From a0126429a16c79db4097a30f28e2c4f70fe6dda3 Mon Sep 17 00:00:00 2001 From: Alexander Miroshnichenko Date: Wed, 30 Jul 2025 18:07:07 +0300 Subject: [PATCH] sys-kernel/hardened-kernel: version update to 6.15.8 Signed-off-by: Alexander Miroshnichenko --- sys-kernel/hardened-kernel/Manifest | 28 +- .../files/linux-6.12.amd64.config | 7124 --- ...d-from-bcachefs-for-upstream-69a5a13.patch | 23844 --------- ..._bcachefs-revert-6.14-backport-fixes.patch | 128 - ...rry-pick-updates-from-master-17227e8.patch | 41408 ---------------- ..._openpax-cherry-pick-updates-fb1be96.patch | 720 - ...4.amd64.config => linux-6.15.amd64.config} | 211 +- ...1189_restrict-fs-causes-bpf-verifier.patch | 0 .../hardened-kernel-6.12.19.ebuild | 145 - ...8.ebuild => hardened-kernel-6.15.8.ebuild} | 77 +- 10 files changed, 159 insertions(+), 73526 deletions(-) delete mode 100644 sys-kernel/hardened-kernel/files/linux-6.12.amd64.config delete mode 100644 sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-upd-from-bcachefs-for-upstream-69a5a13.patch delete mode 100644 sys-kernel/hardened-kernel/files/linux-6.14/1190_bcachefs-revert-6.14-backport-fixes.patch delete mode 100644 sys-kernel/hardened-kernel/files/linux-6.14/1191_bcachefs-cherry-pick-updates-from-master-17227e8.patch delete mode 100644 sys-kernel/hardened-kernel/files/linux-6.14/1199_openpax-cherry-pick-updates-fb1be96.patch rename sys-kernel/hardened-kernel/files/{linux-6.14.amd64.config => linux-6.15.amd64.config} (98%) rename sys-kernel/hardened-kernel/files/{linux-6.14 => linux-6.15}/1189_restrict-fs-causes-bpf-verifier.patch (100%) delete mode 100644 sys-kernel/hardened-kernel/hardened-kernel-6.12.19.ebuild rename sys-kernel/hardened-kernel/{hardened-kernel-6.14.8.ebuild => hardened-kernel-6.15.8.ebuild} (64%) diff --git a/sys-kernel/hardened-kernel/Manifest b/sys-kernel/hardened-kernel/Manifest index 4b01522..baacc0a 100644 --- a/sys-kernel/hardened-kernel/Manifest +++ b/sys-kernel/hardened-kernel/Manifest @@ -1,20 +1,10 @@ -DIST genpatches-6.12-23.base.tar.xz 1435492 BLAKE2B ca65b4ead188bb8c561e47dd7aca29c2cb10d98ed28e78113cedd1bf9d9bf2a380bf12a807bcfc3cce3976621355e087cb8a2a5a06857660401eea0e9156830f SHA512 82fc23bb6e04227bcea2d29336d5a46a6e7f1649244b9ceae2869fac65e3f785e7512ea8d1e32f34281d48c76831223cc5c8b448452d2dd036445773a1329c6d -DIST genpatches-6.12-23.experimental.tar.xz 78500 BLAKE2B f7c0bbe38f90fe3c203725c83bae75f105de44ecc3b4bb5d262056936cc472f8678d50900587e51fd388ed54d95fefa624ba86642f5d12bfc650f0bb4a2a0e37 SHA512 9738997ec9056d66a0e56fb21bc1d6f06c198394993d2960c13acf29821b0f6f1e8b6637abca0abdd3e57ff25b734286a309d991c9614fe6b9ee1f8de59e25fc -DIST genpatches-6.12-23.extras.tar.xz 4056 BLAKE2B dc27e7f57ea95e678f08d3b6f791a26cec5b51e2204f3d527538f3c54333c8f25194981cdc68b7812973ee8baa95e0d5c575be26e918b25c160178d3bcf80769 SHA512 c7d92cc303dde284b5c1f31b87081167a1a8645e5611a65780d09ebc49f9cc2ded94007d10e1764d90e0d25e31fa73095227d381977c1ba13714654a328ac77f -DIST genpatches-6.14-9.base.tar.xz 751104 BLAKE2B 54247d1f3e1639761408bd622efd9ecb1311ec87f5b231ab6e243829b2ef0ab828b7743b38599b655684229875fb07127c931c2bb1de65c05318d54b832ba7a9 SHA512 de7fff5b69767c1fbe7d3dabc97be4777f22c90a47eb137a8a69756ed0fca36a9b962650215ec91b985ad35057bcca0e2a824c71b4d3cde0100e2b7e8e8edceb -DIST genpatches-6.14-9.experimental.tar.xz 79816 BLAKE2B f72de3acdeff2c48e01e488144befceea4e8cd7fbc94b1bd36078b998ca6da3f807db1e2368fb48a8f38fe80627d94583c9c26457f8f560a80642c18ecd437cb SHA512 d678dc235b5e120205e093ddaa86349dc2f2f6613596044a2fb18f94b29da15f4ecbd02fb14a4c61ab1e18b7fb43a494fe8f12d7de01efa45de27ac62bb0406c -DIST genpatches-6.14-9.extras.tar.xz 4056 BLAKE2B 431e8bd76cd1edce40f831c16c9971fd21ebdddb7720bca0028a70c42fdd97d483de920248eff645cb5902684df40b21a7b68ca6e714831b216792c4a2a910e8 SHA512 5e112f31f2b0ec5d25d2d19897ced19b3d3e632d272bac4ae1a27c701235e3c981eb7bd95c176f6a9f9cefbcb0304a1d48b99aea4d091222ac5781ce5dbd4682 -DIST gentoo-kernel-config-g15.tar.gz 5746 BLAKE2B 2baef40e18cbf3bb975362d71ad674604ad80338d1e6914ffa2fc03af26f1b9c9cec66d39a56077693a7327060bc29bd5eb5787549857dd3779e841192cb2016 SHA512 45d609ee25a529988868b8c99c808f4e6abbde34ae501bf35523431ee85c9f3657663d6f938f2036a5f012a55a68965b32fd41fc44d8f6ca606a6fd38f952445 DIST gentoo-kernel-config-g16.tar.gz 5995 BLAKE2B cddb80d45169749c707d87efd186f7a981534aab2479b6c51790008ea61e9f9feac35d0d74b95dc18281e4b81771e09f259a1d9f216f5d7f806fa7cd6aeeb4d1 SHA512 f8114e645e1ab99e45703790b7e43c2fa9ee17b41a2265dccdd9187c122bf8b5a09ba918fbcf094aa899bb959f05d105ed474b75cdfa9a19c4d49fd138825647 -DIST kernel-aarch64-fedora.config.6.12.8-gentoo 288081 BLAKE2B 08273a34c387621d0ccffcc325a0a34b40e0a8fbe78f2429c8a9efc73aa05f8fb563ed53e5fadb25662089f23ebafb61b2d08f91ea00b073e67e702798255e9c SHA512 58ea4f247aa9af6f7535ab5fe44dae2fbf286c7fbceeda86df532125807bbd4c25a89ddeeff4284592efefbaaef5022626abad7f1d1d64976e3040dc6e89251a -DIST kernel-aarch64-fedora.config.6.14.5-gentoo 291637 BLAKE2B 3255e3c098f6c161328633886473ee4cec96799545e9b1a106b1f3fe59fa373407435ee970a9b5d442ceb26869ddc8cf62c962105757630be2fc741a378c4014 SHA512 aded4b58a526b1fd8ece961f04a0885d12fb860bb9e246489242a3060bbebfad904ed1a72935bc6f5a9aabf9b062eccada430772299a958e8393e2980d5b255a -DIST kernel-i686-fedora.config.6.12.8-gentoo 255163 BLAKE2B 7015bbcfb2aed0ba70173dc7e9abf464e167184e2bc8cea6d26623972bbe6b42956241a7d75ff8604d70d5c0202db6e40cdb890abfcea3d0c8e0d00aa869353e SHA512 a1b4b688510a231fe079b4158e8aaddaddc4e719367132668279edcb16e32b6c7f2c449ec196646b0986171dc43a82475255502ae40679e0433de9f9876e0a20 -DIST kernel-i686-fedora.config.6.14.5-gentoo 259410 BLAKE2B c28dfc8cd90f60b57ac80f357ea787bbb68e86e58e21880f643bd5276121f9ff1f6afaf70852694d8bb3a11616c278281d067e248baa439487d7870f76ac7b25 SHA512 d53d840cc1dc2359b3b03198c3416e2f2cbceae1e0555478ab6592d7b280aac07da2a3813c3e3e175160c4674adcb6ec4232b0073fc40b14e64dcc60b278e400 -DIST kernel-ppc64le-fedora.config.6.12.8-gentoo 241851 BLAKE2B e7b8833572348037d7af2ba1f9671e8010276d853e0d85b8a175c0dcc5c212c57c7660be54a7ba2621c427cc8120acbeb1063a1c1a1c293894ebe1d63921b684 SHA512 7a81fe1c4b1d4fb9f2d68846882a8869c0888c8fa764ec41f41d27e61b7a69825ea8ceba2209b40deb7c0e8f4bb2c5d81226a4f28e7ca8aff4788bc7a3292194 -DIST kernel-ppc64le-fedora.config.6.14.5-gentoo 243480 BLAKE2B 1174ca580d5de7db3e94e7a040e655136ce9356c6bccd9c7d6e0c4c6f0cae0b325ac0fd3345e6e493891a4e648ccecf22a0e36adb53f4e028611cf99d1ca4876 SHA512 99adb16c8a84c31442dfae67a828eaa6689fb1b96b4d5fe2505bcf5ae1803e9df742eb239c4524566375eabe64cb93d26390edb4ff651f48cb5f87c27895e2b4 -DIST kernel-x86_64-fedora.config.6.12.8-gentoo 256210 BLAKE2B f14f7de8ae573561824df47cf94c3c0ce52a820456ebd0e618e4c1e7f5454b7d3f6f86c559a3cd98dd94c55aaeed397f3d0cee6b0e37cf6b47d3aedd920a9dea SHA512 ea87b4b45c78888e02d0288dd5844cf2d97a14e251b565c7d6451a0e62fbe0dbef38f46715467af2f869995d6bbc8be61d5b70476a86d607a5bfa27fbaf36e92 -DIST kernel-x86_64-fedora.config.6.14.5-gentoo 260496 BLAKE2B b68058a75bc02afcc3e45371be25d295ccb959efb9047ec394d1d11becea30f3d9007e78da02253ebb8cea41500e0fbb392866b1086c9746cdefdb78cc4edd3c SHA512 2f1e6f112db46bd3765e29cfe0cff1f45991d652c49b520b46b0c5ced4c995e2ef7753c13730b0a918379200cb05f50eaadc827516136177ea5900b4e10d6192 -DIST linux-6.12.tar.xz 147906904 BLAKE2B b2ec2fc69218cacabbbe49f78384a5d259ca581b717617c12b000b16f4a4c59ee348ea886b37147f5f70fb9a7a01c1e2c8f19021078f6b23f5bc62d1c48d5e5e SHA512 a37b1823df7b4f72542f689b65882634740ba0401a42fdcf6601d9efd2e132e5a7650e70450ba76f6cd1f13ca31180f2ccee9d54fe4df89bc0000ade4380a548 -DIST linux-6.14.tar.xz 149408504 BLAKE2B 11835719804b406fe281ea1c276a84dc0cbaa808552ddcca9233d3eaeb1c001d0455c7205379b02de8e8db758c1bae6fe7ceb6697e63e3cf9ae7187dc7a9715e SHA512 71dcaa3772d8d9797c3ae30cae9c582b11a7047a3bbcb8dfd479a4dffb40ff0da74cf3d45175f50cc9992e338bcadd46c9c570f54054ca3bde6661768d3d22eb -DIST linux-hardened-v6.12.19-hardened1.patch 89621 BLAKE2B dcd5dace9b76852547e02ce79f98eb417ebe0290654f6f19d18655d873c868a4e84d72608714e0bf02ae71178726cf69bcee20c38b30b590ef44de9ba7b88470 SHA512 e96e7028303d2d7660d71de2e90a03ea467bafeb3af296c456d859235274d1c92b9f92b093bc3747f1f47d9f0a2ed2e501b05baf22a483b473dc00cb983433ed -DIST linux-hardened-v6.14.8-hardened1.patch 90843 BLAKE2B 9e1d570a0fc91ad249365b2821ffddfa24e822f251a82eede6db827951de799b45bef7223e9c8b7479eba9d130a1205f19750d7e92ffae9784d30563c9bd1789 SHA512 dcf7a7b5456de0d05b9821462d4a1aa20162314092a8a0a855aeac586bdde5d8e7d35b594abd0416c7a11bb0b28f851ab91c36d153565cd76a025e1f8bb81fb9 +DIST kernel-aarch64-fedora.config.6.15.6-gentoo 294308 BLAKE2B 62b4c06f14572cf3ffda30e0d2d3f1d08d9c45b072888e2ffef7d771a88efecd84a463e9b42f6b676f13adc65ec95bfda9ebfa84cb722514bfb7212ffd08d3bc SHA512 058e072cfdfd0314e38f5b538fd82db4140844366f666ba2afd152ca584067ff53d0c63ae9a73be31dec49c69b4301b2a52e328fb4c712ef1f729d806abea950 +DIST kernel-i686-fedora.config.6.15.6-gentoo 261602 BLAKE2B aa195831ccd61cba9c0b7fde51af25d4accc2953c1fdd35c7b007aea4e2a747ce043b0aff90f71aff1b8dd42d22aa99a387fdbcab844edb40f962a0a1d0e0e41 SHA512 4600b2244b34889cbe6014858d012c208878f1815decbc17dd618bcd2a6f37908b7b869286bd2639f6a64ef73c7c199406edd08ed13cc7f4ba9273d76373222d +DIST kernel-ppc64le-fedora.config.6.15.6-gentoo 245484 BLAKE2B b59792e2fe18e0b3ad8459bd5f7dedee3149505be63f15f5ca02b88e95eed0e4dfaf204f95a00761595092163d12967aa8a71ba13103682ded8dd9b70063412b SHA512 ca0915a6be9d0028c4f5d57a145d598fa1c2dc77700bbaeaee5116d611da476e3a3849bbe0bf84935c85946f100d1f0b824c7892651b943986cd8a4f042f5c29 +DIST kernel-x86_64-fedora.config.6.15.6-gentoo 262707 BLAKE2B 4727bbee8f4458c2627d9f808c7baa1df2828844e92a1d6733b7ed1525e76b0b5664f224ef6e1697f949d64602fb2fcbbe913f68c9ddc008d5be01d29c7a847e SHA512 353e804ef2ee63ad8b353a52ea3c905f58668e4162bda6024a0201db1634250eaa782f30a1dd1220a6bc1df364c141167f4e2eacd221f47c022e3a23dc987afb +DIST linux-6.15.tar.xz 151168812 BLAKE2B 465596c6dc053ff3a3966302a906d3edb4f7ee1ef82f8c20b96360196d3414f5b1deeafa67b8340fcdecd3617280ba9b756d7073ad15c707865e256397b4af53 SHA512 d03788ffa8d8ae1b84ef1286bb44a08fc01432e509dfec6cccae5c5a5a47201d378aec2bcd21e6f0bbd1e625b26f47780c06ee9c1cef3775358f98b160923f30 +DIST linux-6.x-sha256sums-20250724.asc 159789 BLAKE2B 9a85b54a784ea9f026ccc7d63d961239f09c0e656a67eac035fac6d7f3eb8bbebd67097a1a38b6d06ab232e79411e6d0ea2cce30eb7972df4cac65fc5f63c664 SHA512 dd79403876b28843987b7685962a9f79f9dae3ae680ff1dd915ec78218c235df4177d1197f8ff9e2a05cb11a3464808dda4c15441626e0fc5b91bbbc217537d0 +DIST linux-gentoo-patches-6.15.8.tar.xz 85336 BLAKE2B 04e7f012c9375fac93fdbfd97a7450ed9022110c56eff2b9b76a856d83c2dab97da983c4c577c7df3f06ec889771f772f281d0ae837b3e07dee30fec79dad110 SHA512 cc51a68444a20c4f106fc16d6f6327d40372faf77fad8e76936a50f97102736166efffc107a405cfe8124b61dbf1370095b811d0172af9003d6e6eaca4effa64 +DIST linux-hardened-v6.15.8-hardened2.patch 90401 BLAKE2B be2f2fb1a4af1231b52f24b7bbcd8ef766cc6ea8883f0a75dafc3ed337e02671265a039c26e8540b8a45013b427a410ae3b393eab57413b07e9d875436a0ddb9 SHA512 44ba90e044a1ce1999c08a68be7023dea3cef3e87411fe6a5c47b10d6ebba01589bf3f166bb22465e08ce209531816723fc2601a339983ea1a8032bf76dbc6b8 +DIST patch-6.15.8.xz 706220 BLAKE2B dea53067591c113c1cc1c5546c6ca1a561199c9b5d36e7b68c0dc712fd0851c879bc24f6bfcf38aa044c0e2dbe565bca4e5f655aa3c48755c5efda8e6fb5e4f2 SHA512 a6341a8c6ca08a0a02598ea2b60dfcf88f99aededda4e3c37bbb8a07fc2d8720d590d9054ef59a8c611f7e4fe1184e3781b6c1a6e0daabac9015597236354158 diff --git a/sys-kernel/hardened-kernel/files/linux-6.12.amd64.config b/sys-kernel/hardened-kernel/files/linux-6.12.amd64.config deleted file mode 100644 index 8f64b79..0000000 --- a/sys-kernel/hardened-kernel/files/linux-6.12.amd64.config +++ /dev/null @@ -1,7124 +0,0 @@ -# -# Automatically generated file; DO NOT EDIT. -# Linux/x86 6.12.4 Kernel Configuration -# -CONFIG_CC_VERSION_TEXT="gcc (Gentoo Hardened 14.2.1_p20241116 p3) 14.2.1 20241116" -CONFIG_CC_IS_GCC=y -CONFIG_GCC_VERSION=140201 -CONFIG_CLANG_VERSION=0 -CONFIG_AS_IS_GNU=y -CONFIG_AS_VERSION=24000 -CONFIG_LD_IS_BFD=y -CONFIG_LD_VERSION=24000 -CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107900 -CONFIG_RUSTC_LLVM_VERSION=180107 -CONFIG_CC_CAN_LINK=y -CONFIG_CC_CAN_LINK_STATIC=y -CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y -CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT=y -CONFIG_TOOLS_SUPPORT_RELR=y -CONFIG_CC_HAS_ASM_INLINE=y -CONFIG_CC_HAS_NO_PROFILE_FN_ATTR=y -CONFIG_PAHOLE_VERSION=127 -CONFIG_IRQ_WORK=y -CONFIG_BUILDTIME_TABLE_SORT=y -CONFIG_THREAD_INFO_IN_TASK=y - -# -# General setup -# -CONFIG_INIT_ENV_ARG_LIMIT=32 -# CONFIG_COMPILE_TEST is not set -CONFIG_WERROR=y -CONFIG_LOCALVERSION="" -# CONFIG_LOCALVERSION_AUTO is not set -CONFIG_BUILD_SALT="" -CONFIG_HAVE_KERNEL_GZIP=y -CONFIG_HAVE_KERNEL_BZIP2=y -CONFIG_HAVE_KERNEL_LZMA=y -CONFIG_HAVE_KERNEL_XZ=y -CONFIG_HAVE_KERNEL_LZO=y -CONFIG_HAVE_KERNEL_LZ4=y -CONFIG_HAVE_KERNEL_ZSTD=y -# CONFIG_KERNEL_GZIP is not set -# CONFIG_KERNEL_BZIP2 is not set -# CONFIG_KERNEL_LZMA is not set -# CONFIG_KERNEL_XZ is not set -# CONFIG_KERNEL_LZO is not set -# CONFIG_KERNEL_LZ4 is not set -CONFIG_KERNEL_ZSTD=y -CONFIG_DEFAULT_INIT="" -CONFIG_DEFAULT_HOSTNAME="gentoo" -CONFIG_SYSVIPC=y -CONFIG_SYSVIPC_SYSCTL=y -CONFIG_SYSVIPC_COMPAT=y -CONFIG_POSIX_MQUEUE=y -CONFIG_POSIX_MQUEUE_SYSCTL=y -CONFIG_WATCH_QUEUE=y -CONFIG_CROSS_MEMORY_ATTACH=y -# CONFIG_USELIB is not set -CONFIG_AUDIT=y -CONFIG_HAVE_ARCH_AUDITSYSCALL=y -CONFIG_AUDITSYSCALL=y - -# -# IRQ subsystem -# -CONFIG_GENERIC_IRQ_PROBE=y -CONFIG_GENERIC_IRQ_SHOW=y -CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y -CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_GENERIC_IRQ_MIGRATION=y -CONFIG_HARDIRQS_SW_RESEND=y -CONFIG_IRQ_DOMAIN=y -CONFIG_IRQ_DOMAIN_HIERARCHY=y -CONFIG_GENERIC_MSI_IRQ=y -CONFIG_IRQ_MSI_IOMMU=y -CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y -CONFIG_GENERIC_IRQ_RESERVATION_MODE=y -CONFIG_GENERIC_IRQ_STAT_SNAPSHOT=y -CONFIG_IRQ_FORCED_THREADING=y -CONFIG_SPARSE_IRQ=y -# CONFIG_GENERIC_IRQ_DEBUGFS is not set -# end of IRQ subsystem - -CONFIG_CLOCKSOURCE_WATCHDOG=y -CONFIG_ARCH_CLOCKSOURCE_INIT=y -CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y -CONFIG_GENERIC_TIME_VSYSCALL=y -CONFIG_GENERIC_CLOCKEVENTS=y -CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y -CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE=y -CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y -CONFIG_GENERIC_CMOS_UPDATE=y -CONFIG_HAVE_POSIX_CPU_TIMERS_TASK_WORK=y -CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y -CONFIG_CONTEXT_TRACKING=y -CONFIG_CONTEXT_TRACKING_IDLE=y - -# -# Timers subsystem -# -CONFIG_TICK_ONESHOT=y -CONFIG_NO_HZ_COMMON=y -# CONFIG_HZ_PERIODIC is not set -CONFIG_NO_HZ_IDLE=y -# CONFIG_NO_HZ_FULL is not set -CONFIG_CONTEXT_TRACKING_USER=y -CONFIG_CONTEXT_TRACKING_USER_FORCE=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US=100 -# end of Timers subsystem - -CONFIG_BPF=y -CONFIG_HAVE_EBPF_JIT=y -CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y - -# -# BPF subsystem -# -CONFIG_BPF_SYSCALL=y -CONFIG_BPF_JIT=y -CONFIG_BPF_JIT_ALWAYS_ON=y -CONFIG_BPF_JIT_DEFAULT_ON=y -CONFIG_BPF_UNPRIV_DEFAULT_OFF=y -# CONFIG_BPF_PRELOAD is not set -CONFIG_BPF_LSM=y -# end of BPF subsystem - -CONFIG_PREEMPT_BUILD=y -# CONFIG_PREEMPT_NONE is not set -# CONFIG_PREEMPT_VOLUNTARY is not set -CONFIG_PREEMPT=y -CONFIG_PREEMPT_COUNT=y -CONFIG_PREEMPTION=y -CONFIG_PREEMPT_DYNAMIC=y -CONFIG_SCHED_CORE=y - -# -# CPU/Task time and stats accounting -# -CONFIG_VIRT_CPU_ACCOUNTING=y -# CONFIG_TICK_CPU_ACCOUNTING is not set -CONFIG_VIRT_CPU_ACCOUNTING_GEN=y -CONFIG_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_SCHED_AVG_IRQ=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_PSI=y -# CONFIG_PSI_DEFAULT_DISABLED is not set -# end of CPU/Task time and stats accounting - -CONFIG_CPU_ISOLATION=y - -# -# RCU Subsystem -# -CONFIG_TREE_RCU=y -CONFIG_PREEMPT_RCU=y -# CONFIG_RCU_EXPERT is not set -CONFIG_TREE_SRCU=y -CONFIG_TASKS_RCU_GENERIC=y -CONFIG_NEED_TASKS_RCU=y -CONFIG_TASKS_RCU=y -CONFIG_TASKS_RUDE_RCU=y -CONFIG_TASKS_TRACE_RCU=y -CONFIG_RCU_STALL_COMMON=y -CONFIG_RCU_NEED_SEGCBLIST=y -# end of RCU Subsystem - -CONFIG_IKCONFIG=m -CONFIG_IKCONFIG_PROC=y -CONFIG_IKHEADERS=m -CONFIG_LOG_BUF_SHIFT=17 -CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 -CONFIG_PRINTK_INDEX=y -CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y - -# -# Scheduler features -# -# CONFIG_UCLAMP_TASK is not set -# CONFIG_SCHED_ALT is not set -# end of Scheduler features - -CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y -CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y -CONFIG_CC_HAS_INT128=y -CONFIG_CC_IMPLICIT_FALLTHROUGH="-Wimplicit-fallthrough=5" -CONFIG_GCC10_NO_ARRAY_BOUNDS=y -CONFIG_CC_NO_ARRAY_BOUNDS=y -CONFIG_GCC_NO_STRINGOP_OVERFLOW=y -CONFIG_CC_NO_STRINGOP_OVERFLOW=y -CONFIG_ARCH_SUPPORTS_INT128=y -# CONFIG_NUMA_BALANCING is not set -CONFIG_SLAB_OBJ_EXT=y -CONFIG_CGROUPS=y -CONFIG_PAGE_COUNTER=y -# CONFIG_CGROUP_FAVOR_DYNMODS is not set -CONFIG_MEMCG=y -# CONFIG_MEMCG_V1 is not set -CONFIG_BLK_CGROUP=y -CONFIG_CGROUP_WRITEBACK=y -CONFIG_CGROUP_SCHED=y -CONFIG_GROUP_SCHED_WEIGHT=y -CONFIG_FAIR_GROUP_SCHED=y -CONFIG_CFS_BANDWIDTH=y -# CONFIG_RT_GROUP_SCHED is not set -CONFIG_SCHED_MM_CID=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_RDMA=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_HUGETLB=y -CONFIG_CPUSETS=y -# CONFIG_CPUSETS_V1 is not set -CONFIG_PROC_PID_CPUSET=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_PERF=y -CONFIG_CGROUP_BPF=y -CONFIG_CGROUP_MISC=y -# CONFIG_CGROUP_DEBUG is not set -CONFIG_SOCK_CGROUP_DATA=y -CONFIG_NAMESPACES=y -CONFIG_UTS_NS=y -CONFIG_TIME_NS=y -CONFIG_IPC_NS=y -CONFIG_USER_NS=y -CONFIG_USER_NS_UNPRIVILEGED=y -CONFIG_PID_NS=y -CONFIG_NET_NS=y -CONFIG_CHECKPOINT_RESTORE=y -CONFIG_SCHED_AUTOGROUP=y -CONFIG_RELAY=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="" -# CONFIG_RD_GZIP is not set -# CONFIG_RD_BZIP2 is not set -# CONFIG_RD_LZMA is not set -CONFIG_RD_XZ=y -# CONFIG_RD_LZO is not set -# CONFIG_RD_LZ4 is not set -CONFIG_RD_ZSTD=y -CONFIG_BOOT_CONFIG=y -# CONFIG_BOOT_CONFIG_FORCE is not set -# CONFIG_BOOT_CONFIG_EMBED is not set -CONFIG_INITRAMFS_PRESERVE_MTIME=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -CONFIG_LD_ORPHAN_WARN=y -CONFIG_LD_ORPHAN_WARN_LEVEL="error" -CONFIG_SYSCTL=y -CONFIG_HAVE_UID16=y -CONFIG_SYSCTL_EXCEPTION_TRACE=y -CONFIG_HAVE_PCSPKR_PLATFORM=y -# CONFIG_EXPERT is not set -CONFIG_UID16=y -CONFIG_MULTIUSER=y -CONFIG_SGETMASK_SYSCALL=y -CONFIG_SYSFS_SYSCALL=y -CONFIG_FHANDLE=y -CONFIG_POSIX_TIMERS=y -CONFIG_PRINTK=y -CONFIG_BUG=y -CONFIG_ELF_CORE=y -CONFIG_PCSPKR_PLATFORM=y -CONFIG_FUTEX=y -CONFIG_FUTEX_PI=y -CONFIG_EPOLL=y -CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y -CONFIG_EVENTFD=y -CONFIG_SHMEM=y -CONFIG_AIO=y -CONFIG_IO_URING=y -CONFIG_ADVISE_SYSCALLS=y -CONFIG_MEMBARRIER=y -CONFIG_KCMP=y -CONFIG_RSEQ=y -CONFIG_CACHESTAT_SYSCALL=y -CONFIG_KALLSYMS=y -# CONFIG_KALLSYMS_SELFTEST is not set -# CONFIG_KALLSYMS_ALL is not set -CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y -CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y -CONFIG_HAVE_PERF_EVENTS=y -CONFIG_GUEST_PERF_EVENTS=y - -# -# Kernel Performance Events And Counters -# -CONFIG_PERF_EVENTS=y -# CONFIG_DEBUG_PERF_USE_VMALLOC is not set -# end of Kernel Performance Events And Counters - -CONFIG_SYSTEM_DATA_VERIFICATION=y -CONFIG_PROFILING=y -CONFIG_TRACEPOINTS=y - -# -# Kexec and crash features -# -# CONFIG_KEXEC is not set -# CONFIG_KEXEC_FILE is not set -# end of Kexec and crash features -# end of General setup - -CONFIG_64BIT=y -CONFIG_X86_64=y -CONFIG_X86=y -CONFIG_INSTRUCTION_DECODER=y -CONFIG_OUTPUT_FORMAT="elf64-x86-64" -CONFIG_LOCKDEP_SUPPORT=y -CONFIG_STACKTRACE_SUPPORT=y -CONFIG_MMU=y -CONFIG_ARCH_MMAP_RND_BITS_MIN=28 -CONFIG_ARCH_MMAP_RND_BITS_MAX=32 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 -CONFIG_GENERIC_ISA_DMA=y -CONFIG_GENERIC_BUG=y -CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y -CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_ARCH_HAS_CPU_RELAX=y -CONFIG_ARCH_HIBERNATION_POSSIBLE=y -CONFIG_ARCH_SUSPEND_POSSIBLE=y -CONFIG_AUDIT_ARCH=y -CONFIG_HAVE_INTEL_TXT=y -CONFIG_X86_64_SMP=y -CONFIG_ARCH_SUPPORTS_UPROBES=y -CONFIG_FIX_EARLYCON_MEM=y -CONFIG_PGTABLE_LEVELS=4 -CONFIG_CC_HAS_SANE_STACKPROTECTOR=y - -# -# Processor type and features -# -CONFIG_SMP=y -CONFIG_X86_X2APIC=y -CONFIG_X86_POSTED_MSI=y -CONFIG_X86_MPPARSE=y -CONFIG_X86_CPU_RESCTRL=y -CONFIG_X86_FRED=y -# CONFIG_X86_EXTENDED_PLATFORM is not set -CONFIG_X86_INTEL_LPSS=y -CONFIG_X86_AMD_PLATFORM_DEVICE=y -CONFIG_IOSF_MBI=y -# CONFIG_IOSF_MBI_DEBUG is not set -CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y -CONFIG_SCHED_OMIT_FRAME_POINTER=y -CONFIG_HYPERVISOR_GUEST=y -CONFIG_PARAVIRT=y -# CONFIG_PARAVIRT_DEBUG is not set -CONFIG_PARAVIRT_SPINLOCKS=y -CONFIG_X86_HV_CALLBACK_VECTOR=y -# CONFIG_XEN is not set -CONFIG_KVM_GUEST=y -CONFIG_ARCH_CPUIDLE_HALTPOLL=y -CONFIG_PVH=y -CONFIG_PARAVIRT_TIME_ACCOUNTING=y -CONFIG_PARAVIRT_CLOCK=y -# CONFIG_JAILHOUSE_GUEST is not set -# CONFIG_ACRN_GUEST is not set -# CONFIG_INTEL_TDX_GUEST is not set -# CONFIG_MK8 is not set -# CONFIG_MK8SSE3 is not set -# CONFIG_MK10 is not set -# CONFIG_MBARCELONA is not set -# CONFIG_MBOBCAT is not set -# CONFIG_MJAGUAR is not set -# CONFIG_MBULLDOZER is not set -# CONFIG_MPILEDRIVER is not set -# CONFIG_MSTEAMROLLER is not set -# CONFIG_MEXCAVATOR is not set -# CONFIG_MZEN is not set -# CONFIG_MZEN2 is not set -# CONFIG_MZEN3 is not set -# CONFIG_MZEN4 is not set -# CONFIG_MZEN5 is not set -# CONFIG_MPSC is not set -# CONFIG_MATOM is not set -# CONFIG_MCORE2 is not set -# CONFIG_MNEHALEM is not set -# CONFIG_MWESTMERE is not set -# CONFIG_MSILVERMONT is not set -# CONFIG_MGOLDMONT is not set -# CONFIG_MGOLDMONTPLUS is not set -# CONFIG_MSANDYBRIDGE is not set -# CONFIG_MIVYBRIDGE is not set -# CONFIG_MHASWELL is not set -# CONFIG_MBROADWELL is not set -# CONFIG_MSKYLAKE is not set -# CONFIG_MSKYLAKEX is not set -# CONFIG_MCANNONLAKE is not set -# CONFIG_MICELAKE is not set -# CONFIG_MCASCADELAKE is not set -# CONFIG_MCOOPERLAKE is not set -# CONFIG_MTIGERLAKE is not set -# CONFIG_MSAPPHIRERAPIDS is not set -# CONFIG_MROCKETLAKE is not set -# CONFIG_MALDERLAKE is not set -# CONFIG_MRAPTORLAKE is not set -# CONFIG_MMETEORLAKE is not set -# CONFIG_MEMERALDRAPIDS is not set -CONFIG_GENERIC_CPU=y -# CONFIG_MNATIVE_INTEL is not set -# CONFIG_MNATIVE_AMD is not set -CONFIG_X86_64_VERSION=2 -CONFIG_X86_INTERNODE_CACHE_SHIFT=6 -CONFIG_X86_L1_CACHE_SHIFT=6 -CONFIG_X86_TSC=y -CONFIG_X86_HAVE_PAE=y -CONFIG_X86_CMPXCHG64=y -CONFIG_X86_CMOV=y -CONFIG_X86_MINIMUM_CPU_FAMILY=64 -CONFIG_X86_DEBUGCTLMSR=y -CONFIG_IA32_FEAT_CTL=y -CONFIG_X86_VMX_FEATURE_NAMES=y -CONFIG_CPU_SUP_INTEL=y -CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_HYGON=y -CONFIG_CPU_SUP_CENTAUR=y -CONFIG_CPU_SUP_ZHAOXIN=y -CONFIG_HPET_TIMER=y -CONFIG_HPET_EMULATE_RTC=y -CONFIG_DMI=y -# CONFIG_GART_IOMMU is not set -CONFIG_BOOT_VESA_SUPPORT=y -# CONFIG_MAXSMP is not set -CONFIG_NR_CPUS_RANGE_BEGIN=2 -CONFIG_NR_CPUS_RANGE_END=512 -CONFIG_NR_CPUS_DEFAULT=64 -CONFIG_NR_CPUS=8 -CONFIG_SCHED_CLUSTER=y -CONFIG_SCHED_SMT=y -CONFIG_SCHED_MC=y -CONFIG_SCHED_MC_PRIO=y -CONFIG_X86_LOCAL_APIC=y -CONFIG_ACPI_MADT_WAKEUP=y -CONFIG_X86_IO_APIC=y -CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_X86_MCE=y -# CONFIG_X86_MCELOG_LEGACY is not set -CONFIG_X86_MCE_INTEL=y -CONFIG_X86_MCE_AMD=y -CONFIG_X86_MCE_THRESHOLD=y -# CONFIG_X86_MCE_INJECT is not set - -# -# Performance monitoring -# -CONFIG_PERF_EVENTS_INTEL_UNCORE=y -CONFIG_PERF_EVENTS_INTEL_RAPL=y -CONFIG_PERF_EVENTS_INTEL_CSTATE=y -# CONFIG_PERF_EVENTS_AMD_POWER is not set -CONFIG_PERF_EVENTS_AMD_UNCORE=m -# CONFIG_PERF_EVENTS_AMD_BRS is not set -# end of Performance monitoring - -CONFIG_X86_VSYSCALL_EMULATION=y -# CONFIG_X86_IOPL_IOPERM is not set -CONFIG_MICROCODE=y -# CONFIG_MICROCODE_LATE_LOADING is not set -CONFIG_X86_MSR=m -CONFIG_X86_CPUID=m -# CONFIG_X86_5LEVEL is not set -CONFIG_X86_DIRECT_GBPAGES=y -# CONFIG_X86_CPA_STATISTICS is not set -# CONFIG_AMD_MEM_ENCRYPT is not set -CONFIG_NUMA=y -# CONFIG_AMD_NUMA is not set -CONFIG_X86_64_ACPI_NUMA=y -CONFIG_NODES_SHIFT=2 -CONFIG_ARCH_SPARSEMEM_ENABLE=y -CONFIG_ARCH_SPARSEMEM_DEFAULT=y -# CONFIG_ARCH_MEMORY_PROBE is not set -CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 -CONFIG_X86_PMEM_LEGACY_DEVICE=y -CONFIG_X86_PMEM_LEGACY=m -CONFIG_X86_CHECK_BIOS_CORRUPTION=y -CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y -CONFIG_MTRR=y -CONFIG_MTRR_SANITIZER=y -CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 -CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 -CONFIG_X86_PAT=y -CONFIG_X86_UMIP=y -CONFIG_CC_HAS_IBT=y -CONFIG_X86_CET=y -CONFIG_X86_KERNEL_IBT=y -CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y -CONFIG_ARCH_PKEY_BITS=4 -# CONFIG_X86_INTEL_TSX_MODE_OFF is not set -# CONFIG_X86_INTEL_TSX_MODE_ON is not set -CONFIG_X86_INTEL_TSX_MODE_AUTO=y -# CONFIG_X86_SGX is not set -# CONFIG_X86_USER_SHADOW_STACK is not set -# CONFIG_INTEL_TDX_HOST is not set -CONFIG_EFI=y -CONFIG_EFI_STUB=y -# CONFIG_EFI_HANDOVER_PROTOCOL is not set -# CONFIG_EFI_MIXED is not set -# CONFIG_HZ_100 is not set -# CONFIG_HZ_250 is not set -CONFIG_HZ_300=y -# CONFIG_HZ_1000 is not set -CONFIG_HZ=300 -CONFIG_SCHED_HRTICK=y -CONFIG_ARCH_SUPPORTS_KEXEC=y -CONFIG_ARCH_SUPPORTS_KEXEC_FILE=y -CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY=y -CONFIG_ARCH_SUPPORTS_KEXEC_SIG=y -CONFIG_ARCH_SUPPORTS_KEXEC_SIG_FORCE=y -CONFIG_ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG=y -CONFIG_ARCH_SUPPORTS_KEXEC_JUMP=y -CONFIG_ARCH_SUPPORTS_CRASH_DUMP=y -CONFIG_ARCH_DEFAULT_CRASH_DUMP=y -CONFIG_ARCH_SUPPORTS_CRASH_HOTPLUG=y -CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_X86_NEED_RELOCS=y -CONFIG_PHYSICAL_ALIGN=0x1000000 -CONFIG_DYNAMIC_MEMORY_LAYOUT=y -CONFIG_RANDOMIZE_MEMORY=y -CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0xa -CONFIG_HOTPLUG_CPU=y -# CONFIG_COMPAT_VDSO is not set -# CONFIG_LEGACY_VSYSCALL_XONLY is not set -CONFIG_LEGACY_VSYSCALL_NONE=y -CONFIG_CMDLINE_BOOL=y -CONFIG_CMDLINE="vdso32=0 page_poison=1 page_alloc.shuffle=1 slab_nomerge pti=on" -# CONFIG_CMDLINE_OVERRIDE is not set -# CONFIG_MODIFY_LDT_SYSCALL is not set -# CONFIG_STRICT_SIGALTSTACK_SIZE is not set -CONFIG_HAVE_LIVEPATCH=y -# end of Processor type and features - -CONFIG_CC_HAS_NAMED_AS=y -CONFIG_CC_HAS_NAMED_AS_FIXED_SANITIZERS=y -CONFIG_USE_X86_SEG_SUPPORT=y -CONFIG_CC_HAS_SLS=y -CONFIG_CC_HAS_RETURN_THUNK=y -CONFIG_CC_HAS_ENTRY_PADDING=y -CONFIG_FUNCTION_PADDING_CFI=11 -CONFIG_FUNCTION_PADDING_BYTES=16 -CONFIG_CALL_PADDING=y -CONFIG_HAVE_CALL_THUNKS=y -CONFIG_CALL_THUNKS=y -CONFIG_PREFIX_SYMBOLS=y -CONFIG_CPU_MITIGATIONS=y -CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=y -CONFIG_MITIGATION_RETPOLINE=y -CONFIG_MITIGATION_RETHUNK=y -CONFIG_MITIGATION_UNRET_ENTRY=y -CONFIG_MITIGATION_CALL_DEPTH_TRACKING=y -# CONFIG_CALL_THUNKS_DEBUG is not set -CONFIG_MITIGATION_IBPB_ENTRY=y -CONFIG_MITIGATION_IBRS_ENTRY=y -CONFIG_MITIGATION_SRSO=y -CONFIG_MITIGATION_SLS=y -CONFIG_MITIGATION_GDS=y -CONFIG_MITIGATION_RFDS=y -CONFIG_MITIGATION_SPECTRE_BHI=y -CONFIG_MITIGATION_MDS=y -CONFIG_MITIGATION_TAA=y -CONFIG_MITIGATION_MMIO_STALE_DATA=y -CONFIG_MITIGATION_L1TF=y -CONFIG_MITIGATION_RETBLEED=y -CONFIG_MITIGATION_SPECTRE_V1=y -CONFIG_MITIGATION_SPECTRE_V2=y -CONFIG_MITIGATION_SRBDS=y -CONFIG_MITIGATION_SSB=y -CONFIG_ARCH_HAS_ADD_PAGES=y - -# -# Power management and ACPI options -# -CONFIG_ARCH_HIBERNATION_HEADER=y -CONFIG_SUSPEND=y -CONFIG_SUSPEND_FREEZER=y -CONFIG_HIBERNATE_CALLBACKS=y -CONFIG_HIBERNATION=y -CONFIG_HIBERNATION_SNAPSHOT_DEV=y -CONFIG_HIBERNATION_COMP_LZO=y -# CONFIG_HIBERNATION_COMP_LZ4 is not set -CONFIG_HIBERNATION_DEF_COMP="lzo" -CONFIG_PM_STD_PARTITION="" -CONFIG_PM_SLEEP=y -CONFIG_PM_SLEEP_SMP=y -CONFIG_PM_AUTOSLEEP=y -# CONFIG_PM_USERSPACE_AUTOSLEEP is not set -CONFIG_PM_WAKELOCKS=y -CONFIG_PM_WAKELOCKS_LIMIT=100 -CONFIG_PM_WAKELOCKS_GC=y -CONFIG_PM=y -# CONFIG_PM_DEBUG is not set -CONFIG_PM_CLK=y -CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y -CONFIG_ENERGY_MODEL=y -CONFIG_ARCH_SUPPORTS_ACPI=y -CONFIG_ACPI=y -CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y -CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y -CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y -CONFIG_ACPI_THERMAL_LIB=y -# CONFIG_ACPI_DEBUGGER is not set -CONFIG_ACPI_SPCR_TABLE=y -CONFIG_ACPI_FPDT=y -CONFIG_ACPI_LPIT=y -CONFIG_ACPI_SLEEP=y -CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y -CONFIG_ACPI_EC_DEBUGFS=m -CONFIG_ACPI_AC=y -CONFIG_ACPI_BATTERY=m -CONFIG_ACPI_BUTTON=y -CONFIG_ACPI_VIDEO=m -CONFIG_ACPI_FAN=y -CONFIG_ACPI_TAD=m -# CONFIG_ACPI_DOCK is not set -CONFIG_ACPI_CPU_FREQ_PSS=y -CONFIG_ACPI_PROCESSOR_CSTATE=y -CONFIG_ACPI_PROCESSOR_IDLE=y -CONFIG_ACPI_CPPC_LIB=y -CONFIG_ACPI_PROCESSOR=y -CONFIG_ACPI_HOTPLUG_CPU=y -CONFIG_ACPI_PROCESSOR_AGGREGATOR=y -CONFIG_ACPI_THERMAL=y -CONFIG_ACPI_PLATFORM_PROFILE=m -CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y -# CONFIG_ACPI_TABLE_UPGRADE is not set -# CONFIG_ACPI_DEBUG is not set -CONFIG_ACPI_PCI_SLOT=y -CONFIG_ACPI_CONTAINER=y -CONFIG_ACPI_HOTPLUG_MEMORY=y -CONFIG_ACPI_HOTPLUG_IOAPIC=y -CONFIG_ACPI_SBS=m -CONFIG_ACPI_HED=y -CONFIG_ACPI_BGRT=y -CONFIG_ACPI_NHLT=y -# CONFIG_ACPI_NFIT is not set -CONFIG_ACPI_NUMA=y -# CONFIG_ACPI_HMAT is not set -CONFIG_HAVE_ACPI_APEI=y -CONFIG_HAVE_ACPI_APEI_NMI=y -CONFIG_ACPI_APEI=y -CONFIG_ACPI_APEI_GHES=y -CONFIG_ACPI_APEI_PCIEAER=y -CONFIG_ACPI_APEI_MEMORY_FAILURE=y -# CONFIG_ACPI_APEI_EINJ is not set -# CONFIG_ACPI_APEI_ERST_DEBUG is not set -# CONFIG_ACPI_DPTF is not set -CONFIG_ACPI_EXTLOG=m -CONFIG_ACPI_CONFIGFS=m -# CONFIG_ACPI_PFRUT is not set -CONFIG_ACPI_PCC=y -# CONFIG_ACPI_FFH is not set -CONFIG_PMIC_OPREGION=y -CONFIG_ACPI_VIOT=y -CONFIG_ACPI_PRMT=y -CONFIG_X86_PM_TIMER=y - -# -# CPU Frequency scaling -# -CONFIG_CPU_FREQ=y -CONFIG_CPU_FREQ_GOV_ATTR_SET=y -CONFIG_CPU_FREQ_GOV_COMMON=y -CONFIG_CPU_FREQ_STAT=y -# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set -CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y -CONFIG_CPU_FREQ_GOV_PERFORMANCE=y -CONFIG_CPU_FREQ_GOV_POWERSAVE=m -CONFIG_CPU_FREQ_GOV_USERSPACE=m -CONFIG_CPU_FREQ_GOV_ONDEMAND=y -CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y -CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y - -# -# CPU frequency scaling drivers -# -CONFIG_X86_INTEL_PSTATE=y -CONFIG_X86_PCC_CPUFREQ=y -CONFIG_X86_AMD_PSTATE=y -CONFIG_X86_AMD_PSTATE_DEFAULT_MODE=3 -CONFIG_X86_AMD_PSTATE_UT=m -CONFIG_X86_ACPI_CPUFREQ=y -# CONFIG_X86_ACPI_CPUFREQ_CPB is not set -CONFIG_X86_POWERNOW_K8=m -CONFIG_X86_AMD_FREQ_SENSITIVITY=m -# CONFIG_X86_SPEEDSTEP_CENTRINO is not set -# CONFIG_X86_P4_CLOCKMOD is not set - -# -# shared options -# -# end of CPU Frequency scaling - -# -# CPU Idle -# -CONFIG_CPU_IDLE=y -CONFIG_CPU_IDLE_GOV_LADDER=y -CONFIG_CPU_IDLE_GOV_MENU=y -CONFIG_CPU_IDLE_GOV_TEO=y -CONFIG_CPU_IDLE_GOV_HALTPOLL=y -CONFIG_HALTPOLL_CPUIDLE=y -# end of CPU Idle - -CONFIG_INTEL_IDLE=y -# end of Power management and ACPI options - -# -# Bus options (PCI etc.) -# -CONFIG_PCI_DIRECT=y -CONFIG_PCI_MMCONFIG=y -CONFIG_MMCONF_FAM10H=y -CONFIG_ISA_DMA_API=y -CONFIG_AMD_NB=y -# end of Bus options (PCI etc.) - -# -# Binary Emulations -# -CONFIG_IA32_EMULATION=y -# CONFIG_IA32_EMULATION_DEFAULT_DISABLED is not set -# CONFIG_X86_X32_ABI is not set -CONFIG_COMPAT_32=y -CONFIG_COMPAT=y -CONFIG_COMPAT_FOR_U64_ALIGNMENT=y -# end of Binary Emulations - -CONFIG_KVM_COMMON=y -CONFIG_HAVE_KVM_PFNCACHE=y -CONFIG_HAVE_KVM_IRQCHIP=y -CONFIG_HAVE_KVM_IRQ_ROUTING=y -CONFIG_HAVE_KVM_DIRTY_RING=y -CONFIG_HAVE_KVM_DIRTY_RING_TSO=y -CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL=y -CONFIG_KVM_MMIO=y -CONFIG_KVM_ASYNC_PF=y -CONFIG_HAVE_KVM_MSI=y -CONFIG_HAVE_KVM_READONLY_MEM=y -CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y -CONFIG_KVM_VFIO=y -CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y -CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY=y -CONFIG_KVM_COMPAT=y -CONFIG_HAVE_KVM_IRQ_BYPASS=y -CONFIG_HAVE_KVM_NO_POLL=y -CONFIG_KVM_XFER_TO_GUEST_WORK=y -CONFIG_HAVE_KVM_PM_NOTIFIER=y -CONFIG_KVM_GENERIC_HARDWARE_ENABLING=y -CONFIG_KVM_GENERIC_MMU_NOTIFIER=y -CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES=y -CONFIG_KVM_PRIVATE_MEM=y -CONFIG_KVM_GENERIC_PRIVATE_MEM=y -CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE=y -CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE=y -CONFIG_VIRTUALIZATION=y -CONFIG_KVM_X86=m -CONFIG_KVM=m -CONFIG_KVM_WERROR=y -CONFIG_KVM_INTEL=m -CONFIG_KVM_AMD=m -CONFIG_KVM_AMD_SEV=y -CONFIG_KVM_SMM=y -# CONFIG_KVM_HYPERV is not set -# CONFIG_KVM_XEN is not set -CONFIG_KVM_EXTERNAL_WRITE_TRACKING=y -CONFIG_KVM_MAX_NR_VCPUS=1024 -CONFIG_AS_AVX512=y -CONFIG_AS_SHA1_NI=y -CONFIG_AS_SHA256_NI=y -CONFIG_AS_TPAUSE=y -CONFIG_AS_GFNI=y -CONFIG_AS_VAES=y -CONFIG_AS_VPCLMULQDQ=y -CONFIG_AS_WRUSS=y -CONFIG_ARCH_CONFIGURES_CPU_MITIGATIONS=y - -# -# General architecture-dependent options -# -CONFIG_HOTPLUG_SMT=y -CONFIG_HOTPLUG_CORE_SYNC=y -CONFIG_HOTPLUG_CORE_SYNC_DEAD=y -CONFIG_HOTPLUG_CORE_SYNC_FULL=y -CONFIG_HOTPLUG_SPLIT_STARTUP=y -CONFIG_HOTPLUG_PARALLEL=y -CONFIG_GENERIC_ENTRY=y -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -# CONFIG_STATIC_KEYS_SELFTEST is not set -# CONFIG_STATIC_CALL_SELFTEST is not set -CONFIG_OPTPROBES=y -CONFIG_KPROBES_ON_FTRACE=y -CONFIG_UPROBES=y -CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y -CONFIG_ARCH_USE_BUILTIN_BSWAP=y -CONFIG_KRETPROBES=y -CONFIG_KRETPROBE_ON_RETHOOK=y -CONFIG_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_IOREMAP_PROT=y -CONFIG_HAVE_KPROBES=y -CONFIG_HAVE_KRETPROBES=y -CONFIG_HAVE_OPTPROBES=y -CONFIG_HAVE_KPROBES_ON_FTRACE=y -CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE=y -CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y -CONFIG_HAVE_NMI=y -CONFIG_TRACE_IRQFLAGS_SUPPORT=y -CONFIG_TRACE_IRQFLAGS_NMI_SUPPORT=y -CONFIG_HAVE_ARCH_TRACEHOOK=y -CONFIG_HAVE_DMA_CONTIGUOUS=y -CONFIG_GENERIC_SMP_IDLE_THREAD=y -CONFIG_ARCH_HAS_FORTIFY_SOURCE=y -CONFIG_ARCH_HAS_SET_MEMORY=y -CONFIG_ARCH_HAS_SET_DIRECT_MAP=y -CONFIG_ARCH_HAS_CPU_FINALIZE_INIT=y -CONFIG_ARCH_HAS_CPU_PASID=y -CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y -CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y -CONFIG_ARCH_WANTS_NO_INSTR=y -CONFIG_HAVE_ASM_MODVERSIONS=y -CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y -CONFIG_HAVE_RSEQ=y -CONFIG_HAVE_RUST=y -CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y -CONFIG_HAVE_HW_BREAKPOINT=y -CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y -CONFIG_HAVE_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_PERF_EVENTS_NMI=y -CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HAVE_PERF_REGS=y -CONFIG_HAVE_PERF_USER_STACK_DUMP=y -CONFIG_HAVE_ARCH_JUMP_LABEL=y -CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y -CONFIG_MMU_GATHER_TABLE_FREE=y -CONFIG_MMU_GATHER_RCU_TABLE_FREE=y -CONFIG_MMU_GATHER_MERGE_VMAS=y -CONFIG_MMU_LAZY_TLB_REFCOUNT=y -CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y -CONFIG_ARCH_HAVE_EXTRA_ELF_NOTES=y -CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS=y -CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y -CONFIG_HAVE_CMPXCHG_LOCAL=y -CONFIG_HAVE_CMPXCHG_DOUBLE=y -CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y -CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y -CONFIG_HAVE_ARCH_SECCOMP=y -CONFIG_HAVE_ARCH_SECCOMP_FILTER=y -CONFIG_SECCOMP=y -CONFIG_SECCOMP_FILTER=y -# CONFIG_SECCOMP_CACHE_DEBUG is not set -CONFIG_HAVE_ARCH_STACKLEAK=y -CONFIG_HAVE_STACKPROTECTOR=y -CONFIG_STACKPROTECTOR=y -CONFIG_STACKPROTECTOR_STRONG=y -CONFIG_ARCH_SUPPORTS_LTO_CLANG=y -CONFIG_ARCH_SUPPORTS_LTO_CLANG_THIN=y -CONFIG_LTO_NONE=y -CONFIG_ARCH_SUPPORTS_CFI_CLANG=y -CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y -CONFIG_HAVE_CONTEXT_TRACKING_USER=y -CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK=y -CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y -CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_MOVE_PUD=y -CONFIG_HAVE_MOVE_PMD=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y -CONFIG_HAVE_ARCH_HUGE_VMAP=y -CONFIG_HAVE_ARCH_HUGE_VMALLOC=y -CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y -CONFIG_ARCH_WANT_PMD_MKWRITE=y -CONFIG_HAVE_ARCH_SOFT_DIRTY=y -CONFIG_HAVE_MOD_ARCH_SPECIFIC=y -CONFIG_MODULES_USE_ELF_RELA=y -CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y -CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK=y -CONFIG_SOFTIRQ_ON_OWN_STACK=y -CONFIG_ARCH_HAS_ELF_RANDOMIZE=y -CONFIG_HAVE_ARCH_MMAP_RND_BITS=y -CONFIG_HAVE_EXIT_THREAD=y -CONFIG_ARCH_MMAP_RND_BITS=32 -CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y -CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 -CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y -CONFIG_HAVE_PAGE_SIZE_4KB=y -CONFIG_PAGE_SIZE_4KB=y -CONFIG_PAGE_SIZE_LESS_THAN_64KB=y -CONFIG_PAGE_SIZE_LESS_THAN_256KB=y -CONFIG_PAGE_SHIFT=12 -CONFIG_HAVE_OBJTOOL=y -CONFIG_HAVE_JUMP_LABEL_HACK=y -CONFIG_HAVE_NOINSTR_HACK=y -CONFIG_HAVE_NOINSTR_VALIDATION=y -CONFIG_HAVE_UACCESS_VALIDATION=y -CONFIG_HAVE_STACK_VALIDATION=y -CONFIG_HAVE_RELIABLE_STACKTRACE=y -CONFIG_OLD_SIGSUSPEND3=y -CONFIG_COMPAT_OLD_SIGACTION=y -CONFIG_COMPAT_32BIT_TIME=y -CONFIG_ARCH_SUPPORTS_RT=y -CONFIG_HAVE_ARCH_VMAP_STACK=y -CONFIG_VMAP_STACK=y -CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET=y -CONFIG_RANDOMIZE_KSTACK_OFFSET=y -CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y -CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y -CONFIG_STRICT_MODULE_RWX=y -CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y -CONFIG_ARCH_USE_MEMREMAP_PROT=y -CONFIG_LOCK_EVENT_COUNTS=y -CONFIG_ARCH_HAS_MEM_ENCRYPT=y -CONFIG_ARCH_HAS_CC_PLATFORM=y -CONFIG_HAVE_STATIC_CALL=y -CONFIG_HAVE_STATIC_CALL_INLINE=y -CONFIG_HAVE_PREEMPT_DYNAMIC=y -CONFIG_HAVE_PREEMPT_DYNAMIC_CALL=y -CONFIG_ARCH_WANT_LD_ORPHAN_WARN=y -CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y -CONFIG_ARCH_SUPPORTS_PAGE_TABLE_CHECK=y -CONFIG_ARCH_HAS_ELFCORE_COMPAT=y -CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH=y -CONFIG_DYNAMIC_SIGFRAME=y -CONFIG_ARCH_HAS_HW_PTE_YOUNG=y -CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y -CONFIG_ARCH_HAS_KERNEL_FPU_SUPPORT=y - -# -# GCOV-based kernel profiling -# -# CONFIG_GCOV_KERNEL is not set -CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y -# end of GCOV-based kernel profiling - -CONFIG_HAVE_GCC_PLUGINS=y -CONFIG_GCC_PLUGINS=y -CONFIG_GCC_PLUGIN_LATENT_ENTROPY=y -CONFIG_FUNCTION_ALIGNMENT_4B=y -CONFIG_FUNCTION_ALIGNMENT_16B=y -CONFIG_FUNCTION_ALIGNMENT=16 -CONFIG_CC_HAS_MIN_FUNCTION_ALIGNMENT=y -CONFIG_CC_HAS_SANE_FUNCTION_ALIGNMENT=y -# end of General architecture-dependent options - -CONFIG_RT_MUTEXES=y -CONFIG_MODULE_SIG_FORMAT=y -CONFIG_MODULES=y -CONFIG_MODULE_DEBUGFS=y -# CONFIG_MODULE_DEBUG is not set -# CONFIG_MODULE_FORCE_LOAD is not set -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y -CONFIG_MODVERSIONS=y -CONFIG_ASM_MODVERSIONS=y -CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG=y -# CONFIG_MODULE_SIG_FORCE is not set -CONFIG_MODULE_SIG_ALL=y -CONFIG_MODULE_SIG_SHA1=y -# CONFIG_MODULE_SIG_SHA256 is not set -# CONFIG_MODULE_SIG_SHA384 is not set -# CONFIG_MODULE_SIG_SHA512 is not set -# CONFIG_MODULE_SIG_SHA3_256 is not set -# CONFIG_MODULE_SIG_SHA3_384 is not set -# CONFIG_MODULE_SIG_SHA3_512 is not set -CONFIG_MODULE_SIG_HASH="sha1" -CONFIG_MODULE_COMPRESS=y -# CONFIG_MODULE_COMPRESS_GZIP is not set -# CONFIG_MODULE_COMPRESS_XZ is not set -CONFIG_MODULE_COMPRESS_ZSTD=y -CONFIG_MODULE_COMPRESS_ALL=y -CONFIG_MODULE_DECOMPRESS=y -# CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS is not set -CONFIG_MODPROBE_PATH="/sbin/modprobe" -CONFIG_TRIM_UNUSED_KSYMS=y -CONFIG_UNUSED_KSYMS_WHITELIST="" -CONFIG_MODULES_TREE_LOOKUP=y -CONFIG_BLOCK=y -CONFIG_BLOCK_LEGACY_AUTOLOAD=y -CONFIG_BLK_RQ_ALLOC_TIME=y -CONFIG_BLK_CGROUP_RWSTAT=y -CONFIG_BLK_CGROUP_PUNT_BIO=y -CONFIG_BLK_DEV_BSG_COMMON=y -CONFIG_BLK_ICQ=y -CONFIG_BLK_DEV_BSGLIB=y -CONFIG_BLK_DEV_INTEGRITY=y -CONFIG_BLK_DEV_WRITE_MOUNTED=y -CONFIG_BLK_DEV_ZONED=y -CONFIG_BLK_DEV_THROTTLING=y -CONFIG_BLK_WBT=y -CONFIG_BLK_WBT_MQ=y -CONFIG_BLK_CGROUP_IOLATENCY=y -CONFIG_BLK_CGROUP_IOCOST=y -CONFIG_BLK_CGROUP_IOPRIO=y -CONFIG_BLK_DEBUG_FS=y -# CONFIG_BLK_SED_OPAL is not set -CONFIG_BLK_INLINE_ENCRYPTION=y -CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y - -# -# Partition Types -# -CONFIG_PARTITION_ADVANCED=y -# CONFIG_ACORN_PARTITION is not set -# CONFIG_AIX_PARTITION is not set -# CONFIG_OSF_PARTITION is not set -# CONFIG_AMIGA_PARTITION is not set -# CONFIG_ATARI_PARTITION is not set -# CONFIG_MAC_PARTITION is not set -CONFIG_MSDOS_PARTITION=y -# CONFIG_BSD_DISKLABEL is not set -# CONFIG_MINIX_SUBPARTITION is not set -# CONFIG_SOLARIS_X86_PARTITION is not set -# CONFIG_UNIXWARE_DISKLABEL is not set -# CONFIG_LDM_PARTITION is not set -# CONFIG_SGI_PARTITION is not set -# CONFIG_ULTRIX_PARTITION is not set -# CONFIG_SUN_PARTITION is not set -# CONFIG_KARMA_PARTITION is not set -CONFIG_EFI_PARTITION=y -# CONFIG_SYSV68_PARTITION is not set -# CONFIG_CMDLINE_PARTITION is not set -# end of Partition Types - -CONFIG_BLK_MQ_PCI=y -CONFIG_BLK_MQ_VIRTIO=y -CONFIG_BLK_PM=y -CONFIG_BLOCK_HOLDER_DEPRECATED=y -CONFIG_BLK_MQ_STACKING=y - -# -# IO Schedulers -# -CONFIG_MQ_IOSCHED_DEADLINE=y -CONFIG_MQ_IOSCHED_KYBER=y -CONFIG_IOSCHED_BFQ=y -CONFIG_BFQ_GROUP_IOSCHED=y -# CONFIG_BFQ_CGROUP_DEBUG is not set -# end of IO Schedulers - -CONFIG_PREEMPT_NOTIFIERS=y -CONFIG_PADATA=y -CONFIG_ASN1=y -CONFIG_UNINLINE_SPIN_UNLOCK=y -CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y -CONFIG_MUTEX_SPIN_ON_OWNER=y -CONFIG_RWSEM_SPIN_ON_OWNER=y -CONFIG_LOCK_SPIN_ON_OWNER=y -CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y -CONFIG_QUEUED_SPINLOCKS=y -CONFIG_ARCH_USE_QUEUED_RWLOCKS=y -CONFIG_QUEUED_RWLOCKS=y -CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y -CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y -CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y -CONFIG_FREEZER=y - -# -# Executable file formats -# -CONFIG_BINFMT_ELF=y -CONFIG_COMPAT_BINFMT_ELF=y -CONFIG_ELFCORE=y -# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_SCRIPT=y -# CONFIG_BINFMT_MISC is not set -CONFIG_COREDUMP=y -# end of Executable file formats - -# -# Memory Management options -# -CONFIG_ZPOOL=y -CONFIG_SWAP=y -CONFIG_ZSWAP=y -CONFIG_ZSWAP_DEFAULT_ON=y -CONFIG_ZSWAP_SHRINKER_DEFAULT_ON=y -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4 is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4HC is not set -CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD=y -CONFIG_ZSWAP_COMPRESSOR_DEFAULT="zstd" -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD is not set -# CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED is not set -CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC=y -CONFIG_ZSWAP_ZPOOL_DEFAULT="zsmalloc" -CONFIG_ZBUD=y -# CONFIG_Z3FOLD_DEPRECATED is not set -CONFIG_ZSMALLOC=y -# CONFIG_ZSMALLOC_STAT is not set -CONFIG_ZSMALLOC_CHAIN_SIZE=8 - -# -# Slab allocator options -# -CONFIG_SLUB=y -# CONFIG_SLAB_MERGE_DEFAULT is not set -CONFIG_SLAB_FREELIST_RANDOM=y -CONFIG_SLAB_FREELIST_HARDENED=y -CONFIG_SLAB_BUCKETS=y -CONFIG_SLAB_CANARY=y -# CONFIG_SLUB_STATS is not set -CONFIG_SLUB_CPU_PARTIAL=y -CONFIG_RANDOM_KMALLOC_CACHES=y -# end of Slab allocator options - -CONFIG_SHUFFLE_PAGE_ALLOCATOR=y -# CONFIG_COMPAT_BRK is not set -CONFIG_SPARSEMEM=y -CONFIG_SPARSEMEM_EXTREME=y -CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y -CONFIG_SPARSEMEM_VMEMMAP=y -CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP=y -CONFIG_ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP=y -CONFIG_HAVE_GUP_FAST=y -CONFIG_NUMA_KEEP_MEMINFO=y -CONFIG_MEMORY_ISOLATION=y -CONFIG_EXCLUSIVE_SYSTEM_RAM=y -CONFIG_HAVE_BOOTMEM_INFO_NODE=y -CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y -CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y -CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y -CONFIG_MEMORY_HOTREMOVE=y -CONFIG_MHP_MEMMAP_ON_MEMORY=y -CONFIG_ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE=y -CONFIG_SPLIT_PTE_PTLOCKS=y -CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y -CONFIG_SPLIT_PMD_PTLOCKS=y -CONFIG_MEMORY_BALLOON=y -CONFIG_BALLOON_COMPACTION=y -CONFIG_COMPACTION=y -CONFIG_COMPACT_UNEVICTABLE_DEFAULT=1 -CONFIG_PAGE_REPORTING=y -CONFIG_MIGRATION=y -CONFIG_DEVICE_MIGRATION=y -CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y -CONFIG_ARCH_ENABLE_THP_MIGRATION=y -CONFIG_CONTIG_ALLOC=y -CONFIG_PCP_BATCH_SCALE_MAX=5 -CONFIG_PHYS_ADDR_T_64BIT=y -CONFIG_MMU_NOTIFIER=y -CONFIG_KSM=y -CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 -CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y -CONFIG_MEMORY_FAILURE=y -# CONFIG_HWPOISON_INJECT is not set -CONFIG_ARCH_WANT_GENERAL_HUGETLB=y -CONFIG_ARCH_WANTS_THP_SWAP=y -CONFIG_TRANSPARENT_HUGEPAGE=y -# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set -CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y -# CONFIG_TRANSPARENT_HUGEPAGE_NEVER is not set -CONFIG_THP_SWAP=y -CONFIG_READ_ONLY_THP_FOR_FS=y -CONFIG_PGTABLE_HAS_HUGE_LEAVES=y -CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y -CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y -CONFIG_ARCH_SUPPORTS_PUD_PFNMAP=y -CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y -CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y -CONFIG_USE_PERCPU_NUMA_NODE_ID=y -CONFIG_HAVE_SETUP_PER_CPU_AREA=y -# CONFIG_CMA is not set -# CONFIG_MEM_SOFT_DIRTY is not set -CONFIG_GENERIC_EARLY_IOREMAP=y -# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set -CONFIG_PAGE_IDLE_FLAG=y -CONFIG_IDLE_PAGE_TRACKING=y -CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y -CONFIG_ARCH_HAS_CURRENT_STACK_POINTER=y -CONFIG_ARCH_HAS_PTE_DEVMAP=y -CONFIG_ZONE_DMA=y -CONFIG_ZONE_DMA32=y -CONFIG_ZONE_DEVICE=y -CONFIG_HMM_MIRROR=y -CONFIG_GET_FREE_REGION=y -CONFIG_DEVICE_PRIVATE=y -CONFIG_VMAP_PFN=y -CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y -CONFIG_ARCH_HAS_PKEYS=y -CONFIG_ARCH_USES_PG_ARCH_2=y -CONFIG_VM_EVENT_COUNTERS=y -# CONFIG_PERCPU_STATS is not set -# CONFIG_GUP_TEST is not set -# CONFIG_DMAPOOL_TEST is not set -CONFIG_ARCH_HAS_PTE_SPECIAL=y -CONFIG_MEMFD_CREATE=y -CONFIG_SECRETMEM=y -CONFIG_ANON_VMA_NAME=y -# CONFIG_USERFAULTFD is not set -CONFIG_LRU_GEN=y -CONFIG_LRU_GEN_ENABLED=y -# CONFIG_LRU_GEN_STATS is not set -CONFIG_LRU_GEN_WALKS_MMU=y -CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y -CONFIG_PER_VMA_LOCK=y -CONFIG_LOCK_MM_AND_FIND_VMA=y -CONFIG_IOMMU_MM_DATA=y -CONFIG_EXECMEM=y -CONFIG_NUMA_MEMBLKS=y -# CONFIG_NUMA_EMU is not set - -# -# Data Access Monitoring -# -# CONFIG_DAMON is not set -# end of Data Access Monitoring -# end of Memory Management options - -CONFIG_NET=y -CONFIG_NET_INGRESS=y -CONFIG_NET_EGRESS=y -CONFIG_NET_XGRESS=y -CONFIG_SKB_DECRYPTED=y -CONFIG_SKB_EXTENSIONS=y -CONFIG_NET_DEVMEM=y - -# -# Networking options -# -CONFIG_PACKET=m -CONFIG_PACKET_DIAG=m -CONFIG_UNIX=y -CONFIG_AF_UNIX_OOB=y -CONFIG_UNIX_DIAG=y -CONFIG_TLS=m -CONFIG_TLS_DEVICE=y -# CONFIG_TLS_TOE is not set -CONFIG_XFRM=y -CONFIG_XFRM_OFFLOAD=y -CONFIG_XFRM_ALGO=m -CONFIG_XFRM_USER=m -# CONFIG_XFRM_USER_COMPAT is not set -CONFIG_XFRM_INTERFACE=m -CONFIG_XFRM_SUB_POLICY=y -CONFIG_XFRM_MIGRATE=y -CONFIG_XFRM_STATISTICS=y -CONFIG_XFRM_AH=m -CONFIG_XFRM_ESP=m -CONFIG_XFRM_IPCOMP=m -CONFIG_NET_KEY=m -# CONFIG_NET_KEY_MIGRATE is not set -CONFIG_XDP_SOCKETS=y -CONFIG_XDP_SOCKETS_DIAG=m -CONFIG_NET_HANDSHAKE=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -# CONFIG_IP_FIB_TRIE_STATS is not set -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -# CONFIG_IP_ROUTE_VERBOSE is not set -CONFIG_IP_ROUTE_CLASSID=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -# CONFIG_IP_PNP_BOOTP is not set -# CONFIG_IP_PNP_RARP is not set -CONFIG_NET_IPIP=m -# CONFIG_NET_IPGRE_DEMUX is not set -CONFIG_NET_IP_TUNNEL=m -CONFIG_IP_MROUTE_COMMON=y -CONFIG_IP_MROUTE=y -CONFIG_IP_MROUTE_MULTIPLE_TABLES=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -# CONFIG_NET_IPVTI is not set -CONFIG_NET_UDP_TUNNEL=m -# CONFIG_NET_FOU is not set -# CONFIG_NET_FOU_IP_TUNNELS is not set -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_ESP_OFFLOAD=m -# CONFIG_INET_ESPINTCP is not set -CONFIG_INET_IPCOMP=m -CONFIG_INET_TABLE_PERTURB_ORDER=16 -CONFIG_INET_XFRM_TUNNEL=m -CONFIG_INET_TUNNEL=m -# CONFIG_INET_DIAG is not set -CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_BIC=m -CONFIG_TCP_CONG_CUBIC=y -CONFIG_TCP_CONG_WESTWOOD=m -CONFIG_TCP_CONG_HTCP=m -CONFIG_TCP_CONG_HSTCP=m -CONFIG_TCP_CONG_HYBLA=m -CONFIG_TCP_CONG_VEGAS=y -# CONFIG_TCP_CONG_NV is not set -CONFIG_TCP_CONG_SCALABLE=m -CONFIG_TCP_CONG_LP=m -CONFIG_TCP_CONG_VENO=m -CONFIG_TCP_CONG_YEAH=y -CONFIG_TCP_CONG_ILLINOIS=m -# CONFIG_TCP_CONG_DCTCP is not set -# CONFIG_TCP_CONG_CDG is not set -CONFIG_TCP_CONG_BBR=y -# CONFIG_DEFAULT_CUBIC is not set -# CONFIG_DEFAULT_VEGAS is not set -CONFIG_DEFAULT_BBR=y -# CONFIG_DEFAULT_RENO is not set -CONFIG_DEFAULT_TCP_CONG="bbr" -CONFIG_TCP_SIGPOOL=y -# CONFIG_TCP_AO is not set -CONFIG_TCP_MD5SIG=y -# CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON is not set -CONFIG_IPV6=y -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_IPV6_ROUTE_INFO=y -CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_ESP_OFFLOAD=m -# CONFIG_INET6_ESPINTCP is not set -CONFIG_INET6_IPCOMP=m -CONFIG_IPV6_MIP6=m -CONFIG_IPV6_ILA=m -CONFIG_INET6_XFRM_TUNNEL=m -CONFIG_INET6_TUNNEL=m -# CONFIG_IPV6_VTI is not set -CONFIG_IPV6_SIT=m -CONFIG_IPV6_SIT_6RD=y -CONFIG_IPV6_NDISC_NODETYPE=y -CONFIG_IPV6_TUNNEL=m -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_SUBTREES=y -CONFIG_IPV6_MROUTE=y -CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y -# CONFIG_IPV6_PIMSM_V2 is not set -CONFIG_IPV6_SEG6_LWTUNNEL=y -CONFIG_IPV6_SEG6_HMAC=y -CONFIG_IPV6_SEG6_BPF=y -# CONFIG_IPV6_RPL_LWTUNNEL is not set -CONFIG_IPV6_IOAM6_LWTUNNEL=y -# CONFIG_NETLABEL is not set -# CONFIG_MPTCP is not set -CONFIG_NETWORK_SECMARK=y -CONFIG_NET_PTP_CLASSIFY=y -# CONFIG_NETWORK_PHY_TIMESTAMPING is not set -CONFIG_NETFILTER=y -CONFIG_NETFILTER_ADVANCED=y -CONFIG_BRIDGE_NETFILTER=m - -# -# Core Netfilter Configuration -# -CONFIG_NETFILTER_INGRESS=y -CONFIG_NETFILTER_EGRESS=y -CONFIG_NETFILTER_SKIP_EGRESS=y -CONFIG_NETFILTER_NETLINK=m -CONFIG_NETFILTER_FAMILY_BRIDGE=y -CONFIG_NETFILTER_FAMILY_ARP=y -CONFIG_NETFILTER_BPF_LINK=y -CONFIG_NETFILTER_NETLINK_HOOK=m -CONFIG_NETFILTER_NETLINK_ACCT=m -CONFIG_NETFILTER_NETLINK_QUEUE=m -CONFIG_NETFILTER_NETLINK_LOG=m -CONFIG_NETFILTER_NETLINK_OSF=m -CONFIG_NF_CONNTRACK=m -CONFIG_NF_LOG_SYSLOG=m -CONFIG_NETFILTER_CONNCOUNT=m -CONFIG_NF_CONNTRACK_MARK=y -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CONNTRACK_PROCFS is not set -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_TIMEOUT=y -CONFIG_NF_CONNTRACK_TIMESTAMP=y -CONFIG_NF_CONNTRACK_LABELS=y -CONFIG_NF_CONNTRACK_OVS=y -CONFIG_NF_CT_PROTO_DCCP=y -CONFIG_NF_CT_PROTO_GRE=y -CONFIG_NF_CT_PROTO_SCTP=y -CONFIG_NF_CT_PROTO_UDPLITE=y -CONFIG_NF_CONNTRACK_AMANDA=m -CONFIG_NF_CONNTRACK_FTP=m -CONFIG_NF_CONNTRACK_H323=m -CONFIG_NF_CONNTRACK_IRC=m -CONFIG_NF_CONNTRACK_BROADCAST=m -CONFIG_NF_CONNTRACK_NETBIOS_NS=m -CONFIG_NF_CONNTRACK_SNMP=m -CONFIG_NF_CONNTRACK_PPTP=m -CONFIG_NF_CONNTRACK_SANE=m -CONFIG_NF_CONNTRACK_SIP=m -CONFIG_NF_CONNTRACK_TFTP=m -CONFIG_NF_CT_NETLINK=m -CONFIG_NF_CT_NETLINK_TIMEOUT=m -CONFIG_NF_CT_NETLINK_HELPER=m -CONFIG_NETFILTER_NETLINK_GLUE_CT=y -CONFIG_NF_NAT=m -CONFIG_NF_NAT_AMANDA=m -CONFIG_NF_NAT_FTP=m -CONFIG_NF_NAT_IRC=m -CONFIG_NF_NAT_SIP=m -CONFIG_NF_NAT_TFTP=m -CONFIG_NF_NAT_REDIRECT=y -CONFIG_NF_NAT_MASQUERADE=y -CONFIG_NF_NAT_OVS=y -CONFIG_NETFILTER_SYNPROXY=m -CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_INET=y -CONFIG_NF_TABLES_NETDEV=y -CONFIG_NFT_NUMGEN=m -CONFIG_NFT_CT=m -CONFIG_NFT_FLOW_OFFLOAD=m -CONFIG_NFT_CONNLIMIT=m -CONFIG_NFT_LOG=m -CONFIG_NFT_LIMIT=m -CONFIG_NFT_MASQ=m -CONFIG_NFT_REDIR=m -CONFIG_NFT_NAT=m -CONFIG_NFT_TUNNEL=m -CONFIG_NFT_QUEUE=m -CONFIG_NFT_QUOTA=m -CONFIG_NFT_REJECT=m -CONFIG_NFT_REJECT_INET=m -CONFIG_NFT_COMPAT=m -CONFIG_NFT_HASH=m -CONFIG_NFT_FIB=m -CONFIG_NFT_FIB_INET=m -CONFIG_NFT_XFRM=m -CONFIG_NFT_SOCKET=m -CONFIG_NFT_OSF=m -CONFIG_NFT_TPROXY=m -CONFIG_NFT_SYNPROXY=m -CONFIG_NF_DUP_NETDEV=m -CONFIG_NFT_DUP_NETDEV=m -CONFIG_NFT_FWD_NETDEV=m -CONFIG_NFT_FIB_NETDEV=m -CONFIG_NFT_REJECT_NETDEV=m -CONFIG_NF_FLOW_TABLE_INET=m -CONFIG_NF_FLOW_TABLE=m -CONFIG_NF_FLOW_TABLE_PROCFS=y -CONFIG_NETFILTER_XTABLES=m -CONFIG_NETFILTER_XTABLES_COMPAT=y - -# -# Xtables combined modules -# -CONFIG_NETFILTER_XT_MARK=m -CONFIG_NETFILTER_XT_CONNMARK=m -CONFIG_NETFILTER_XT_SET=m - -# -# Xtables targets -# -CONFIG_NETFILTER_XT_TARGET_AUDIT=m -CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m -CONFIG_NETFILTER_XT_TARGET_CONNMARK=m -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m -CONFIG_NETFILTER_XT_TARGET_CT=m -CONFIG_NETFILTER_XT_TARGET_DSCP=m -CONFIG_NETFILTER_XT_TARGET_HL=m -CONFIG_NETFILTER_XT_TARGET_HMARK=m -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m -CONFIG_NETFILTER_XT_TARGET_LED=m -CONFIG_NETFILTER_XT_TARGET_LOG=m -CONFIG_NETFILTER_XT_TARGET_MARK=m -CONFIG_NETFILTER_XT_NAT=m -CONFIG_NETFILTER_XT_TARGET_NETMAP=m -CONFIG_NETFILTER_XT_TARGET_NFLOG=m -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m -# CONFIG_NETFILTER_XT_TARGET_NOTRACK is not set -CONFIG_NETFILTER_XT_TARGET_RATEEST=m -CONFIG_NETFILTER_XT_TARGET_REDIRECT=m -CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m -CONFIG_NETFILTER_XT_TARGET_TEE=m -CONFIG_NETFILTER_XT_TARGET_TPROXY=m -CONFIG_NETFILTER_XT_TARGET_TRACE=m -CONFIG_NETFILTER_XT_TARGET_SECMARK=m -CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m - -# -# Xtables matches -# -CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m -CONFIG_NETFILTER_XT_MATCH_BPF=m -CONFIG_NETFILTER_XT_MATCH_CGROUP=m -CONFIG_NETFILTER_XT_MATCH_CLUSTER=m -CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m -CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m -CONFIG_NETFILTER_XT_MATCH_CONNMARK=m -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m -CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m -CONFIG_NETFILTER_XT_MATCH_DSCP=m -CONFIG_NETFILTER_XT_MATCH_ECN=m -CONFIG_NETFILTER_XT_MATCH_ESP=m -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m -CONFIG_NETFILTER_XT_MATCH_HELPER=m -CONFIG_NETFILTER_XT_MATCH_HL=m -CONFIG_NETFILTER_XT_MATCH_IPCOMP=m -CONFIG_NETFILTER_XT_MATCH_IPRANGE=m -CONFIG_NETFILTER_XT_MATCH_L2TP=m -CONFIG_NETFILTER_XT_MATCH_LENGTH=m -CONFIG_NETFILTER_XT_MATCH_LIMIT=m -CONFIG_NETFILTER_XT_MATCH_MAC=m -CONFIG_NETFILTER_XT_MATCH_MARK=m -CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m -CONFIG_NETFILTER_XT_MATCH_NFACCT=m -CONFIG_NETFILTER_XT_MATCH_OSF=m -CONFIG_NETFILTER_XT_MATCH_OWNER=m -CONFIG_NETFILTER_XT_MATCH_POLICY=m -CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m -CONFIG_NETFILTER_XT_MATCH_QUOTA=m -CONFIG_NETFILTER_XT_MATCH_RATEEST=m -CONFIG_NETFILTER_XT_MATCH_REALM=m -CONFIG_NETFILTER_XT_MATCH_RECENT=m -CONFIG_NETFILTER_XT_MATCH_SCTP=m -CONFIG_NETFILTER_XT_MATCH_SOCKET=m -CONFIG_NETFILTER_XT_MATCH_STATE=m -CONFIG_NETFILTER_XT_MATCH_STATISTIC=m -CONFIG_NETFILTER_XT_MATCH_STRING=m -CONFIG_NETFILTER_XT_MATCH_TCPMSS=m -CONFIG_NETFILTER_XT_MATCH_TIME=m -CONFIG_NETFILTER_XT_MATCH_U32=m -# end of Core Netfilter Configuration - -CONFIG_IP_SET=m -CONFIG_IP_SET_MAX=256 -CONFIG_IP_SET_BITMAP_IP=m -CONFIG_IP_SET_BITMAP_IPMAC=m -CONFIG_IP_SET_BITMAP_PORT=m -CONFIG_IP_SET_HASH_IP=m -CONFIG_IP_SET_HASH_IPMARK=m -CONFIG_IP_SET_HASH_IPPORT=m -CONFIG_IP_SET_HASH_IPPORTIP=m -CONFIG_IP_SET_HASH_IPPORTNET=m -CONFIG_IP_SET_HASH_IPMAC=m -CONFIG_IP_SET_HASH_MAC=m -CONFIG_IP_SET_HASH_NETPORTNET=m -CONFIG_IP_SET_HASH_NET=m -CONFIG_IP_SET_HASH_NETNET=m -CONFIG_IP_SET_HASH_NETPORT=m -CONFIG_IP_SET_HASH_NETIFACE=m -CONFIG_IP_SET_LIST_SET=m -# CONFIG_IP_VS is not set - -# -# IP: Netfilter Configuration -# -CONFIG_NF_DEFRAG_IPV4=m -CONFIG_IP_NF_IPTABLES_LEGACY=m -CONFIG_NF_SOCKET_IPV4=m -CONFIG_NF_TPROXY_IPV4=m -CONFIG_NF_TABLES_IPV4=y -CONFIG_NFT_REJECT_IPV4=m -CONFIG_NFT_DUP_IPV4=m -CONFIG_NFT_FIB_IPV4=m -CONFIG_NF_TABLES_ARP=y -CONFIG_NF_DUP_IPV4=m -CONFIG_NF_LOG_ARP=m -CONFIG_NF_LOG_IPV4=m -CONFIG_NF_REJECT_IPV4=m -CONFIG_NF_NAT_SNMP_BASIC=m -CONFIG_NF_NAT_PPTP=m -CONFIG_NF_NAT_H323=m -CONFIG_IP_NF_IPTABLES=m -CONFIG_IP_NF_MATCH_AH=m -CONFIG_IP_NF_MATCH_ECN=m -CONFIG_IP_NF_MATCH_RPFILTER=m -CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_TARGET_SYNPROXY=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_TARGET_NETMAP=m -CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_TTL=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m -CONFIG_NFT_COMPAT_ARP=m -CONFIG_IP_NF_ARPFILTER=m -CONFIG_IP_NF_ARP_MANGLE=m -# end of IP: Netfilter Configuration - -# -# IPv6: Netfilter Configuration -# -CONFIG_IP6_NF_IPTABLES_LEGACY=m -CONFIG_NF_SOCKET_IPV6=m -CONFIG_NF_TPROXY_IPV6=m -CONFIG_NF_TABLES_IPV6=y -CONFIG_NFT_REJECT_IPV6=m -CONFIG_NFT_DUP_IPV6=m -CONFIG_NFT_FIB_IPV6=m -CONFIG_NF_DUP_IPV6=m -CONFIG_NF_REJECT_IPV6=m -CONFIG_NF_LOG_IPV6=m -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_AH=m -CONFIG_IP6_NF_MATCH_EUI64=m -CONFIG_IP6_NF_MATCH_FRAG=m -CONFIG_IP6_NF_MATCH_OPTS=m -CONFIG_IP6_NF_MATCH_HL=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_MATCH_MH=m -CONFIG_IP6_NF_MATCH_RPFILTER=m -CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_MATCH_SRH=m -CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_RAW=m -CONFIG_IP6_NF_SECURITY=m -CONFIG_IP6_NF_NAT=m -CONFIG_IP6_NF_TARGET_MASQUERADE=m -CONFIG_IP6_NF_TARGET_NPT=m -# end of IPv6: Netfilter Configuration - -CONFIG_NF_DEFRAG_IPV6=m -CONFIG_NF_TABLES_BRIDGE=m -CONFIG_NFT_BRIDGE_META=m -CONFIG_NFT_BRIDGE_REJECT=m -CONFIG_NF_CONNTRACK_BRIDGE=m -CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m -CONFIG_BRIDGE_NF_EBTABLES=m -CONFIG_BRIDGE_EBT_BROUTE=m -CONFIG_BRIDGE_EBT_T_FILTER=m -CONFIG_BRIDGE_EBT_T_NAT=m -CONFIG_BRIDGE_EBT_802_3=m -CONFIG_BRIDGE_EBT_AMONG=m -CONFIG_BRIDGE_EBT_ARP=m -CONFIG_BRIDGE_EBT_IP=m -CONFIG_BRIDGE_EBT_IP6=m -CONFIG_BRIDGE_EBT_LIMIT=m -CONFIG_BRIDGE_EBT_MARK=m -CONFIG_BRIDGE_EBT_PKTTYPE=m -CONFIG_BRIDGE_EBT_STP=m -CONFIG_BRIDGE_EBT_VLAN=m -CONFIG_BRIDGE_EBT_ARPREPLY=m -CONFIG_BRIDGE_EBT_DNAT=m -CONFIG_BRIDGE_EBT_MARK_T=m -CONFIG_BRIDGE_EBT_REDIRECT=m -CONFIG_BRIDGE_EBT_SNAT=m -CONFIG_BRIDGE_EBT_LOG=m -CONFIG_BRIDGE_EBT_NFLOG=m -# CONFIG_IP_DCCP is not set -# CONFIG_IP_SCTP is not set -# CONFIG_RDS is not set -# CONFIG_TIPC is not set -CONFIG_ATM=m -CONFIG_ATM_CLIP=m -# CONFIG_ATM_CLIP_NO_ICMP is not set -CONFIG_ATM_LANE=m -CONFIG_ATM_MPOA=m -CONFIG_ATM_BR2684=m -# CONFIG_ATM_BR2684_IPFILTER is not set -CONFIG_L2TP=m -# CONFIG_L2TP_DEBUGFS is not set -CONFIG_L2TP_V3=y -CONFIG_L2TP_IP=m -CONFIG_L2TP_ETH=m -CONFIG_STP=m -CONFIG_BRIDGE=m -CONFIG_BRIDGE_IGMP_SNOOPING=y -# CONFIG_BRIDGE_VLAN_FILTERING is not set -# CONFIG_BRIDGE_MRP is not set -CONFIG_BRIDGE_CFM=y -# CONFIG_NET_DSA is not set -CONFIG_VLAN_8021Q=m -# CONFIG_VLAN_8021Q_GVRP is not set -# CONFIG_VLAN_8021Q_MVRP is not set -CONFIG_LLC=m -# CONFIG_LLC2 is not set -# CONFIG_ATALK is not set -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -# CONFIG_PHONET is not set -# CONFIG_6LOWPAN is not set -# CONFIG_IEEE802154 is not set -CONFIG_NET_SCHED=y - -# -# Queueing/Scheduling -# -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_HFSC=m -CONFIG_NET_SCH_PRIO=m -# CONFIG_NET_SCH_MULTIQ is not set -CONFIG_NET_SCH_RED=m -# CONFIG_NET_SCH_SFB is not set -CONFIG_NET_SCH_SFQ=m -CONFIG_NET_SCH_TEQL=m -CONFIG_NET_SCH_TBF=m -# CONFIG_NET_SCH_CBS is not set -CONFIG_NET_SCH_ETF=m -CONFIG_NET_SCH_MQPRIO_LIB=m -CONFIG_NET_SCH_TAPRIO=m -CONFIG_NET_SCH_GRED=m -# CONFIG_NET_SCH_NETEM is not set -# CONFIG_NET_SCH_DRR is not set -# CONFIG_NET_SCH_MQPRIO is not set -CONFIG_NET_SCH_SKBPRIO=m -# CONFIG_NET_SCH_CHOKE is not set -CONFIG_NET_SCH_QFQ=m -# CONFIG_NET_SCH_CODEL is not set -CONFIG_NET_SCH_FQ_CODEL=y -CONFIG_NET_SCH_CAKE=m -CONFIG_NET_SCH_FQ=m -# CONFIG_NET_SCH_HHF is not set -# CONFIG_NET_SCH_PIE is not set -CONFIG_NET_SCH_INGRESS=m -# CONFIG_NET_SCH_PLUG is not set -# CONFIG_NET_SCH_ETS is not set -CONFIG_NET_SCH_DEFAULT=y -# CONFIG_DEFAULT_FQ is not set -# CONFIG_DEFAULT_FQ_CODEL is not set -# CONFIG_DEFAULT_SFQ is not set -CONFIG_DEFAULT_PFIFO_FAST=y -CONFIG_DEFAULT_NET_SCH="pfifo_fast" - -# -# Classification -# -CONFIG_NET_CLS=y -CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_ROUTE4=m -CONFIG_NET_CLS_FW=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_PERF=y -CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_FLOW=m -CONFIG_NET_CLS_CGROUP=y -CONFIG_NET_CLS_BPF=m -# CONFIG_NET_CLS_FLOWER is not set -# CONFIG_NET_CLS_MATCHALL is not set -CONFIG_NET_EMATCH=y -CONFIG_NET_EMATCH_STACK=32 -CONFIG_NET_EMATCH_CMP=m -CONFIG_NET_EMATCH_NBYTE=m -CONFIG_NET_EMATCH_U32=m -CONFIG_NET_EMATCH_META=m -CONFIG_NET_EMATCH_TEXT=m -# CONFIG_NET_EMATCH_IPSET is not set -CONFIG_NET_EMATCH_IPT=m -CONFIG_NET_CLS_ACT=y -CONFIG_NET_ACT_POLICE=m -CONFIG_NET_ACT_GACT=m -CONFIG_GACT_PROB=y -CONFIG_NET_ACT_MIRRED=m -# CONFIG_NET_ACT_SAMPLE is not set -CONFIG_NET_ACT_NAT=m -CONFIG_NET_ACT_PEDIT=m -# CONFIG_NET_ACT_SIMP is not set -# CONFIG_NET_ACT_SKBEDIT is not set -# CONFIG_NET_ACT_CSUM is not set -# CONFIG_NET_ACT_MPLS is not set -# CONFIG_NET_ACT_VLAN is not set -CONFIG_NET_ACT_BPF=m -# CONFIG_NET_ACT_CONNMARK is not set -CONFIG_NET_ACT_CTINFO=m -# CONFIG_NET_ACT_SKBMOD is not set -# CONFIG_NET_ACT_IFE is not set -# CONFIG_NET_ACT_TUNNEL_KEY is not set -CONFIG_NET_ACT_CT=m -# CONFIG_NET_ACT_GATE is not set -# CONFIG_NET_TC_SKB_EXT is not set -CONFIG_NET_SCH_FIFO=y -# CONFIG_DCB is not set -CONFIG_DNS_RESOLVER=m -CONFIG_BATMAN_ADV=m -# CONFIG_BATMAN_ADV_BATMAN_V is not set -CONFIG_BATMAN_ADV_BLA=y -CONFIG_BATMAN_ADV_DAT=y -CONFIG_BATMAN_ADV_NC=y -CONFIG_BATMAN_ADV_MCAST=y -# CONFIG_BATMAN_ADV_DEBUG is not set -# CONFIG_BATMAN_ADV_TRACING is not set -CONFIG_OPENVSWITCH=m -CONFIG_OPENVSWITCH_VXLAN=m -# CONFIG_VSOCKETS is not set -CONFIG_NETLINK_DIAG=y -CONFIG_MPLS=y -CONFIG_NET_MPLS_GSO=m -# CONFIG_MPLS_ROUTING is not set -CONFIG_NET_NSH=m -# CONFIG_HSR is not set -# CONFIG_NET_SWITCHDEV is not set -# CONFIG_NET_L3_MASTER_DEV is not set -# CONFIG_QRTR is not set -# CONFIG_NET_NCSI is not set -CONFIG_PCPU_DEV_REFCNT=y -CONFIG_MAX_SKB_FRAGS=17 -CONFIG_RPS=y -CONFIG_RFS_ACCEL=y -CONFIG_SOCK_RX_QUEUE_MAPPING=y -CONFIG_XPS=y -CONFIG_CGROUP_NET_PRIO=y -CONFIG_CGROUP_NET_CLASSID=y -CONFIG_NET_RX_BUSY_POLL=y -CONFIG_BQL=y -CONFIG_BPF_STREAM_PARSER=y -CONFIG_NET_FLOW_LIMIT=y - -# -# Network testing -# -CONFIG_NET_PKTGEN=m -# CONFIG_NET_DROP_MONITOR is not set -# end of Network testing -# end of Networking options - -# CONFIG_HAMRADIO is not set -# CONFIG_CAN is not set -CONFIG_BT=m -CONFIG_BT_BREDR=y -CONFIG_BT_RFCOMM=m -CONFIG_BT_RFCOMM_TTY=y -# CONFIG_BT_BNEP is not set -# CONFIG_BT_HIDP is not set -CONFIG_BT_LE=y -CONFIG_BT_LE_L2CAP_ECRED=y -# CONFIG_BT_LEDS is not set -CONFIG_BT_MSFTEXT=y -CONFIG_BT_AOSPEXT=y -CONFIG_BT_DEBUGFS=y -# CONFIG_BT_SELFTEST is not set -# CONFIG_BT_FEATURE_DEBUG is not set - -# -# Bluetooth device drivers -# -CONFIG_BT_INTEL=m -CONFIG_BT_BCM=m -CONFIG_BT_RTL=m -CONFIG_BT_MTK=m -CONFIG_BT_HCIBTUSB=m -CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y -CONFIG_BT_HCIBTUSB_POLL_SYNC=y -CONFIG_BT_HCIBTUSB_BCM=y -CONFIG_BT_HCIBTUSB_MTK=y -CONFIG_BT_HCIBTUSB_RTL=y -# CONFIG_BT_HCIBTSDIO is not set -# CONFIG_BT_HCIUART is not set -# CONFIG_BT_HCIBCM203X is not set -# CONFIG_BT_HCIBCM4377 is not set -# CONFIG_BT_HCIBPA10X is not set -# CONFIG_BT_HCIBFUSB is not set -# CONFIG_BT_HCIVHCI is not set -# CONFIG_BT_MRVL is not set -CONFIG_BT_ATH3K=m -# CONFIG_BT_MTKSDIO is not set -CONFIG_BT_VIRTIO=m -# CONFIG_BT_INTEL_PCIE is not set -# end of Bluetooth device drivers - -CONFIG_AF_RXRPC=m -# CONFIG_AF_RXRPC_IPV6 is not set -# CONFIG_AF_RXRPC_INJECT_LOSS is not set -# CONFIG_AF_RXRPC_INJECT_RX_DELAY is not set -# CONFIG_AF_RXRPC_DEBUG is not set -# CONFIG_RXKAD is not set -# CONFIG_RXPERF is not set -# CONFIG_AF_KCM is not set -CONFIG_STREAM_PARSER=y -CONFIG_MCTP=y -CONFIG_FIB_RULES=y -CONFIG_WIRELESS=y -CONFIG_CFG80211=m -# CONFIG_NL80211_TESTMODE is not set -# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set -CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y -CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y -CONFIG_CFG80211_DEFAULT_PS=y -# CONFIG_CFG80211_DEBUGFS is not set -CONFIG_CFG80211_CRDA_SUPPORT=y -# CONFIG_CFG80211_WEXT is not set -CONFIG_MAC80211=m -CONFIG_MAC80211_HAS_RC=y -CONFIG_MAC80211_RC_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" -CONFIG_MAC80211_MESH=y -CONFIG_MAC80211_LEDS=y -# CONFIG_MAC80211_MESSAGE_TRACING is not set -# CONFIG_MAC80211_DEBUG_MENU is not set -CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 -CONFIG_RFKILL=m -CONFIG_RFKILL_LEDS=y -CONFIG_RFKILL_INPUT=y -# CONFIG_NET_9P is not set -# CONFIG_CAIF is not set -CONFIG_CEPH_LIB=m -# CONFIG_CEPH_LIB_PRETTYDEBUG is not set -# CONFIG_CEPH_LIB_USE_DNS_RESOLVER is not set -# CONFIG_NFC is not set -# CONFIG_PSAMPLE is not set -# CONFIG_NET_IFE is not set -CONFIG_LWTUNNEL=y -CONFIG_LWTUNNEL_BPF=y -CONFIG_DST_CACHE=y -CONFIG_GRO_CELLS=y -CONFIG_SOCK_VALIDATE_XMIT=y -CONFIG_NET_SELFTESTS=m -CONFIG_NET_SOCK_MSG=y -CONFIG_PAGE_POOL=y -# CONFIG_PAGE_POOL_STATS is not set -CONFIG_FAILOVER=m -CONFIG_ETHTOOL_NETLINK=y - -# -# Device Drivers -# -CONFIG_HAVE_EISA=y -# CONFIG_EISA is not set -CONFIG_HAVE_PCI=y -CONFIG_GENERIC_PCI_IOMAP=y -CONFIG_PCI=y -CONFIG_PCI_DOMAINS=y -CONFIG_PCIEPORTBUS=y -CONFIG_HOTPLUG_PCI_PCIE=y -CONFIG_PCIEAER=y -# CONFIG_PCIEAER_INJECT is not set -CONFIG_PCIE_ECRC=y -CONFIG_PCIEASPM=y -CONFIG_PCIEASPM_DEFAULT=y -# CONFIG_PCIEASPM_POWERSAVE is not set -# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set -# CONFIG_PCIEASPM_PERFORMANCE is not set -CONFIG_PCIE_PME=y -CONFIG_PCIE_DPC=y -CONFIG_PCIE_PTM=y -# CONFIG_PCIE_EDR is not set -CONFIG_PCI_MSI=y -CONFIG_PCI_QUIRKS=y -# CONFIG_PCI_DEBUG is not set -CONFIG_PCI_REALLOC_ENABLE_AUTO=y -CONFIG_PCI_STUB=m -# CONFIG_PCI_PF_STUB is not set -CONFIG_PCI_ATS=y -CONFIG_PCI_LOCKLESS_CONFIG=y -CONFIG_PCI_IOV=y -CONFIG_PCI_PRI=y -CONFIG_PCI_PASID=y -# CONFIG_PCI_P2PDMA is not set -CONFIG_PCI_LABEL=y -CONFIG_VGA_ARB=y -CONFIG_VGA_ARB_MAX_GPUS=16 -CONFIG_HOTPLUG_PCI=y -CONFIG_HOTPLUG_PCI_ACPI=y -CONFIG_HOTPLUG_PCI_ACPI_IBM=m -# CONFIG_HOTPLUG_PCI_CPCI is not set -# CONFIG_HOTPLUG_PCI_SHPC is not set - -# -# PCI controller drivers -# -# CONFIG_VMD is not set - -# -# Cadence-based PCIe controllers -# -# end of Cadence-based PCIe controllers - -# -# DesignWare-based PCIe controllers -# -# CONFIG_PCI_MESON is not set -# CONFIG_PCIE_DW_PLAT_HOST is not set -# end of DesignWare-based PCIe controllers - -# -# Mobiveil-based PCIe controllers -# -# end of Mobiveil-based PCIe controllers - -# -# PLDA-based PCIe controllers -# -# end of PLDA-based PCIe controllers -# end of PCI controller drivers - -# -# PCI Endpoint -# -# CONFIG_PCI_ENDPOINT is not set -# end of PCI Endpoint - -# -# PCI switch controller drivers -# -# CONFIG_PCI_SW_SWITCHTEC is not set -# end of PCI switch controller drivers - -# CONFIG_CXL_BUS is not set -# CONFIG_PCCARD is not set -# CONFIG_RAPIDIO is not set - -# -# Generic Driver Options -# -CONFIG_AUXILIARY_BUS=y -CONFIG_UEVENT_HELPER=y -CONFIG_UEVENT_HELPER_PATH="" -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -CONFIG_DEVTMPFS_SAFE=y -CONFIG_STANDALONE=y -CONFIG_PREVENT_FIRMWARE_BUILD=y - -# -# Firmware loader -# -CONFIG_FW_LOADER=y -CONFIG_FW_LOADER_PAGED_BUF=y -CONFIG_FW_LOADER_SYSFS=y -CONFIG_EXTRA_FIRMWARE="" -# CONFIG_FW_LOADER_USER_HELPER is not set -CONFIG_FW_LOADER_COMPRESS=y -CONFIG_FW_LOADER_COMPRESS_XZ=y -CONFIG_FW_LOADER_COMPRESS_ZSTD=y -CONFIG_FW_CACHE=y -CONFIG_FW_UPLOAD=y -# end of Firmware loader - -CONFIG_WANT_DEV_COREDUMP=y -CONFIG_ALLOW_DEV_COREDUMP=y -CONFIG_DEV_COREDUMP=y -# CONFIG_DEBUG_DRIVER is not set -# CONFIG_DEBUG_DEVRES is not set -# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set -# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set -CONFIG_GENERIC_CPU_DEVICES=y -CONFIG_GENERIC_CPU_AUTOPROBE=y -CONFIG_GENERIC_CPU_VULNERABILITIES=y -CONFIG_REGMAP=y -CONFIG_REGMAP_I2C=y -CONFIG_REGMAP_SPI=y -CONFIG_DMA_SHARED_BUFFER=y -# CONFIG_DMA_FENCE_TRACE is not set -# CONFIG_FW_DEVLINK_SYNC_STATE_TIMEOUT is not set -# end of Generic Driver Options - -# -# Bus devices -# -# CONFIG_MHI_BUS is not set -# CONFIG_MHI_BUS_EP is not set -# end of Bus devices - -# -# Cache Drivers -# -# end of Cache Drivers - -CONFIG_CONNECTOR=y -CONFIG_PROC_EVENTS=y - -# -# Firmware Drivers -# - -# -# ARM System Control and Management Interface Protocol -# -# end of ARM System Control and Management Interface Protocol - -# CONFIG_EDD is not set -CONFIG_FIRMWARE_MEMMAP=y -CONFIG_DMIID=y -CONFIG_DMI_SYSFS=m -CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y -# CONFIG_ISCSI_IBFT is not set -# CONFIG_FW_CFG_SYSFS is not set -CONFIG_SYSFB=y -CONFIG_SYSFB_SIMPLEFB=y -# CONFIG_GOOGLE_FIRMWARE is not set - -# -# EFI (Extensible Firmware Interface) Support -# -CONFIG_EFI_ESRT=y -CONFIG_EFI_VARS_PSTORE=y -# CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE is not set -CONFIG_EFI_DXE_MEM_ATTRIBUTES=y -CONFIG_EFI_RUNTIME_WRAPPERS=y -CONFIG_EFI_BOOTLOADER_CONTROL=m -CONFIG_EFI_CAPSULE_LOADER=m -# CONFIG_EFI_TEST is not set -CONFIG_EFI_DEV_PATH_PARSER=y -CONFIG_APPLE_PROPERTIES=y -CONFIG_RESET_ATTACK_MITIGATION=y -CONFIG_EFI_RCI2_TABLE=y -# CONFIG_EFI_DISABLE_PCI_DMA is not set -CONFIG_EFI_EARLYCON=y -# CONFIG_EFI_CUSTOM_SSDT_OVERLAYS is not set -# CONFIG_EFI_DISABLE_RUNTIME is not set -CONFIG_EFI_COCO_SECRET=y -# end of EFI (Extensible Firmware Interface) Support - -CONFIG_UEFI_CPER=y -CONFIG_UEFI_CPER_X86=y - -# -# Qualcomm firmware drivers -# -# end of Qualcomm firmware drivers - -# -# Tegra firmware driver -# -# end of Tegra firmware driver -# end of Firmware Drivers - -# CONFIG_GNSS is not set -# CONFIG_MTD is not set -# CONFIG_OF is not set -CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y -# CONFIG_PARPORT is not set -CONFIG_PNP=y -CONFIG_PNP_DEBUG_MESSAGES=y - -# -# Protocols -# -CONFIG_PNPACPI=y -CONFIG_BLK_DEV=y -# CONFIG_BLK_DEV_NULL_BLK is not set -# CONFIG_BLK_DEV_FD is not set -CONFIG_CDROM=m -# CONFIG_BLK_DEV_PCIESSD_MTIP32XX is not set -CONFIG_ZRAM=m -CONFIG_ZRAM_BACKEND_LZ4=y -CONFIG_ZRAM_BACKEND_LZ4HC=y -CONFIG_ZRAM_BACKEND_ZSTD=y -# CONFIG_ZRAM_BACKEND_DEFLATE is not set -# CONFIG_ZRAM_BACKEND_842 is not set -CONFIG_ZRAM_BACKEND_LZO=y -# CONFIG_ZRAM_DEF_COMP_LZORLE is not set -# CONFIG_ZRAM_DEF_COMP_LZO is not set -# CONFIG_ZRAM_DEF_COMP_LZ4 is not set -# CONFIG_ZRAM_DEF_COMP_LZ4HC is not set -CONFIG_ZRAM_DEF_COMP_ZSTD=y -CONFIG_ZRAM_DEF_COMP="zstd" -CONFIG_ZRAM_WRITEBACK=y -CONFIG_ZRAM_TRACK_ENTRY_ACTIME=y -CONFIG_ZRAM_MEMORY_TRACKING=y -CONFIG_ZRAM_MULTI_COMP=y -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 -CONFIG_BLK_DEV_DRBD=m -# CONFIG_DRBD_FAULT_INJECTION is not set -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=8192 -# CONFIG_CDROM_PKTCDVD is not set -# CONFIG_ATA_OVER_ETH is not set -CONFIG_VIRTIO_BLK=m -CONFIG_BLK_DEV_RBD=m -CONFIG_BLK_DEV_UBLK=m -# CONFIG_BLKDEV_UBLK_LEGACY_OPCODES is not set - -# -# NVME Support -# -CONFIG_NVME_CORE=y -CONFIG_BLK_DEV_NVME=y -# CONFIG_NVME_MULTIPATH is not set -# CONFIG_NVME_VERBOSE_ERRORS is not set -CONFIG_NVME_HWMON=y -# CONFIG_NVME_FC is not set -# CONFIG_NVME_TCP is not set -# CONFIG_NVME_HOST_AUTH is not set -# CONFIG_NVME_TARGET is not set -# end of NVME Support - -# -# Misc devices -# -# CONFIG_AD525X_DPOT is not set -# CONFIG_DUMMY_IRQ is not set -# CONFIG_IBM_ASM is not set -# CONFIG_PHANTOM is not set -# CONFIG_RPMB is not set -# CONFIG_TIFM_CORE is not set -# CONFIG_ICS932S401 is not set -CONFIG_ENCLOSURE_SERVICES=m -# CONFIG_HP_ILO is not set -# CONFIG_APDS9802ALS is not set -# CONFIG_ISL29003 is not set -# CONFIG_ISL29020 is not set -# CONFIG_SENSORS_TSL2550 is not set -# CONFIG_SENSORS_BH1770 is not set -# CONFIG_SENSORS_APDS990X is not set -# CONFIG_HMC6352 is not set -# CONFIG_DS1682 is not set -# CONFIG_LATTICE_ECP3_CONFIG is not set -# CONFIG_SRAM is not set -# CONFIG_DW_XDATA_PCIE is not set -# CONFIG_PCI_ENDPOINT_TEST is not set -# CONFIG_XILINX_SDFEC is not set -# CONFIG_NSM is not set -# CONFIG_C2PORT is not set - -# -# EEPROM support -# -CONFIG_EEPROM_AT24=m -# CONFIG_EEPROM_AT25 is not set -# CONFIG_EEPROM_MAX6875 is not set -CONFIG_EEPROM_93CX6=m -# CONFIG_EEPROM_93XX46 is not set -# CONFIG_EEPROM_IDT_89HPESX is not set -CONFIG_EEPROM_EE1004=m -# end of EEPROM support - -# CONFIG_CB710_CORE is not set - -# -# Texas Instruments shared transport line discipline -# -# end of Texas Instruments shared transport line discipline - -# CONFIG_SENSORS_LIS3_I2C is not set -# CONFIG_ALTERA_STAPL is not set -# CONFIG_INTEL_MEI is not set -# CONFIG_VMWARE_VMCI is not set -# CONFIG_GENWQE is not set -# CONFIG_ECHO is not set -# CONFIG_BCM_VK is not set -# CONFIG_MISC_ALCOR_PCI is not set -# CONFIG_MISC_RTSX_PCI is not set -# CONFIG_MISC_RTSX_USB is not set -# CONFIG_UACCE is not set -# CONFIG_PVPANIC is not set -# CONFIG_KEBA_CP500 is not set -# end of Misc devices - -# -# SCSI device support -# -CONFIG_SCSI_MOD=y -# CONFIG_RAID_ATTRS is not set -CONFIG_SCSI_COMMON=y -CONFIG_SCSI=y -CONFIG_SCSI_DMA=y -CONFIG_SCSI_PROC_FS=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=y -# CONFIG_CHR_DEV_ST is not set -CONFIG_BLK_DEV_SR=m -CONFIG_CHR_DEV_SG=y -CONFIG_BLK_DEV_BSG=y -# CONFIG_CHR_DEV_SCH is not set -CONFIG_SCSI_ENCLOSURE=m -# CONFIG_SCSI_CONSTANTS is not set -# CONFIG_SCSI_LOGGING is not set -CONFIG_SCSI_SCAN_ASYNC=y - -# -# SCSI Transports -# -# CONFIG_SCSI_SPI_ATTRS is not set -# CONFIG_SCSI_FC_ATTRS is not set -# CONFIG_SCSI_ISCSI_ATTRS is not set -# CONFIG_SCSI_SAS_ATTRS is not set -# CONFIG_SCSI_SAS_LIBSAS is not set -# CONFIG_SCSI_SRP_ATTRS is not set -# end of SCSI Transports - -CONFIG_SCSI_LOWLEVEL=y -# CONFIG_ISCSI_TCP is not set -# CONFIG_ISCSI_BOOT_SYSFS is not set -# CONFIG_SCSI_CXGB3_ISCSI is not set -# CONFIG_SCSI_CXGB4_ISCSI is not set -# CONFIG_SCSI_BNX2_ISCSI is not set -# CONFIG_BE2ISCSI is not set -# CONFIG_BLK_DEV_3W_XXXX_RAID is not set -# CONFIG_SCSI_HPSA is not set -# CONFIG_SCSI_3W_9XXX is not set -# CONFIG_SCSI_3W_SAS is not set -# CONFIG_SCSI_ACARD is not set -# CONFIG_SCSI_AACRAID is not set -# CONFIG_SCSI_AIC7XXX is not set -# CONFIG_SCSI_AIC79XX is not set -# CONFIG_SCSI_AIC94XX is not set -# CONFIG_SCSI_MVSAS is not set -# CONFIG_SCSI_MVUMI is not set -# CONFIG_SCSI_ADVANSYS is not set -# CONFIG_SCSI_ARCMSR is not set -# CONFIG_SCSI_ESAS2R is not set -# CONFIG_MEGARAID_NEWGEN is not set -# CONFIG_MEGARAID_LEGACY is not set -# CONFIG_MEGARAID_SAS is not set -# CONFIG_SCSI_MPT3SAS is not set -# CONFIG_SCSI_MPT2SAS is not set -# CONFIG_SCSI_MPI3MR is not set -# CONFIG_SCSI_SMARTPQI is not set -# CONFIG_SCSI_HPTIOP is not set -# CONFIG_SCSI_BUSLOGIC is not set -# CONFIG_SCSI_MYRB is not set -# CONFIG_SCSI_MYRS is not set -# CONFIG_VMWARE_PVSCSI is not set -# CONFIG_SCSI_SNIC is not set -# CONFIG_SCSI_DMX3191D is not set -# CONFIG_SCSI_FDOMAIN_PCI is not set -# CONFIG_SCSI_ISCI is not set -# CONFIG_SCSI_IPS is not set -# CONFIG_SCSI_INITIO is not set -# CONFIG_SCSI_INIA100 is not set -# CONFIG_SCSI_STEX is not set -# CONFIG_SCSI_SYM53C8XX_2 is not set -# CONFIG_SCSI_IPR is not set -# CONFIG_SCSI_QLOGIC_1280 is not set -# CONFIG_SCSI_QLA_ISCSI is not set -# CONFIG_SCSI_DC395x is not set -# CONFIG_SCSI_AM53C974 is not set -# CONFIG_SCSI_WD719X is not set -# CONFIG_SCSI_DEBUG is not set -# CONFIG_SCSI_PMCRAID is not set -# CONFIG_SCSI_PM8001 is not set -CONFIG_SCSI_VIRTIO=m -# CONFIG_SCSI_DH is not set -# end of SCSI device support - -CONFIG_ATA=y -CONFIG_SATA_HOST=y -CONFIG_PATA_TIMINGS=y -CONFIG_ATA_VERBOSE_ERROR=y -CONFIG_ATA_FORCE=y -CONFIG_ATA_ACPI=y -# CONFIG_SATA_ZPODD is not set -CONFIG_SATA_PMP=y - -# -# Controllers with non-SFF native interface -# -CONFIG_SATA_AHCI=y -CONFIG_SATA_MOBILE_LPM_POLICY=0 -CONFIG_SATA_AHCI_PLATFORM=m -# CONFIG_AHCI_DWC is not set -# CONFIG_SATA_INIC162X is not set -# CONFIG_SATA_ACARD_AHCI is not set -# CONFIG_SATA_SIL24 is not set -# CONFIG_ATA_SFF is not set -CONFIG_MD=y -CONFIG_BLK_DEV_MD=m -# CONFIG_MD_BITMAP_FILE is not set -CONFIG_MD_RAID0=m -CONFIG_MD_RAID1=m -CONFIG_MD_RAID10=m -CONFIG_MD_RAID456=m -CONFIG_BCACHE=m -# CONFIG_BCACHE_DEBUG is not set -CONFIG_BCACHE_ASYNC_REGISTRATION=y -CONFIG_BLK_DEV_DM_BUILTIN=y -CONFIG_BLK_DEV_DM=m -# CONFIG_DM_DEBUG is not set -CONFIG_DM_BUFIO=m -# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set -CONFIG_DM_BIO_PRISON=m -CONFIG_DM_PERSISTENT_DATA=m -CONFIG_DM_UNSTRIPED=m -CONFIG_DM_CRYPT=m -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_THIN_PROVISIONING=m -CONFIG_DM_CACHE=m -CONFIG_DM_CACHE_SMQ=m -CONFIG_DM_WRITECACHE=m -# CONFIG_DM_EBS is not set -# CONFIG_DM_ERA is not set -# CONFIG_DM_CLONE is not set -CONFIG_DM_MIRROR=m -# CONFIG_DM_LOG_USERSPACE is not set -CONFIG_DM_RAID=m -# CONFIG_DM_ZERO is not set -# CONFIG_DM_MULTIPATH is not set -# CONFIG_DM_DELAY is not set -CONFIG_DM_DUST=m -CONFIG_DM_UEVENT=y -# CONFIG_DM_FLAKEY is not set -CONFIG_DM_VERITY=m -CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y -CONFIG_DM_VERITY_FEC=y -CONFIG_DM_SWITCH=m -# CONFIG_DM_LOG_WRITES is not set -CONFIG_DM_INTEGRITY=m -CONFIG_DM_ZONED=m -CONFIG_DM_AUDIT=y -CONFIG_DM_VDO=m -# CONFIG_TARGET_CORE is not set -# CONFIG_FUSION is not set - -# -# IEEE 1394 (FireWire) support -# -# CONFIG_FIREWIRE is not set -# CONFIG_FIREWIRE_NOSY is not set -# end of IEEE 1394 (FireWire) support - -# CONFIG_MACINTOSH_DRIVERS is not set -CONFIG_NETDEVICES=y -CONFIG_MII=m -CONFIG_NET_CORE=y -# CONFIG_BONDING is not set -CONFIG_DUMMY=m -CONFIG_WIREGUARD=m -# CONFIG_WIREGUARD_DEBUG is not set -# CONFIG_EQUALIZER is not set -# CONFIG_NET_FC is not set -# CONFIG_IFB is not set -# CONFIG_NET_TEAM is not set -CONFIG_MACVLAN=m -CONFIG_MACVTAP=m -# CONFIG_IPVLAN is not set -CONFIG_VXLAN=m -# CONFIG_GENEVE is not set -# CONFIG_BAREUDP is not set -# CONFIG_GTP is not set -CONFIG_PFCP=m -# CONFIG_AMT is not set -# CONFIG_MACSEC is not set -CONFIG_NETCONSOLE=m -CONFIG_NETCONSOLE_DYNAMIC=y -# CONFIG_NETCONSOLE_EXTENDED_LOG is not set -CONFIG_NETPOLL=y -CONFIG_NET_POLL_CONTROLLER=y -CONFIG_TUN=m -CONFIG_TAP=m -# CONFIG_TUN_VNET_CROSS_LE is not set -CONFIG_VETH=m -CONFIG_VIRTIO_NET=m -# CONFIG_NLMON is not set -CONFIG_NETKIT=y -# CONFIG_ARCNET is not set -# CONFIG_ATM_DRIVERS is not set -CONFIG_ETHERNET=y -# CONFIG_NET_VENDOR_3COM is not set -# CONFIG_NET_VENDOR_ADAPTEC is not set -# CONFIG_NET_VENDOR_AGERE is not set -# CONFIG_NET_VENDOR_ALACRITECH is not set -# CONFIG_NET_VENDOR_ALTEON is not set -# CONFIG_ALTERA_TSE is not set -CONFIG_NET_VENDOR_AMAZON=y -# CONFIG_ENA_ETHERNET is not set -# CONFIG_NET_VENDOR_AMD is not set -# CONFIG_NET_VENDOR_AQUANTIA is not set -# CONFIG_NET_VENDOR_ARC is not set -CONFIG_NET_VENDOR_ASIX=y -# CONFIG_NET_VENDOR_ATHEROS is not set -# CONFIG_CX_ECAT is not set -CONFIG_NET_VENDOR_BROADCOM=y -# CONFIG_B44 is not set -# CONFIG_BCMGENET is not set -# CONFIG_BNX2 is not set -# CONFIG_CNIC is not set -CONFIG_TIGON3=m -CONFIG_TIGON3_HWMON=y -# CONFIG_BNX2X is not set -# CONFIG_SYSTEMPORT is not set -# CONFIG_BNXT is not set -# CONFIG_NET_VENDOR_CADENCE is not set -# CONFIG_NET_VENDOR_CAVIUM is not set -# CONFIG_NET_VENDOR_CHELSIO is not set -# CONFIG_NET_VENDOR_CISCO is not set -# CONFIG_NET_VENDOR_CORTINA is not set -# CONFIG_NET_VENDOR_DAVICOM is not set -# CONFIG_DNET is not set -# CONFIG_NET_VENDOR_DEC is not set -# CONFIG_NET_VENDOR_DLINK is not set -# CONFIG_NET_VENDOR_EMULEX is not set -# CONFIG_NET_VENDOR_ENGLEDER is not set -# CONFIG_NET_VENDOR_EZCHIP is not set -# CONFIG_NET_VENDOR_FUNGIBLE is not set -# CONFIG_NET_VENDOR_GOOGLE is not set -# CONFIG_NET_VENDOR_HUAWEI is not set -# CONFIG_NET_VENDOR_I825XX is not set -CONFIG_NET_VENDOR_INTEL=y -# CONFIG_E100 is not set -# CONFIG_E1000 is not set -CONFIG_E1000E=m -CONFIG_E1000E_HWTS=y -CONFIG_IGB=m -CONFIG_IGB_HWMON=y -CONFIG_IGB_DCA=y -CONFIG_IGBVF=m -# CONFIG_IXGBE is not set -# CONFIG_IXGBEVF is not set -# CONFIG_I40E is not set -# CONFIG_I40EVF is not set -# CONFIG_ICE is not set -# CONFIG_FM10K is not set -# CONFIG_IGC is not set -# CONFIG_IDPF is not set -# CONFIG_JME is not set -# CONFIG_NET_VENDOR_ADI is not set -# CONFIG_NET_VENDOR_LITEX is not set -# CONFIG_NET_VENDOR_MARVELL is not set -# CONFIG_NET_VENDOR_MELLANOX is not set -# CONFIG_NET_VENDOR_META is not set -# CONFIG_NET_VENDOR_MICREL is not set -# CONFIG_NET_VENDOR_MICROCHIP is not set -# CONFIG_NET_VENDOR_MICROSEMI is not set -# CONFIG_NET_VENDOR_MICROSOFT is not set -# CONFIG_NET_VENDOR_MYRI is not set -# CONFIG_FEALNX is not set -# CONFIG_NET_VENDOR_NI is not set -# CONFIG_NET_VENDOR_NATSEMI is not set -# CONFIG_NET_VENDOR_NETERION is not set -# CONFIG_NET_VENDOR_NETRONOME is not set -CONFIG_NET_VENDOR_NVIDIA=y -CONFIG_FORCEDETH=m -# CONFIG_NET_VENDOR_OKI is not set -# CONFIG_ETHOC is not set -# CONFIG_OA_TC6 is not set -# CONFIG_NET_VENDOR_PACKET_ENGINES is not set -# CONFIG_NET_VENDOR_PENSANDO is not set -# CONFIG_NET_VENDOR_QLOGIC is not set -# CONFIG_NET_VENDOR_BROCADE is not set -# CONFIG_NET_VENDOR_QUALCOMM is not set -# CONFIG_NET_VENDOR_RDC is not set -# CONFIG_NET_VENDOR_REALTEK is not set -# CONFIG_NET_VENDOR_RENESAS is not set -# CONFIG_NET_VENDOR_ROCKER is not set -# CONFIG_NET_VENDOR_SAMSUNG is not set -# CONFIG_NET_VENDOR_SEEQ is not set -# CONFIG_NET_VENDOR_SILAN is not set -# CONFIG_NET_VENDOR_SIS is not set -# CONFIG_NET_VENDOR_SOLARFLARE is not set -# CONFIG_NET_VENDOR_SMSC is not set -# CONFIG_NET_VENDOR_SOCIONEXT is not set -# CONFIG_NET_VENDOR_STMICRO is not set -# CONFIG_NET_VENDOR_SUN is not set -# CONFIG_NET_VENDOR_SYNOPSYS is not set -# CONFIG_NET_VENDOR_TEHUTI is not set -# CONFIG_NET_VENDOR_TI is not set -# CONFIG_NET_VENDOR_VERTEXCOM is not set -# CONFIG_NET_VENDOR_VIA is not set -# CONFIG_NET_VENDOR_WANGXUN is not set -# CONFIG_NET_VENDOR_WIZNET is not set -# CONFIG_NET_VENDOR_XILINX is not set -# CONFIG_FDDI is not set -# CONFIG_HIPPI is not set -CONFIG_PHYLINK=m -CONFIG_PHYLIB=m -CONFIG_SWPHY=y -# CONFIG_LED_TRIGGER_PHY is not set -CONFIG_FIXED_PHY=m -# CONFIG_SFP is not set - -# -# MII PHY device drivers -# -# CONFIG_AIR_EN8811H_PHY is not set -# CONFIG_AMD_PHY is not set -# CONFIG_ADIN_PHY is not set -# CONFIG_ADIN1100_PHY is not set -# CONFIG_AQUANTIA_PHY is not set -CONFIG_AX88796B_PHY=m -CONFIG_BROADCOM_PHY=m -# CONFIG_BCM54140_PHY is not set -# CONFIG_BCM7XXX_PHY is not set -# CONFIG_BCM84881_PHY is not set -# CONFIG_BCM87XX_PHY is not set -CONFIG_BCM_NET_PHYLIB=m -# CONFIG_CICADA_PHY is not set -# CONFIG_CORTINA_PHY is not set -# CONFIG_DAVICOM_PHY is not set -# CONFIG_ICPLUS_PHY is not set -# CONFIG_LXT_PHY is not set -# CONFIG_INTEL_XWAY_PHY is not set -# CONFIG_LSI_ET1011C_PHY is not set -# CONFIG_MARVELL_PHY is not set -# CONFIG_MARVELL_10G_PHY is not set -# CONFIG_MARVELL_88Q2XXX_PHY is not set -# CONFIG_MARVELL_88X2222_PHY is not set -# CONFIG_MAXLINEAR_GPHY is not set -# CONFIG_MEDIATEK_GE_PHY is not set -# CONFIG_MICREL_PHY is not set -# CONFIG_MICROCHIP_T1S_PHY is not set -# CONFIG_MICROCHIP_PHY is not set -# CONFIG_MICROCHIP_T1_PHY is not set -# CONFIG_MICROSEMI_PHY is not set -# CONFIG_MOTORCOMM_PHY is not set -# CONFIG_NATIONAL_PHY is not set -# CONFIG_NXP_CBTX_PHY is not set -# CONFIG_NXP_C45_TJA11XX_PHY is not set -# CONFIG_NXP_TJA11XX_PHY is not set -# CONFIG_NCN26000_PHY is not set -# CONFIG_QCA83XX_PHY is not set -# CONFIG_QCA808X_PHY is not set -# CONFIG_QSEMI_PHY is not set -# CONFIG_REALTEK_PHY is not set -# CONFIG_RENESAS_PHY is not set -# CONFIG_ROCKCHIP_PHY is not set -# CONFIG_SMSC_PHY is not set -# CONFIG_STE10XP is not set -# CONFIG_TERANETICS_PHY is not set -# CONFIG_DP83822_PHY is not set -# CONFIG_DP83TC811_PHY is not set -# CONFIG_DP83848_PHY is not set -# CONFIG_DP83867_PHY is not set -# CONFIG_DP83869_PHY is not set -# CONFIG_DP83TD510_PHY is not set -# CONFIG_DP83TG720_PHY is not set -# CONFIG_VITESSE_PHY is not set -# CONFIG_XILINX_GMII2RGMII is not set -# CONFIG_MICREL_KS8995MA is not set - -# -# MCTP Device Drivers -# -# CONFIG_MCTP_SERIAL is not set -# CONFIG_MCTP_TRANSPORT_I3C is not set -# end of MCTP Device Drivers - -CONFIG_MDIO_DEVICE=m -CONFIG_MDIO_BUS=m -CONFIG_FWNODE_MDIO=m -CONFIG_ACPI_MDIO=m -CONFIG_MDIO_DEVRES=m -# CONFIG_MDIO_BITBANG is not set -# CONFIG_MDIO_BCM_UNIMAC is not set -# CONFIG_MDIO_MVUSB is not set -# CONFIG_MDIO_THUNDER is not set - -# -# MDIO Multiplexers -# - -# -# PCS device drivers -# -# CONFIG_PCS_XPCS is not set -# end of PCS device drivers - -CONFIG_PPP=m -CONFIG_PPP_BSDCOMP=m -CONFIG_PPP_DEFLATE=m -CONFIG_PPP_FILTER=y -CONFIG_PPP_MPPE=m -CONFIG_PPP_MULTILINK=y -CONFIG_PPPOATM=m -CONFIG_PPPOE=m -# CONFIG_PPPOE_HASH_BITS_1 is not set -# CONFIG_PPPOE_HASH_BITS_2 is not set -CONFIG_PPPOE_HASH_BITS_4=y -# CONFIG_PPPOE_HASH_BITS_8 is not set -CONFIG_PPPOE_HASH_BITS=4 -CONFIG_PPPOL2TP=m -CONFIG_PPP_ASYNC=m -CONFIG_PPP_SYNC_TTY=m -# CONFIG_SLIP is not set -CONFIG_SLHC=m - -# -# Host-side USB support is needed for USB Network Adapter support -# -CONFIG_USB_NET_DRIVERS=m -# CONFIG_USB_CATC is not set -# CONFIG_USB_KAWETH is not set -# CONFIG_USB_PEGASUS is not set -# CONFIG_USB_RTL8150 is not set -# CONFIG_USB_RTL8152 is not set -# CONFIG_USB_LAN78XX is not set -CONFIG_USB_USBNET=m -CONFIG_USB_NET_AX8817X=m -CONFIG_USB_NET_AX88179_178A=m -CONFIG_USB_NET_CDCETHER=m -CONFIG_USB_NET_CDC_EEM=m -CONFIG_USB_NET_CDC_NCM=m -# CONFIG_USB_NET_HUAWEI_CDC_NCM is not set -# CONFIG_USB_NET_CDC_MBIM is not set -# CONFIG_USB_NET_DM9601 is not set -# CONFIG_USB_NET_SR9700 is not set -# CONFIG_USB_NET_SR9800 is not set -# CONFIG_USB_NET_SMSC75XX is not set -# CONFIG_USB_NET_SMSC95XX is not set -# CONFIG_USB_NET_GL620A is not set -# CONFIG_USB_NET_NET1080 is not set -# CONFIG_USB_NET_PLUSB is not set -# CONFIG_USB_NET_MCS7830 is not set -CONFIG_USB_NET_RNDIS_HOST=m -# CONFIG_USB_NET_CDC_SUBSET is not set -# CONFIG_USB_NET_ZAURUS is not set -# CONFIG_USB_NET_CX82310_ETH is not set -# CONFIG_USB_NET_KALMIA is not set -# CONFIG_USB_NET_QMI_WWAN is not set -# CONFIG_USB_HSO is not set -# CONFIG_USB_NET_INT51X1 is not set -# CONFIG_USB_IPHETH is not set -# CONFIG_USB_SIERRA_NET is not set -# CONFIG_USB_VL600 is not set -# CONFIG_USB_NET_CH9200 is not set -# CONFIG_USB_NET_AQC111 is not set -CONFIG_USB_RTL8153_ECM=m -CONFIG_WLAN=y -# CONFIG_WLAN_VENDOR_ADMTEK is not set -CONFIG_ATH_COMMON=m -CONFIG_WLAN_VENDOR_ATH=y -# CONFIG_ATH_DEBUG is not set -# CONFIG_ATH5K is not set -# CONFIG_ATH5K_PCI is not set -# CONFIG_ATH9K is not set -# CONFIG_ATH9K_HTC is not set -# CONFIG_CARL9170 is not set -# CONFIG_ATH6KL is not set -# CONFIG_AR5523 is not set -# CONFIG_WIL6210 is not set -CONFIG_ATH10K=m -CONFIG_ATH10K_CE=y -CONFIG_ATH10K_PCI=m -# CONFIG_ATH10K_SDIO is not set -# CONFIG_ATH10K_USB is not set -# CONFIG_ATH10K_DEBUG is not set -# CONFIG_ATH10K_DEBUGFS is not set -CONFIG_ATH10K_LEDS=y -# CONFIG_ATH10K_TRACING is not set -# CONFIG_WCN36XX is not set -# CONFIG_ATH11K is not set -# CONFIG_ATH12K is not set -# CONFIG_WLAN_VENDOR_ATMEL is not set -# CONFIG_WLAN_VENDOR_BROADCOM is not set -CONFIG_WLAN_VENDOR_INTEL=y -# CONFIG_IPW2100 is not set -# CONFIG_IPW2200 is not set -# CONFIG_IWL4965 is not set -# CONFIG_IWL3945 is not set -CONFIG_IWLWIFI=m -CONFIG_IWLWIFI_LEDS=y -CONFIG_IWLDVM=m -CONFIG_IWLMVM=m -CONFIG_IWLWIFI_OPMODE_MODULAR=y - -# -# Debugging Options -# -# CONFIG_IWLWIFI_DEBUG is not set -# CONFIG_IWLWIFI_DEVICE_TRACING is not set -# end of Debugging Options - -# CONFIG_WLAN_VENDOR_INTERSIL is not set -# CONFIG_WLAN_VENDOR_MARVELL is not set -CONFIG_WLAN_VENDOR_MEDIATEK=y -# CONFIG_MT7601U is not set -CONFIG_MT76_CORE=m -CONFIG_MT76_LEDS=y -CONFIG_MT76_USB=m -CONFIG_MT76x02_LIB=m -CONFIG_MT76x02_USB=m -CONFIG_MT76_CONNAC_LIB=m -CONFIG_MT792x_LIB=m -# CONFIG_MT76x0U is not set -# CONFIG_MT76x0E is not set -CONFIG_MT76x2_COMMON=m -# CONFIG_MT76x2E is not set -CONFIG_MT76x2U=m -# CONFIG_MT7603E is not set -# CONFIG_MT7615E is not set -# CONFIG_MT7663U is not set -# CONFIG_MT7663S is not set -# CONFIG_MT7915E is not set -CONFIG_MT7921_COMMON=m -CONFIG_MT7921E=m -# CONFIG_MT7921S is not set -# CONFIG_MT7921U is not set -# CONFIG_MT7996E is not set -# CONFIG_MT7925E is not set -# CONFIG_MT7925U is not set -# CONFIG_WLAN_VENDOR_MICROCHIP is not set -# CONFIG_WLAN_VENDOR_PURELIFI is not set -# CONFIG_WLAN_VENDOR_RALINK is not set -CONFIG_WLAN_VENDOR_REALTEK=y -# CONFIG_RTL8180 is not set -# CONFIG_RTL8187 is not set -CONFIG_RTL_CARDS=m -# CONFIG_RTL8192CE is not set -CONFIG_RTL8192SE=m -# CONFIG_RTL8192DE is not set -# CONFIG_RTL8723AE is not set -# CONFIG_RTL8723BE is not set -# CONFIG_RTL8188EE is not set -# CONFIG_RTL8192EE is not set -# CONFIG_RTL8821AE is not set -# CONFIG_RTL8192CU is not set -# CONFIG_RTL8192DU is not set -CONFIG_RTLWIFI=m -CONFIG_RTLWIFI_PCI=m -# CONFIG_RTLWIFI_DEBUG is not set -# CONFIG_RTL8XXXU is not set -# CONFIG_RTW88 is not set -# CONFIG_RTW89 is not set -# CONFIG_WLAN_VENDOR_RSI is not set -# CONFIG_WLAN_VENDOR_SILABS is not set -# CONFIG_WLAN_VENDOR_ST is not set -# CONFIG_WLAN_VENDOR_TI is not set -# CONFIG_WLAN_VENDOR_ZYDAS is not set -# CONFIG_WLAN_VENDOR_QUANTENNA is not set -# CONFIG_MAC80211_HWSIM is not set -# CONFIG_VIRT_WIFI is not set -# CONFIG_WAN is not set - -# -# Wireless WAN -# -# CONFIG_WWAN is not set -# end of Wireless WAN - -# CONFIG_VMXNET3 is not set -# CONFIG_FUJITSU_ES is not set -CONFIG_USB4_NET=m -# CONFIG_NETDEVSIM is not set -CONFIG_NET_FAILOVER=m -# CONFIG_ISDN is not set - -# -# Input device support -# -CONFIG_INPUT=y -CONFIG_INPUT_LEDS=m -CONFIG_INPUT_FF_MEMLESS=y -CONFIG_INPUT_SPARSEKMAP=m -# CONFIG_INPUT_MATRIXKMAP is not set -CONFIG_INPUT_VIVALDIFMAP=y - -# -# Userland interfaces -# -CONFIG_INPUT_MOUSEDEV=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 -# CONFIG_INPUT_JOYDEV is not set -CONFIG_INPUT_EVDEV=y -# CONFIG_INPUT_EVBUG is not set - -# -# Input Device Drivers -# -CONFIG_INPUT_KEYBOARD=y -# CONFIG_KEYBOARD_ADP5588 is not set -# CONFIG_KEYBOARD_ADP5589 is not set -# CONFIG_KEYBOARD_APPLESPI is not set -CONFIG_KEYBOARD_ATKBD=y -# CONFIG_KEYBOARD_QT1050 is not set -# CONFIG_KEYBOARD_QT1070 is not set -# CONFIG_KEYBOARD_QT2160 is not set -# CONFIG_KEYBOARD_DLINK_DIR685 is not set -# CONFIG_KEYBOARD_LKKBD is not set -# CONFIG_KEYBOARD_TCA6416 is not set -# CONFIG_KEYBOARD_TCA8418 is not set -# CONFIG_KEYBOARD_LM8323 is not set -# CONFIG_KEYBOARD_LM8333 is not set -# CONFIG_KEYBOARD_MAX7359 is not set -# CONFIG_KEYBOARD_MPR121 is not set -# CONFIG_KEYBOARD_NEWTON is not set -# CONFIG_KEYBOARD_OPENCORES is not set -# CONFIG_KEYBOARD_SAMSUNG is not set -# CONFIG_KEYBOARD_STOWAWAY is not set -# CONFIG_KEYBOARD_SUNKBD is not set -# CONFIG_KEYBOARD_TM2_TOUCHKEY is not set -CONFIG_KEYBOARD_XTKBD=m -# CONFIG_KEYBOARD_CYPRESS_SF is not set -CONFIG_INPUT_MOUSE=y -CONFIG_MOUSE_PS2=y -CONFIG_MOUSE_PS2_ALPS=y -CONFIG_MOUSE_PS2_BYD=y -CONFIG_MOUSE_PS2_LOGIPS2PP=y -CONFIG_MOUSE_PS2_SYNAPTICS=y -CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y -CONFIG_MOUSE_PS2_CYPRESS=y -CONFIG_MOUSE_PS2_LIFEBOOK=y -CONFIG_MOUSE_PS2_TRACKPOINT=y -# CONFIG_MOUSE_PS2_ELANTECH is not set -# CONFIG_MOUSE_PS2_SENTELIC is not set -# CONFIG_MOUSE_PS2_TOUCHKIT is not set -CONFIG_MOUSE_PS2_FOCALTECH=y -# CONFIG_MOUSE_PS2_VMMOUSE is not set -CONFIG_MOUSE_PS2_SMBUS=y -# CONFIG_MOUSE_SERIAL is not set -# CONFIG_MOUSE_APPLETOUCH is not set -# CONFIG_MOUSE_BCM5974 is not set -# CONFIG_MOUSE_CYAPA is not set -# CONFIG_MOUSE_ELAN_I2C is not set -# CONFIG_MOUSE_VSXXXAA is not set -CONFIG_MOUSE_SYNAPTICS_I2C=m -CONFIG_MOUSE_SYNAPTICS_USB=m -# CONFIG_INPUT_JOYSTICK is not set -# CONFIG_INPUT_TABLET is not set -# CONFIG_INPUT_TOUCHSCREEN is not set -CONFIG_INPUT_MISC=y -# CONFIG_INPUT_AD714X is not set -# CONFIG_INPUT_BMA150 is not set -# CONFIG_INPUT_E3X0_BUTTON is not set -# CONFIG_INPUT_PCSPKR is not set -# CONFIG_INPUT_MMA8450 is not set -# CONFIG_INPUT_APANEL is not set -# CONFIG_INPUT_ATLAS_BTNS is not set -# CONFIG_INPUT_ATI_REMOTE2 is not set -# CONFIG_INPUT_KEYSPAN_REMOTE is not set -# CONFIG_INPUT_KXTJ9 is not set -# CONFIG_INPUT_POWERMATE is not set -# CONFIG_INPUT_YEALINK is not set -# CONFIG_INPUT_CM109 is not set -CONFIG_INPUT_UINPUT=m -# CONFIG_INPUT_PCF8574 is not set -# CONFIG_INPUT_PWM_BEEPER is not set -# CONFIG_INPUT_PWM_VIBRA is not set -# CONFIG_INPUT_DA7280_HAPTICS is not set -# CONFIG_INPUT_ADXL34X is not set -# CONFIG_INPUT_IMS_PCU is not set -# CONFIG_INPUT_IQS269A is not set -# CONFIG_INPUT_IQS626A is not set -# CONFIG_INPUT_IQS7222 is not set -# CONFIG_INPUT_CMA3000 is not set -# CONFIG_INPUT_IDEAPAD_SLIDEBAR is not set -# CONFIG_INPUT_DRV2665_HAPTICS is not set -# CONFIG_INPUT_DRV2667_HAPTICS is not set -CONFIG_RMI4_CORE=m -# CONFIG_RMI4_I2C is not set -# CONFIG_RMI4_SPI is not set -# CONFIG_RMI4_SMB is not set -CONFIG_RMI4_F03=y -CONFIG_RMI4_F03_SERIO=m -CONFIG_RMI4_2D_SENSOR=y -CONFIG_RMI4_F11=y -CONFIG_RMI4_F12=y -CONFIG_RMI4_F30=y -# CONFIG_RMI4_F34 is not set -# CONFIG_RMI4_F3A is not set -# CONFIG_RMI4_F54 is not set -# CONFIG_RMI4_F55 is not set - -# -# Hardware I/O ports -# -CONFIG_SERIO=y -CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y -CONFIG_SERIO_I8042=y -CONFIG_SERIO_SERPORT=m -CONFIG_SERIO_CT82C710=m -CONFIG_SERIO_PCIPS2=m -CONFIG_SERIO_LIBPS2=y -# CONFIG_SERIO_RAW is not set -# CONFIG_SERIO_ALTERA_PS2 is not set -# CONFIG_SERIO_PS2MULT is not set -# CONFIG_SERIO_ARC_PS2 is not set -# CONFIG_USERIO is not set -# CONFIG_GAMEPORT is not set -# end of Hardware I/O ports -# end of Input device support - -# -# Character devices -# -CONFIG_TTY=y -CONFIG_VT=y -CONFIG_CONSOLE_TRANSLATIONS=y -CONFIG_VT_CONSOLE=y -CONFIG_VT_CONSOLE_SLEEP=y -CONFIG_VT_HW_CONSOLE_BINDING=y -CONFIG_UNIX98_PTYS=y -# CONFIG_LEGACY_PTYS is not set -# CONFIG_LEGACY_TIOCSTI is not set -# CONFIG_LDISC_AUTOLOAD is not set - -# -# Serial drivers -# -CONFIG_SERIAL_EARLYCON=y -CONFIG_SERIAL_8250=y -# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set -CONFIG_SERIAL_8250_PNP=y -# CONFIG_SERIAL_8250_16550A_VARIANTS is not set -# CONFIG_SERIAL_8250_FINTEK is not set -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_DMA=y -CONFIG_SERIAL_8250_PCILIB=y -CONFIG_SERIAL_8250_PCI=y -CONFIG_SERIAL_8250_EXAR=y -CONFIG_SERIAL_8250_NR_UARTS=4 -CONFIG_SERIAL_8250_RUNTIME_UARTS=4 -# CONFIG_SERIAL_8250_EXTENDED is not set -# CONFIG_SERIAL_8250_PCI1XXXX is not set -CONFIG_SERIAL_8250_DWLIB=y -# CONFIG_SERIAL_8250_DW is not set -# CONFIG_SERIAL_8250_RT288X is not set -CONFIG_SERIAL_8250_LPSS=y -CONFIG_SERIAL_8250_MID=y -CONFIG_SERIAL_8250_PERICOM=y - -# -# Non-8250 serial port support -# -CONFIG_SERIAL_MAX3100=m -CONFIG_SERIAL_MAX310X=y -# CONFIG_SERIAL_UARTLITE is not set -CONFIG_SERIAL_CORE=y -CONFIG_SERIAL_CORE_CONSOLE=y -# CONFIG_SERIAL_JSM is not set -# CONFIG_SERIAL_LANTIQ is not set -# CONFIG_SERIAL_SCCNXP is not set -# CONFIG_SERIAL_SC16IS7XX is not set -# CONFIG_SERIAL_ALTERA_JTAGUART is not set -# CONFIG_SERIAL_ALTERA_UART is not set -# CONFIG_SERIAL_ARC is not set -# CONFIG_SERIAL_RP2 is not set -# CONFIG_SERIAL_FSL_LPUART is not set -# CONFIG_SERIAL_FSL_LINFLEXUART is not set -# CONFIG_SERIAL_SPRD is not set -# end of Serial drivers - -# CONFIG_SERIAL_NONSTANDARD is not set -# CONFIG_N_GSM is not set -# CONFIG_NOZOMI is not set -# CONFIG_NULL_TTY is not set -CONFIG_HVC_DRIVER=y -# CONFIG_SERIAL_DEV_BUS is not set -CONFIG_VIRTIO_CONSOLE=m -# CONFIG_IPMI_HANDLER is not set -CONFIG_HW_RANDOM=y -# CONFIG_HW_RANDOM_TIMERIOMEM is not set -CONFIG_HW_RANDOM_INTEL=m -CONFIG_HW_RANDOM_AMD=m -# CONFIG_HW_RANDOM_BA431 is not set -# CONFIG_HW_RANDOM_VIA is not set -CONFIG_HW_RANDOM_VIRTIO=m -# CONFIG_HW_RANDOM_XIPHERA is not set -# CONFIG_APPLICOM is not set -CONFIG_MWAVE=m -# CONFIG_DEVMEM is not set -CONFIG_NVRAM=m -# CONFIG_DEVPORT is not set -CONFIG_HPET=y -CONFIG_HPET_MMAP=y -CONFIG_HPET_MMAP_DEFAULT=y -# CONFIG_HANGCHECK_TIMER is not set -CONFIG_TCG_TPM=m -CONFIG_TCG_TPM2_HMAC=y -CONFIG_HW_RANDOM_TPM=y -CONFIG_TCG_TIS_CORE=m -CONFIG_TCG_TIS=m -CONFIG_TCG_TIS_SPI=m -CONFIG_TCG_TIS_SPI_CR50=y -CONFIG_TCG_TIS_I2C=m -CONFIG_TCG_TIS_I2C_CR50=m -CONFIG_TCG_TIS_I2C_ATMEL=m -CONFIG_TCG_TIS_I2C_INFINEON=m -CONFIG_TCG_TIS_I2C_NUVOTON=m -CONFIG_TCG_NSC=m -CONFIG_TCG_ATMEL=m -CONFIG_TCG_INFINEON=m -CONFIG_TCG_CRB=m -CONFIG_TCG_VTPM_PROXY=m -CONFIG_TCG_TIS_ST33ZP24=m -CONFIG_TCG_TIS_ST33ZP24_I2C=m -CONFIG_TCG_TIS_ST33ZP24_SPI=m -# CONFIG_TELCLOCK is not set -# CONFIG_XILLYBUS is not set -# CONFIG_XILLYUSB is not set -# end of Character devices - -# -# I2C support -# -CONFIG_I2C=y -CONFIG_ACPI_I2C_OPREGION=y -CONFIG_I2C_BOARDINFO=y -CONFIG_I2C_CHARDEV=m -CONFIG_I2C_MUX=m - -# -# Multiplexer I2C Chip support -# -# CONFIG_I2C_MUX_LTC4306 is not set -# CONFIG_I2C_MUX_PCA9541 is not set -# CONFIG_I2C_MUX_REG is not set -# CONFIG_I2C_MUX_MLXCPLD is not set -# end of Multiplexer I2C Chip support - -CONFIG_I2C_HELPER_AUTO=y -CONFIG_I2C_SMBUS=m -CONFIG_I2C_ALGOBIT=m - -# -# I2C Hardware Bus support -# - -# -# PC SMBus host controller drivers -# -CONFIG_I2C_CCGX_UCSI=m -# CONFIG_I2C_ALI1535 is not set -# CONFIG_I2C_ALI1563 is not set -# CONFIG_I2C_ALI15X3 is not set -# CONFIG_I2C_AMD756 is not set -# CONFIG_I2C_AMD8111 is not set -# CONFIG_I2C_AMD_MP2 is not set -CONFIG_I2C_I801=m -CONFIG_I2C_ISCH=m -CONFIG_I2C_ISMT=m -CONFIG_I2C_PIIX4=m -# CONFIG_I2C_NFORCE2 is not set -CONFIG_I2C_NVIDIA_GPU=m -# CONFIG_I2C_SIS5595 is not set -# CONFIG_I2C_SIS630 is not set -# CONFIG_I2C_SIS96X is not set -# CONFIG_I2C_VIA is not set -# CONFIG_I2C_VIAPRO is not set -# CONFIG_I2C_ZHAOXIN is not set - -# -# ACPI drivers -# -# CONFIG_I2C_SCMI is not set - -# -# I2C system bus drivers (mostly embedded / system-on-chip) -# -# CONFIG_I2C_DESIGNWARE_CORE is not set -# CONFIG_I2C_EMEV2 is not set -# CONFIG_I2C_OCORES is not set -# CONFIG_I2C_PCA_PLATFORM is not set -# CONFIG_I2C_SIMTEC is not set -# CONFIG_I2C_XILINX is not set - -# -# External I2C/SMBus adapter drivers -# -# CONFIG_I2C_DIOLAN_U2C is not set -# CONFIG_I2C_CP2615 is not set -# CONFIG_I2C_PCI1XXXX is not set -# CONFIG_I2C_ROBOTFUZZ_OSIF is not set -# CONFIG_I2C_TAOS_EVM is not set -# CONFIG_I2C_TINY_USB is not set - -# -# Other I2C/SMBus bus drivers -# -# CONFIG_I2C_MLXCPLD is not set -CONFIG_I2C_VIRTIO=m -# end of I2C Hardware Bus support - -# CONFIG_I2C_STUB is not set -# CONFIG_I2C_SLAVE is not set -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# end of I2C support - -CONFIG_I3C=m -CONFIG_CDNS_I3C_MASTER=m -CONFIG_DW_I3C_MASTER=m -# CONFIG_SVC_I3C_MASTER is not set -# CONFIG_MIPI_I3C_HCI is not set -CONFIG_SPI=y -# CONFIG_SPI_DEBUG is not set -CONFIG_SPI_MASTER=y -# CONFIG_SPI_MEM is not set - -# -# SPI Master Controller Drivers -# -# CONFIG_SPI_ALTERA is not set -# CONFIG_SPI_AXI_SPI_ENGINE is not set -# CONFIG_SPI_BITBANG is not set -# CONFIG_SPI_CADENCE is not set -# CONFIG_SPI_CH341 is not set -# CONFIG_SPI_DESIGNWARE is not set -# CONFIG_SPI_MICROCHIP_CORE is not set -# CONFIG_SPI_MICROCHIP_CORE_QSPI is not set -# CONFIG_SPI_LANTIQ_SSC is not set -# CONFIG_SPI_PCI1XXXX is not set -# CONFIG_SPI_PXA2XX is not set -# CONFIG_SPI_SC18IS602 is not set -# CONFIG_SPI_SIFIVE is not set -# CONFIG_SPI_MXIC is not set -# CONFIG_SPI_XCOMM is not set -# CONFIG_SPI_XILINX is not set -# CONFIG_SPI_AMD is not set - -# -# SPI Multiplexer support -# -# CONFIG_SPI_MUX is not set - -# -# SPI Protocol Masters -# -# CONFIG_SPI_SPIDEV is not set -# CONFIG_SPI_LOOPBACK_TEST is not set -# CONFIG_SPI_TLE62X0 is not set -# CONFIG_SPI_SLAVE is not set -CONFIG_SPI_DYNAMIC=y -CONFIG_SPMI=m -# CONFIG_SPMI_HISI3670 is not set -# CONFIG_HSI is not set -CONFIG_PPS=m -# CONFIG_PPS_DEBUG is not set - -# -# PPS clients support -# -# CONFIG_PPS_CLIENT_KTIMER is not set -# CONFIG_PPS_CLIENT_LDISC is not set -# CONFIG_PPS_CLIENT_GPIO is not set - -# -# PPS generators support -# - -# -# PTP clock support -# -CONFIG_PTP_1588_CLOCK=m -CONFIG_PTP_1588_CLOCK_OPTIONAL=m - -# -# Enable PHYLIB and NETWORK_PHY_TIMESTAMPING to see the additional clocks. -# -CONFIG_PTP_1588_CLOCK_KVM=m -# CONFIG_PTP_1588_CLOCK_IDT82P33 is not set -# CONFIG_PTP_1588_CLOCK_IDTCM is not set -# CONFIG_PTP_1588_CLOCK_FC3W is not set -# CONFIG_PTP_1588_CLOCK_MOCK is not set -# CONFIG_PTP_1588_CLOCK_VMW is not set -# end of PTP clock support - -CONFIG_PINCTRL=y -# CONFIG_DEBUG_PINCTRL is not set -# CONFIG_PINCTRL_AMD is not set -# CONFIG_PINCTRL_CY8C95X0 is not set -# CONFIG_PINCTRL_MCP23S08 is not set -# CONFIG_PINCTRL_SX150X is not set - -# -# Intel pinctrl drivers -# -# CONFIG_PINCTRL_BAYTRAIL is not set -# CONFIG_PINCTRL_CHERRYVIEW is not set -# CONFIG_PINCTRL_LYNXPOINT is not set -# CONFIG_PINCTRL_INTEL_PLATFORM is not set -# CONFIG_PINCTRL_ALDERLAKE is not set -# CONFIG_PINCTRL_BROXTON is not set -# CONFIG_PINCTRL_CANNONLAKE is not set -# CONFIG_PINCTRL_CEDARFORK is not set -# CONFIG_PINCTRL_DENVERTON is not set -# CONFIG_PINCTRL_ELKHARTLAKE is not set -# CONFIG_PINCTRL_EMMITSBURG is not set -# CONFIG_PINCTRL_GEMINILAKE is not set -# CONFIG_PINCTRL_ICELAKE is not set -# CONFIG_PINCTRL_JASPERLAKE is not set -# CONFIG_PINCTRL_LAKEFIELD is not set -# CONFIG_PINCTRL_LEWISBURG is not set -# CONFIG_PINCTRL_METEORLAKE is not set -# CONFIG_PINCTRL_METEORPOINT is not set -# CONFIG_PINCTRL_SUNRISEPOINT is not set -# CONFIG_PINCTRL_TIGERLAKE is not set -# end of Intel pinctrl drivers - -# -# Renesas pinctrl drivers -# -# end of Renesas pinctrl drivers - -# CONFIG_GPIOLIB is not set -# CONFIG_W1 is not set -# CONFIG_POWER_RESET is not set -# CONFIG_POWER_SEQUENCING is not set -CONFIG_POWER_SUPPLY=y -# CONFIG_POWER_SUPPLY_DEBUG is not set -CONFIG_POWER_SUPPLY_HWMON=y -# CONFIG_IP5XXX_POWER is not set -# CONFIG_TEST_POWER is not set -# CONFIG_CHARGER_ADP5061 is not set -# CONFIG_BATTERY_CW2015 is not set -# CONFIG_BATTERY_DS2780 is not set -# CONFIG_BATTERY_DS2781 is not set -# CONFIG_BATTERY_DS2782 is not set -# CONFIG_BATTERY_SAMSUNG_SDI is not set -# CONFIG_BATTERY_SBS is not set -# CONFIG_CHARGER_SBS is not set -# CONFIG_BATTERY_BQ27XXX is not set -# CONFIG_BATTERY_MAX17042 is not set -# CONFIG_BATTERY_MAX1720X is not set -# CONFIG_CHARGER_ISP1704 is not set -# CONFIG_CHARGER_MAX8903 is not set -# CONFIG_CHARGER_LP8727 is not set -# CONFIG_CHARGER_LTC4162L is not set -# CONFIG_CHARGER_MAX77976 is not set -# CONFIG_CHARGER_BQ2415X is not set -# CONFIG_BATTERY_GAUGE_LTC2941 is not set -# CONFIG_BATTERY_GOLDFISH is not set -# CONFIG_BATTERY_RT5033 is not set -# CONFIG_CHARGER_BD99954 is not set -# CONFIG_BATTERY_UG3105 is not set -# CONFIG_FUEL_GAUGE_MM8013 is not set -CONFIG_HWMON=y -CONFIG_HWMON_VID=m -# CONFIG_HWMON_DEBUG_CHIP is not set - -# -# Native drivers -# -# CONFIG_SENSORS_ABITUGURU is not set -# CONFIG_SENSORS_ABITUGURU3 is not set -# CONFIG_SENSORS_AD7314 is not set -# CONFIG_SENSORS_AD7414 is not set -# CONFIG_SENSORS_AD7418 is not set -# CONFIG_SENSORS_ADM1025 is not set -# CONFIG_SENSORS_ADM1026 is not set -# CONFIG_SENSORS_ADM1029 is not set -# CONFIG_SENSORS_ADM1031 is not set -# CONFIG_SENSORS_ADM1177 is not set -# CONFIG_SENSORS_ADM9240 is not set -# CONFIG_SENSORS_ADT7310 is not set -# CONFIG_SENSORS_ADT7410 is not set -# CONFIG_SENSORS_ADT7411 is not set -# CONFIG_SENSORS_ADT7462 is not set -# CONFIG_SENSORS_ADT7470 is not set -# CONFIG_SENSORS_ADT7475 is not set -# CONFIG_SENSORS_AHT10 is not set -# CONFIG_SENSORS_AQUACOMPUTER_D5NEXT is not set -# CONFIG_SENSORS_AS370 is not set -# CONFIG_SENSORS_ASC7621 is not set -# CONFIG_SENSORS_ASUS_ROG_RYUJIN is not set -# CONFIG_SENSORS_AXI_FAN_CONTROL is not set -# CONFIG_SENSORS_K8TEMP is not set -CONFIG_SENSORS_K10TEMP=m -CONFIG_SENSORS_FAM15H_POWER=m -# CONFIG_SENSORS_APPLESMC is not set -# CONFIG_SENSORS_ASB100 is not set -# CONFIG_SENSORS_ATXP1 is not set -# CONFIG_SENSORS_CHIPCAP2 is not set -# CONFIG_SENSORS_CORSAIR_CPRO is not set -# CONFIG_SENSORS_CORSAIR_PSU is not set -CONFIG_SENSORS_DRIVETEMP=m -# CONFIG_SENSORS_DS620 is not set -# CONFIG_SENSORS_DS1621 is not set -# CONFIG_SENSORS_DELL_SMM is not set -# CONFIG_SENSORS_I5K_AMB is not set -# CONFIG_SENSORS_F71805F is not set -# CONFIG_SENSORS_F71882FG is not set -# CONFIG_SENSORS_F75375S is not set -# CONFIG_SENSORS_FSCHMD is not set -# CONFIG_SENSORS_FTSTEUTATES is not set -# CONFIG_SENSORS_GIGABYTE_WATERFORCE is not set -# CONFIG_SENSORS_GL518SM is not set -# CONFIG_SENSORS_GL520SM is not set -# CONFIG_SENSORS_G760A is not set -# CONFIG_SENSORS_G762 is not set -# CONFIG_SENSORS_HIH6130 is not set -# CONFIG_SENSORS_HS3001 is not set -# CONFIG_SENSORS_I5500 is not set -CONFIG_SENSORS_CORETEMP=m -# CONFIG_SENSORS_IT87 is not set -# CONFIG_SENSORS_JC42 is not set -# CONFIG_SENSORS_POWERZ is not set -# CONFIG_SENSORS_POWR1220 is not set -CONFIG_SENSORS_LENOVO_EC=m -# CONFIG_SENSORS_LINEAGE is not set -# CONFIG_SENSORS_LTC2945 is not set -# CONFIG_SENSORS_LTC2947_I2C is not set -# CONFIG_SENSORS_LTC2947_SPI is not set -# CONFIG_SENSORS_LTC2990 is not set -# CONFIG_SENSORS_LTC2991 is not set -# CONFIG_SENSORS_LTC4151 is not set -# CONFIG_SENSORS_LTC4215 is not set -# CONFIG_SENSORS_LTC4222 is not set -# CONFIG_SENSORS_LTC4245 is not set -# CONFIG_SENSORS_LTC4260 is not set -# CONFIG_SENSORS_LTC4261 is not set -# CONFIG_SENSORS_LTC4282 is not set -# CONFIG_SENSORS_MAX1111 is not set -# CONFIG_SENSORS_MAX127 is not set -# CONFIG_SENSORS_MAX16065 is not set -# CONFIG_SENSORS_MAX1619 is not set -# CONFIG_SENSORS_MAX1668 is not set -# CONFIG_SENSORS_MAX197 is not set -# CONFIG_SENSORS_MAX31722 is not set -# CONFIG_SENSORS_MAX31730 is not set -# CONFIG_SENSORS_MAX31760 is not set -# CONFIG_MAX31827 is not set -# CONFIG_SENSORS_MAX6620 is not set -# CONFIG_SENSORS_MAX6621 is not set -# CONFIG_SENSORS_MAX6639 is not set -# CONFIG_SENSORS_MAX6650 is not set -# CONFIG_SENSORS_MAX6697 is not set -# CONFIG_SENSORS_MAX31790 is not set -# CONFIG_SENSORS_MC34VR500 is not set -# CONFIG_SENSORS_MCP3021 is not set -# CONFIG_SENSORS_TC654 is not set -# CONFIG_SENSORS_TPS23861 is not set -# CONFIG_SENSORS_MR75203 is not set -# CONFIG_SENSORS_ADCXX is not set -# CONFIG_SENSORS_LM63 is not set -# CONFIG_SENSORS_LM70 is not set -# CONFIG_SENSORS_LM73 is not set -# CONFIG_SENSORS_LM75 is not set -# CONFIG_SENSORS_LM77 is not set -# CONFIG_SENSORS_LM78 is not set -# CONFIG_SENSORS_LM80 is not set -# CONFIG_SENSORS_LM83 is not set -# CONFIG_SENSORS_LM85 is not set -# CONFIG_SENSORS_LM87 is not set -# CONFIG_SENSORS_LM90 is not set -# CONFIG_SENSORS_LM92 is not set -# CONFIG_SENSORS_LM93 is not set -# CONFIG_SENSORS_LM95234 is not set -# CONFIG_SENSORS_LM95241 is not set -# CONFIG_SENSORS_LM95245 is not set -# CONFIG_SENSORS_PC87360 is not set -# CONFIG_SENSORS_PC87427 is not set -# CONFIG_SENSORS_NCT6683 is not set -CONFIG_SENSORS_NCT6775_CORE=m -CONFIG_SENSORS_NCT6775=m -CONFIG_SENSORS_NCT6775_I2C=m -# CONFIG_SENSORS_NCT7802 is not set -# CONFIG_SENSORS_NCT7904 is not set -# CONFIG_SENSORS_NPCM7XX is not set -# CONFIG_SENSORS_NZXT_KRAKEN2 is not set -# CONFIG_SENSORS_NZXT_KRAKEN3 is not set -# CONFIG_SENSORS_NZXT_SMART2 is not set -# CONFIG_SENSORS_OCC_P8_I2C is not set -# CONFIG_SENSORS_OXP is not set -# CONFIG_SENSORS_PCF8591 is not set -# CONFIG_PMBUS is not set -# CONFIG_SENSORS_PT5161L is not set -CONFIG_SENSORS_PWM_FAN=m -# CONFIG_SENSORS_SBTSI is not set -# CONFIG_SENSORS_SBRMI is not set -# CONFIG_SENSORS_SHT21 is not set -# CONFIG_SENSORS_SHT3x is not set -# CONFIG_SENSORS_SHT4x is not set -# CONFIG_SENSORS_SHTC1 is not set -# CONFIG_SENSORS_SIS5595 is not set -# CONFIG_SENSORS_DME1737 is not set -# CONFIG_SENSORS_EMC1403 is not set -# CONFIG_SENSORS_EMC2103 is not set -# CONFIG_SENSORS_EMC2305 is not set -# CONFIG_SENSORS_EMC6W201 is not set -# CONFIG_SENSORS_SMSC47M1 is not set -# CONFIG_SENSORS_SMSC47M192 is not set -# CONFIG_SENSORS_SMSC47B397 is not set -# CONFIG_SENSORS_SCH5627 is not set -# CONFIG_SENSORS_SCH5636 is not set -# CONFIG_SENSORS_STTS751 is not set -# CONFIG_SENSORS_ADC128D818 is not set -# CONFIG_SENSORS_ADS7828 is not set -# CONFIG_SENSORS_ADS7871 is not set -# CONFIG_SENSORS_AMC6821 is not set -# CONFIG_SENSORS_INA209 is not set -# CONFIG_SENSORS_INA2XX is not set -# CONFIG_SENSORS_INA238 is not set -# CONFIG_SENSORS_INA3221 is not set -# CONFIG_SENSORS_SPD5118 is not set -# CONFIG_SENSORS_TC74 is not set -# CONFIG_SENSORS_THMC50 is not set -# CONFIG_SENSORS_TMP102 is not set -# CONFIG_SENSORS_TMP103 is not set -# CONFIG_SENSORS_TMP108 is not set -# CONFIG_SENSORS_TMP401 is not set -# CONFIG_SENSORS_TMP421 is not set -# CONFIG_SENSORS_TMP464 is not set -# CONFIG_SENSORS_TMP513 is not set -# CONFIG_SENSORS_VIA_CPUTEMP is not set -# CONFIG_SENSORS_VIA686A is not set -# CONFIG_SENSORS_VT1211 is not set -# CONFIG_SENSORS_VT8231 is not set -# CONFIG_SENSORS_W83773G is not set -# CONFIG_SENSORS_W83781D is not set -# CONFIG_SENSORS_W83791D is not set -# CONFIG_SENSORS_W83792D is not set -# CONFIG_SENSORS_W83793 is not set -# CONFIG_SENSORS_W83795 is not set -# CONFIG_SENSORS_W83L785TS is not set -# CONFIG_SENSORS_W83L786NG is not set -# CONFIG_SENSORS_W83627HF is not set -# CONFIG_SENSORS_W83627EHF is not set -# CONFIG_SENSORS_XGENE is not set - -# -# ACPI drivers -# -CONFIG_SENSORS_ACPI_POWER=m -# CONFIG_SENSORS_ATK0110 is not set -# CONFIG_SENSORS_ASUS_WMI is not set -# CONFIG_SENSORS_ASUS_EC is not set -# CONFIG_SENSORS_HP_WMI is not set -CONFIG_THERMAL=y -CONFIG_THERMAL_NETLINK=y -# CONFIG_THERMAL_STATISTICS is not set -# CONFIG_THERMAL_DEBUGFS is not set -# CONFIG_THERMAL_CORE_TESTING is not set -CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 -CONFIG_THERMAL_HWMON=y -# CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE is not set -CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE=y -# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set -# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set -CONFIG_THERMAL_GOV_FAIR_SHARE=y -CONFIG_THERMAL_GOV_STEP_WISE=y -# CONFIG_THERMAL_GOV_BANG_BANG is not set -CONFIG_THERMAL_GOV_USER_SPACE=y -CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y -# CONFIG_DEVFREQ_THERMAL is not set -# CONFIG_THERMAL_EMULATION is not set - -# -# Intel thermal drivers -# -# CONFIG_INTEL_POWERCLAMP is not set -CONFIG_X86_THERMAL_VECTOR=y -CONFIG_INTEL_TCC=y -CONFIG_X86_PKG_TEMP_THERMAL=m -CONFIG_INTEL_SOC_DTS_IOSF_CORE=m -# CONFIG_INTEL_SOC_DTS_THERMAL is not set - -# -# ACPI INT340X thermal drivers -# -CONFIG_INT340X_THERMAL=m -CONFIG_ACPI_THERMAL_REL=m -CONFIG_INT3406_THERMAL=m -CONFIG_PROC_THERMAL_MMIO_RAPL=m -# end of ACPI INT340X thermal drivers - -CONFIG_INTEL_PCH_THERMAL=m -CONFIG_INTEL_TCC_COOLING=m -CONFIG_INTEL_HFI_THERMAL=y -# end of Intel thermal drivers - -CONFIG_WATCHDOG=y -CONFIG_WATCHDOG_CORE=y -# CONFIG_WATCHDOG_NOWAYOUT is not set -CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y -CONFIG_WATCHDOG_OPEN_TIMEOUT=0 -CONFIG_WATCHDOG_SYSFS=y -# CONFIG_WATCHDOG_HRTIMER_PRETIMEOUT is not set - -# -# Watchdog Pretimeout Governors -# -# CONFIG_WATCHDOG_PRETIMEOUT_GOV is not set - -# -# Watchdog Device Drivers -# -# CONFIG_SOFT_WATCHDOG is not set -# CONFIG_LENOVO_SE10_WDT is not set -# CONFIG_WDAT_WDT is not set -# CONFIG_XILINX_WATCHDOG is not set -# CONFIG_ZIIRAVE_WATCHDOG is not set -# CONFIG_CADENCE_WATCHDOG is not set -# CONFIG_DW_WATCHDOG is not set -# CONFIG_MAX63XX_WATCHDOG is not set -# CONFIG_ACQUIRE_WDT is not set -# CONFIG_ADVANTECH_WDT is not set -# CONFIG_ADVANTECH_EC_WDT is not set -# CONFIG_ALIM1535_WDT is not set -# CONFIG_ALIM7101_WDT is not set -# CONFIG_EBC_C384_WDT is not set -# CONFIG_EXAR_WDT is not set -# CONFIG_F71808E_WDT is not set -# CONFIG_SP5100_TCO is not set -# CONFIG_SBC_FITPC2_WATCHDOG is not set -# CONFIG_EUROTECH_WDT is not set -# CONFIG_IB700_WDT is not set -# CONFIG_IBMASR is not set -# CONFIG_WAFER_WDT is not set -# CONFIG_I6300ESB_WDT is not set -# CONFIG_IE6XX_WDT is not set -CONFIG_ITCO_WDT=m -CONFIG_ITCO_VENDOR_SUPPORT=y -CONFIG_IT8712F_WDT=m -CONFIG_IT87_WDT=m -# CONFIG_HP_WATCHDOG is not set -# CONFIG_SC1200_WDT is not set -# CONFIG_PC87413_WDT is not set -# CONFIG_NV_TCO is not set -# CONFIG_60XX_WDT is not set -# CONFIG_CPU5_WDT is not set -# CONFIG_SMSC_SCH311X_WDT is not set -# CONFIG_SMSC37B787_WDT is not set -# CONFIG_TQMX86_WDT is not set -# CONFIG_VIA_WDT is not set -# CONFIG_W83627HF_WDT is not set -# CONFIG_W83877F_WDT is not set -# CONFIG_W83977F_WDT is not set -# CONFIG_MACHZ_WDT is not set -# CONFIG_SBC_EPX_C3_WATCHDOG is not set -# CONFIG_NI903X_WDT is not set -# CONFIG_NIC7018_WDT is not set - -# -# PCI-based Watchdog Cards -# -# CONFIG_PCIPCWATCHDOG is not set -# CONFIG_WDTPCI is not set - -# -# USB-based Watchdog Cards -# -# CONFIG_USBPCWATCHDOG is not set -CONFIG_SSB_POSSIBLE=y -CONFIG_SSB=m -CONFIG_SSB_SPROM=y -CONFIG_SSB_PCIHOST_POSSIBLE=y -CONFIG_SSB_PCIHOST=y -CONFIG_SSB_SDIOHOST_POSSIBLE=y -# CONFIG_SSB_SDIOHOST is not set -CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y -CONFIG_SSB_DRIVER_PCICORE=y -CONFIG_BCMA_POSSIBLE=y -# CONFIG_BCMA is not set - -# -# Multifunction device drivers -# -CONFIG_MFD_CORE=m -# CONFIG_MFD_AS3711 is not set -# CONFIG_MFD_SMPRO is not set -# CONFIG_PMIC_ADP5520 is not set -# CONFIG_MFD_BCM590XX is not set -# CONFIG_MFD_BD9571MWV is not set -# CONFIG_MFD_AXP20X_I2C is not set -# CONFIG_MFD_CS42L43_I2C is not set -# CONFIG_MFD_MADERA is not set -# CONFIG_PMIC_DA903X is not set -# CONFIG_MFD_DA9052_SPI is not set -# CONFIG_MFD_DA9052_I2C is not set -# CONFIG_MFD_DA9055 is not set -# CONFIG_MFD_DA9062 is not set -# CONFIG_MFD_DA9063 is not set -# CONFIG_MFD_DA9150 is not set -# CONFIG_MFD_DLN2 is not set -# CONFIG_MFD_MC13XXX_SPI is not set -# CONFIG_MFD_MC13XXX_I2C is not set -# CONFIG_MFD_MP2629 is not set -# CONFIG_MFD_INTEL_QUARK_I2C_GPIO is not set -CONFIG_LPC_ICH=m -CONFIG_LPC_SCH=m -CONFIG_MFD_INTEL_LPSS=m -CONFIG_MFD_INTEL_LPSS_ACPI=m -CONFIG_MFD_INTEL_LPSS_PCI=m -# CONFIG_MFD_INTEL_PMC_BXT is not set -# CONFIG_MFD_IQS62X is not set -# CONFIG_MFD_JANZ_CMODIO is not set -# CONFIG_MFD_KEMPLD is not set -# CONFIG_MFD_88PM800 is not set -# CONFIG_MFD_88PM805 is not set -# CONFIG_MFD_88PM860X is not set -# CONFIG_MFD_MAX14577 is not set -# CONFIG_MFD_MAX77541 is not set -# CONFIG_MFD_MAX77693 is not set -# CONFIG_MFD_MAX77843 is not set -# CONFIG_MFD_MAX8907 is not set -# CONFIG_MFD_MAX8925 is not set -# CONFIG_MFD_MAX8997 is not set -# CONFIG_MFD_MAX8998 is not set -# CONFIG_MFD_MT6360 is not set -# CONFIG_MFD_MT6370 is not set -# CONFIG_MFD_MT6397 is not set -# CONFIG_MFD_MENF21BMC is not set -# CONFIG_MFD_OCELOT is not set -# CONFIG_EZX_PCAP is not set -# CONFIG_MFD_VIPERBOARD is not set -# CONFIG_MFD_RETU is not set -# CONFIG_MFD_PCF50633 is not set -# CONFIG_MFD_SY7636A is not set -# CONFIG_MFD_RDC321X is not set -# CONFIG_MFD_RT4831 is not set -# CONFIG_MFD_RT5033 is not set -# CONFIG_MFD_RT5120 is not set -# CONFIG_MFD_RC5T583 is not set -# CONFIG_MFD_SI476X_CORE is not set -# CONFIG_MFD_SM501 is not set -# CONFIG_MFD_SKY81452 is not set -# CONFIG_MFD_SYSCON is not set -# CONFIG_MFD_LP3943 is not set -# CONFIG_MFD_LP8788 is not set -# CONFIG_MFD_TI_LMU is not set -# CONFIG_MFD_PALMAS is not set -# CONFIG_TPS6105X is not set -# CONFIG_TPS6507X is not set -# CONFIG_MFD_TPS65086 is not set -# CONFIG_MFD_TPS65090 is not set -# CONFIG_MFD_TI_LP873X is not set -# CONFIG_MFD_TPS6586X is not set -# CONFIG_MFD_TPS65912_I2C is not set -# CONFIG_MFD_TPS65912_SPI is not set -# CONFIG_MFD_TPS6594_I2C is not set -# CONFIG_MFD_TPS6594_SPI is not set -# CONFIG_TWL4030_CORE is not set -# CONFIG_TWL6040_CORE is not set -# CONFIG_MFD_WL1273_CORE is not set -# CONFIG_MFD_LM3533 is not set -# CONFIG_MFD_TQMX86 is not set -# CONFIG_MFD_VX855 is not set -# CONFIG_MFD_ARIZONA_I2C is not set -# CONFIG_MFD_ARIZONA_SPI is not set -# CONFIG_MFD_WM8400 is not set -# CONFIG_MFD_WM831X_I2C is not set -# CONFIG_MFD_WM831X_SPI is not set -# CONFIG_MFD_WM8350_I2C is not set -# CONFIG_MFD_WM8994 is not set -# CONFIG_MFD_ATC260X_I2C is not set -# CONFIG_MFD_CS40L50_I2C is not set -# CONFIG_MFD_CS40L50_SPI is not set -# CONFIG_MFD_INTEL_M10_BMC_SPI is not set -# end of Multifunction device drivers - -# CONFIG_REGULATOR is not set -# CONFIG_RC_CORE is not set -CONFIG_CEC_CORE=m - -# -# CEC support -# -# CONFIG_MEDIA_CEC_SUPPORT is not set -# end of CEC support - -CONFIG_MEDIA_SUPPORT=m -CONFIG_MEDIA_SUPPORT_FILTER=y -CONFIG_MEDIA_SUBDRV_AUTOSELECT=y - -# -# Media device types -# -CONFIG_MEDIA_CAMERA_SUPPORT=y -# CONFIG_MEDIA_ANALOG_TV_SUPPORT is not set -# CONFIG_MEDIA_DIGITAL_TV_SUPPORT is not set -# CONFIG_MEDIA_RADIO_SUPPORT is not set -# CONFIG_MEDIA_SDR_SUPPORT is not set -# CONFIG_MEDIA_PLATFORM_SUPPORT is not set -# CONFIG_MEDIA_TEST_SUPPORT is not set -# end of Media device types - -CONFIG_VIDEO_DEV=m -CONFIG_MEDIA_CONTROLLER=y - -# -# Video4Linux options -# -CONFIG_VIDEO_V4L2_I2C=y -CONFIG_VIDEO_V4L2_SUBDEV_API=y -# CONFIG_VIDEO_ADV_DEBUG is not set -# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set -CONFIG_V4L2_FWNODE=m -CONFIG_V4L2_ASYNC=m -# end of Video4Linux options - -# -# Media controller options -# -# end of Media controller options - -# -# Media drivers -# - -# -# Drivers filtered as selected at 'Filter media drivers' -# - -# -# Media drivers -# -CONFIG_MEDIA_USB_SUPPORT=y - -# -# Webcam devices -# -# CONFIG_USB_GSPCA is not set -# CONFIG_USB_PWC is not set -# CONFIG_USB_S2255 is not set -# CONFIG_VIDEO_USBTV is not set -CONFIG_USB_VIDEO_CLASS=m -CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y - -# -# Webcam, TV (analog/digital) USB devices -# -# CONFIG_VIDEO_EM28XX is not set -# CONFIG_MEDIA_PCI_SUPPORT is not set -CONFIG_UVC_COMMON=m -CONFIG_VIDEOBUF2_CORE=m -CONFIG_VIDEOBUF2_V4L2=m -CONFIG_VIDEOBUF2_MEMOPS=m -CONFIG_VIDEOBUF2_VMALLOC=m -# end of Media drivers - -CONFIG_MEDIA_HIDE_ANCILLARY_SUBDRV=y - -# -# Media ancillary drivers -# -CONFIG_VIDEO_CAMERA_SENSOR=y -# CONFIG_VIDEO_ALVIUM_CSI2 is not set -# CONFIG_VIDEO_AR0521 is not set -# CONFIG_VIDEO_GC0308 is not set -# CONFIG_VIDEO_GC05A2 is not set -# CONFIG_VIDEO_GC08A3 is not set -# CONFIG_VIDEO_GC2145 is not set -# CONFIG_VIDEO_HI556 is not set -# CONFIG_VIDEO_HI846 is not set -# CONFIG_VIDEO_HI847 is not set -# CONFIG_VIDEO_IMX208 is not set -# CONFIG_VIDEO_IMX219 is not set -# CONFIG_VIDEO_IMX258 is not set -# CONFIG_VIDEO_IMX274 is not set -# CONFIG_VIDEO_IMX283 is not set -# CONFIG_VIDEO_IMX290 is not set -# CONFIG_VIDEO_IMX296 is not set -# CONFIG_VIDEO_IMX319 is not set -# CONFIG_VIDEO_IMX355 is not set -# CONFIG_VIDEO_MT9M001 is not set -# CONFIG_VIDEO_MT9M111 is not set -# CONFIG_VIDEO_MT9M114 is not set -# CONFIG_VIDEO_MT9P031 is not set -# CONFIG_VIDEO_MT9T112 is not set -# CONFIG_VIDEO_MT9V011 is not set -# CONFIG_VIDEO_MT9V032 is not set -# CONFIG_VIDEO_MT9V111 is not set -# CONFIG_VIDEO_OG01A1B is not set -# CONFIG_VIDEO_OV01A10 is not set -# CONFIG_VIDEO_OV02A10 is not set -# CONFIG_VIDEO_OV08D10 is not set -# CONFIG_VIDEO_OV08X40 is not set -# CONFIG_VIDEO_OV13858 is not set -# CONFIG_VIDEO_OV13B10 is not set -# CONFIG_VIDEO_OV2640 is not set -# CONFIG_VIDEO_OV2680 is not set -# CONFIG_VIDEO_OV2685 is not set -# CONFIG_VIDEO_OV2740 is not set -# CONFIG_VIDEO_OV5647 is not set -# CONFIG_VIDEO_OV5648 is not set -# CONFIG_VIDEO_OV5670 is not set -# CONFIG_VIDEO_OV5675 is not set -# CONFIG_VIDEO_OV5693 is not set -# CONFIG_VIDEO_OV5695 is not set -# CONFIG_VIDEO_OV64A40 is not set -# CONFIG_VIDEO_OV6650 is not set -# CONFIG_VIDEO_OV7251 is not set -# CONFIG_VIDEO_OV7640 is not set -# CONFIG_VIDEO_OV7670 is not set -# CONFIG_VIDEO_OV772X is not set -# CONFIG_VIDEO_OV7740 is not set -# CONFIG_VIDEO_OV8856 is not set -# CONFIG_VIDEO_OV8858 is not set -# CONFIG_VIDEO_OV8865 is not set -# CONFIG_VIDEO_OV9640 is not set -# CONFIG_VIDEO_OV9650 is not set -# CONFIG_VIDEO_OV9734 is not set -# CONFIG_VIDEO_RDACM20 is not set -# CONFIG_VIDEO_RDACM21 is not set -# CONFIG_VIDEO_RJ54N1 is not set -# CONFIG_VIDEO_S5C73M3 is not set -# CONFIG_VIDEO_S5K5BAF is not set -# CONFIG_VIDEO_S5K6A3 is not set -# CONFIG_VIDEO_CCS is not set -# CONFIG_VIDEO_ET8EK8 is not set - -# -# Camera ISPs -# -# CONFIG_VIDEO_THP7312 is not set -# end of Camera ISPs - -# -# Lens drivers -# -# CONFIG_VIDEO_AK7375 is not set -# CONFIG_VIDEO_DW9714 is not set -# CONFIG_VIDEO_DW9719 is not set -# CONFIG_VIDEO_DW9768 is not set -# CONFIG_VIDEO_DW9807_VCM is not set -# end of Lens drivers - -# -# Flash devices -# -# CONFIG_VIDEO_ADP1653 is not set -# CONFIG_VIDEO_LM3560 is not set -# CONFIG_VIDEO_LM3646 is not set -# end of Flash devices - -# -# audio, video and radio I2C drivers auto-selected by 'Autoselect ancillary drivers' -# - -# -# Video and audio decoders -# - -# -# Video serializers and deserializers -# -# end of Video serializers and deserializers - -# -# SPI I2C drivers auto-selected by 'Autoselect ancillary drivers' -# - -# -# Media SPI Adapters -# -# CONFIG_VIDEO_GS1662 is not set -# end of Media SPI Adapters -# end of Media ancillary drivers - -# -# Graphics support -# -CONFIG_APERTURE_HELPERS=y -CONFIG_SCREEN_INFO=y -CONFIG_VIDEO=y -# CONFIG_AUXDISPLAY is not set -CONFIG_AGP=y -CONFIG_AGP_AMD64=m -CONFIG_AGP_INTEL=m -# CONFIG_AGP_SIS is not set -# CONFIG_AGP_VIA is not set -CONFIG_INTEL_GTT=m -CONFIG_VGA_SWITCHEROO=y -CONFIG_DRM=y -CONFIG_DRM_MIPI_DSI=y -# CONFIG_DRM_DEBUG_MM is not set -CONFIG_DRM_KMS_HELPER=y -CONFIG_DRM_PANIC=y -CONFIG_DRM_PANIC_FOREGROUND_COLOR=0xffffff -CONFIG_DRM_PANIC_BACKGROUND_COLOR=0x000000 -# CONFIG_DRM_PANIC_DEBUG is not set -CONFIG_DRM_PANIC_SCREEN="user" -CONFIG_DRM_FBDEV_EMULATION=y -CONFIG_DRM_FBDEV_OVERALLOC=100 -# CONFIG_DRM_LOAD_EDID_FIRMWARE is not set -CONFIG_DRM_DISPLAY_HELPER=m -CONFIG_DRM_DISPLAY_DP_AUX_CEC=y -CONFIG_DRM_DISPLAY_DP_AUX_CHARDEV=y -CONFIG_DRM_DISPLAY_DP_HELPER=y -CONFIG_DRM_DISPLAY_DP_TUNNEL=y -CONFIG_DRM_DISPLAY_HDCP_HELPER=y -CONFIG_DRM_DISPLAY_HDMI_HELPER=y -CONFIG_DRM_TTM=m -CONFIG_DRM_EXEC=m -CONFIG_DRM_GPUVM=m -CONFIG_DRM_BUDDY=m -CONFIG_DRM_VRAM_HELPER=m -CONFIG_DRM_TTM_HELPER=m -CONFIG_DRM_GEM_SHMEM_HELPER=y -CONFIG_DRM_SUBALLOC_HELPER=m -CONFIG_DRM_SCHED=m - -# -# I2C encoder or helper chips -# -# CONFIG_DRM_I2C_CH7006 is not set -# CONFIG_DRM_I2C_SIL164 is not set -# CONFIG_DRM_I2C_NXP_TDA998X is not set -# CONFIG_DRM_I2C_NXP_TDA9950 is not set -# end of I2C encoder or helper chips - -# -# ARM devices -# -# end of ARM devices - -# CONFIG_DRM_RADEON is not set -CONFIG_DRM_AMDGPU=m -CONFIG_DRM_AMDGPU_SI=y -CONFIG_DRM_AMDGPU_CIK=y -CONFIG_DRM_AMDGPU_USERPTR=y -# CONFIG_DRM_AMD_ISP is not set - -# -# ACP (Audio CoProcessor) Configuration -# -# CONFIG_DRM_AMD_ACP is not set -# end of ACP (Audio CoProcessor) Configuration - -# -# Display Engine Configuration -# -CONFIG_DRM_AMD_DC=y -CONFIG_DRM_AMD_DC_FP=y -CONFIG_DRM_AMD_DC_SI=y -CONFIG_DRM_AMD_SECURE_DISPLAY=y -# end of Display Engine Configuration - -CONFIG_HSA_AMD=y -CONFIG_HSA_AMD_SVM=y -CONFIG_DRM_NOUVEAU=m -CONFIG_NOUVEAU_DEBUG=5 -CONFIG_NOUVEAU_DEBUG_DEFAULT=3 -# CONFIG_NOUVEAU_DEBUG_MMU is not set -# CONFIG_NOUVEAU_DEBUG_PUSH is not set -CONFIG_DRM_NOUVEAU_BACKLIGHT=y -# CONFIG_DRM_NOUVEAU_GSP_DEFAULT is not set -CONFIG_DRM_I915=m -CONFIG_DRM_I915_FORCE_PROBE="" -CONFIG_DRM_I915_CAPTURE_ERROR=y -CONFIG_DRM_I915_COMPRESS_ERROR=y -CONFIG_DRM_I915_USERPTR=y -CONFIG_DRM_I915_GVT_KVMGT=m -CONFIG_DRM_I915_DP_TUNNEL=y -CONFIG_DRM_I915_REQUEST_TIMEOUT=20000 -CONFIG_DRM_I915_FENCE_TIMEOUT=10000 -CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 -CONFIG_DRM_I915_HEARTBEAT_INTERVAL=2500 -CONFIG_DRM_I915_PREEMPT_TIMEOUT=640 -CONFIG_DRM_I915_PREEMPT_TIMEOUT_COMPUTE=7500 -CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT=8000 -CONFIG_DRM_I915_STOP_TIMEOUT=100 -CONFIG_DRM_I915_TIMESLICE_DURATION=1 -CONFIG_DRM_I915_GVT=y -# CONFIG_DRM_XE is not set -CONFIG_DRM_VGEM=m -CONFIG_DRM_VKMS=m -# CONFIG_DRM_VMWGFX is not set -# CONFIG_DRM_GMA500 is not set -CONFIG_DRM_UDL=m -# CONFIG_DRM_AST is not set -# CONFIG_DRM_MGAG200 is not set -CONFIG_DRM_QXL=m -CONFIG_DRM_VIRTIO_GPU=m -CONFIG_DRM_VIRTIO_GPU_KMS=y -CONFIG_DRM_PANEL=y - -# -# Display Panels -# -# CONFIG_DRM_PANEL_AUO_A030JTN01 is not set -# CONFIG_DRM_PANEL_ILITEK_ILI9341 is not set -# CONFIG_DRM_PANEL_ORISETECH_OTA5601A is not set -# CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN is not set -# end of Display Panels - -CONFIG_DRM_BRIDGE=y -CONFIG_DRM_PANEL_BRIDGE=y - -# -# Display Interface Bridges -# -# CONFIG_DRM_ANALOGIX_ANX78XX is not set -# end of Display Interface Bridges - -# CONFIG_DRM_ETNAVIV is not set -CONFIG_DRM_BOCHS=m -# CONFIG_DRM_CIRRUS_QEMU is not set -# CONFIG_DRM_GM12U320 is not set -# CONFIG_DRM_PANEL_MIPI_DBI is not set -CONFIG_DRM_SIMPLEDRM=y -# CONFIG_TINYDRM_HX8357D is not set -# CONFIG_TINYDRM_ILI9163 is not set -# CONFIG_TINYDRM_ILI9225 is not set -# CONFIG_TINYDRM_ILI9341 is not set -# CONFIG_TINYDRM_ILI9486 is not set -# CONFIG_TINYDRM_MI0283QT is not set -# CONFIG_TINYDRM_REPAPER is not set -# CONFIG_TINYDRM_ST7586 is not set -# CONFIG_TINYDRM_ST7735R is not set -# CONFIG_DRM_VBOXVIDEO is not set -CONFIG_DRM_GUD=m -# CONFIG_DRM_SSD130X is not set -CONFIG_DRM_PRIVACY_SCREEN=y -CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y - -# -# Frame buffer Devices -# -CONFIG_FB=y -# CONFIG_FB_CIRRUS is not set -# CONFIG_FB_PM2 is not set -# CONFIG_FB_CYBER2000 is not set -# CONFIG_FB_ARC is not set -# CONFIG_FB_ASILIANT is not set -# CONFIG_FB_IMSTT is not set -# CONFIG_FB_VGA16 is not set -# CONFIG_FB_UVESA is not set -# CONFIG_FB_VESA is not set -CONFIG_FB_EFI=y -# CONFIG_FB_N411 is not set -# CONFIG_FB_HGA is not set -# CONFIG_FB_OPENCORES is not set -# CONFIG_FB_S1D13XXX is not set -# CONFIG_FB_NVIDIA is not set -# CONFIG_FB_RIVA is not set -# CONFIG_FB_I740 is not set -# CONFIG_FB_MATROX is not set -# CONFIG_FB_RADEON is not set -# CONFIG_FB_ATY128 is not set -# CONFIG_FB_ATY is not set -# CONFIG_FB_S3 is not set -# CONFIG_FB_SAVAGE is not set -# CONFIG_FB_SIS is not set -# CONFIG_FB_NEOMAGIC is not set -# CONFIG_FB_KYRO is not set -# CONFIG_FB_3DFX is not set -# CONFIG_FB_VOODOO1 is not set -# CONFIG_FB_VT8623 is not set -# CONFIG_FB_TRIDENT is not set -# CONFIG_FB_ARK is not set -# CONFIG_FB_PM3 is not set -# CONFIG_FB_CARMINE is not set -# CONFIG_FB_SMSCUFX is not set -# CONFIG_FB_IBM_GXT4500 is not set -# CONFIG_FB_VIRTUAL is not set -# CONFIG_FB_METRONOME is not set -# CONFIG_FB_MB862XX is not set -# CONFIG_FB_SM712 is not set -CONFIG_FB_CORE=y -CONFIG_FB_NOTIFY=y -CONFIG_FIRMWARE_EDID=y -# CONFIG_FB_DEVICE is not set -CONFIG_FB_CFB_FILLRECT=y -CONFIG_FB_CFB_COPYAREA=y -CONFIG_FB_CFB_IMAGEBLIT=y -CONFIG_FB_SYS_FILLRECT=y -CONFIG_FB_SYS_COPYAREA=y -CONFIG_FB_SYS_IMAGEBLIT=y -# CONFIG_FB_FOREIGN_ENDIAN is not set -CONFIG_FB_SYSMEM_FOPS=y -CONFIG_FB_DEFERRED_IO=y -CONFIG_FB_IOMEM_FOPS=y -CONFIG_FB_IOMEM_HELPERS=y -CONFIG_FB_SYSMEM_HELPERS=y -CONFIG_FB_SYSMEM_HELPERS_DEFERRED=y -# CONFIG_FB_MODE_HELPERS is not set -# CONFIG_FB_TILEBLITTING is not set -# end of Frame buffer Devices - -# -# Backlight & LCD device support -# -CONFIG_LCD_CLASS_DEVICE=m -# CONFIG_LCD_LTV350QV is not set -# CONFIG_LCD_ILI922X is not set -# CONFIG_LCD_ILI9320 is not set -# CONFIG_LCD_TDO24M is not set -# CONFIG_LCD_VGG2432A4 is not set -CONFIG_LCD_PLATFORM=m -# CONFIG_LCD_AMS369FG06 is not set -# CONFIG_LCD_LMS501KF03 is not set -# CONFIG_LCD_HX8357 is not set -# CONFIG_LCD_OTM3225A is not set -CONFIG_BACKLIGHT_CLASS_DEVICE=y -# CONFIG_BACKLIGHT_KTD2801 is not set -# CONFIG_BACKLIGHT_KTZ8866 is not set -CONFIG_BACKLIGHT_PWM=m -# CONFIG_BACKLIGHT_APPLE is not set -# CONFIG_BACKLIGHT_QCOM_WLED is not set -# CONFIG_BACKLIGHT_SAHARA is not set -# CONFIG_BACKLIGHT_ADP8860 is not set -# CONFIG_BACKLIGHT_ADP8870 is not set -# CONFIG_BACKLIGHT_LM3509 is not set -# CONFIG_BACKLIGHT_LM3630A is not set -CONFIG_BACKLIGHT_LM3639=m -# CONFIG_BACKLIGHT_LP855X is not set -# CONFIG_BACKLIGHT_MP3309C is not set -# CONFIG_BACKLIGHT_LV5207LP is not set -# CONFIG_BACKLIGHT_BD6107 is not set -# CONFIG_BACKLIGHT_ARCXCNN is not set -# end of Backlight & LCD device support - -CONFIG_HDMI=y - -# -# Console display driver support -# -CONFIG_VGA_CONSOLE=y -CONFIG_DUMMY_CONSOLE=y -CONFIG_DUMMY_CONSOLE_COLUMNS=80 -CONFIG_DUMMY_CONSOLE_ROWS=25 -CONFIG_FRAMEBUFFER_CONSOLE=y -# CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION is not set -CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y -# CONFIG_FRAMEBUFFER_CONSOLE_ROTATION is not set -CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y -# end of Console display driver support - -# CONFIG_LOGO is not set -# end of Graphics support - -# CONFIG_DRM_ACCEL is not set -CONFIG_SOUND=y -CONFIG_SND=m -CONFIG_SND_TIMER=m -CONFIG_SND_PCM=m -CONFIG_SND_HWDEP=m -CONFIG_SND_SEQ_DEVICE=m -CONFIG_SND_RAWMIDI=m -CONFIG_SND_UMP=m -# CONFIG_SND_UMP_LEGACY_RAWMIDI is not set -CONFIG_SND_COMPRESS_OFFLOAD=m -CONFIG_SND_JACK=y -CONFIG_SND_JACK_INPUT_DEV=y -# CONFIG_SND_OSSEMUL is not set -CONFIG_SND_PCM_TIMER=y -CONFIG_SND_HRTIMER=m -CONFIG_SND_DYNAMIC_MINORS=y -CONFIG_SND_MAX_CARDS=6 -# CONFIG_SND_SUPPORT_OLD_API is not set -CONFIG_SND_PROC_FS=y -# CONFIG_SND_VERBOSE_PROCFS is not set -CONFIG_SND_CTL_FAST_LOOKUP=y -# CONFIG_SND_DEBUG is not set -CONFIG_SND_CTL_INPUT_VALIDATION=y -CONFIG_SND_UTIMER=y -CONFIG_SND_VMASTER=y -CONFIG_SND_DMA_SGBUF=y -CONFIG_SND_CTL_LED=m -CONFIG_SND_SEQUENCER=m -# CONFIG_SND_SEQ_DUMMY is not set -CONFIG_SND_SEQ_HRTIMER_DEFAULT=y -CONFIG_SND_SEQ_MIDI_EVENT=m -CONFIG_SND_SEQ_MIDI=m -# CONFIG_SND_SEQ_UMP is not set -CONFIG_SND_SEQ_UMP_CLIENT=m -CONFIG_SND_MPU401_UART=m -CONFIG_SND_AC97_CODEC=m -CONFIG_SND_DRIVERS=y -# CONFIG_SND_PCSP is not set -# CONFIG_SND_DUMMY is not set -# CONFIG_SND_ALOOP is not set -# CONFIG_SND_PCMTEST is not set -# CONFIG_SND_VIRMIDI is not set -# CONFIG_SND_MTPAV is not set -# CONFIG_SND_SERIAL_U16550 is not set -CONFIG_SND_MPU401=m -# CONFIG_SND_AC97_POWER_SAVE is not set -CONFIG_SND_PCI=y -# CONFIG_SND_AD1889 is not set -# CONFIG_SND_ALS300 is not set -# CONFIG_SND_ALS4000 is not set -# CONFIG_SND_ALI5451 is not set -# CONFIG_SND_ASIHPI is not set -# CONFIG_SND_ATIIXP is not set -# CONFIG_SND_ATIIXP_MODEM is not set -# CONFIG_SND_AU8810 is not set -# CONFIG_SND_AU8820 is not set -# CONFIG_SND_AU8830 is not set -# CONFIG_SND_AW2 is not set -# CONFIG_SND_AZT3328 is not set -# CONFIG_SND_BT87X is not set -# CONFIG_SND_CA0106 is not set -# CONFIG_SND_CMIPCI is not set -# CONFIG_SND_OXYGEN is not set -# CONFIG_SND_CS4281 is not set -# CONFIG_SND_CS46XX is not set -# CONFIG_SND_CTXFI is not set -# CONFIG_SND_DARLA20 is not set -# CONFIG_SND_GINA20 is not set -# CONFIG_SND_LAYLA20 is not set -# CONFIG_SND_DARLA24 is not set -# CONFIG_SND_GINA24 is not set -# CONFIG_SND_LAYLA24 is not set -# CONFIG_SND_MONA is not set -# CONFIG_SND_MIA is not set -# CONFIG_SND_ECHO3G is not set -# CONFIG_SND_INDIGO is not set -# CONFIG_SND_INDIGOIO is not set -# CONFIG_SND_INDIGODJ is not set -# CONFIG_SND_INDIGOIOX is not set -# CONFIG_SND_INDIGODJX is not set -# CONFIG_SND_EMU10K1 is not set -# CONFIG_SND_EMU10K1X is not set -# CONFIG_SND_ENS1370 is not set -# CONFIG_SND_ENS1371 is not set -# CONFIG_SND_ES1938 is not set -# CONFIG_SND_ES1968 is not set -# CONFIG_SND_FM801 is not set -# CONFIG_SND_HDSP is not set -# CONFIG_SND_HDSPM is not set -# CONFIG_SND_ICE1712 is not set -# CONFIG_SND_ICE1724 is not set -CONFIG_SND_INTEL8X0=m -# CONFIG_SND_INTEL8X0M is not set -# CONFIG_SND_KORG1212 is not set -# CONFIG_SND_LOLA is not set -# CONFIG_SND_LX6464ES is not set -# CONFIG_SND_MAESTRO3 is not set -# CONFIG_SND_MIXART is not set -# CONFIG_SND_NM256 is not set -# CONFIG_SND_PCXHR is not set -# CONFIG_SND_RIPTIDE is not set -# CONFIG_SND_RME32 is not set -# CONFIG_SND_RME96 is not set -# CONFIG_SND_RME9652 is not set -# CONFIG_SND_SE6X is not set -# CONFIG_SND_SONICVIBES is not set -# CONFIG_SND_TRIDENT is not set -# CONFIG_SND_VIA82XX is not set -# CONFIG_SND_VIA82XX_MODEM is not set -# CONFIG_SND_VIRTUOSO is not set -# CONFIG_SND_VX222 is not set -# CONFIG_SND_YMFPCI is not set - -# -# HD-Audio -# -CONFIG_SND_HDA=m -CONFIG_SND_HDA_GENERIC_LEDS=y -CONFIG_SND_HDA_INTEL=m -CONFIG_SND_HDA_HWDEP=y -CONFIG_SND_HDA_RECONFIG=y -# CONFIG_SND_HDA_INPUT_BEEP is not set -CONFIG_SND_HDA_PATCH_LOADER=y -CONFIG_SND_HDA_SCODEC_COMPONENT=m -# CONFIG_SND_HDA_SCODEC_CS35L41_I2C is not set -# CONFIG_SND_HDA_SCODEC_CS35L41_SPI is not set -# CONFIG_SND_HDA_SCODEC_CS35L56_I2C is not set -# CONFIG_SND_HDA_SCODEC_CS35L56_SPI is not set -# CONFIG_SND_HDA_SCODEC_TAS2781_I2C is not set -CONFIG_SND_HDA_CODEC_REALTEK=m -CONFIG_SND_HDA_CODEC_ANALOG=m -CONFIG_SND_HDA_CODEC_SIGMATEL=m -CONFIG_SND_HDA_CODEC_VIA=m -CONFIG_SND_HDA_CODEC_HDMI=m -CONFIG_SND_HDA_CODEC_CIRRUS=m -# CONFIG_SND_HDA_CODEC_CS8409 is not set -CONFIG_SND_HDA_CODEC_CONEXANT=m -# CONFIG_SND_HDA_CODEC_SENARYTECH is not set -CONFIG_SND_HDA_CODEC_CA0110=m -CONFIG_SND_HDA_CODEC_CA0132=m -# CONFIG_SND_HDA_CODEC_CA0132_DSP is not set -CONFIG_SND_HDA_CODEC_CMEDIA=m -CONFIG_SND_HDA_CODEC_SI3054=m -CONFIG_SND_HDA_GENERIC=m -CONFIG_SND_HDA_POWER_SAVE_DEFAULT=60 -# CONFIG_SND_HDA_INTEL_HDMI_SILENT_STREAM is not set -# CONFIG_SND_HDA_CTL_DEV_ID is not set -# end of HD-Audio - -CONFIG_SND_HDA_CORE=m -CONFIG_SND_HDA_COMPONENT=y -CONFIG_SND_HDA_I915=y -CONFIG_SND_HDA_EXT_CORE=m -CONFIG_SND_HDA_PREALLOC_SIZE=0 -CONFIG_SND_INTEL_NHLT=y -CONFIG_SND_INTEL_DSP_CONFIG=m -CONFIG_SND_INTEL_SOUNDWIRE_ACPI=m -CONFIG_SND_SPI=y -CONFIG_SND_USB=y -CONFIG_SND_USB_AUDIO=m -CONFIG_SND_USB_AUDIO_MIDI_V2=y -CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y -# CONFIG_SND_USB_UA101 is not set -# CONFIG_SND_USB_USX2Y is not set -# CONFIG_SND_USB_CAIAQ is not set -# CONFIG_SND_USB_US122L is not set -# CONFIG_SND_USB_6FIRE is not set -# CONFIG_SND_USB_HIFACE is not set -# CONFIG_SND_BCD2000 is not set -# CONFIG_SND_USB_POD is not set -# CONFIG_SND_USB_PODHD is not set -# CONFIG_SND_USB_TONEPORT is not set -# CONFIG_SND_USB_VARIAX is not set -CONFIG_SND_SOC=m -CONFIG_SND_SOC_COMPRESS=y -CONFIG_SND_SOC_TOPOLOGY=y -CONFIG_SND_SOC_ACPI=m -# CONFIG_SND_SOC_ADI is not set -# CONFIG_SND_SOC_AMD_ACP is not set -# CONFIG_SND_SOC_AMD_ACP3x is not set -# CONFIG_SND_SOC_AMD_RENOIR is not set -# CONFIG_SND_SOC_AMD_ACP5x is not set -# CONFIG_SND_SOC_AMD_ACP6x is not set -# CONFIG_SND_AMD_ACP_CONFIG is not set -# CONFIG_SND_SOC_AMD_ACP_COMMON is not set -# CONFIG_SND_SOC_AMD_RPL_ACP6x is not set -# CONFIG_SND_ATMEL_SOC is not set -# CONFIG_SND_BCM63XX_I2S_WHISTLER is not set -# CONFIG_SND_DESIGNWARE_I2S is not set - -# -# SoC Audio for Freescale CPUs -# - -# -# Common SoC Audio options for Freescale CPUs: -# -# CONFIG_SND_SOC_FSL_ASRC is not set -# CONFIG_SND_SOC_FSL_SAI is not set -# CONFIG_SND_SOC_FSL_AUDMIX is not set -# CONFIG_SND_SOC_FSL_SSI is not set -# CONFIG_SND_SOC_FSL_SPDIF is not set -# CONFIG_SND_SOC_FSL_ESAI is not set -# CONFIG_SND_SOC_FSL_MICFIL is not set -# CONFIG_SND_SOC_FSL_XCVR is not set -# CONFIG_SND_SOC_IMX_AUDMUX is not set -# end of SoC Audio for Freescale CPUs - -# CONFIG_SND_SOC_CHV3_I2S is not set -# CONFIG_SND_I2S_HI6210_I2S is not set -# CONFIG_SND_SOC_IMG is not set -# CONFIG_SND_SOC_INTEL_SST_TOPLEVEL is not set -CONFIG_SND_SOC_ACPI_INTEL_MATCH=m -# CONFIG_SND_SOC_INTEL_AVS is not set -CONFIG_SND_SOC_INTEL_MACH=y -# CONFIG_SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES is not set -CONFIG_SND_SOC_INTEL_HDA_DSP_COMMON=m -CONFIG_SND_SOC_INTEL_SOF_BOARD_HELPERS=m -# CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH is not set -# CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH is not set -# CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH is not set -# CONFIG_SND_SOC_INTEL_GLK_DA7219_MAX98357A_MACH is not set -# CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH is not set -CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m -# CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH is not set -# CONFIG_SND_SOC_INTEL_SOF_CS42L42_MACH is not set -# CONFIG_SND_SOC_INTEL_SOF_PCM512x_MACH is not set -# CONFIG_SND_SOC_INTEL_SOF_NAU8825_MACH is not set -# CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH is not set -# CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH is not set -# CONFIG_SND_SOC_INTEL_SOF_DA7219_MACH is not set -# CONFIG_SND_SOC_INTEL_SOF_SSP_AMP_MACH is not set -# CONFIG_SND_SOC_INTEL_EHL_RT5660_MACH is not set -# CONFIG_SND_SOC_MTK_BTCVSD is not set -CONFIG_SND_SOC_SOF_TOPLEVEL=y -CONFIG_SND_SOC_SOF_PCI_DEV=m -CONFIG_SND_SOC_SOF_PCI=m -CONFIG_SND_SOC_SOF_ACPI=m -CONFIG_SND_SOC_SOF_ACPI_DEV=m -CONFIG_SND_SOC_SOF_DEBUG_PROBES=m -CONFIG_SND_SOC_SOF_CLIENT=m -CONFIG_SND_SOC_SOF=m -CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y -CONFIG_SND_SOC_SOF_IPC3=y -CONFIG_SND_SOC_SOF_IPC4=y -CONFIG_SND_SOC_SOF_AMD_TOPLEVEL=m -# CONFIG_SND_SOC_SOF_AMD_RENOIR is not set -# CONFIG_SND_SOC_SOF_AMD_VANGOGH is not set -# CONFIG_SND_SOC_SOF_AMD_REMBRANDT is not set -# CONFIG_SND_SOC_SOF_AMD_ACP63 is not set -# CONFIG_SND_SOC_SOF_AMD_ACP70 is not set -CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y -CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m -CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m -CONFIG_SND_SOC_SOF_INTEL_COMMON=m -CONFIG_SND_SOC_SOF_BAYTRAIL=m -CONFIG_SND_SOC_SOF_BROADWELL=m -CONFIG_SND_SOC_SOF_MERRIFIELD=m -CONFIG_SND_SOC_SOF_INTEL_SKL=m -CONFIG_SND_SOC_SOF_SKYLAKE=m -CONFIG_SND_SOC_SOF_KABYLAKE=m -CONFIG_SND_SOC_SOF_INTEL_APL=m -CONFIG_SND_SOC_SOF_APOLLOLAKE=m -CONFIG_SND_SOC_SOF_GEMINILAKE=m -CONFIG_SND_SOC_SOF_INTEL_CNL=m -CONFIG_SND_SOC_SOF_CANNONLAKE=m -CONFIG_SND_SOC_SOF_COFFEELAKE=m -CONFIG_SND_SOC_SOF_COMETLAKE=m -CONFIG_SND_SOC_SOF_INTEL_ICL=m -CONFIG_SND_SOC_SOF_ICELAKE=m -CONFIG_SND_SOC_SOF_JASPERLAKE=m -CONFIG_SND_SOC_SOF_INTEL_TGL=m -CONFIG_SND_SOC_SOF_TIGERLAKE=m -CONFIG_SND_SOC_SOF_ELKHARTLAKE=m -CONFIG_SND_SOC_SOF_ALDERLAKE=m -CONFIG_SND_SOC_SOF_INTEL_MTL=m -CONFIG_SND_SOC_SOF_METEORLAKE=m -CONFIG_SND_SOC_SOF_INTEL_LNL=m -CONFIG_SND_SOC_SOF_LUNARLAKE=m -# CONFIG_SND_SOC_SOF_PANTHERLAKE is not set -CONFIG_SND_SOC_SOF_HDA_COMMON=m -CONFIG_SND_SOC_SOF_HDA_GENERIC=m -CONFIG_SND_SOC_SOF_HDA_MLINK=m -CONFIG_SND_SOC_SOF_HDA_LINK=y -CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y -CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m -CONFIG_SND_SOC_SOF_HDA=m -CONFIG_SND_SOC_SOF_HDA_PROBES=m -CONFIG_SND_SOC_SOF_INTEL_SOUNDWIRE_LINK_BASELINE=m -CONFIG_SND_SOC_SOF_XTENSA=m - -# -# STMicroelectronics STM32 SOC audio support -# -# end of STMicroelectronics STM32 SOC audio support - -# CONFIG_SND_SOC_XILINX_I2S is not set -# CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER is not set -# CONFIG_SND_SOC_XILINX_SPDIF is not set -# CONFIG_SND_SOC_XTFPGA_I2S is not set -CONFIG_SND_SOC_I2C_AND_SPI=m - -# -# CODEC drivers -# -# CONFIG_SND_SOC_AC97_CODEC is not set -# CONFIG_SND_SOC_ADAU1372_I2C is not set -# CONFIG_SND_SOC_ADAU1372_SPI is not set -# CONFIG_SND_SOC_ADAU1701 is not set -# CONFIG_SND_SOC_ADAU1761_I2C is not set -# CONFIG_SND_SOC_ADAU1761_SPI is not set -# CONFIG_SND_SOC_ADAU7002 is not set -# CONFIG_SND_SOC_ADAU7118_HW is not set -# CONFIG_SND_SOC_ADAU7118_I2C is not set -# CONFIG_SND_SOC_AK4104 is not set -# CONFIG_SND_SOC_AK4118 is not set -# CONFIG_SND_SOC_AK4375 is not set -# CONFIG_SND_SOC_AK4458 is not set -# CONFIG_SND_SOC_AK4554 is not set -# CONFIG_SND_SOC_AK4613 is not set -# CONFIG_SND_SOC_AK4619 is not set -# CONFIG_SND_SOC_AK4642 is not set -# CONFIG_SND_SOC_AK5386 is not set -# CONFIG_SND_SOC_AK5558 is not set -# CONFIG_SND_SOC_ALC5623 is not set -# CONFIG_SND_SOC_AW8738 is not set -# CONFIG_SND_SOC_AW88395 is not set -# CONFIG_SND_SOC_AW88261 is not set -# CONFIG_SND_SOC_AW87390 is not set -# CONFIG_SND_SOC_AW88399 is not set -# CONFIG_SND_SOC_BD28623 is not set -# CONFIG_SND_SOC_BT_SCO is not set -# CONFIG_SND_SOC_CHV3_CODEC is not set -# CONFIG_SND_SOC_CS35L32 is not set -# CONFIG_SND_SOC_CS35L33 is not set -# CONFIG_SND_SOC_CS35L34 is not set -# CONFIG_SND_SOC_CS35L35 is not set -# CONFIG_SND_SOC_CS35L36 is not set -# CONFIG_SND_SOC_CS35L41_SPI is not set -# CONFIG_SND_SOC_CS35L41_I2C is not set -# CONFIG_SND_SOC_CS35L45_SPI is not set -# CONFIG_SND_SOC_CS35L45_I2C is not set -# CONFIG_SND_SOC_CS35L56_I2C is not set -# CONFIG_SND_SOC_CS35L56_SPI is not set -# CONFIG_SND_SOC_CS42L42 is not set -# CONFIG_SND_SOC_CS42L51_I2C is not set -# CONFIG_SND_SOC_CS42L52 is not set -# CONFIG_SND_SOC_CS42L56 is not set -# CONFIG_SND_SOC_CS42L73 is not set -# CONFIG_SND_SOC_CS42L83 is not set -# CONFIG_SND_SOC_CS4234 is not set -# CONFIG_SND_SOC_CS4265 is not set -# CONFIG_SND_SOC_CS4270 is not set -# CONFIG_SND_SOC_CS4271_I2C is not set -# CONFIG_SND_SOC_CS4271_SPI is not set -# CONFIG_SND_SOC_CS42XX8_I2C is not set -# CONFIG_SND_SOC_CS43130 is not set -# CONFIG_SND_SOC_CS4341 is not set -# CONFIG_SND_SOC_CS4349 is not set -# CONFIG_SND_SOC_CS53L30 is not set -# CONFIG_SND_SOC_CS530X_I2C is not set -# CONFIG_SND_SOC_CX2072X is not set -# CONFIG_SND_SOC_DA7213 is not set -CONFIG_SND_SOC_DMIC=m -# CONFIG_SND_SOC_ES7134 is not set -# CONFIG_SND_SOC_ES7241 is not set -# CONFIG_SND_SOC_ES8311 is not set -# CONFIG_SND_SOC_ES8316 is not set -# CONFIG_SND_SOC_ES8326 is not set -# CONFIG_SND_SOC_ES8328_I2C is not set -# CONFIG_SND_SOC_ES8328_SPI is not set -# CONFIG_SND_SOC_GTM601 is not set -CONFIG_SND_SOC_HDAC_HDA=m -# CONFIG_SND_SOC_HDA is not set -# CONFIG_SND_SOC_ICS43432 is not set -# CONFIG_SND_SOC_IDT821034 is not set -# CONFIG_SND_SOC_MAX98088 is not set -# CONFIG_SND_SOC_MAX98090 is not set -# CONFIG_SND_SOC_MAX98357A is not set -# CONFIG_SND_SOC_MAX98504 is not set -# CONFIG_SND_SOC_MAX9867 is not set -# CONFIG_SND_SOC_MAX98927 is not set -# CONFIG_SND_SOC_MAX98520 is not set -# CONFIG_SND_SOC_MAX98373_I2C is not set -# CONFIG_SND_SOC_MAX98388 is not set -# CONFIG_SND_SOC_MAX98390 is not set -# CONFIG_SND_SOC_MAX98396 is not set -# CONFIG_SND_SOC_MAX9860 is not set -# CONFIG_SND_SOC_MSM8916_WCD_ANALOG is not set -# CONFIG_SND_SOC_MSM8916_WCD_DIGITAL is not set -# CONFIG_SND_SOC_PCM1681 is not set -# CONFIG_SND_SOC_PCM1789_I2C is not set -# CONFIG_SND_SOC_PCM179X_I2C is not set -# CONFIG_SND_SOC_PCM179X_SPI is not set -# CONFIG_SND_SOC_PCM186X_I2C is not set -# CONFIG_SND_SOC_PCM186X_SPI is not set -# CONFIG_SND_SOC_PCM3060_I2C is not set -# CONFIG_SND_SOC_PCM3060_SPI is not set -# CONFIG_SND_SOC_PCM3168A_I2C is not set -# CONFIG_SND_SOC_PCM3168A_SPI is not set -# CONFIG_SND_SOC_PCM5102A is not set -# CONFIG_SND_SOC_PCM512x_I2C is not set -# CONFIG_SND_SOC_PCM512x_SPI is not set -# CONFIG_SND_SOC_PCM6240 is not set -# CONFIG_SND_SOC_PEB2466 is not set -# CONFIG_SND_SOC_RT5616 is not set -# CONFIG_SND_SOC_RT5631 is not set -# CONFIG_SND_SOC_RT5640 is not set -# CONFIG_SND_SOC_RT5659 is not set -# CONFIG_SND_SOC_RT9120 is not set -# CONFIG_SND_SOC_RTQ9128 is not set -# CONFIG_SND_SOC_SGTL5000 is not set -# CONFIG_SND_SOC_SIMPLE_AMPLIFIER is not set -# CONFIG_SND_SOC_SMA1303 is not set -# CONFIG_SND_SOC_SPDIF is not set -# CONFIG_SND_SOC_SRC4XXX_I2C is not set -# CONFIG_SND_SOC_SSM2305 is not set -# CONFIG_SND_SOC_SSM2518 is not set -# CONFIG_SND_SOC_SSM2602_SPI is not set -# CONFIG_SND_SOC_SSM2602_I2C is not set -# CONFIG_SND_SOC_SSM4567 is not set -# CONFIG_SND_SOC_STA32X is not set -# CONFIG_SND_SOC_STA350 is not set -# CONFIG_SND_SOC_STI_SAS is not set -# CONFIG_SND_SOC_TAS2552 is not set -# CONFIG_SND_SOC_TAS2562 is not set -# CONFIG_SND_SOC_TAS2764 is not set -# CONFIG_SND_SOC_TAS2770 is not set -# CONFIG_SND_SOC_TAS2780 is not set -# CONFIG_SND_SOC_TAS2781_I2C is not set -# CONFIG_SND_SOC_TAS5086 is not set -# CONFIG_SND_SOC_TAS571X is not set -# CONFIG_SND_SOC_TAS5720 is not set -# CONFIG_SND_SOC_TAS5805M is not set -# CONFIG_SND_SOC_TAS6424 is not set -# CONFIG_SND_SOC_TDA7419 is not set -# CONFIG_SND_SOC_TFA9879 is not set -# CONFIG_SND_SOC_TFA989X is not set -# CONFIG_SND_SOC_TLV320AIC23_I2C is not set -# CONFIG_SND_SOC_TLV320AIC23_SPI is not set -# CONFIG_SND_SOC_TLV320AIC31XX is not set -# CONFIG_SND_SOC_TLV320AIC32X4_I2C is not set -# CONFIG_SND_SOC_TLV320AIC32X4_SPI is not set -# CONFIG_SND_SOC_TLV320AIC3X_I2C is not set -# CONFIG_SND_SOC_TLV320AIC3X_SPI is not set -# CONFIG_SND_SOC_TLV320ADCX140 is not set -# CONFIG_SND_SOC_TS3A227E is not set -# CONFIG_SND_SOC_TSCS42XX is not set -# CONFIG_SND_SOC_TSCS454 is not set -# CONFIG_SND_SOC_WM8510 is not set -# CONFIG_SND_SOC_WM8523 is not set -# CONFIG_SND_SOC_WM8580 is not set -# CONFIG_SND_SOC_WM8711 is not set -# CONFIG_SND_SOC_WM8728 is not set -# CONFIG_SND_SOC_WM8731_I2C is not set -# CONFIG_SND_SOC_WM8731_SPI is not set -# CONFIG_SND_SOC_WM8737 is not set -# CONFIG_SND_SOC_WM8741 is not set -# CONFIG_SND_SOC_WM8750 is not set -# CONFIG_SND_SOC_WM8753 is not set -# CONFIG_SND_SOC_WM8770 is not set -# CONFIG_SND_SOC_WM8776 is not set -# CONFIG_SND_SOC_WM8782 is not set -# CONFIG_SND_SOC_WM8804_I2C is not set -# CONFIG_SND_SOC_WM8804_SPI is not set -# CONFIG_SND_SOC_WM8903 is not set -# CONFIG_SND_SOC_WM8904 is not set -# CONFIG_SND_SOC_WM8940 is not set -# CONFIG_SND_SOC_WM8960 is not set -# CONFIG_SND_SOC_WM8961 is not set -# CONFIG_SND_SOC_WM8962 is not set -# CONFIG_SND_SOC_WM8974 is not set -# CONFIG_SND_SOC_WM8978 is not set -# CONFIG_SND_SOC_WM8985 is not set -# CONFIG_SND_SOC_MT6351 is not set -# CONFIG_SND_SOC_MT6357 is not set -# CONFIG_SND_SOC_MT6358 is not set -# CONFIG_SND_SOC_MT6660 is not set -# CONFIG_SND_SOC_NAU8315 is not set -# CONFIG_SND_SOC_NAU8540 is not set -# CONFIG_SND_SOC_NAU8810 is not set -# CONFIG_SND_SOC_NAU8821 is not set -# CONFIG_SND_SOC_NAU8822 is not set -# CONFIG_SND_SOC_NAU8824 is not set -# CONFIG_SND_SOC_TPA6130A2 is not set -# CONFIG_SND_SOC_LPASS_WSA_MACRO is not set -# CONFIG_SND_SOC_LPASS_VA_MACRO is not set -# CONFIG_SND_SOC_LPASS_RX_MACRO is not set -# CONFIG_SND_SOC_LPASS_TX_MACRO is not set -# end of CODEC drivers - -# CONFIG_SND_SIMPLE_CARD is not set -CONFIG_SND_X86=y -# CONFIG_HDMI_LPE_AUDIO is not set -CONFIG_SND_VIRTIO=m -CONFIG_AC97_BUS=m -CONFIG_HID_SUPPORT=y -CONFIG_HID=m -CONFIG_HID_BATTERY_STRENGTH=y -CONFIG_HIDRAW=y -CONFIG_UHID=m -CONFIG_HID_GENERIC=m - -# -# Special HID drivers -# -CONFIG_HID_A4TECH=m -# CONFIG_HID_ACCUTOUCH is not set -# CONFIG_HID_ACRUX is not set -# CONFIG_HID_APPLE is not set -# CONFIG_HID_APPLEIR is not set -# CONFIG_HID_ASUS is not set -# CONFIG_HID_AUREAL is not set -# CONFIG_HID_BELKIN is not set -# CONFIG_HID_BETOP_FF is not set -# CONFIG_HID_BIGBEN_FF is not set -# CONFIG_HID_CHERRY is not set -CONFIG_HID_CHICONY=m -# CONFIG_HID_CORSAIR is not set -# CONFIG_HID_COUGAR is not set -# CONFIG_HID_MACALLY is not set -# CONFIG_HID_PRODIKEYS is not set -# CONFIG_HID_CMEDIA is not set -# CONFIG_HID_CREATIVE_SB0540 is not set -CONFIG_HID_CYPRESS=m -# CONFIG_HID_DRAGONRISE is not set -# CONFIG_HID_EMS_FF is not set -# CONFIG_HID_ELAN is not set -# CONFIG_HID_ELECOM is not set -# CONFIG_HID_ELO is not set -# CONFIG_HID_EVISION is not set -# CONFIG_HID_EZKEY is not set -# CONFIG_HID_FT260 is not set -# CONFIG_HID_GEMBIRD is not set -# CONFIG_HID_GFRM is not set -# CONFIG_HID_GLORIOUS is not set -# CONFIG_HID_HOLTEK is not set -# CONFIG_HID_GOODIX_SPI is not set -# CONFIG_HID_GOOGLE_STADIA_FF is not set -# CONFIG_HID_VIVALDI is not set -# CONFIG_HID_GT683R is not set -# CONFIG_HID_KEYTOUCH is not set -CONFIG_HID_KYE=m -# CONFIG_HID_UCLOGIC is not set -# CONFIG_HID_WALTOP is not set -# CONFIG_HID_VIEWSONIC is not set -# CONFIG_HID_VRC2 is not set -CONFIG_HID_XIAOMI=m -# CONFIG_HID_GYRATION is not set -# CONFIG_HID_ICADE is not set -# CONFIG_HID_ITE is not set -# CONFIG_HID_JABRA is not set -# CONFIG_HID_TWINHAN is not set -# CONFIG_HID_KENSINGTON is not set -# CONFIG_HID_LCPOWER is not set -# CONFIG_HID_LED is not set -CONFIG_HID_LENOVO=m -# CONFIG_HID_LETSKETCH is not set -CONFIG_HID_LOGITECH=m -CONFIG_HID_LOGITECH_DJ=m -CONFIG_HID_LOGITECH_HIDPP=m -# CONFIG_LOGITECH_FF is not set -# CONFIG_LOGIRUMBLEPAD2_FF is not set -# CONFIG_LOGIG940_FF is not set -# CONFIG_LOGIWHEELS_FF is not set -# CONFIG_HID_MAGICMOUSE is not set -# CONFIG_HID_MALTRON is not set -# CONFIG_HID_MAYFLASH is not set -# CONFIG_HID_MEGAWORLD_FF is not set -# CONFIG_HID_REDRAGON is not set -CONFIG_HID_MICROSOFT=m -# CONFIG_HID_MONTEREY is not set -CONFIG_HID_MULTITOUCH=m -# CONFIG_HID_NINTENDO is not set -# CONFIG_HID_NTI is not set -# CONFIG_HID_NTRIG is not set -# CONFIG_HID_ORTEK is not set -# CONFIG_HID_PANTHERLORD is not set -# CONFIG_HID_PENMOUNT is not set -# CONFIG_HID_PETALYNX is not set -# CONFIG_HID_PICOLCD is not set -# CONFIG_HID_PLANTRONICS is not set -# CONFIG_HID_PXRC is not set -# CONFIG_HID_RAZER is not set -# CONFIG_HID_PRIMAX is not set -# CONFIG_HID_RETRODE is not set -# CONFIG_HID_ROCCAT is not set -# CONFIG_HID_SAITEK is not set -# CONFIG_HID_SAMSUNG is not set -# CONFIG_HID_SEMITEK is not set -# CONFIG_HID_SIGMAMICRO is not set -# CONFIG_HID_SONY is not set -# CONFIG_HID_SPEEDLINK is not set -# CONFIG_HID_STEAM is not set -# CONFIG_HID_STEELSERIES is not set -# CONFIG_HID_SUNPLUS is not set -CONFIG_HID_RMI=m -# CONFIG_HID_GREENASIA is not set -# CONFIG_HID_SMARTJOYPLUS is not set -# CONFIG_HID_TIVO is not set -# CONFIG_HID_TOPSEED is not set -# CONFIG_HID_TOPRE is not set -# CONFIG_HID_THINGM is not set -# CONFIG_HID_THRUSTMASTER is not set -# CONFIG_HID_UDRAW_PS3 is not set -CONFIG_HID_U2FZERO=m -# CONFIG_HID_WACOM is not set -# CONFIG_HID_WIIMOTE is not set -# CONFIG_HID_WINWING is not set -# CONFIG_HID_XINMO is not set -# CONFIG_HID_ZEROPLUS is not set -# CONFIG_HID_ZYDACRON is not set -CONFIG_HID_SENSOR_HUB=m -CONFIG_HID_SENSOR_CUSTOM_SENSOR=m -# CONFIG_HID_ALPS is not set -# CONFIG_HID_MCP2221 is not set -# end of Special HID drivers - -# -# HID-BPF support -# -CONFIG_HID_BPF=y -# end of HID-BPF support - -# -# USB HID support -# -CONFIG_USB_HID=m -# CONFIG_HID_PID is not set -CONFIG_USB_HIDDEV=y -# end of USB HID support - -CONFIG_I2C_HID=m -CONFIG_I2C_HID_ACPI=m -CONFIG_I2C_HID_OF=m -CONFIG_I2C_HID_CORE=m - -# -# Intel ISH HID support -# -CONFIG_INTEL_ISH_HID=m -CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m -# end of Intel ISH HID support - -# -# AMD SFH HID Support -# -CONFIG_AMD_SFH_HID=m -# end of AMD SFH HID Support - -CONFIG_USB_OHCI_LITTLE_ENDIAN=y -CONFIG_USB_SUPPORT=y -CONFIG_USB_COMMON=m -# CONFIG_USB_LED_TRIG is not set -# CONFIG_USB_ULPI_BUS is not set -CONFIG_USB_ARCH_HAS_HCD=y -CONFIG_USB=m -CONFIG_USB_PCI=y -CONFIG_USB_PCI_AMD=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y - -# -# Miscellaneous USB options -# -CONFIG_USB_DEFAULT_PERSIST=y -# CONFIG_USB_FEW_INIT_RETRIES is not set -# CONFIG_USB_DYNAMIC_MINORS is not set -CONFIG_USB_OTG=y -# CONFIG_USB_OTG_PRODUCTLIST is not set -# CONFIG_USB_OTG_DISABLE_EXTERNAL_HUB is not set -CONFIG_USB_OTG_FSM=m -# CONFIG_USB_LEDS_TRIGGER_USBPORT is not set -CONFIG_USB_AUTOSUSPEND_DELAY=2 -CONFIG_USB_DEFAULT_AUTHORIZATION_MODE=1 -# CONFIG_USB_MON is not set - -# -# USB Host Controller Drivers -# -# CONFIG_USB_C67X00_HCD is not set -CONFIG_USB_XHCI_HCD=m -# CONFIG_USB_XHCI_DBGCAP is not set -CONFIG_USB_XHCI_PCI=m -# CONFIG_USB_XHCI_PCI_RENESAS is not set -CONFIG_USB_XHCI_PLATFORM=m -CONFIG_USB_EHCI_HCD=m -CONFIG_USB_EHCI_ROOT_HUB_TT=y -CONFIG_USB_EHCI_TT_NEWSCHED=y -CONFIG_USB_EHCI_PCI=m -CONFIG_USB_EHCI_FSL=m -CONFIG_USB_EHCI_HCD_PLATFORM=m -# CONFIG_USB_OXU210HP_HCD is not set -# CONFIG_USB_ISP116X_HCD is not set -# CONFIG_USB_MAX3421_HCD is not set -CONFIG_USB_OHCI_HCD=m -CONFIG_USB_OHCI_HCD_PCI=m -# CONFIG_USB_OHCI_HCD_SSB is not set -# CONFIG_USB_OHCI_HCD_PLATFORM is not set -CONFIG_USB_UHCI_HCD=m -# CONFIG_USB_SL811_HCD is not set -# CONFIG_USB_R8A66597_HCD is not set -# CONFIG_USB_HCD_SSB is not set -# CONFIG_USB_HCD_TEST_MODE is not set - -# -# USB Device Class drivers -# -CONFIG_USB_ACM=m -# CONFIG_USB_PRINTER is not set -CONFIG_USB_WDM=m -CONFIG_USB_TMC=m - -# -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may -# - -# -# also be needed; see USB_STORAGE Help for more info -# -CONFIG_USB_STORAGE=m -# CONFIG_USB_STORAGE_DEBUG is not set -# CONFIG_USB_STORAGE_REALTEK is not set -# CONFIG_USB_STORAGE_DATAFAB is not set -# CONFIG_USB_STORAGE_FREECOM is not set -# CONFIG_USB_STORAGE_ISD200 is not set -# CONFIG_USB_STORAGE_USBAT is not set -# CONFIG_USB_STORAGE_SDDR09 is not set -# CONFIG_USB_STORAGE_SDDR55 is not set -# CONFIG_USB_STORAGE_JUMPSHOT is not set -# CONFIG_USB_STORAGE_ALAUDA is not set -# CONFIG_USB_STORAGE_ONETOUCH is not set -# CONFIG_USB_STORAGE_KARMA is not set -# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set -# CONFIG_USB_STORAGE_ENE_UB6250 is not set -CONFIG_USB_UAS=m - -# -# USB Imaging devices -# -# CONFIG_USB_MDC800 is not set -# CONFIG_USB_MICROTEK is not set -# CONFIG_USBIP_CORE is not set - -# -# USB dual-mode controller drivers -# -# CONFIG_USB_CDNS_SUPPORT is not set -# CONFIG_USB_MUSB_HDRC is not set -# CONFIG_USB_DWC3 is not set -# CONFIG_USB_DWC2 is not set -# CONFIG_USB_CHIPIDEA is not set -# CONFIG_USB_ISP1760 is not set - -# -# USB port drivers -# -CONFIG_USB_SERIAL=m -CONFIG_USB_SERIAL_GENERIC=y -# CONFIG_USB_SERIAL_SIMPLE is not set -# CONFIG_USB_SERIAL_AIRCABLE is not set -CONFIG_USB_SERIAL_ARK3116=m -# CONFIG_USB_SERIAL_BELKIN is not set -CONFIG_USB_SERIAL_CH341=m -# CONFIG_USB_SERIAL_WHITEHEAT is not set -# CONFIG_USB_SERIAL_DIGI_ACCELEPORT is not set -CONFIG_USB_SERIAL_CP210X=m -# CONFIG_USB_SERIAL_CYPRESS_M8 is not set -# CONFIG_USB_SERIAL_EMPEG is not set -# CONFIG_USB_SERIAL_FTDI_SIO is not set -# CONFIG_USB_SERIAL_VISOR is not set -# CONFIG_USB_SERIAL_IPAQ is not set -# CONFIG_USB_SERIAL_IR is not set -# CONFIG_USB_SERIAL_EDGEPORT is not set -# CONFIG_USB_SERIAL_EDGEPORT_TI is not set -# CONFIG_USB_SERIAL_F81232 is not set -# CONFIG_USB_SERIAL_F8153X is not set -# CONFIG_USB_SERIAL_GARMIN is not set -# CONFIG_USB_SERIAL_IPW is not set -# CONFIG_USB_SERIAL_IUU is not set -# CONFIG_USB_SERIAL_KEYSPAN_PDA is not set -# CONFIG_USB_SERIAL_KEYSPAN is not set -# CONFIG_USB_SERIAL_KLSI is not set -# CONFIG_USB_SERIAL_KOBIL_SCT is not set -# CONFIG_USB_SERIAL_MCT_U232 is not set -# CONFIG_USB_SERIAL_METRO is not set -# CONFIG_USB_SERIAL_MOS7720 is not set -# CONFIG_USB_SERIAL_MOS7840 is not set -# CONFIG_USB_SERIAL_MXUPORT is not set -# CONFIG_USB_SERIAL_NAVMAN is not set -# CONFIG_USB_SERIAL_PL2303 is not set -# CONFIG_USB_SERIAL_OTI6858 is not set -# CONFIG_USB_SERIAL_QCAUX is not set -# CONFIG_USB_SERIAL_QUALCOMM is not set -# CONFIG_USB_SERIAL_SPCP8X5 is not set -# CONFIG_USB_SERIAL_SAFE is not set -# CONFIG_USB_SERIAL_SIERRAWIRELESS is not set -# CONFIG_USB_SERIAL_SYMBOL is not set -# CONFIG_USB_SERIAL_TI is not set -# CONFIG_USB_SERIAL_CYBERJACK is not set -CONFIG_USB_SERIAL_WWAN=m -CONFIG_USB_SERIAL_OPTION=m -# CONFIG_USB_SERIAL_OMNINET is not set -# CONFIG_USB_SERIAL_OPTICON is not set -# CONFIG_USB_SERIAL_XSENS_MT is not set -# CONFIG_USB_SERIAL_WISHBONE is not set -# CONFIG_USB_SERIAL_SSU100 is not set -# CONFIG_USB_SERIAL_QT2 is not set -# CONFIG_USB_SERIAL_UPD78F0730 is not set -# CONFIG_USB_SERIAL_XR is not set -# CONFIG_USB_SERIAL_DEBUG is not set - -# -# USB Miscellaneous drivers -# -# CONFIG_USB_EMI62 is not set -# CONFIG_USB_EMI26 is not set -# CONFIG_USB_ADUTUX is not set -# CONFIG_USB_SEVSEG is not set -# CONFIG_USB_LEGOTOWER is not set -# CONFIG_USB_LCD is not set -# CONFIG_USB_CYPRESS_CY7C63 is not set -# CONFIG_USB_CYTHERM is not set -# CONFIG_USB_IDMOUSE is not set -# CONFIG_USB_APPLEDISPLAY is not set -# CONFIG_APPLE_MFI_FASTCHARGE is not set -# CONFIG_USB_LJCA is not set -# CONFIG_USB_SISUSBVGA is not set -# CONFIG_USB_LD is not set -# CONFIG_USB_TRANCEVIBRATOR is not set -# CONFIG_USB_IOWARRIOR is not set -# CONFIG_USB_TEST is not set -# CONFIG_USB_EHSET_TEST_FIXTURE is not set -# CONFIG_USB_ISIGHTFW is not set -# CONFIG_USB_YUREX is not set -# CONFIG_USB_EZUSB_FX2 is not set -# CONFIG_USB_HUB_USB251XB is not set -# CONFIG_USB_HSIC_USB3503 is not set -# CONFIG_USB_HSIC_USB4604 is not set -# CONFIG_USB_LINK_LAYER_TEST is not set -# CONFIG_USB_CHAOSKEY is not set -# CONFIG_USB_ATM is not set - -# -# USB Physical Layer drivers -# -CONFIG_USB_PHY=y -# CONFIG_NOP_USB_XCEIV is not set -# CONFIG_USB_ISP1301 is not set -# end of USB Physical Layer drivers - -# CONFIG_USB_GADGET is not set -CONFIG_TYPEC=m -CONFIG_TYPEC_TCPM=m -CONFIG_TYPEC_TCPCI=m -CONFIG_TYPEC_RT1711H=m -CONFIG_TYPEC_TCPCI_MAXIM=m -# CONFIG_TYPEC_FUSB302 is not set -CONFIG_TYPEC_UCSI=m -CONFIG_UCSI_CCG=m -CONFIG_UCSI_ACPI=m -# CONFIG_UCSI_STM32G0 is not set -# CONFIG_TYPEC_TPS6598X is not set -# CONFIG_TYPEC_ANX7411 is not set -# CONFIG_TYPEC_RT1719 is not set -# CONFIG_TYPEC_HD3SS3220 is not set -# CONFIG_TYPEC_STUSB160X is not set -# CONFIG_TYPEC_WUSB3801 is not set - -# -# USB Type-C Multiplexer/DeMultiplexer Switch support -# -# CONFIG_TYPEC_MUX_FSA4480 is not set -# CONFIG_TYPEC_MUX_GPIO_SBU is not set -# CONFIG_TYPEC_MUX_PI3USB30532 is not set -# CONFIG_TYPEC_MUX_INTEL_PMC is not set -# CONFIG_TYPEC_MUX_IT5205 is not set -# CONFIG_TYPEC_MUX_NB7VPQ904M is not set -# CONFIG_TYPEC_MUX_PTN36502 is not set -# CONFIG_TYPEC_MUX_WCD939X_USBSS is not set -# end of USB Type-C Multiplexer/DeMultiplexer Switch support - -# -# USB Type-C Alternate Mode drivers -# -CONFIG_TYPEC_DP_ALTMODE=m -# CONFIG_TYPEC_NVIDIA_ALTMODE is not set -# end of USB Type-C Alternate Mode drivers - -CONFIG_USB_ROLE_SWITCH=m -# CONFIG_USB_ROLES_INTEL_XHCI is not set -CONFIG_MMC=y -CONFIG_MMC_BLOCK=m -CONFIG_MMC_BLOCK_MINORS=8 -# CONFIG_SDIO_UART is not set -CONFIG_MMC_TEST=m -# CONFIG_MMC_CRYPTO is not set - -# -# MMC/SD/SDIO Host Controller Drivers -# -# CONFIG_MMC_DEBUG is not set -CONFIG_MMC_SDHCI=m -CONFIG_MMC_SDHCI_IO_ACCESSORS=y -CONFIG_MMC_SDHCI_PCI=m -CONFIG_MMC_RICOH_MMC=y -CONFIG_MMC_SDHCI_ACPI=m -CONFIG_MMC_SDHCI_PLTFM=m -# CONFIG_MMC_SDHCI_F_SDH30 is not set -# CONFIG_MMC_WBSD is not set -# CONFIG_MMC_TIFM_SD is not set -# CONFIG_MMC_SPI is not set -# CONFIG_MMC_CB710 is not set -# CONFIG_MMC_VIA_SDMMC is not set -# CONFIG_MMC_VUB300 is not set -CONFIG_MMC_USHC=m -# CONFIG_MMC_USDHI6ROL0 is not set -CONFIG_MMC_CQHCI=m -CONFIG_MMC_HSQ=m -# CONFIG_MMC_TOSHIBA_PCI is not set -# CONFIG_MMC_MTK is not set -# CONFIG_MMC_SDHCI_XENON is not set -# CONFIG_SCSI_UFSHCD is not set -# CONFIG_MEMSTICK is not set -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=m -# CONFIG_LEDS_CLASS_FLASH is not set -# CONFIG_LEDS_CLASS_MULTICOLOR is not set -# CONFIG_LEDS_BRIGHTNESS_HW_CHANGED is not set - -# -# LED drivers -# -# CONFIG_LEDS_APU is not set -# CONFIG_LEDS_AW200XX is not set -# CONFIG_LEDS_LM3530 is not set -# CONFIG_LEDS_LM3532 is not set -# CONFIG_LEDS_LM3642 is not set -# CONFIG_LEDS_PCA9532 is not set -# CONFIG_LEDS_LP3944 is not set -# CONFIG_LEDS_PCA955X is not set -# CONFIG_LEDS_PCA963X is not set -# CONFIG_LEDS_PCA995X is not set -# CONFIG_LEDS_DAC124S085 is not set -# CONFIG_LEDS_PWM is not set -# CONFIG_LEDS_BD2606MVV is not set -# CONFIG_LEDS_BD2802 is not set -# CONFIG_LEDS_INTEL_SS4200 is not set -# CONFIG_LEDS_TCA6507 is not set -# CONFIG_LEDS_TLC591XX is not set -# CONFIG_LEDS_LM355x is not set -# CONFIG_LEDS_IS31FL319X is not set - -# -# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) -# -# CONFIG_LEDS_BLINKM is not set -# CONFIG_LEDS_MLXCPLD is not set -# CONFIG_LEDS_MLXREG is not set -# CONFIG_LEDS_USER is not set -# CONFIG_LEDS_NIC78BX is not set -# CONFIG_LEDS_SPI_BYTE is not set - -# -# Flash and Torch LED drivers -# - -# -# RGB LED drivers -# - -# -# LED Triggers -# -CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=m -# CONFIG_LEDS_TRIGGER_ONESHOT is not set -# CONFIG_LEDS_TRIGGER_DISK is not set -CONFIG_LEDS_TRIGGER_HEARTBEAT=m -# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set -# CONFIG_LEDS_TRIGGER_CPU is not set -# CONFIG_LEDS_TRIGGER_ACTIVITY is not set -# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set - -# -# iptables trigger is under Netfilter config (LED target) -# -# CONFIG_LEDS_TRIGGER_TRANSIENT is not set -# CONFIG_LEDS_TRIGGER_CAMERA is not set -# CONFIG_LEDS_TRIGGER_PANIC is not set -# CONFIG_LEDS_TRIGGER_NETDEV is not set -# CONFIG_LEDS_TRIGGER_PATTERN is not set -# CONFIG_LEDS_TRIGGER_TTY is not set -# CONFIG_LEDS_TRIGGER_INPUT_EVENTS is not set - -# -# Simple LED drivers -# -# CONFIG_ACCESSIBILITY is not set -# CONFIG_INFINIBAND is not set -CONFIG_EDAC_ATOMIC_SCRUB=y -CONFIG_EDAC_SUPPORT=y -CONFIG_EDAC=y -# CONFIG_EDAC_LEGACY_SYSFS is not set -# CONFIG_EDAC_DEBUG is not set -CONFIG_EDAC_DECODE_MCE=m -# CONFIG_EDAC_GHES is not set -CONFIG_EDAC_AMD64=m -# CONFIG_EDAC_E752X is not set -# CONFIG_EDAC_I82975X is not set -# CONFIG_EDAC_I3000 is not set -# CONFIG_EDAC_I3200 is not set -CONFIG_EDAC_IE31200=m -# CONFIG_EDAC_X38 is not set -# CONFIG_EDAC_I5400 is not set -CONFIG_EDAC_I7CORE=m -# CONFIG_EDAC_I5100 is not set -# CONFIG_EDAC_I7300 is not set -CONFIG_EDAC_SBRIDGE=m -# CONFIG_EDAC_SKX is not set -# CONFIG_EDAC_I10NM is not set -# CONFIG_EDAC_PND2 is not set -# CONFIG_EDAC_IGEN6 is not set -CONFIG_RTC_LIB=y -CONFIG_RTC_MC146818_LIB=y -CONFIG_RTC_CLASS=y -CONFIG_RTC_HCTOSYS=y -CONFIG_RTC_HCTOSYS_DEVICE="rtc0" -CONFIG_RTC_SYSTOHC=y -CONFIG_RTC_SYSTOHC_DEVICE="rtc0" -# CONFIG_RTC_DEBUG is not set -# CONFIG_RTC_NVMEM is not set - -# -# RTC interfaces -# -CONFIG_RTC_INTF_SYSFS=y -CONFIG_RTC_INTF_PROC=y -CONFIG_RTC_INTF_DEV=y -# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set -# CONFIG_RTC_DRV_TEST is not set - -# -# I2C RTC drivers -# -# CONFIG_RTC_DRV_ABB5ZES3 is not set -# CONFIG_RTC_DRV_ABEOZ9 is not set -# CONFIG_RTC_DRV_ABX80X is not set -# CONFIG_RTC_DRV_DS1307 is not set -# CONFIG_RTC_DRV_DS1374 is not set -# CONFIG_RTC_DRV_DS1672 is not set -# CONFIG_RTC_DRV_MAX6900 is not set -# CONFIG_RTC_DRV_MAX31335 is not set -# CONFIG_RTC_DRV_RS5C372 is not set -# CONFIG_RTC_DRV_ISL1208 is not set -# CONFIG_RTC_DRV_ISL12022 is not set -# CONFIG_RTC_DRV_X1205 is not set -# CONFIG_RTC_DRV_PCF8523 is not set -# CONFIG_RTC_DRV_PCF85063 is not set -# CONFIG_RTC_DRV_PCF85363 is not set -# CONFIG_RTC_DRV_PCF8563 is not set -# CONFIG_RTC_DRV_PCF8583 is not set -# CONFIG_RTC_DRV_M41T80 is not set -# CONFIG_RTC_DRV_BQ32K is not set -# CONFIG_RTC_DRV_S35390A is not set -# CONFIG_RTC_DRV_FM3130 is not set -# CONFIG_RTC_DRV_RX8010 is not set -# CONFIG_RTC_DRV_RX8111 is not set -# CONFIG_RTC_DRV_RX8581 is not set -# CONFIG_RTC_DRV_RX8025 is not set -# CONFIG_RTC_DRV_EM3027 is not set -# CONFIG_RTC_DRV_RV3028 is not set -# CONFIG_RTC_DRV_RV3032 is not set -# CONFIG_RTC_DRV_RV8803 is not set -# CONFIG_RTC_DRV_SD2405AL is not set -# CONFIG_RTC_DRV_SD3078 is not set - -# -# SPI RTC drivers -# -# CONFIG_RTC_DRV_M41T93 is not set -# CONFIG_RTC_DRV_M41T94 is not set -# CONFIG_RTC_DRV_DS1302 is not set -# CONFIG_RTC_DRV_DS1305 is not set -# CONFIG_RTC_DRV_DS1343 is not set -# CONFIG_RTC_DRV_DS1347 is not set -# CONFIG_RTC_DRV_DS1390 is not set -# CONFIG_RTC_DRV_MAX6916 is not set -# CONFIG_RTC_DRV_R9701 is not set -# CONFIG_RTC_DRV_RX4581 is not set -# CONFIG_RTC_DRV_RS5C348 is not set -# CONFIG_RTC_DRV_MAX6902 is not set -# CONFIG_RTC_DRV_PCF2123 is not set -# CONFIG_RTC_DRV_MCP795 is not set -CONFIG_RTC_I2C_AND_SPI=y - -# -# SPI and I2C RTC drivers -# -# CONFIG_RTC_DRV_DS3232 is not set -# CONFIG_RTC_DRV_PCF2127 is not set -# CONFIG_RTC_DRV_RV3029C2 is not set -# CONFIG_RTC_DRV_RX6110 is not set - -# -# Platform RTC drivers -# -CONFIG_RTC_DRV_CMOS=y -# CONFIG_RTC_DRV_DS1286 is not set -# CONFIG_RTC_DRV_DS1511 is not set -# CONFIG_RTC_DRV_DS1553 is not set -# CONFIG_RTC_DRV_DS1685_FAMILY is not set -# CONFIG_RTC_DRV_DS1742 is not set -# CONFIG_RTC_DRV_DS2404 is not set -# CONFIG_RTC_DRV_STK17TA8 is not set -# CONFIG_RTC_DRV_M48T86 is not set -# CONFIG_RTC_DRV_M48T35 is not set -# CONFIG_RTC_DRV_M48T59 is not set -# CONFIG_RTC_DRV_MSM6242 is not set -# CONFIG_RTC_DRV_RP5C01 is not set - -# -# on-CPU RTC drivers -# -# CONFIG_RTC_DRV_FTRTC010 is not set - -# -# HID Sensor RTC drivers -# -# CONFIG_RTC_DRV_GOLDFISH is not set -CONFIG_DMADEVICES=y -# CONFIG_DMADEVICES_DEBUG is not set - -# -# DMA Devices -# -CONFIG_DMA_ENGINE=y -CONFIG_DMA_VIRTUAL_CHANNELS=y -CONFIG_DMA_ACPI=y -# CONFIG_ALTERA_MSGDMA is not set -CONFIG_INTEL_IDMA64=m -# CONFIG_INTEL_IDXD is not set -# CONFIG_INTEL_IDXD_COMPAT is not set -CONFIG_INTEL_IOATDMA=m -# CONFIG_PLX_DMA is not set -# CONFIG_XILINX_DMA is not set -# CONFIG_XILINX_XDMA is not set -# CONFIG_AMD_QDMA is not set -CONFIG_AMD_PTDMA=m -# CONFIG_QCOM_HIDMA_MGMT is not set -# CONFIG_QCOM_HIDMA is not set -CONFIG_DW_DMAC_CORE=y -# CONFIG_DW_DMAC is not set -CONFIG_DW_DMAC_PCI=y -# CONFIG_DW_EDMA is not set -CONFIG_HSU_DMA=y -# CONFIG_SF_PDMA is not set -# CONFIG_INTEL_LDMA is not set - -# -# DMA Clients -# -CONFIG_ASYNC_TX_DMA=y -# CONFIG_DMATEST is not set -CONFIG_DMA_ENGINE_RAID=y - -# -# DMABUF options -# -CONFIG_SYNC_FILE=y -# CONFIG_SW_SYNC is not set -CONFIG_UDMABUF=y -CONFIG_DMABUF_MOVE_NOTIFY=y -# CONFIG_DMABUF_DEBUG is not set -# CONFIG_DMABUF_SELFTESTS is not set -# CONFIG_DMABUF_HEAPS is not set -# CONFIG_DMABUF_SYSFS_STATS is not set -# end of DMABUF options - -CONFIG_DCA=m -# CONFIG_UIO is not set -CONFIG_VFIO=m -CONFIG_VFIO_GROUP=y -CONFIG_VFIO_CONTAINER=y -CONFIG_VFIO_IOMMU_TYPE1=m -# CONFIG_VFIO_NOIOMMU is not set -CONFIG_VFIO_VIRQFD=y -# CONFIG_VFIO_DEBUGFS is not set - -# -# VFIO support for PCI devices -# -CONFIG_VFIO_PCI_CORE=m -CONFIG_VFIO_PCI_MMAP=y -CONFIG_VFIO_PCI_INTX=y -CONFIG_VFIO_PCI=m -CONFIG_VFIO_PCI_VGA=y -CONFIG_VFIO_PCI_IGD=y -CONFIG_VIRTIO_VFIO_PCI=m -CONFIG_QAT_VFIO_PCI=m -# end of VFIO support for PCI devices - -CONFIG_VFIO_MDEV=m -CONFIG_IRQ_BYPASS_MANAGER=y -CONFIG_VIRT_DRIVERS=y -# CONFIG_VMGENID is not set -# CONFIG_VBOXGUEST is not set -# CONFIG_NITRO_ENCLAVES is not set -CONFIG_EFI_SECRET=m -CONFIG_VIRTIO_ANCHOR=y -CONFIG_VIRTIO=y -CONFIG_VIRTIO_PCI_LIB=y -CONFIG_VIRTIO_PCI_LIB_LEGACY=y -CONFIG_VIRTIO_MENU=y -CONFIG_VIRTIO_PCI=y -CONFIG_VIRTIO_PCI_ADMIN_LEGACY=y -CONFIG_VIRTIO_PCI_LEGACY=y -CONFIG_VIRTIO_VDPA=m -CONFIG_VIRTIO_PMEM=m -CONFIG_VIRTIO_BALLOON=m -CONFIG_VIRTIO_MEM=m -CONFIG_VIRTIO_INPUT=m -CONFIG_VIRTIO_MMIO=m -# CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES is not set -CONFIG_VIRTIO_DMA_SHARED_BUFFER=m -# CONFIG_VIRTIO_DEBUG is not set -CONFIG_VDPA=m -# CONFIG_VDPA_SIM is not set -CONFIG_IFCVF=m -# CONFIG_MLX5_VDPA_STEERING_DEBUG is not set -CONFIG_VP_VDPA=m -# CONFIG_ALIBABA_ENI_VDPA is not set -CONFIG_SNET_VDPA=m -CONFIG_OCTEONEP_VDPA=m -CONFIG_VHOST_IOTLB=m -CONFIG_VHOST_TASK=y -CONFIG_VHOST=m -CONFIG_VHOST_MENU=y -CONFIG_VHOST_NET=m -CONFIG_VHOST_VDPA=m -# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set - -# -# Microsoft Hyper-V guest support -# -# CONFIG_HYPERV is not set -# end of Microsoft Hyper-V guest support - -# CONFIG_GREYBUS is not set -# CONFIG_COMEDI is not set -# CONFIG_STAGING is not set -# CONFIG_GOLDFISH is not set -# CONFIG_CHROME_PLATFORMS is not set -# CONFIG_CZNIC_PLATFORMS is not set -# CONFIG_MELLANOX_PLATFORM is not set -# CONFIG_SURFACE_PLATFORMS is not set -CONFIG_X86_PLATFORM_DEVICES=y -CONFIG_ACPI_WMI=m -CONFIG_WMI_BMOF=m -# CONFIG_HUAWEI_WMI is not set -CONFIG_MXM_WMI=m -# CONFIG_NVIDIA_WMI_EC_BACKLIGHT is not set -# CONFIG_XIAOMI_WMI is not set -# CONFIG_GIGABYTE_WMI is not set -CONFIG_YOGABOOK=m -# CONFIG_ACERHDF is not set -# CONFIG_ACER_WIRELESS is not set -# CONFIG_ACER_WMI is not set -# CONFIG_AMD_PMC is not set -# CONFIG_AMD_HSMP is not set -CONFIG_AMD_WBRF=y -# CONFIG_ADV_SWBUTTON is not set -# CONFIG_APPLE_GMUX is not set -# CONFIG_ASUS_LAPTOP is not set -# CONFIG_ASUS_WIRELESS is not set -# CONFIG_ASUS_WMI is not set -# CONFIG_EEEPC_LAPTOP is not set -# CONFIG_X86_PLATFORM_DRIVERS_DELL is not set -# CONFIG_AMILO_RFKILL is not set -# CONFIG_FUJITSU_LAPTOP is not set -# CONFIG_FUJITSU_TABLET is not set -# CONFIG_GPD_POCKET_FAN is not set -# CONFIG_X86_PLATFORM_DRIVERS_HP is not set -CONFIG_WIRELESS_HOTKEY=m -# CONFIG_IBM_RTL is not set -CONFIG_IDEAPAD_LAPTOP=m -CONFIG_LENOVO_YMC=m -CONFIG_SENSORS_HDAPS=m -CONFIG_THINKPAD_ACPI=m -CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y -# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set -# CONFIG_THINKPAD_ACPI_DEBUG is not set -# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set -CONFIG_THINKPAD_ACPI_VIDEO=y -CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y -CONFIG_THINKPAD_LMI=m -# CONFIG_INTEL_ATOMISP2_PM is not set -# CONFIG_INTEL_IFS is not set -# CONFIG_INTEL_SAR_INT1092 is not set -# CONFIG_INTEL_PMT_TELEMETRY is not set -# CONFIG_INTEL_PMT_CRASHLOG is not set - -# -# Intel Speed Select Technology interface support -# -CONFIG_INTEL_SPEED_SELECT_TPMI=m -CONFIG_INTEL_SPEED_SELECT_INTERFACE=m -# end of Intel Speed Select Technology interface support - -CONFIG_INTEL_WMI=y -# CONFIG_INTEL_WMI_SBL_FW_UPDATE is not set -CONFIG_INTEL_WMI_THUNDERBOLT=m - -# -# Intel Uncore Frequency Control -# -CONFIG_INTEL_UNCORE_FREQ_CONTROL_TPMI=m -CONFIG_INTEL_UNCORE_FREQ_CONTROL=m -# end of Intel Uncore Frequency Control - -CONFIG_INTEL_HID_EVENT=m -CONFIG_INTEL_VBTN=m -CONFIG_INTEL_OAKTRAIL=m -CONFIG_INTEL_ISHTP_ECLITE=m -# CONFIG_INTEL_PUNIT_IPC is not set -CONFIG_INTEL_RST=m -# CONFIG_INTEL_SDSI is not set -CONFIG_INTEL_SMARTCONNECT=m -CONFIG_INTEL_TPMI_POWER_DOMAINS=m -CONFIG_INTEL_TPMI=m -# CONFIG_INTEL_PLR_TPMI is not set -CONFIG_INTEL_TURBO_MAX_3=y -CONFIG_INTEL_VSEC=m -# CONFIG_ACPI_QUICKSTART is not set -# CONFIG_MSI_EC is not set -# CONFIG_MSI_LAPTOP is not set -# CONFIG_MSI_WMI is not set -# CONFIG_MSI_WMI_PLATFORM is not set -# CONFIG_SAMSUNG_LAPTOP is not set -# CONFIG_SAMSUNG_Q10 is not set -# CONFIG_TOSHIBA_BT_RFKILL is not set -# CONFIG_TOSHIBA_HAPS is not set -# CONFIG_TOSHIBA_WMI is not set -# CONFIG_ACPI_CMPC is not set -# CONFIG_COMPAL_LAPTOP is not set -# CONFIG_LG_LAPTOP is not set -# CONFIG_PANASONIC_LAPTOP is not set -# CONFIG_SONY_LAPTOP is not set -# CONFIG_SYSTEM76_ACPI is not set -# CONFIG_TOPSTAR_LAPTOP is not set -# CONFIG_SERIAL_MULTI_INSTANTIATE is not set -# CONFIG_MLX_PLATFORM is not set -# CONFIG_INSPUR_PLATFORM_PROFILE is not set -# CONFIG_LENOVO_WMI_CAMERA is not set -CONFIG_FW_ATTR_CLASS=m -CONFIG_INTEL_IPS=m -CONFIG_INTEL_SCU_IPC=y -CONFIG_INTEL_SCU=y -CONFIG_INTEL_SCU_PCI=y -CONFIG_INTEL_SCU_PLATFORM=m -CONFIG_INTEL_SCU_IPC_UTIL=m -# CONFIG_SIEMENS_SIMATIC_IPC is not set -# CONFIG_WINMATE_FM07_KEYS is not set -CONFIG_P2SB=y -CONFIG_HAVE_CLK=y -CONFIG_HAVE_CLK_PREPARE=y -CONFIG_COMMON_CLK=y -# CONFIG_LMK04832 is not set -# CONFIG_COMMON_CLK_MAX9485 is not set -# CONFIG_COMMON_CLK_SI5341 is not set -# CONFIG_COMMON_CLK_SI5351 is not set -# CONFIG_COMMON_CLK_SI544 is not set -# CONFIG_COMMON_CLK_CDCE706 is not set -# CONFIG_COMMON_CLK_CS2000_CP is not set -# CONFIG_COMMON_CLK_PWM is not set -# CONFIG_XILINX_VCU is not set -CONFIG_HWSPINLOCK=y - -# -# Clock Source drivers -# -CONFIG_CLKEVT_I8253=y -CONFIG_I8253_LOCK=y -CONFIG_CLKBLD_I8253=y -# end of Clock Source drivers - -CONFIG_MAILBOX=y -CONFIG_PCC=y -# CONFIG_ALTERA_MBOX is not set -CONFIG_IOMMU_IOVA=y -CONFIG_IOMMU_API=y -CONFIG_IOMMU_SUPPORT=y - -# -# Generic IOMMU Pagetable Support -# -CONFIG_IOMMU_IO_PGTABLE=y -# end of Generic IOMMU Pagetable Support - -# CONFIG_IOMMU_DEBUGFS is not set -CONFIG_IOMMU_DEFAULT_DMA_STRICT=y -# CONFIG_IOMMU_DEFAULT_DMA_LAZY is not set -# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set -CONFIG_IOMMU_DMA=y -CONFIG_IOMMU_SVA=y -CONFIG_IOMMU_IOPF=y -CONFIG_AMD_IOMMU=y -CONFIG_DMAR_TABLE=y -CONFIG_INTEL_IOMMU=y -CONFIG_INTEL_IOMMU_SVM=y -CONFIG_INTEL_IOMMU_DEFAULT_ON=y -CONFIG_INTEL_IOMMU_FLOPPY_WA=y -CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON=y -CONFIG_INTEL_IOMMU_PERF_EVENTS=y -# CONFIG_IOMMUFD is not set -CONFIG_IRQ_REMAP=y -CONFIG_VIRTIO_IOMMU=m - -# -# Remoteproc drivers -# -# CONFIG_REMOTEPROC is not set -# end of Remoteproc drivers - -# -# Rpmsg drivers -# -# CONFIG_RPMSG_QCOM_GLINK_RPM is not set -# CONFIG_RPMSG_VIRTIO is not set -# end of Rpmsg drivers - -# CONFIG_SOUNDWIRE is not set - -# -# SOC (System On Chip) specific Drivers -# - -# -# Amlogic SoC drivers -# -# end of Amlogic SoC drivers - -# -# Broadcom SoC drivers -# -# end of Broadcom SoC drivers - -# -# NXP/Freescale QorIQ SoC drivers -# -# end of NXP/Freescale QorIQ SoC drivers - -# -# fujitsu SoC drivers -# -# end of fujitsu SoC drivers - -# -# i.MX SoC drivers -# -# end of i.MX SoC drivers - -# -# Enable LiteX SoC Builder specific drivers -# -# end of Enable LiteX SoC Builder specific drivers - -# CONFIG_WPCM450_SOC is not set - -# -# Qualcomm SoC drivers -# -# CONFIG_QCOM_PBS is not set -# end of Qualcomm SoC drivers - -# CONFIG_SOC_TI is not set - -# -# Xilinx SoC drivers -# -# end of Xilinx SoC drivers -# end of SOC (System On Chip) specific Drivers - -# -# PM Domains -# - -# -# Amlogic PM Domains -# -# end of Amlogic PM Domains - -# -# Broadcom PM Domains -# -# end of Broadcom PM Domains - -# -# i.MX PM Domains -# -# end of i.MX PM Domains - -# -# Qualcomm PM Domains -# -# end of Qualcomm PM Domains -# end of PM Domains - -CONFIG_PM_DEVFREQ=y - -# -# DEVFREQ Governors -# -CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y -CONFIG_DEVFREQ_GOV_PERFORMANCE=m -CONFIG_DEVFREQ_GOV_POWERSAVE=m -CONFIG_DEVFREQ_GOV_USERSPACE=m -CONFIG_DEVFREQ_GOV_PASSIVE=m - -# -# DEVFREQ Drivers -# -# CONFIG_PM_DEVFREQ_EVENT is not set -CONFIG_EXTCON=y - -# -# Extcon Device Drivers -# -# CONFIG_EXTCON_FSA9480 is not set -# CONFIG_EXTCON_LC824206XA is not set -# CONFIG_EXTCON_RT8973A is not set -# CONFIG_EXTCON_SM5502 is not set -# CONFIG_EXTCON_USBC_TUSB320 is not set -CONFIG_MEMORY=y -# CONFIG_IIO is not set -# CONFIG_NTB is not set -CONFIG_PWM=y -# CONFIG_PWM_DEBUG is not set -CONFIG_PWM_CLK=m -# CONFIG_PWM_DWC is not set -CONFIG_PWM_LPSS=m -CONFIG_PWM_LPSS_PCI=m -CONFIG_PWM_LPSS_PLATFORM=m -# CONFIG_PWM_PCA9685 is not set - -# -# IRQ chip support -# -# CONFIG_LAN966X_OIC is not set -# end of IRQ chip support - -# CONFIG_IPACK_BUS is not set -# CONFIG_RESET_CONTROLLER is not set - -# -# PHY Subsystem -# -# CONFIG_GENERIC_PHY is not set -# CONFIG_USB_LGM_PHY is not set -# CONFIG_PHY_CAN_TRANSCEIVER is not set - -# -# PHY drivers for Broadcom platforms -# -# CONFIG_BCM_KONA_USB2_PHY is not set -# end of PHY drivers for Broadcom platforms - -# CONFIG_PHY_PXA_28NM_HSIC is not set -# CONFIG_PHY_PXA_28NM_USB2 is not set -# CONFIG_PHY_INTEL_LGM_EMMC is not set -# end of PHY Subsystem - -CONFIG_POWERCAP=y -CONFIG_INTEL_RAPL_CORE=m -CONFIG_INTEL_RAPL=m -CONFIG_INTEL_RAPL_TPMI=m -# CONFIG_IDLE_INJECT is not set -# CONFIG_MCB is not set - -# -# Performance monitor support -# -# CONFIG_DWC_PCIE_PMU is not set -# end of Performance monitor support - -CONFIG_RAS=y -# CONFIG_RAS_CEC is not set -CONFIG_AMD_ATL=m -CONFIG_AMD_ATL_PRM=y -CONFIG_RAS_FMPM=m -CONFIG_USB4=m -# CONFIG_USB4_DEBUGFS_WRITE is not set -# CONFIG_USB4_DMA_TEST is not set - -# -# Android -# -# CONFIG_ANDROID_BINDER_IPC is not set -# end of Android - -CONFIG_LIBNVDIMM=y -CONFIG_BLK_DEV_PMEM=y -CONFIG_ND_CLAIM=y -CONFIG_ND_BTT=y -CONFIG_BTT=y -CONFIG_ND_PFN=y -CONFIG_NVDIMM_PFN=y -CONFIG_NVDIMM_DAX=y -CONFIG_NVDIMM_KEYS=y -# CONFIG_NVDIMM_SECURITY_TEST is not set -CONFIG_DAX=y -CONFIG_DEV_DAX=m -CONFIG_DEV_DAX_PMEM=m -CONFIG_DEV_DAX_KMEM=m -CONFIG_NVMEM=y -CONFIG_NVMEM_SYSFS=y -# CONFIG_NVMEM_LAYOUTS is not set -# CONFIG_NVMEM_RMEM is not set -CONFIG_NVMEM_SPMI_SDAM=m - -# -# HW tracing support -# -# CONFIG_STM is not set -# CONFIG_INTEL_TH is not set -# end of HW tracing support - -# CONFIG_FPGA is not set -# CONFIG_TEE is not set -CONFIG_PM_OPP=y -# CONFIG_SIOX is not set -# CONFIG_SLIMBUS is not set -# CONFIG_INTERCONNECT is not set -CONFIG_COUNTER=m -# CONFIG_INTEL_QEP is not set -# CONFIG_MOST is not set -# CONFIG_PECI is not set -CONFIG_HTE=y -# end of Device Drivers - -# -# File systems -# -CONFIG_DCACHE_WORD_ACCESS=y -CONFIG_VALIDATE_FS_PARSER=y -CONFIG_FS_IOMAP=y -CONFIG_FS_STACK=y -CONFIG_BUFFER_HEAD=y -CONFIG_LEGACY_DIRECT_IO=y -CONFIG_EXT2_FS=m -CONFIG_EXT2_FS_XATTR=y -CONFIG_EXT2_FS_POSIX_ACL=y -CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT3_FS=m -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y -CONFIG_EXT4_FS=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -# CONFIG_EXT4_DEBUG is not set -CONFIG_JBD2=y -# CONFIG_JBD2_DEBUG is not set -CONFIG_FS_MBCACHE=y -# CONFIG_REISERFS_FS is not set -# CONFIG_JFS_FS is not set -CONFIG_XFS_FS=m -# CONFIG_XFS_SUPPORT_V4 is not set -# CONFIG_XFS_SUPPORT_ASCII_CI is not set -CONFIG_XFS_QUOTA=y -CONFIG_XFS_POSIX_ACL=y -CONFIG_XFS_RT=y -# CONFIG_XFS_ONLINE_SCRUB is not set -# CONFIG_XFS_WARN is not set -# CONFIG_XFS_DEBUG is not set -# CONFIG_GFS2_FS is not set -# CONFIG_OCFS2_FS is not set -CONFIG_BTRFS_FS=m -CONFIG_BTRFS_FS_POSIX_ACL=y -# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set -# CONFIG_BTRFS_DEBUG is not set -# CONFIG_BTRFS_ASSERT is not set -# CONFIG_BTRFS_FS_REF_VERIFY is not set -CONFIG_NILFS2_FS=m -CONFIG_F2FS_FS=m -CONFIG_F2FS_STAT_FS=y -CONFIG_F2FS_FS_XATTR=y -CONFIG_F2FS_FS_POSIX_ACL=y -CONFIG_F2FS_FS_SECURITY=y -CONFIG_F2FS_CHECK_FS=y -# CONFIG_F2FS_FAULT_INJECTION is not set -CONFIG_F2FS_FS_COMPRESSION=y -CONFIG_F2FS_FS_LZO=y -CONFIG_F2FS_FS_LZORLE=y -CONFIG_F2FS_FS_LZ4=y -CONFIG_F2FS_FS_LZ4HC=y -CONFIG_F2FS_FS_ZSTD=y -CONFIG_F2FS_IOSTAT=y -# CONFIG_F2FS_UNFAIR_RWSEM is not set -CONFIG_BCACHEFS_FS=m -CONFIG_BCACHEFS_QUOTA=y -CONFIG_BCACHEFS_ERASURE_CODING=y -CONFIG_BCACHEFS_POSIX_ACL=y -# CONFIG_BCACHEFS_DEBUG is not set -# CONFIG_BCACHEFS_TESTS is not set -CONFIG_BCACHEFS_LOCK_TIME_STATS=y -# CONFIG_BCACHEFS_NO_LATENCY_ACCT is not set -CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN=y -# CONFIG_BCACHEFS_PATH_TRACEPOINTS is not set -CONFIG_ZONEFS_FS=m -CONFIG_FS_DAX=y -CONFIG_FS_DAX_PMD=y -CONFIG_FS_POSIX_ACL=y -CONFIG_EXPORTFS=y -CONFIG_EXPORTFS_BLOCK_OPS=y -CONFIG_FILE_LOCKING=y -CONFIG_FS_ENCRYPTION=y -CONFIG_FS_ENCRYPTION_ALGS=y -CONFIG_FS_ENCRYPTION_INLINE_CRYPT=y -CONFIG_FS_VERITY=y -CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y -CONFIG_FSNOTIFY=y -CONFIG_DNOTIFY=y -CONFIG_INOTIFY_USER=y -CONFIG_FANOTIFY=y -# CONFIG_FANOTIFY_ACCESS_PERMISSIONS is not set -CONFIG_QUOTA=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -CONFIG_QUOTA_DEBUG=y -CONFIG_QUOTA_TREE=m -# CONFIG_QFMT_V1 is not set -CONFIG_QFMT_V2=m -CONFIG_QUOTACTL=y -CONFIG_AUTOFS_FS=y -CONFIG_FUSE_FS=m -# CONFIG_CUSE is not set -CONFIG_VIRTIO_FS=m -CONFIG_FUSE_DAX=y -CONFIG_FUSE_PASSTHROUGH=y -CONFIG_OVERLAY_FS=m -CONFIG_OVERLAY_FS_REDIRECT_DIR=y -# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set -CONFIG_OVERLAY_FS_INDEX=y -CONFIG_OVERLAY_FS_XINO_AUTO=y -CONFIG_OVERLAY_FS_METACOPY=y -# CONFIG_OVERLAY_FS_DEBUG is not set -CONFIG_OVERLAY_FS_UNPRIVILEGED=y - -# -# Caches -# -CONFIG_NETFS_SUPPORT=m -CONFIG_NETFS_STATS=y -# CONFIG_NETFS_DEBUG is not set -CONFIG_FSCACHE=y -CONFIG_FSCACHE_STATS=y -CONFIG_CACHEFILES=m -# CONFIG_CACHEFILES_DEBUG is not set -# CONFIG_CACHEFILES_ERROR_INJECTION is not set -CONFIG_CACHEFILES_ONDEMAND=y -# end of Caches - -# -# CD-ROM/DVD Filesystems -# -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_UDF_FS=m -# end of CD-ROM/DVD Filesystems - -# -# DOS/FAT/EXFAT/NT Filesystems -# -CONFIG_FAT_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_CODEPAGE=437 -CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" -CONFIG_FAT_DEFAULT_UTF8=y -CONFIG_EXFAT_FS=m -CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" -CONFIG_NTFS3_FS=m -# CONFIG_NTFS3_64BIT_CLUSTER is not set -CONFIG_NTFS3_LZX_XPRESS=y -CONFIG_NTFS3_FS_POSIX_ACL=y -# CONFIG_NTFS_FS is not set -# end of DOS/FAT/EXFAT/NT Filesystems - -# -# Pseudo filesystems -# -CONFIG_PROC_FS=y -# CONFIG_PROC_KCORE is not set -CONFIG_PROC_SYSCTL=y -CONFIG_PROC_PAGE_MONITOR=y -CONFIG_PROC_CHILDREN=y -CONFIG_PROC_PID_ARCH_STATUS=y -CONFIG_PROC_CPU_RESCTRL=y -CONFIG_KERNFS=y -CONFIG_SYSFS=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_TMPFS_XATTR=y -CONFIG_TMPFS_INODE64=y -CONFIG_TMPFS_QUOTA=y -CONFIG_HUGETLBFS=y -# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set -CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y -CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING=y -CONFIG_ARCH_HAS_GIGANTIC_PAGE=y -CONFIG_CONFIGFS_FS=y -CONFIG_EFIVAR_FS=m -# end of Pseudo filesystems - -CONFIG_MISC_FILESYSTEMS=y -# CONFIG_ORANGEFS_FS is not set -# CONFIG_ADFS_FS is not set -# CONFIG_AFFS_FS is not set -CONFIG_ECRYPT_FS=m -# CONFIG_ECRYPT_FS_MESSAGING is not set -# CONFIG_HFS_FS is not set -# CONFIG_HFSPLUS_FS is not set -# CONFIG_BEFS_FS is not set -# CONFIG_BFS_FS is not set -# CONFIG_EFS_FS is not set -# CONFIG_CRAMFS is not set -CONFIG_SQUASHFS=m -# CONFIG_SQUASHFS_FILE_CACHE is not set -CONFIG_SQUASHFS_FILE_DIRECT=y -CONFIG_SQUASHFS_DECOMP_SINGLE=y -CONFIG_SQUASHFS_DECOMP_MULTI=y -CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU=y -CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y -CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS=y -CONFIG_SQUASHFS_XATTR=y -CONFIG_SQUASHFS_ZLIB=y -CONFIG_SQUASHFS_LZ4=y -CONFIG_SQUASHFS_LZO=y -CONFIG_SQUASHFS_XZ=y -CONFIG_SQUASHFS_ZSTD=y -CONFIG_SQUASHFS_4K_DEVBLK_SIZE=y -CONFIG_SQUASHFS_EMBEDDED=y -CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 -# CONFIG_VXFS_FS is not set -# CONFIG_MINIX_FS is not set -# CONFIG_OMFS_FS is not set -# CONFIG_HPFS_FS is not set -# CONFIG_QNX4FS_FS is not set -# CONFIG_QNX6FS_FS is not set -# CONFIG_ROMFS_FS is not set -CONFIG_PSTORE=y -CONFIG_PSTORE_DEFAULT_KMSG_BYTES=10240 -CONFIG_PSTORE_COMPRESS=y -# CONFIG_PSTORE_CONSOLE is not set -# CONFIG_PSTORE_PMSG is not set -# CONFIG_PSTORE_FTRACE is not set -CONFIG_PSTORE_RAM=m -CONFIG_PSTORE_ZONE=y -CONFIG_PSTORE_BLK=y -CONFIG_PSTORE_BLK_BLKDEV="" -CONFIG_PSTORE_BLK_KMSG_SIZE=64 -CONFIG_PSTORE_BLK_MAX_REASON=2 -# CONFIG_SYSV_FS is not set -# CONFIG_UFS_FS is not set -# CONFIG_EROFS_FS is not set -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_NFS_FS=m -# CONFIG_NFS_V2 is not set -# CONFIG_NFS_V3 is not set -CONFIG_NFS_V4=m -# CONFIG_NFS_SWAP is not set -CONFIG_NFS_V4_1=y -CONFIG_NFS_V4_2=y -CONFIG_PNFS_FILE_LAYOUT=m -CONFIG_PNFS_BLOCK=m -CONFIG_PNFS_FLEXFILE_LAYOUT=m -CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="millerson.name" -CONFIG_NFS_V4_1_MIGRATION=y -CONFIG_NFS_V4_SECURITY_LABEL=y -CONFIG_NFS_FSCACHE=y -# CONFIG_NFS_USE_LEGACY_DNS is not set -CONFIG_NFS_USE_KERNEL_DNS=y -CONFIG_NFS_DISABLE_UDP_SUPPORT=y -CONFIG_NFS_V4_2_READ_PLUS=y -CONFIG_NFSD=m -# CONFIG_NFSD_V2 is not set -# CONFIG_NFSD_V3_ACL is not set -CONFIG_NFSD_V4=y -CONFIG_NFSD_PNFS=y -CONFIG_NFSD_BLOCKLAYOUT=y -# CONFIG_NFSD_SCSILAYOUT is not set -# CONFIG_NFSD_FLEXFILELAYOUT is not set -CONFIG_NFSD_V4_2_INTER_SSC=y -CONFIG_NFSD_V4_SECURITY_LABEL=y -# CONFIG_NFSD_LEGACY_CLIENT_TRACKING is not set -CONFIG_GRACE_PERIOD=m -CONFIG_LOCKD=m -CONFIG_LOCKD_V4=y -CONFIG_NFS_COMMON=y -CONFIG_NFS_COMMON_LOCALIO_SUPPORT=m -CONFIG_NFS_LOCALIO=y -CONFIG_NFS_V4_2_SSC_HELPER=y -CONFIG_SUNRPC=m -CONFIG_SUNRPC_GSS=m -CONFIG_SUNRPC_BACKCHANNEL=y -CONFIG_RPCSEC_GSS_KRB5=m -CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1=y -# CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_CAMELLIA is not set -# CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA2 is not set -# CONFIG_SUNRPC_DEBUG is not set -# CONFIG_CEPH_FS is not set -CONFIG_CIFS=m -CONFIG_CIFS_STATS2=y -CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y -CONFIG_CIFS_UPCALL=y -# CONFIG_CIFS_XATTR is not set -# CONFIG_CIFS_DEBUG is not set -CONFIG_CIFS_DFS_UPCALL=y -CONFIG_CIFS_SWN_UPCALL=y -CONFIG_CIFS_FSCACHE=y -# CONFIG_CIFS_COMPRESSION is not set -# CONFIG_SMB_SERVER is not set -CONFIG_SMBFS=m -# CONFIG_CODA_FS is not set -# CONFIG_AFS_FS is not set -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=y -# CONFIG_NLS_CODEPAGE_737 is not set -# CONFIG_NLS_CODEPAGE_775 is not set -# CONFIG_NLS_CODEPAGE_850 is not set -# CONFIG_NLS_CODEPAGE_852 is not set -# CONFIG_NLS_CODEPAGE_855 is not set -# CONFIG_NLS_CODEPAGE_857 is not set -# CONFIG_NLS_CODEPAGE_860 is not set -# CONFIG_NLS_CODEPAGE_861 is not set -# CONFIG_NLS_CODEPAGE_862 is not set -# CONFIG_NLS_CODEPAGE_863 is not set -# CONFIG_NLS_CODEPAGE_864 is not set -# CONFIG_NLS_CODEPAGE_865 is not set -CONFIG_NLS_CODEPAGE_866=y -# CONFIG_NLS_CODEPAGE_869 is not set -# CONFIG_NLS_CODEPAGE_936 is not set -# CONFIG_NLS_CODEPAGE_950 is not set -# CONFIG_NLS_CODEPAGE_932 is not set -# CONFIG_NLS_CODEPAGE_949 is not set -# CONFIG_NLS_CODEPAGE_874 is not set -# CONFIG_NLS_ISO8859_8 is not set -# CONFIG_NLS_CODEPAGE_1250 is not set -# CONFIG_NLS_CODEPAGE_1251 is not set -# CONFIG_NLS_ASCII is not set -CONFIG_NLS_ISO8859_1=y -# CONFIG_NLS_ISO8859_2 is not set -# CONFIG_NLS_ISO8859_3 is not set -# CONFIG_NLS_ISO8859_4 is not set -# CONFIG_NLS_ISO8859_5 is not set -# CONFIG_NLS_ISO8859_6 is not set -# CONFIG_NLS_ISO8859_7 is not set -# CONFIG_NLS_ISO8859_9 is not set -# CONFIG_NLS_ISO8859_13 is not set -# CONFIG_NLS_ISO8859_14 is not set -# CONFIG_NLS_ISO8859_15 is not set -CONFIG_NLS_KOI8_R=y -# CONFIG_NLS_KOI8_U is not set -# CONFIG_NLS_MAC_ROMAN is not set -# CONFIG_NLS_MAC_CELTIC is not set -# CONFIG_NLS_MAC_CENTEURO is not set -# CONFIG_NLS_MAC_CROATIAN is not set -# CONFIG_NLS_MAC_CYRILLIC is not set -# CONFIG_NLS_MAC_GAELIC is not set -# CONFIG_NLS_MAC_GREEK is not set -# CONFIG_NLS_MAC_ICELAND is not set -# CONFIG_NLS_MAC_INUIT is not set -# CONFIG_NLS_MAC_ROMANIAN is not set -# CONFIG_NLS_MAC_TURKISH is not set -CONFIG_NLS_UTF8=y -CONFIG_NLS_UCS2_UTILS=m -# CONFIG_DLM is not set -CONFIG_UNICODE=y -# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set -CONFIG_IO_WQ=y -# end of File systems - -# -# Security options -# -CONFIG_KEYS=y -CONFIG_KEYS_REQUEST_CACHE=y -CONFIG_PERSISTENT_KEYRINGS=y -# CONFIG_TRUSTED_KEYS is not set -CONFIG_ENCRYPTED_KEYS=y -# CONFIG_USER_DECRYPTED_DATA is not set -# CONFIG_KEY_DH_OPERATIONS is not set -CONFIG_KEY_NOTIFICATIONS=y -CONFIG_SECURITY_DMESG_RESTRICT=y -# CONFIG_PROC_MEM_ALWAYS_FORCE is not set -CONFIG_PROC_MEM_FORCE_PTRACE=y -# CONFIG_PROC_MEM_NO_FORCE is not set -CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y -CONFIG_SECURITY_TIOCSTI_RESTRICT=y -CONFIG_SECURITY=y -CONFIG_SECURITYFS=y -CONFIG_SECURITY_NETWORK=y -# CONFIG_SECURITY_NETWORK_XFRM is not set -CONFIG_SECURITY_PATH=y -# CONFIG_INTEL_TXT is not set -CONFIG_LSM_MMAP_MIN_ADDR=65536 -CONFIG_HARDENED_USERCOPY=y -CONFIG_FORTIFY_SOURCE=y -# CONFIG_STATIC_USERMODEHELPER is not set -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -# CONFIG_SECURITY_SELINUX_DEVELOP is not set -CONFIG_SECURITY_SELINUX_AVC_STATS=y -CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 -CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 -# CONFIG_SECURITY_SELINUX_DEBUG is not set -# CONFIG_SECURITY_SMACK is not set -# CONFIG_SECURITY_TOMOYO is not set -CONFIG_SECURITY_APPARMOR=y -# CONFIG_SECURITY_APPARMOR_DEBUG is not set -CONFIG_SECURITY_APPARMOR_INTROSPECT_POLICY=y -CONFIG_SECURITY_APPARMOR_HASH=y -CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y -CONFIG_SECURITY_APPARMOR_EXPORT_BINARY=y -CONFIG_SECURITY_APPARMOR_PARANOID_LOAD=y -# CONFIG_SECURITY_LOADPIN is not set -CONFIG_SECURITY_YAMA=y -CONFIG_SECURITY_SAFESETID=y -CONFIG_SECURITY_LOCKDOWN_LSM=y -CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y -CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y -# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set -# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set -CONFIG_SECURITY_LANDLOCK=y -# CONFIG_SECURITY_IPE is not set -CONFIG_INTEGRITY=y -CONFIG_INTEGRITY_SIGNATURE=y -CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y -CONFIG_INTEGRITY_TRUSTED_KEYRING=y -CONFIG_INTEGRITY_AUDIT=y -# CONFIG_IMA is not set -# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set -# CONFIG_EVM is not set -# CONFIG_DEFAULT_SECURITY_SELINUX is not set -# CONFIG_DEFAULT_SECURITY_APPARMOR is not set -CONFIG_DEFAULT_SECURITY_DAC=y -CONFIG_LSM="lockdown,yama,integrity,selinux,bpf,landlock" - -# -# Kernel hardening options -# - -# -# Memory initialization -# -CONFIG_CC_HAS_AUTO_VAR_INIT_PATTERN=y -CONFIG_CC_HAS_AUTO_VAR_INIT_ZERO_BARE=y -CONFIG_CC_HAS_AUTO_VAR_INIT_ZERO=y -# CONFIG_INIT_STACK_NONE is not set -# CONFIG_INIT_STACK_ALL_PATTERN is not set -CONFIG_INIT_STACK_ALL_ZERO=y -CONFIG_GCC_PLUGIN_STACKLEAK=y -CONFIG_STACKLEAK_TRACK_MIN_SIZE=100 -# CONFIG_STACKLEAK_METRICS is not set -# CONFIG_STACKLEAK_RUNTIME_DISABLE is not set -CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y -CONFIG_INIT_ON_FREE_DEFAULT_ON=y -CONFIG_CC_HAS_ZERO_CALL_USED_REGS=y -CONFIG_ZERO_CALL_USED_REGS=y -CONFIG_PAGE_SANITIZE_VERIFY=y -CONFIG_SLAB_SANITIZE_VERIFY=y -# end of Memory initialization - -# -# Hardening of kernel data structures -# -CONFIG_LIST_HARDENED=y -CONFIG_BUG_ON_DATA_CORRUPTION=y -# end of Hardening of kernel data structures - -# CONFIG_RANDSTRUCT_NONE is not set -CONFIG_RANDSTRUCT_FULL=y -# CONFIG_RANDSTRUCT_PERFORMANCE is not set -CONFIG_RANDSTRUCT=y -CONFIG_GCC_PLUGIN_RANDSTRUCT=y -# end of Kernel hardening options -# end of Security options - -CONFIG_XOR_BLOCKS=m -CONFIG_ASYNC_CORE=m -CONFIG_ASYNC_MEMCPY=m -CONFIG_ASYNC_XOR=m -CONFIG_ASYNC_PQ=m -CONFIG_ASYNC_RAID6_RECOV=m -CONFIG_CRYPTO=y - -# -# Crypto core or helper -# -CONFIG_CRYPTO_ALGAPI=y -CONFIG_CRYPTO_ALGAPI2=y -CONFIG_CRYPTO_AEAD=y -CONFIG_CRYPTO_AEAD2=y -CONFIG_CRYPTO_SIG=y -CONFIG_CRYPTO_SIG2=y -CONFIG_CRYPTO_SKCIPHER=y -CONFIG_CRYPTO_SKCIPHER2=y -CONFIG_CRYPTO_HASH=y -CONFIG_CRYPTO_HASH2=y -CONFIG_CRYPTO_RNG=y -CONFIG_CRYPTO_RNG2=y -CONFIG_CRYPTO_RNG_DEFAULT=y -CONFIG_CRYPTO_AKCIPHER2=y -CONFIG_CRYPTO_AKCIPHER=y -CONFIG_CRYPTO_KPP2=y -CONFIG_CRYPTO_KPP=m -CONFIG_CRYPTO_ACOMP2=y -CONFIG_CRYPTO_MANAGER=y -CONFIG_CRYPTO_MANAGER2=y -# CONFIG_CRYPTO_USER is not set -CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y -CONFIG_CRYPTO_NULL=y -CONFIG_CRYPTO_NULL2=y -CONFIG_CRYPTO_PCRYPT=y -CONFIG_CRYPTO_CRYPTD=y -CONFIG_CRYPTO_AUTHENC=y -# CONFIG_CRYPTO_TEST is not set -CONFIG_CRYPTO_SIMD=y -CONFIG_CRYPTO_ENGINE=m -# end of Crypto core or helper - -# -# Public-key cryptography -# -CONFIG_CRYPTO_RSA=y -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_DH_RFC7919_GROUPS=y -CONFIG_CRYPTO_ECC=y -CONFIG_CRYPTO_ECDH=m -CONFIG_CRYPTO_ECDSA=y -CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m -# end of Public-key cryptography - -# -# Block ciphers -# -CONFIG_CRYPTO_AES=y -CONFIG_CRYPTO_AES_TI=m -CONFIG_CRYPTO_ARIA=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_BLOWFISH_COMMON=m -CONFIG_CRYPTO_CAMELLIA=m -CONFIG_CRYPTO_CAST_COMMON=m -CONFIG_CRYPTO_CAST5=m -CONFIG_CRYPTO_CAST6=m -CONFIG_CRYPTO_DES=m -CONFIG_CRYPTO_FCRYPT=m -CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SM4=m -CONFIG_CRYPTO_SM4_GENERIC=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_TWOFISH_COMMON=m -# end of Block ciphers - -# -# Length-preserving ciphers and modes -# -CONFIG_CRYPTO_ADIANTUM=m -CONFIG_CRYPTO_CHACHA20=m -CONFIG_CRYPTO_CBC=y -CONFIG_CRYPTO_CTR=y -CONFIG_CRYPTO_CTS=y -CONFIG_CRYPTO_ECB=y -CONFIG_CRYPTO_HCTR2=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_XCTR=m -CONFIG_CRYPTO_XTS=y -CONFIG_CRYPTO_NHPOLY1305=m -# end of Length-preserving ciphers and modes - -# -# AEAD (authenticated encryption with associated data) ciphers -# -CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_CCM=m -CONFIG_CRYPTO_GCM=m -CONFIG_CRYPTO_GENIV=m -CONFIG_CRYPTO_SEQIV=m -CONFIG_CRYPTO_ECHAINIV=m -CONFIG_CRYPTO_ESSIV=m -# end of AEAD (authenticated encryption with associated data) ciphers - -# -# Hashes, digests, and MACs -# -CONFIG_CRYPTO_BLAKE2B=y -CONFIG_CRYPTO_CMAC=m -CONFIG_CRYPTO_GHASH=m -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_POLYVAL=m -CONFIG_CRYPTO_POLY1305=m -# CONFIG_CRYPTO_RMD160 is not set -CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA256=y -CONFIG_CRYPTO_SHA512=y -CONFIG_CRYPTO_SHA3=y -CONFIG_CRYPTO_SM3=m -# CONFIG_CRYPTO_SM3_GENERIC is not set -CONFIG_CRYPTO_STREEBOG=m -# CONFIG_CRYPTO_VMAC is not set -# CONFIG_CRYPTO_WP512 is not set -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_XXHASH=y -# end of Hashes, digests, and MACs - -# -# CRCs (cyclic redundancy checks) -# -CONFIG_CRYPTO_CRC32C=y -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_CRCT10DIF=y -CONFIG_CRYPTO_CRC64_ROCKSOFT=y -# end of CRCs (cyclic redundancy checks) - -# -# Compression -# -CONFIG_CRYPTO_DEFLATE=m -CONFIG_CRYPTO_LZO=y -# CONFIG_CRYPTO_842 is not set -CONFIG_CRYPTO_LZ4=m -CONFIG_CRYPTO_LZ4HC=m -CONFIG_CRYPTO_ZSTD=y -# end of Compression - -# -# Random number generation -# -CONFIG_CRYPTO_ANSI_CPRNG=y -CONFIG_CRYPTO_DRBG_MENU=y -CONFIG_CRYPTO_DRBG_HMAC=y -CONFIG_CRYPTO_DRBG_HASH=y -CONFIG_CRYPTO_DRBG_CTR=y -CONFIG_CRYPTO_DRBG=y -CONFIG_CRYPTO_JITTERENTROPY=y -CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS=64 -CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE=32 -CONFIG_CRYPTO_JITTERENTROPY_OSR=1 -# end of Random number generation - -# -# Userspace interface -# -CONFIG_CRYPTO_USER_API=y -CONFIG_CRYPTO_USER_API_HASH=y -CONFIG_CRYPTO_USER_API_SKCIPHER=y -CONFIG_CRYPTO_USER_API_RNG=y -# CONFIG_CRYPTO_USER_API_RNG_CAVP is not set -CONFIG_CRYPTO_USER_API_AEAD=m -# CONFIG_CRYPTO_USER_API_ENABLE_OBSOLETE is not set -# end of Userspace interface - -CONFIG_CRYPTO_HASH_INFO=y - -# -# Accelerated Cryptographic Algorithms for CPU (x86) -# -CONFIG_CRYPTO_CURVE25519_X86=m -CONFIG_CRYPTO_AES_NI_INTEL=y -CONFIG_CRYPTO_BLOWFISH_X86_64=m -CONFIG_CRYPTO_CAMELLIA_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m -CONFIG_CRYPTO_CAST5_AVX_X86_64=m -CONFIG_CRYPTO_CAST6_AVX_X86_64=m -CONFIG_CRYPTO_DES3_EDE_X86_64=m -CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m -CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64=m -CONFIG_CRYPTO_SM4_AESNI_AVX2_X86_64=m -CONFIG_CRYPTO_TWOFISH_X86_64=m -CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m -CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m -CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64=m -CONFIG_CRYPTO_ARIA_AESNI_AVX2_X86_64=m -CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64=m -CONFIG_CRYPTO_CHACHA20_X86_64=y -CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m -CONFIG_CRYPTO_NHPOLY1305_SSE2=m -CONFIG_CRYPTO_NHPOLY1305_AVX2=m -CONFIG_CRYPTO_BLAKE2S_X86=y -CONFIG_CRYPTO_POLYVAL_CLMUL_NI=m -CONFIG_CRYPTO_POLY1305_X86_64=y -CONFIG_CRYPTO_SHA1_SSSE3=m -CONFIG_CRYPTO_SHA256_SSSE3=m -CONFIG_CRYPTO_SHA512_SSSE3=m -CONFIG_CRYPTO_SM3_AVX_X86_64=m -CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m -CONFIG_CRYPTO_CRC32C_INTEL=y -CONFIG_CRYPTO_CRC32_PCLMUL=m -CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m -# end of Accelerated Cryptographic Algorithms for CPU (x86) - -CONFIG_CRYPTO_HW=y -# CONFIG_CRYPTO_DEV_PADLOCK is not set -# CONFIG_CRYPTO_DEV_ATMEL_ECC is not set -# CONFIG_CRYPTO_DEV_ATMEL_SHA204A is not set -CONFIG_CRYPTO_DEV_CCP=y -CONFIG_CRYPTO_DEV_CCP_DD=m -CONFIG_CRYPTO_DEV_SP_CCP=y -CONFIG_CRYPTO_DEV_CCP_CRYPTO=m -CONFIG_CRYPTO_DEV_SP_PSP=y -# CONFIG_CRYPTO_DEV_CCP_DEBUGFS is not set -# CONFIG_CRYPTO_DEV_NITROX_CNN55XX is not set -CONFIG_CRYPTO_DEV_QAT=m -CONFIG_CRYPTO_DEV_QAT_DH895xCC=m -CONFIG_CRYPTO_DEV_QAT_C3XXX=m -CONFIG_CRYPTO_DEV_QAT_C62X=m -CONFIG_CRYPTO_DEV_QAT_4XXX=m -# CONFIG_CRYPTO_DEV_QAT_420XX is not set -CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m -CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m -CONFIG_CRYPTO_DEV_QAT_C62XVF=m -# CONFIG_CRYPTO_DEV_QAT_ERROR_INJECTION is not set -CONFIG_CRYPTO_DEV_VIRTIO=m -# CONFIG_CRYPTO_DEV_SAFEXCEL is not set -# CONFIG_CRYPTO_DEV_AMLOGIC_GXL is not set -CONFIG_ASYMMETRIC_KEY_TYPE=y -CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y -CONFIG_X509_CERTIFICATE_PARSER=y -CONFIG_PKCS8_PRIVATE_KEY_PARSER=m -CONFIG_PKCS7_MESSAGE_PARSER=y -# CONFIG_PKCS7_TEST_KEY is not set -# CONFIG_SIGNED_PE_FILE_VERIFICATION is not set -# CONFIG_FIPS_SIGNATURE_SELFTEST is not set - -# -# Certificates for signature checking -# -CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" -# CONFIG_MODULE_SIG_KEY_TYPE_RSA is not set -CONFIG_MODULE_SIG_KEY_TYPE_ECDSA=y -CONFIG_SYSTEM_TRUSTED_KEYRING=y -CONFIG_SYSTEM_TRUSTED_KEYS="" -# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set -# CONFIG_SECONDARY_TRUSTED_KEYRING is not set -# CONFIG_SYSTEM_BLACKLIST_KEYRING is not set -# end of Certificates for signature checking - -CONFIG_BINARY_PRINTF=y - -# -# Library routines -# -CONFIG_RAID6_PQ=m -CONFIG_RAID6_PQ_BENCHMARK=y -CONFIG_PACKING=y -CONFIG_BITREVERSE=y -CONFIG_GENERIC_STRNCPY_FROM_USER=y -CONFIG_GENERIC_STRNLEN_USER=y -CONFIG_GENERIC_NET_UTILS=y -# CONFIG_CORDIC is not set -# CONFIG_PRIME_NUMBERS is not set -CONFIG_RATIONAL=y -CONFIG_GENERIC_IOMAP=y -CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y -CONFIG_ARCH_HAS_FAST_MULTIPLIER=y -CONFIG_ARCH_USE_SYM_ANNOTATIONS=y - -# -# Crypto library routines -# -CONFIG_CRYPTO_LIB_UTILS=y -CONFIG_CRYPTO_LIB_AES=y -CONFIG_CRYPTO_LIB_AESCFB=m -CONFIG_CRYPTO_LIB_ARC4=m -CONFIG_CRYPTO_LIB_GF128MUL=y -CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=y -CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=y -CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=y -CONFIG_CRYPTO_LIB_CHACHA_GENERIC=y -CONFIG_CRYPTO_LIB_CHACHA=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_DES=m -CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11 -CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=y -CONFIG_CRYPTO_LIB_POLY1305_GENERIC=y -CONFIG_CRYPTO_LIB_POLY1305=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m -CONFIG_CRYPTO_LIB_SHA1=y -CONFIG_CRYPTO_LIB_SHA256=y -# end of Crypto library routines - -CONFIG_CRC_CCITT=y -CONFIG_CRC16=y -CONFIG_CRC_T10DIF=y -CONFIG_CRC64_ROCKSOFT=y -CONFIG_CRC_ITU_T=y -CONFIG_CRC32=y -# CONFIG_CRC32_SELFTEST is not set -CONFIG_CRC32_SLICEBY8=y -# CONFIG_CRC32_SLICEBY4 is not set -# CONFIG_CRC32_SARWATE is not set -# CONFIG_CRC32_BIT is not set -CONFIG_CRC64=y -# CONFIG_CRC4 is not set -CONFIG_CRC7=m -CONFIG_LIBCRC32C=y -CONFIG_CRC8=m -CONFIG_XXHASH=y -# CONFIG_RANDOM32_SELFTEST is not set -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=y -CONFIG_LZO_COMPRESS=y -CONFIG_LZO_DECOMPRESS=y -CONFIG_LZ4_COMPRESS=m -CONFIG_LZ4HC_COMPRESS=m -CONFIG_LZ4_DECOMPRESS=m -CONFIG_ZSTD_COMMON=y -CONFIG_ZSTD_COMPRESS=y -CONFIG_ZSTD_DECOMPRESS=y -CONFIG_XZ_DEC=y -CONFIG_XZ_DEC_X86=y -CONFIG_XZ_DEC_POWERPC=y -CONFIG_XZ_DEC_ARM=y -CONFIG_XZ_DEC_ARMTHUMB=y -CONFIG_XZ_DEC_ARM64=y -CONFIG_XZ_DEC_SPARC=y -CONFIG_XZ_DEC_RISCV=y -CONFIG_XZ_DEC_MICROLZMA=y -CONFIG_XZ_DEC_BCJ=y -# CONFIG_XZ_DEC_TEST is not set -CONFIG_DECOMPRESS_XZ=y -CONFIG_DECOMPRESS_ZSTD=y -CONFIG_GENERIC_ALLOCATOR=y -CONFIG_REED_SOLOMON=m -CONFIG_REED_SOLOMON_ENC8=y -CONFIG_REED_SOLOMON_DEC8=y -CONFIG_TEXTSEARCH=y -CONFIG_TEXTSEARCH_KMP=m -CONFIG_TEXTSEARCH_BM=m -CONFIG_TEXTSEARCH_FSM=m -CONFIG_INTERVAL_TREE=y -CONFIG_XARRAY_MULTI=y -CONFIG_ASSOCIATIVE_ARRAY=y -CONFIG_CLOSURES=y -CONFIG_HAS_IOMEM=y -CONFIG_HAS_IOPORT=y -CONFIG_HAS_IOPORT_MAP=y -CONFIG_HAS_DMA=y -CONFIG_DMA_OPS_HELPERS=y -CONFIG_NEED_SG_DMA_FLAGS=y -CONFIG_NEED_SG_DMA_LENGTH=y -CONFIG_NEED_DMA_MAP_STATE=y -CONFIG_ARCH_DMA_ADDR_T_64BIT=y -CONFIG_SWIOTLB=y -# CONFIG_SWIOTLB_DYNAMIC is not set -CONFIG_DMA_NEED_SYNC=y -# CONFIG_DMA_API_DEBUG is not set -# CONFIG_DMA_MAP_BENCHMARK is not set -CONFIG_SGL_ALLOC=y -CONFIG_CHECK_SIGNATURE=y -CONFIG_CPU_RMAP=y -CONFIG_DQL=y -CONFIG_GLOB=y -# CONFIG_GLOB_SELFTEST is not set -CONFIG_NLATTR=y -CONFIG_LRU_CACHE=m -CONFIG_CLZ_TAB=y -# CONFIG_IRQ_POLL is not set -CONFIG_MPILIB=y -CONFIG_SIGNATURE=y -CONFIG_DIMLIB=y -CONFIG_OID_REGISTRY=y -CONFIG_UCS2_STRING=y -CONFIG_HAVE_GENERIC_VDSO=y -CONFIG_GENERIC_GETTIMEOFDAY=y -CONFIG_GENERIC_VDSO_TIME_NS=y -CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT=y -CONFIG_VDSO_GETRANDOM=y -CONFIG_FONT_SUPPORT=y -# CONFIG_FONTS is not set -CONFIG_FONT_8x8=y -CONFIG_FONT_8x16=y -CONFIG_SG_POOL=y -CONFIG_ARCH_HAS_PMEM_API=y -CONFIG_MEMREGION=y -CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION=y -CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y -CONFIG_ARCH_HAS_COPY_MC=y -CONFIG_ARCH_STACKWALK=y -CONFIG_STACKDEPOT=y -CONFIG_STACKDEPOT_MAX_FRAMES=64 -CONFIG_SBITMAP=y -# CONFIG_LWQ_TEST is not set -# end of Library routines - -CONFIG_FIRMWARE_TABLE=y - -# -# Kernel hacking -# - -# -# printk and dmesg options -# -CONFIG_PRINTK_TIME=y -# CONFIG_PRINTK_CALLER is not set -# CONFIG_STACKTRACE_BUILD_ID is not set -CONFIG_CONSOLE_LOGLEVEL_DEFAULT=7 -CONFIG_CONSOLE_LOGLEVEL_QUIET=4 -CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 -# CONFIG_BOOT_PRINTK_DELAY is not set -# CONFIG_DYNAMIC_DEBUG is not set -# CONFIG_DYNAMIC_DEBUG_CORE is not set -CONFIG_SYMBOLIC_ERRNAME=y -CONFIG_DEBUG_BUGVERBOSE=y -# end of printk and dmesg options - -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_MISC=y - -# -# Compile-time checks and compiler options -# -CONFIG_AS_HAS_NON_CONST_ULEB128=y -CONFIG_DEBUG_INFO_NONE=y -# CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT is not set -# CONFIG_DEBUG_INFO_DWARF4 is not set -# CONFIG_DEBUG_INFO_DWARF5 is not set -CONFIG_FRAME_WARN=2048 -CONFIG_STRIP_ASM_SYMS=y -# CONFIG_READABLE_ASM is not set -# CONFIG_HEADERS_INSTALL is not set -# CONFIG_DEBUG_SECTION_MISMATCH is not set -CONFIG_SECTION_MISMATCH_WARN_ONLY=y -# CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE is not set -CONFIG_OBJTOOL=y -# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set -# end of Compile-time checks and compiler options - -# -# Generic Kernel Debugging Instruments -# -CONFIG_MAGIC_SYSRQ=y -CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x1 -CONFIG_MAGIC_SYSRQ_SERIAL=y -CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" -CONFIG_DEBUG_FS=y -CONFIG_DEBUG_FS_ALLOW_ALL=y -# CONFIG_DEBUG_FS_DISALLOW_MOUNT is not set -# CONFIG_DEBUG_FS_ALLOW_NONE is not set -CONFIG_HAVE_ARCH_KGDB=y -# CONFIG_KGDB is not set -CONFIG_ARCH_HAS_UBSAN=y -# CONFIG_UBSAN is not set -CONFIG_HAVE_ARCH_KCSAN=y -CONFIG_HAVE_KCSAN_COMPILER=y -# CONFIG_KCSAN is not set -# end of Generic Kernel Debugging Instruments - -# -# Networking Debugging -# -# CONFIG_NET_DEV_REFCNT_TRACKER is not set -# CONFIG_NET_NS_REFCNT_TRACKER is not set -# CONFIG_DEBUG_NET is not set -# end of Networking Debugging - -# -# Memory Debugging -# -CONFIG_PAGE_EXTENSION=y -# CONFIG_DEBUG_PAGEALLOC is not set -CONFIG_SLUB_DEBUG=y -# CONFIG_SLUB_DEBUG_ON is not set -# CONFIG_PAGE_OWNER is not set -# CONFIG_PAGE_TABLE_CHECK is not set -CONFIG_PAGE_POISONING=y -# CONFIG_DEBUG_PAGE_REF is not set -CONFIG_DEBUG_RODATA_TEST=y -CONFIG_ARCH_HAS_DEBUG_WX=y -CONFIG_DEBUG_WX=y -CONFIG_GENERIC_PTDUMP=y -CONFIG_PTDUMP_CORE=y -# CONFIG_PTDUMP_DEBUGFS is not set -CONFIG_HAVE_DEBUG_KMEMLEAK=y -# CONFIG_DEBUG_KMEMLEAK is not set -# CONFIG_PER_VMA_LOCK_STATS is not set -# CONFIG_DEBUG_OBJECTS is not set -# CONFIG_SHRINKER_DEBUG is not set -# CONFIG_DEBUG_STACK_USAGE is not set -CONFIG_SCHED_STACK_END_CHECK=y -CONFIG_ARCH_HAS_DEBUG_VM_PGTABLE=y -# CONFIG_DEBUG_VM is not set -# CONFIG_DEBUG_VM_PGTABLE is not set -CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y -CONFIG_DEBUG_VIRTUAL=y -CONFIG_DEBUG_MEMORY_INIT=y -# CONFIG_DEBUG_PER_CPU_MAPS is not set -CONFIG_ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP=y -# CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP is not set -# CONFIG_MEM_ALLOC_PROFILING is not set -CONFIG_HAVE_ARCH_KASAN=y -CONFIG_HAVE_ARCH_KASAN_VMALLOC=y -CONFIG_CC_HAS_KASAN_GENERIC=y -CONFIG_CC_HAS_KASAN_SW_TAGS=y -CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y -# CONFIG_KASAN is not set -CONFIG_HAVE_ARCH_KFENCE=y -CONFIG_KFENCE=y -CONFIG_KFENCE_SAMPLE_INTERVAL=100 -CONFIG_KFENCE_NUM_OBJECTS=255 -CONFIG_KFENCE_DEFERRABLE=y -CONFIG_KFENCE_STRESS_TEST_FAULTS=0 -CONFIG_KFENCE_BUG_ON_DATA_CORRUPTION=y -CONFIG_HAVE_ARCH_KMSAN=y -# end of Memory Debugging - -# CONFIG_DEBUG_SHIRQ is not set - -# -# Debug Oops, Lockups and Hangs -# -# CONFIG_PANIC_ON_OOPS is not set -CONFIG_PANIC_ON_OOPS_VALUE=0 -CONFIG_PANIC_TIMEOUT=0 -CONFIG_LOCKUP_DETECTOR=y -CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM=y -# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set -CONFIG_HAVE_HARDLOCKUP_DETECTOR_BUDDY=y -CONFIG_HARDLOCKUP_DETECTOR=y -# CONFIG_HARDLOCKUP_DETECTOR_PREFER_BUDDY is not set -CONFIG_HARDLOCKUP_DETECTOR_PERF=y -# CONFIG_HARDLOCKUP_DETECTOR_BUDDY is not set -# CONFIG_HARDLOCKUP_DETECTOR_ARCH is not set -CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER=y -CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y -CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y -CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 -# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set -CONFIG_WQ_WATCHDOG=y -# CONFIG_WQ_CPU_INTENSIVE_REPORT is not set -# CONFIG_TEST_LOCKUP is not set -# end of Debug Oops, Lockups and Hangs - -# -# Scheduler Debugging -# -# CONFIG_SCHED_DEBUG is not set -CONFIG_SCHED_INFO=y -# CONFIG_SCHEDSTATS is not set -# end of Scheduler Debugging - -# CONFIG_DEBUG_TIMEKEEPING is not set -CONFIG_DEBUG_PREEMPT=y - -# -# Lock Debugging (spinlocks, mutexes, etc...) -# -CONFIG_LOCK_DEBUGGING_SUPPORT=y -# CONFIG_PROVE_LOCKING is not set -# CONFIG_LOCK_STAT is not set -# CONFIG_DEBUG_RT_MUTEXES is not set -# CONFIG_DEBUG_SPINLOCK is not set -CONFIG_DEBUG_MUTEXES=y -# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set -# CONFIG_DEBUG_RWSEMS is not set -# CONFIG_DEBUG_LOCK_ALLOC is not set -# CONFIG_DEBUG_ATOMIC_SLEEP is not set -# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set -# CONFIG_LOCK_TORTURE_TEST is not set -# CONFIG_WW_MUTEX_SELFTEST is not set -# CONFIG_SCF_TORTURE_TEST is not set -# CONFIG_CSD_LOCK_WAIT_DEBUG is not set -# end of Lock Debugging (spinlocks, mutexes, etc...) - -# CONFIG_NMI_CHECK_CPU is not set -# CONFIG_DEBUG_IRQFLAGS is not set -CONFIG_STACKTRACE=y -# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set -# CONFIG_DEBUG_KOBJECT is not set - -# -# Debug kernel data structures -# -CONFIG_DEBUG_LIST=y -# CONFIG_DEBUG_PLIST is not set -CONFIG_DEBUG_SG=y -CONFIG_DEBUG_NOTIFIERS=y -# CONFIG_DEBUG_CLOSURES is not set -# CONFIG_DEBUG_MAPLE_TREE is not set -# end of Debug kernel data structures - -# -# RCU Debugging -# -# CONFIG_RCU_SCALE_TEST is not set -# CONFIG_RCU_TORTURE_TEST is not set -# CONFIG_RCU_REF_SCALE_TEST is not set -CONFIG_RCU_CPU_STALL_TIMEOUT=60 -CONFIG_RCU_EXP_CPU_STALL_TIMEOUT=0 -# CONFIG_RCU_CPU_STALL_CPUTIME is not set -# CONFIG_RCU_TRACE is not set -# CONFIG_RCU_EQS_DEBUG is not set -# end of RCU Debugging - -# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set -# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set -# CONFIG_LATENCYTOP is not set -# CONFIG_DEBUG_CGROUP_REF is not set -CONFIG_USER_STACKTRACE_SUPPORT=y -CONFIG_NOP_TRACER=y -CONFIG_HAVE_RETHOOK=y -CONFIG_RETHOOK=y -CONFIG_HAVE_FUNCTION_TRACER=y -CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y -CONFIG_HAVE_FUNCTION_GRAPH_RETVAL=y -CONFIG_HAVE_DYNAMIC_FTRACE=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS=y -CONFIG_HAVE_DYNAMIC_FTRACE_NO_PATCHABLE=y -CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y -CONFIG_HAVE_SYSCALL_TRACEPOINTS=y -CONFIG_HAVE_FENTRY=y -CONFIG_HAVE_OBJTOOL_MCOUNT=y -CONFIG_HAVE_OBJTOOL_NOP_MCOUNT=y -CONFIG_HAVE_C_RECORDMCOUNT=y -CONFIG_HAVE_BUILDTIME_MCOUNT_SORT=y -CONFIG_BUILDTIME_MCOUNT_SORT=y -CONFIG_TRACER_MAX_TRACE=y -CONFIG_TRACE_CLOCK=y -CONFIG_RING_BUFFER=y -CONFIG_EVENT_TRACING=y -CONFIG_CONTEXT_SWITCH_TRACER=y -CONFIG_TRACING=y -CONFIG_GENERIC_TRACER=y -CONFIG_TRACING_SUPPORT=y -CONFIG_FTRACE=y -# CONFIG_BOOTTIME_TRACING is not set -CONFIG_FUNCTION_TRACER=y -CONFIG_FUNCTION_GRAPH_TRACER=y -CONFIG_FUNCTION_GRAPH_RETVAL=y -CONFIG_DYNAMIC_FTRACE=y -CONFIG_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_DYNAMIC_FTRACE_WITH_ARGS=y -CONFIG_FPROBE=y -# CONFIG_FUNCTION_PROFILER is not set -# CONFIG_STACK_TRACER is not set -# CONFIG_IRQSOFF_TRACER is not set -# CONFIG_PREEMPT_TRACER is not set -# CONFIG_SCHED_TRACER is not set -# CONFIG_HWLAT_TRACER is not set -CONFIG_OSNOISE_TRACER=y -CONFIG_TIMERLAT_TRACER=y -# CONFIG_MMIOTRACE is not set -# CONFIG_FTRACE_SYSCALLS is not set -# CONFIG_TRACER_SNAPSHOT is not set -CONFIG_BRANCH_PROFILE_NONE=y -# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_FPROBE_EVENTS=y -CONFIG_KPROBE_EVENTS=y -# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set -CONFIG_UPROBE_EVENTS=y -CONFIG_BPF_EVENTS=y -CONFIG_DYNAMIC_EVENTS=y -CONFIG_PROBE_EVENTS=y -# CONFIG_BPF_KPROBE_OVERRIDE is not set -CONFIG_FTRACE_MCOUNT_RECORD=y -CONFIG_FTRACE_MCOUNT_USE_CC=y -# CONFIG_SYNTH_EVENTS is not set -# CONFIG_USER_EVENTS is not set -# CONFIG_HIST_TRIGGERS is not set -# CONFIG_TRACE_EVENT_INJECT is not set -# CONFIG_TRACEPOINT_BENCHMARK is not set -# CONFIG_RING_BUFFER_BENCHMARK is not set -# CONFIG_TRACE_EVAL_MAP_FILE is not set -# CONFIG_FTRACE_RECORD_RECURSION is not set -# CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING is not set -# CONFIG_FTRACE_STARTUP_TEST is not set -# CONFIG_FTRACE_SORT_STARTUP_TEST is not set -# CONFIG_RING_BUFFER_STARTUP_TEST is not set -# CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS is not set -# CONFIG_PREEMPTIRQ_DELAY_TEST is not set -# CONFIG_KPROBE_EVENT_GEN_TEST is not set -# CONFIG_RV is not set -# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set -# CONFIG_SAMPLES is not set -CONFIG_HAVE_SAMPLE_FTRACE_DIRECT=y -CONFIG_HAVE_SAMPLE_FTRACE_DIRECT_MULTI=y -CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y -# CONFIG_STRICT_DEVMEM is not set - -# -# x86 Debugging -# -CONFIG_X86_VERBOSE_BOOTUP=y -CONFIG_EARLY_PRINTK=y -# CONFIG_EARLY_PRINTK_DBGP is not set -# CONFIG_EARLY_PRINTK_USB_XDBC is not set -# CONFIG_EFI_PGT_DUMP is not set -# CONFIG_DEBUG_TLBFLUSH is not set -CONFIG_HAVE_MMIOTRACE_SUPPORT=y -# CONFIG_X86_DECODER_SELFTEST is not set -# CONFIG_IO_DELAY_0X80 is not set -# CONFIG_IO_DELAY_0XED is not set -# CONFIG_IO_DELAY_UDELAY is not set -CONFIG_IO_DELAY_NONE=y -# CONFIG_DEBUG_BOOT_PARAMS is not set -# CONFIG_CPA_DEBUG is not set -# CONFIG_DEBUG_ENTRY is not set -# CONFIG_DEBUG_NMI_SELFTEST is not set -# CONFIG_X86_DEBUG_FPU is not set -# CONFIG_PUNIT_ATOM_DEBUG is not set -CONFIG_UNWINDER_ORC=y -# CONFIG_UNWINDER_FRAME_POINTER is not set -# end of x86 Debugging - -# -# Kernel Testing and Coverage -# -# CONFIG_KUNIT is not set -# CONFIG_NOTIFIER_ERROR_INJECTION is not set -CONFIG_FUNCTION_ERROR_INJECTION=y -# CONFIG_FAULT_INJECTION is not set -CONFIG_ARCH_HAS_KCOV=y -CONFIG_CC_HAS_SANCOV_TRACE_PC=y -# CONFIG_KCOV is not set -CONFIG_RUNTIME_TESTING_MENU=y -# CONFIG_TEST_DHRY is not set -# CONFIG_LKDTM is not set -# CONFIG_TEST_MIN_HEAP is not set -# CONFIG_TEST_DIV64 is not set -# CONFIG_TEST_MULDIV64 is not set -# CONFIG_BACKTRACE_SELF_TEST is not set -# CONFIG_TEST_REF_TRACKER is not set -# CONFIG_RBTREE_TEST is not set -# CONFIG_REED_SOLOMON_TEST is not set -# CONFIG_INTERVAL_TREE_TEST is not set -# CONFIG_PERCPU_TEST is not set -# CONFIG_ATOMIC64_SELFTEST is not set -# CONFIG_ASYNC_RAID6_TEST is not set -# CONFIG_TEST_HEXDUMP is not set -# CONFIG_TEST_KSTRTOX is not set -# CONFIG_TEST_PRINTF is not set -# CONFIG_TEST_SCANF is not set -# CONFIG_TEST_BITMAP is not set -# CONFIG_TEST_UUID is not set -# CONFIG_TEST_XARRAY is not set -# CONFIG_TEST_MAPLE_TREE is not set -# CONFIG_TEST_RHASHTABLE is not set -# CONFIG_TEST_IDA is not set -# CONFIG_TEST_LKM is not set -# CONFIG_TEST_BITOPS is not set -# CONFIG_TEST_VMALLOC is not set -CONFIG_TEST_BPF=m -# CONFIG_TEST_BLACKHOLE_DEV is not set -# CONFIG_FIND_BIT_BENCHMARK is not set -# CONFIG_TEST_FIRMWARE is not set -# CONFIG_TEST_SYSCTL is not set -# CONFIG_TEST_UDELAY is not set -# CONFIG_TEST_STATIC_KEYS is not set -# CONFIG_TEST_KMOD is not set -# CONFIG_TEST_DEBUG_VIRTUAL is not set -# CONFIG_TEST_MEMCAT_P is not set -# CONFIG_TEST_MEMINIT is not set -# CONFIG_TEST_HMM is not set -# CONFIG_TEST_FREE_PAGES is not set -# CONFIG_TEST_FPU is not set -# CONFIG_TEST_CLOCKSOURCE_WATCHDOG is not set -# CONFIG_TEST_OBJPOOL is not set -CONFIG_ARCH_USE_MEMTEST=y -# CONFIG_MEMTEST is not set -# end of Kernel Testing and Coverage - -# -# Rust hacking -# -# end of Rust hacking -# end of Kernel hacking - -# -# Gentoo Linux -# -CONFIG_GENTOO_LINUX=y -CONFIG_GENTOO_LINUX_UDEV=y -CONFIG_GENTOO_LINUX_PORTAGE=y - -# -# Support for init systems, system and service managers -# -CONFIG_GENTOO_LINUX_INIT_SCRIPT=y -CONFIG_GENTOO_LINUX_INIT_SYSTEMD=y -# end of Support for init systems, system and service managers - -CONFIG_GENTOO_KERNEL_SELF_PROTECTION=y -CONFIG_GENTOO_PRINT_FIRMWARE_INFO=y -# end of Gentoo Linux diff --git a/sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-upd-from-bcachefs-for-upstream-69a5a13.patch b/sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-upd-from-bcachefs-for-upstream-69a5a13.patch deleted file mode 100644 index 4109529..0000000 --- a/sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-upd-from-bcachefs-for-upstream-69a5a13.patch +++ /dev/null @@ -1,23844 +0,0 @@ -From 6b415ceff37832db6c4a8bfb64edcae7701c7f8c Mon Sep 17 00:00:00 2001 -From: Alexander Miroshnichenko -Date: Tue, 18 Mar 2025 16:26:30 +0300 -Subject: [PATCH] bcachefs: cherry-pick updates from bcachefs-for-upstream - 69a5a13 -Content-Type: text/plain; charset="utf-8" -Content-Transfer-Encoding: 8bit - -Signed-off-by: Alexander Miroshnichenko ---- - .../filesystems/bcachefs/CodingStyle.rst | 2 +- - .../bcachefs/SubmittingPatches.rst | 98 ++ - Documentation/filesystems/bcachefs/index.rst | 1 + - MAINTAINERS | 1 + - fs/bcachefs/Kconfig | 9 +- - fs/bcachefs/Makefile | 1 + - fs/bcachefs/acl.c | 11 +- - fs/bcachefs/alloc_background.c | 601 +++++++------ - fs/bcachefs/alloc_background.h | 18 +- - fs/bcachefs/alloc_background_format.h | 4 +- - fs/bcachefs/alloc_foreground.c | 312 +++---- - fs/bcachefs/alloc_foreground.h | 4 +- - fs/bcachefs/alloc_types.h | 1 + - fs/bcachefs/backpointers.c | 838 +++++++++++------- - fs/bcachefs/backpointers.h | 97 +- - fs/bcachefs/bbpos.h | 2 +- - fs/bcachefs/bcachefs.h | 70 +- - fs/bcachefs/bcachefs_format.h | 106 ++- - fs/bcachefs/bkey.h | 7 - - fs/bcachefs/bkey_methods.c | 29 +- - fs/bcachefs/bkey_methods.h | 15 +- - fs/bcachefs/bkey_types.h | 28 + - fs/bcachefs/btree_cache.c | 73 +- - fs/bcachefs/btree_cache.h | 14 +- - fs/bcachefs/btree_gc.c | 178 +--- - fs/bcachefs/btree_gc.h | 4 +- - fs/bcachefs/btree_io.c | 229 +++-- - fs/bcachefs/btree_io.h | 6 +- - fs/bcachefs/btree_iter.c | 626 ++++++++----- - fs/bcachefs/btree_iter.h | 148 ++-- - fs/bcachefs/btree_journal_iter.c | 237 ++++- - fs/bcachefs/btree_journal_iter.h | 22 +- - fs/bcachefs/btree_journal_iter_types.h | 36 + - fs/bcachefs/btree_key_cache.c | 76 +- - fs/bcachefs/btree_locking.c | 83 +- - fs/bcachefs/btree_locking.h | 52 +- - fs/bcachefs/btree_node_scan.c | 153 ++-- - fs/bcachefs/btree_node_scan_types.h | 1 - - fs/bcachefs/btree_trans_commit.c | 211 ++--- - fs/bcachefs/btree_types.h | 45 +- - fs/bcachefs/btree_update.c | 70 +- - fs/bcachefs/btree_update.h | 37 +- - fs/bcachefs/btree_update_interior.c | 295 +++--- - fs/bcachefs/btree_update_interior.h | 7 +- - fs/bcachefs/btree_write_buffer.c | 102 ++- - fs/bcachefs/buckets.c | 133 +-- - fs/bcachefs/buckets.h | 30 +- - fs/bcachefs/buckets_types.h | 2 +- - fs/bcachefs/buckets_waiting_for_journal.c | 12 +- - fs/bcachefs/buckets_waiting_for_journal.h | 4 +- - fs/bcachefs/chardev.c | 219 +---- - fs/bcachefs/checksum.c | 10 +- - fs/bcachefs/checksum.h | 2 +- - fs/bcachefs/compress.c | 127 ++- - fs/bcachefs/compress.h | 4 +- - fs/bcachefs/darray.h | 2 +- - fs/bcachefs/data_update.c | 127 +-- - fs/bcachefs/debug.c | 5 +- - fs/bcachefs/dirent.c | 10 +- - fs/bcachefs/dirent.h | 4 +- - fs/bcachefs/disk_accounting.c | 150 ++-- - fs/bcachefs/disk_accounting.h | 75 +- - fs/bcachefs/ec.c | 267 +++--- - fs/bcachefs/ec.h | 5 +- - fs/bcachefs/ec_format.h | 17 + - fs/bcachefs/errcode.h | 21 +- - fs/bcachefs/error.c | 187 ++-- - fs/bcachefs/error.h | 58 +- - fs/bcachefs/extent_update.c | 4 +- - fs/bcachefs/extents.c | 292 ++---- - fs/bcachefs/extents.h | 20 +- - fs/bcachefs/extents_format.h | 15 +- - fs/bcachefs/fs-common.c | 108 ++- - fs/bcachefs/fs-common.h | 2 + - fs/bcachefs/fs-io-buffered.c | 57 +- - fs/bcachefs/fs-io-direct.c | 5 + - fs/bcachefs/fs-io-pagecache.c | 4 +- - fs/bcachefs/fs-io.c | 55 +- - fs/bcachefs/fs-ioctl.c | 7 +- - fs/bcachefs/fs.c | 104 ++- - fs/bcachefs/fs.h | 1 + - fs/bcachefs/fsck.c | 747 ++++++++++------ - fs/bcachefs/fsck.h | 11 + - fs/bcachefs/inode.c | 170 ++-- - fs/bcachefs/inode.h | 45 +- - fs/bcachefs/inode_format.h | 15 +- - fs/bcachefs/io_misc.c | 22 +- - fs/bcachefs/io_read.c | 278 +++--- - fs/bcachefs/io_read.h | 28 +- - fs/bcachefs/io_write.c | 114 ++- - fs/bcachefs/io_write.h | 2 + - fs/bcachefs/journal.c | 323 ++++--- - fs/bcachefs/journal.h | 19 +- - fs/bcachefs/journal_io.c | 223 +++-- - fs/bcachefs/journal_io.h | 2 +- - fs/bcachefs/journal_reclaim.c | 172 +++- - fs/bcachefs/journal_reclaim.h | 3 + - fs/bcachefs/journal_types.h | 22 +- - fs/bcachefs/logged_ops.c | 11 +- - fs/bcachefs/logged_ops_format.h | 5 + - fs/bcachefs/lru.c | 4 +- - fs/bcachefs/lru.h | 2 +- - fs/bcachefs/move.c | 185 ++-- - fs/bcachefs/move.h | 5 +- - fs/bcachefs/movinggc.c | 42 +- - fs/bcachefs/opts.c | 26 +- - fs/bcachefs/opts.h | 47 +- - fs/bcachefs/printbuf.h | 15 +- - fs/bcachefs/quota.c | 2 +- - fs/bcachefs/quota.h | 4 +- - fs/bcachefs/rcu_pending.c | 38 +- - fs/bcachefs/rebalance.c | 264 +++++- - fs/bcachefs/rebalance.h | 30 + - fs/bcachefs/rebalance_format.h | 53 ++ - fs/bcachefs/rebalance_types.h | 2 - - fs/bcachefs/recovery.c | 212 +++-- - fs/bcachefs/recovery.h | 2 +- - fs/bcachefs/recovery_passes.c | 112 ++- - fs/bcachefs/recovery_passes.h | 1 + - fs/bcachefs/recovery_passes_types.h | 92 +- - fs/bcachefs/reflink.c | 496 ++++++++--- - fs/bcachefs/reflink.h | 20 +- - fs/bcachefs/reflink_format.h | 7 +- - fs/bcachefs/sb-clean.c | 6 +- - fs/bcachefs/sb-counters_format.h | 165 ++-- - fs/bcachefs/sb-downgrade.c | 25 +- - fs/bcachefs/sb-errors_format.h | 60 +- - fs/bcachefs/six.c | 32 +- - fs/bcachefs/six.h | 8 +- - fs/bcachefs/snapshot.c | 515 +++++------ - fs/bcachefs/snapshot.h | 17 +- - fs/bcachefs/str_hash.c | 295 ++++++ - fs/bcachefs/str_hash.h | 28 +- - fs/bcachefs/subvolume.c | 73 +- - fs/bcachefs/subvolume.h | 19 +- - fs/bcachefs/subvolume_types.h | 2 +- - fs/bcachefs/super-io.c | 114 ++- - fs/bcachefs/super-io.h | 18 +- - fs/bcachefs/super.c | 76 +- - fs/bcachefs/super.h | 11 +- - fs/bcachefs/sysfs.c | 60 +- - fs/bcachefs/tests.c | 26 +- - fs/bcachefs/trace.h | 117 ++- - fs/bcachefs/util.c | 24 +- - fs/bcachefs/util.h | 34 +- - fs/bcachefs/varint.c | 5 +- - fs/bcachefs/xattr.c | 13 +- - fs/bcachefs/xattr.h | 5 +- - fs/fs_parser.c | 3 +- - include/linux/fs_parser.h | 2 + - include/linux/min_heap.h | 4 +- - 151 files changed, 7816 insertions(+), 4919 deletions(-) - create mode 100644 Documentation/filesystems/bcachefs/SubmittingPatches.rst - create mode 100644 fs/bcachefs/btree_journal_iter_types.h - create mode 100644 fs/bcachefs/rebalance_format.h - create mode 100644 fs/bcachefs/str_hash.c - -diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst -index 01de555e21d8..b29562a6bf55 100644 ---- a/Documentation/filesystems/bcachefs/CodingStyle.rst -+++ b/Documentation/filesystems/bcachefs/CodingStyle.rst -@@ -183,4 +183,4 @@ even better as a code comment. - A good code comment is wonderful, but even better is the comment that didn't - need to exist because the code was so straightforward as to be obvious; - organized into small clean and tidy modules, with clear and descriptive names --for functions and variable, where every line of code has a clear purpose. -+for functions and variables, where every line of code has a clear purpose. -diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst -new file mode 100644 -index 000000000000..026b12ae0d6a ---- /dev/null -+++ b/Documentation/filesystems/bcachefs/SubmittingPatches.rst -@@ -0,0 +1,98 @@ -+Submitting patches to bcachefs: -+=============================== -+ -+Patches must be tested before being submitted, either with the xfstests suite -+[0], or the full bcachefs test suite in ktest [1], depending on what's being -+touched. Note that ktest wraps xfstests and will be an easier method to running -+it for most users; it includes single-command wrappers for all the mainstream -+in-kernel local filesystems. -+ -+Patches will undergo more testing after being merged (including -+lockdep/kasan/preempt/etc. variants), these are not generally required to be -+run by the submitter - but do put some thought into what you're changing and -+which tests might be relevant, e.g. are you dealing with tricky memory layout -+work? kasan, are you doing locking work? then lockdep; and ktest includes -+single-command variants for the debug build types you'll most likely need. -+ -+The exception to this rule is incomplete WIP/RFC patches: if you're working on -+something nontrivial, it's encouraged to send out a WIP patch to let people -+know what you're doing and make sure you're on the right track. Just make sure -+it includes a brief note as to what's done and what's incomplete, to avoid -+confusion. -+ -+Rigorous checkpatch.pl adherence is not required (many of its warnings are -+considered out of date), but try not to deviate too much without reason. -+ -+Focus on writing code that reads well and is organized well; code should be -+aesthetically pleasing. -+ -+CI: -+=== -+ -+Instead of running your tests locally, when running the full test suite it's -+prefereable to let a server farm do it in parallel, and then have the results -+in a nice test dashboard (which can tell you which failures are new, and -+presents results in a git log view, avoiding the need for most bisecting). -+ -+That exists [2], and community members may request an account. If you work for -+a big tech company, you'll need to help out with server costs to get access - -+but the CI is not restricted to running bcachefs tests: it runs any ktest test -+(which generally makes it easy to wrap other tests that can run in qemu). -+ -+Other things to think about: -+============================ -+ -+- How will we debug this code? Is there sufficient introspection to diagnose -+ when something starts acting wonky on a user machine? -+ -+ We don't necessarily need every single field of every data structure visible -+ with introspection, but having the important fields of all the core data -+ types wired up makes debugging drastically easier - a bit of thoughtful -+ foresight greatly reduces the need to have people build custom kernels with -+ debug patches. -+ -+ More broadly, think about all the debug tooling that might be needed. -+ -+- Does it make the codebase more or less of a mess? Can we also try to do some -+ organizing, too? -+ -+- Do new tests need to be written? New assertions? How do we know and verify -+ that the code is correct, and what happens if something goes wrong? -+ -+ We don't yet have automated code coverage analysis or easy fault injection - -+ but for now, pretend we did and ask what they might tell us. -+ -+ Assertions are hugely important, given that we don't yet have a systems -+ language that can do ergonomic embedded correctness proofs. Hitting an assert -+ in testing is much better than wandering off into undefined behaviour la-la -+ land - use them. Use them judiciously, and not as a replacement for proper -+ error handling, but use them. -+ -+- Does it need to be performance tested? Should we add new peformance counters? -+ -+ bcachefs has a set of persistent runtime counters which can be viewed with -+ the 'bcachefs fs top' command; this should give users a basic idea of what -+ their filesystem is currently doing. If you're doing a new feature or looking -+ at old code, think if anything should be added. -+ -+- If it's a new on disk format feature - have upgrades and downgrades been -+ tested? (Automated tests exists but aren't in the CI, due to the hassle of -+ disk image management; coordinate to have them run.) -+ -+Mailing list, IRC: -+================== -+ -+Patches should hit the list [3], but much discussion and code review happens on -+IRC as well [4]; many people appreciate the more conversational approach and -+quicker feedback. -+ -+Additionally, we have a lively user community doing excellent QA work, which -+exists primarily on IRC. Please make use of that resource; user feedback is -+important for any nontrivial feature, and documenting it in commit messages -+would be a good idea. -+ -+[0]: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git -+[1]: https://evilpiepirate.org/git/ktest.git/ -+[2]: https://evilpiepirate.org/~testdashboard/ci/ -+[3]: linux-bcachefs@vger.kernel.org -+[4]: irc.oftc.net#bcache, #bcachefs-dev -diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst -index 95fc4b90739e..7db4d7ceab58 100644 ---- a/Documentation/filesystems/bcachefs/index.rst -+++ b/Documentation/filesystems/bcachefs/index.rst -@@ -9,4 +9,5 @@ bcachefs Documentation - :numbered: - - CodingStyle -+ SubmittingPatches - errorcodes -diff --git a/MAINTAINERS b/MAINTAINERS -index de04c7ba8571..e64ff0cd0693 100644 ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -3848,6 +3848,7 @@ M: Kent Overstreet - L: linux-bcachefs@vger.kernel.org - S: Supported - C: irc://irc.oftc.net/bcache -+P: Documentation/filesystems/bcachefs/SubmittingPatches.rst - T: git https://evilpiepirate.org/git/bcachefs.git - F: fs/bcachefs/ - F: Documentation/filesystems/bcachefs/ -diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig -index 5bac803ea367..8350b07fe0b4 100644 ---- a/fs/bcachefs/Kconfig -+++ b/fs/bcachefs/Kconfig -@@ -59,6 +59,13 @@ config BCACHEFS_DEBUG - The resulting code will be significantly slower than normal; you - probably shouldn't select this option unless you're a developer. - -+config BCACHEFS_INJECT_TRANSACTION_RESTARTS -+ bool "Randomly inject transaction restarts" -+ depends on BCACHEFS_DEBUG -+ help -+ Randomly inject transaction restarts in a few core paths - may have a -+ significant performance penalty -+ - config BCACHEFS_TESTS - bool "bcachefs unit and performance tests" - depends on BCACHEFS_FS -@@ -89,7 +96,7 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN - - config BCACHEFS_PATH_TRACEPOINTS - bool "Extra btree_path tracepoints" -- depends on BCACHEFS_FS -+ depends on BCACHEFS_FS && TRACING - help - Enable extra tracepoints for debugging btree_path operations; we don't - normally want these enabled because they happen at very high rates. -diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile -index 56d20e219f59..d2689388d5e8 100644 ---- a/fs/bcachefs/Makefile -+++ b/fs/bcachefs/Makefile -@@ -82,6 +82,7 @@ bcachefs-y := \ - siphash.o \ - six.o \ - snapshot.o \ -+ str_hash.o \ - subvolume.o \ - super.o \ - super-io.o \ -diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c -index 87f1be9d4db4..99487727ae64 100644 ---- a/fs/bcachefs/acl.c -+++ b/fs/bcachefs/acl.c -@@ -184,11 +184,6 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, - return ERR_PTR(-EINVAL); - } - --#define acl_for_each_entry(acl, acl_e) \ -- for (acl_e = acl->a_entries; \ -- acl_e < acl->a_entries + acl->a_count; \ -- acl_e++) -- - /* - * Convert from in-memory to filesystem representation. - */ -@@ -199,11 +194,11 @@ bch2_acl_to_xattr(struct btree_trans *trans, - { - struct bkey_i_xattr *xattr; - bch_acl_header *acl_header; -- const struct posix_acl_entry *acl_e; -+ const struct posix_acl_entry *acl_e, *pe; - void *outptr; - unsigned nr_short = 0, nr_long = 0, acl_len, u64s; - -- acl_for_each_entry(acl, acl_e) { -+ FOREACH_ACL_ENTRY(acl_e, acl, pe) { - switch (acl_e->e_tag) { - case ACL_USER: - case ACL_GROUP: -@@ -241,7 +236,7 @@ bch2_acl_to_xattr(struct btree_trans *trans, - - outptr = (void *) acl_header + sizeof(*acl_header); - -- acl_for_each_entry(acl, acl_e) { -+ FOREACH_ACL_ENTRY(acl_e, acl, pe) { - bch_acl_entry *entry = outptr; - - entry->e_tag = cpu_to_le16(acl_e->e_tag); -diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c -index c84a91572a1d..3ea809990ef1 100644 ---- a/fs/bcachefs/alloc_background.c -+++ b/fs/bcachefs/alloc_background.c -@@ -198,7 +198,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) - } - - int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); - int ret = 0; -@@ -213,7 +213,7 @@ int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, - } - - int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_alloc_unpacked u; - int ret = 0; -@@ -226,7 +226,7 @@ int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, - } - - int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_alloc_unpacked u; - int ret = 0; -@@ -239,7 +239,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, - } - - int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bch_alloc_v4 a; - int ret = 0; -@@ -322,9 +322,9 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, - void bch2_alloc_v4_swab(struct bkey_s k) - { - struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; -- struct bch_backpointer *bp, *bps; - -- a->journal_seq = swab64(a->journal_seq); -+ a->journal_seq_nonempty = swab64(a->journal_seq_nonempty); -+ a->journal_seq_empty = swab64(a->journal_seq_empty); - a->flags = swab32(a->flags); - a->dirty_sectors = swab32(a->dirty_sectors); - a->cached_sectors = swab32(a->cached_sectors); -@@ -333,13 +333,6 @@ void bch2_alloc_v4_swab(struct bkey_s k) - a->stripe = swab32(a->stripe); - a->nr_external_backpointers = swab32(a->nr_external_backpointers); - a->stripe_sectors = swab32(a->stripe_sectors); -- -- bps = alloc_v4_backpointers(a); -- for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { -- bp->bucket_offset = swab40(bp->bucket_offset); -- bp->bucket_len = swab32(bp->bucket_len); -- bch2_bpos_swab(&bp->pos); -- } - } - - void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -@@ -354,16 +347,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c - prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); - bch2_prt_data_type(out, a->data_type); - prt_newline(out); -- prt_printf(out, "journal_seq %llu\n", a->journal_seq); -- prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); -- prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); -- prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); -- prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); -- prt_printf(out, "cached_sectors %u\n", a->cached_sectors); -- prt_printf(out, "stripe %u\n", a->stripe); -- prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); -- prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); -- prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); -+ prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); -+ prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); -+ prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); -+ prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); -+ prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); -+ prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); -+ prt_printf(out, "cached_sectors %u\n", a->cached_sectors); -+ prt_printf(out, "stripe %u\n", a->stripe); -+ prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); -+ prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); -+ prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); - - if (ca) - prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca)); -@@ -392,7 +386,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) - struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); - - *out = (struct bch_alloc_v4) { -- .journal_seq = u.journal_seq, -+ .journal_seq_nonempty = u.journal_seq, - .flags = u.need_discard, - .gen = u.gen, - .oldest_gen = u.oldest_gen, -@@ -517,7 +511,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) - } - - int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - int ret = 0; - -@@ -664,74 +658,80 @@ int bch2_alloc_read(struct bch_fs *c) - - /* Free space/discard btree: */ - -+static int __need_discard_or_freespace_err(struct btree_trans *trans, -+ struct bkey_s_c alloc_k, -+ bool set, bool discard, bool repair) -+{ -+ struct bch_fs *c = trans->c; -+ enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0); -+ enum bch_sb_error_id err_id = discard -+ ? BCH_FSCK_ERR_need_discard_key_wrong -+ : BCH_FSCK_ERR_freespace_key_wrong; -+ enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace; -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf, c, alloc_k); -+ -+ int ret = __bch2_fsck_err(NULL, trans, flags, err_id, -+ "bucket incorrectly %sset in %s btree\n" -+ " %s", -+ set ? "" : "un", -+ bch2_btree_id_str(btree), -+ buf.buf); -+ if (ret == -BCH_ERR_fsck_ignore || -+ ret == -BCH_ERR_fsck_errors_not_fixed) -+ ret = 0; -+ -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+#define need_discard_or_freespace_err(...) \ -+ fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__)) -+ -+#define need_discard_or_freespace_err_on(cond, ...) \ -+ (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false) -+ - static int bch2_bucket_do_index(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c alloc_k, - const struct bch_alloc_v4 *a, - bool set) - { -- struct bch_fs *c = trans->c; -- struct btree_iter iter; -- struct bkey_s_c old; -- struct bkey_i *k; - enum btree_id btree; -- enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; -- enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; -- struct printbuf buf = PRINTBUF; -- int ret; -+ struct bpos pos; - - if (a->data_type != BCH_DATA_free && - a->data_type != BCH_DATA_need_discard) - return 0; - -- k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); -- if (IS_ERR(k)) -- return PTR_ERR(k); -- -- bkey_init(&k->k); -- k->k.type = new_type; -- - switch (a->data_type) { - case BCH_DATA_free: - btree = BTREE_ID_freespace; -- k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); -- bch2_key_resize(&k->k, 1); -+ pos = alloc_freespace_pos(alloc_k.k->p, *a); - break; - case BCH_DATA_need_discard: - btree = BTREE_ID_need_discard; -- k->k.p = alloc_k.k->p; -+ pos = alloc_k.k->p; - break; - default: - return 0; - } - -- old = bch2_bkey_get_iter(trans, &iter, btree, -- bkey_start_pos(&k->k), -- BTREE_ITER_intent); -- ret = bkey_err(old); -+ struct btree_iter iter; -+ struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent); -+ int ret = bkey_err(old); - if (ret) - return ret; - -- if (ca->mi.freespace_initialized && -- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && -- bch2_trans_inconsistent_on(old.k->type != old_type, trans, -- "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" -- " for %s", -- set ? "setting" : "clearing", -- bch2_btree_id_str(btree), -- iter.pos.inode, -- iter.pos.offset, -- bch2_bkey_types[old.k->type], -- bch2_bkey_types[old_type], -- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { -- ret = -EIO; -- goto err; -- } -+ need_discard_or_freespace_err_on(ca->mi.freespace_initialized && -+ !old.k->type != set, -+ trans, alloc_k, set, -+ btree == BTREE_ID_need_discard, false); - -- ret = bch2_trans_update(trans, &iter, k, 0); --err: -+ ret = bch2_btree_bit_mod_iter(trans, &iter, set); -+fsck_err: - bch2_trans_iter_exit(trans, &iter); -- printbuf_exit(&buf); - return ret; - } - -@@ -858,7 +858,10 @@ int bch2_trigger_alloc(struct btree_trans *trans, - if (flags & BTREE_TRIGGER_transactional) { - alloc_data_type_set(new_a, new_a->data_type); - -- if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { -+ int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - -+ (int) data_type_is_empty(old_a->data_type); -+ -+ if (is_empty_delta < 0) { - new_a->io_time[READ] = bch2_current_io_time(c, READ); - new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); -@@ -928,37 +931,55 @@ int bch2_trigger_alloc(struct btree_trans *trans, - } - - if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { -- u64 journal_seq = trans->journal_res.seq; -- u64 bucket_journal_seq = new_a->journal_seq; -+ u64 transaction_seq = trans->journal_res.seq; -+ BUG_ON(!transaction_seq); - -- if ((flags & BTREE_TRIGGER_insert) && -- data_type_is_empty(old_a->data_type) != -- data_type_is_empty(new_a->data_type) && -- new.k->type == KEY_TYPE_alloc_v4) { -- struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; -+ if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq, -+ trans, alloc_key_journal_seq_in_future, -+ "bucket journal seq in future (currently at %llu)\n%s", -+ journal_cur_seq(&c->journal), -+ (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf))) -+ new_a->journal_seq_nonempty = transaction_seq; - -- /* -- * If the btree updates referring to a bucket weren't flushed -- * before the bucket became empty again, then the we don't have -- * to wait on a journal flush before we can reuse the bucket: -- */ -- v->journal_seq = bucket_journal_seq = -- data_type_is_empty(new_a->data_type) && -- (journal_seq == v->journal_seq || -- bch2_journal_noflush_seq(&c->journal, v->journal_seq)) -- ? 0 : journal_seq; -+ int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - -+ (int) data_type_is_empty(old_a->data_type); -+ -+ /* -+ * Record journal sequence number of empty -> nonempty transition: -+ * Note that there may be multiple empty -> nonempty -+ * transitions, data in a bucket may be overwritten while we're -+ * still writing to it - so be careful to only record the first: -+ * */ -+ if (is_empty_delta < 0 && -+ new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) { -+ new_a->journal_seq_nonempty = transaction_seq; -+ new_a->journal_seq_empty = 0; - } - -- if (!data_type_is_empty(old_a->data_type) && -- data_type_is_empty(new_a->data_type) && -- bucket_journal_seq) { -- ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, -- c->journal.flushed_seq_ondisk, -- new.k->p.inode, new.k->p.offset, -- bucket_journal_seq); -- if (bch2_fs_fatal_err_on(ret, c, -- "setting bucket_needs_journal_commit: %s", bch2_err_str(ret))) -- goto err; -+ /* -+ * Bucket becomes empty: mark it as waiting for a journal flush, -+ * unless updates since empty -> nonempty transition were never -+ * flushed - we may need to ask the journal not to flush -+ * intermediate sequence numbers: -+ */ -+ if (is_empty_delta > 0) { -+ if (new_a->journal_seq_nonempty == transaction_seq || -+ bch2_journal_noflush_seq(&c->journal, -+ new_a->journal_seq_nonempty, -+ transaction_seq)) { -+ new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0; -+ } else { -+ new_a->journal_seq_empty = transaction_seq; -+ -+ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, -+ c->journal.flushed_seq_ondisk, -+ new.k->p.inode, new.k->p.offset, -+ transaction_seq); -+ if (bch2_fs_fatal_err_on(ret, c, -+ "setting bucket_needs_journal_commit: %s", -+ bch2_err_str(ret))) -+ goto err; -+ } - } - - if (new_a->gen != old_a->gen) { -@@ -974,7 +995,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, - - #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) - #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) --#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) -+#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk) - - if (statechange(a->data_type == BCH_DATA_free) && - bucket_flushed(new_a)) -@@ -1006,6 +1027,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, - rcu_read_unlock(); - } - err: -+fsck_err: - printbuf_exit(&buf); - bch2_dev_put(ca); - return ret; -@@ -1045,7 +1067,7 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos - * btree node min/max is a closed interval, upto takes a half - * open interval: - */ -- k = bch2_btree_iter_peek_upto(&iter2, end); -+ k = bch2_btree_iter_peek_max(&iter2, end); - next = iter2.pos; - bch2_trans_iter_exit(iter->trans, &iter2); - -@@ -1129,7 +1151,6 @@ int bch2_check_alloc_key(struct btree_trans *trans, - struct bch_fs *c = trans->c; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; -- unsigned discard_key_type, freespace_key_type; - unsigned gens_offset; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; -@@ -1149,64 +1170,30 @@ int bch2_check_alloc_key(struct btree_trans *trans, - - a = bch2_alloc_to_v4(alloc_k, &a_convert); - -- discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; - bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); - k = bch2_btree_iter_peek_slot(discard_iter); - ret = bkey_err(k); - if (ret) - goto err; - -- if (fsck_err_on(k.k->type != discard_key_type, -- trans, need_discard_key_wrong, -- "incorrect key in need_discard btree (got %s should be %s)\n" -- " %s", -- bch2_bkey_types[k.k->type], -- bch2_bkey_types[discard_key_type], -- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { -- struct bkey_i *update = -- bch2_trans_kmalloc(trans, sizeof(*update)); -- -- ret = PTR_ERR_OR_ZERO(update); -- if (ret) -- goto err; -- -- bkey_init(&update->k); -- update->k.type = discard_key_type; -- update->k.p = discard_iter->pos; -- -- ret = bch2_trans_update(trans, discard_iter, update, 0); -+ bool is_discarded = a->data_type == BCH_DATA_need_discard; -+ if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded, -+ trans, alloc_k, !is_discarded, true, true)) { -+ ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded); - if (ret) - goto err; - } - -- freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; - bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); - k = bch2_btree_iter_peek_slot(freespace_iter); - ret = bkey_err(k); - if (ret) - goto err; - -- if (fsck_err_on(k.k->type != freespace_key_type, -- trans, freespace_key_wrong, -- "incorrect key in freespace btree (got %s should be %s)\n" -- " %s", -- bch2_bkey_types[k.k->type], -- bch2_bkey_types[freespace_key_type], -- (printbuf_reset(&buf), -- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { -- struct bkey_i *update = -- bch2_trans_kmalloc(trans, sizeof(*update)); -- -- ret = PTR_ERR_OR_ZERO(update); -- if (ret) -- goto err; -- -- bkey_init(&update->k); -- update->k.type = freespace_key_type; -- update->k.p = freespace_iter->pos; -- bch2_key_resize(&update->k, 1); -- -- ret = bch2_trans_update(trans, freespace_iter, update, 0); -+ bool is_free = a->data_type == BCH_DATA_free; -+ if (need_discard_or_freespace_err_on(!!k.k->type != is_free, -+ trans, alloc_k, !is_free, false, true)) { -+ ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free); - if (ret) - goto err; - } -@@ -1368,51 +1355,88 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, - return ret; - } - --static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, -- struct btree_iter *iter) -+struct check_discard_freespace_key_async { -+ struct work_struct work; -+ struct bch_fs *c; -+ struct bbpos pos; -+}; -+ -+static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0); -+ int ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ u8 gen; -+ ret = k.k->type != KEY_TYPE_set -+ ? bch2_check_discard_freespace_key(trans, &iter, &gen, false) -+ : 0; -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static void check_discard_freespace_key_work(struct work_struct *work) -+{ -+ struct check_discard_freespace_key_async *w = -+ container_of(work, struct check_discard_freespace_key_async, work); -+ -+ bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); -+ bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key); -+ kfree(w); -+} -+ -+int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, -+ bool async_repair) - { - struct bch_fs *c = trans->c; -- struct btree_iter alloc_iter; -- struct bkey_s_c alloc_k; -- struct bch_alloc_v4 a_convert; -- const struct bch_alloc_v4 *a; -- u64 genbits; -- struct bpos pos; - enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard - ? BCH_DATA_need_discard - : BCH_DATA_free; - struct printbuf buf = PRINTBUF; -- int ret; - -- pos = iter->pos; -- pos.offset &= ~(~0ULL << 56); -- genbits = iter->pos.offset & (~0ULL << 56); -+ struct bpos bucket = iter->pos; -+ bucket.offset &= ~(~0ULL << 56); -+ u64 genbits = iter->pos.offset & (~0ULL << 56); - -- alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); -- ret = bkey_err(alloc_k); -+ struct btree_iter alloc_iter; -+ struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, -+ BTREE_ID_alloc, bucket, -+ async_repair ? BTREE_ITER_cached : 0); -+ int ret = bkey_err(alloc_k); - if (ret) - return ret; - -- if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), -- trans, need_discard_freespace_key_to_invalid_dev_bucket, -- "entry in %s btree for nonexistant dev:bucket %llu:%llu", -- bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) -- goto delete; -+ if (!bch2_dev_bucket_exists(c, bucket)) { -+ if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket, -+ "entry in %s btree for nonexistant dev:bucket %llu:%llu", -+ bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) -+ goto delete; -+ ret = 1; -+ goto out; -+ } - -- a = bch2_alloc_to_v4(alloc_k, &a_convert); -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); -+ -+ if (a->data_type != state || -+ (state == BCH_DATA_free && -+ genbits != alloc_freespace_genbits(*a))) { -+ if (fsck_err(trans, need_discard_freespace_key_bad, -+ "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", -+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), -+ bch2_btree_id_str(iter->btree_id), -+ iter->pos.inode, -+ iter->pos.offset, -+ a->data_type == state, -+ genbits >> 56, alloc_freespace_genbits(*a) >> 56)) -+ goto delete; -+ ret = 1; -+ goto out; -+ } - -- if (fsck_err_on(a->data_type != state || -- (state == BCH_DATA_free && -- genbits != alloc_freespace_genbits(*a)), -- trans, need_discard_freespace_key_bad, -- "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", -- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), -- bch2_btree_id_str(iter->btree_id), -- iter->pos.inode, -- iter->pos.offset, -- a->data_type == state, -- genbits >> 56, alloc_freespace_genbits(*a) >> 56)) -- goto delete; -+ *gen = a->gen; - out: - fsck_err: - bch2_set_btree_iter_dontneed(&alloc_iter); -@@ -1420,11 +1444,40 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran - printbuf_exit(&buf); - return ret; - delete: -- ret = bch2_btree_delete_extent_at(trans, iter, -- iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: -- bch2_trans_commit(trans, NULL, NULL, -- BCH_TRANS_COMMIT_no_enospc); -- goto out; -+ if (!async_repair) { -+ ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BCH_TRANS_COMMIT_no_enospc) ?: -+ -BCH_ERR_transaction_restart_commit; -+ goto out; -+ } else { -+ /* -+ * We can't repair here when called from the allocator path: the -+ * commit will recurse back into the allocator -+ */ -+ struct check_discard_freespace_key_async *w = -+ kzalloc(sizeof(*w), GFP_KERNEL); -+ if (!w) -+ goto out; -+ -+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) { -+ kfree(w); -+ goto out; -+ } -+ -+ INIT_WORK(&w->work, check_discard_freespace_key_work); -+ w->c = c; -+ w->pos = BBPOS(iter->btree_id, iter->pos); -+ queue_work(c->write_ref_wq, &w->work); -+ goto out; -+ } -+} -+ -+static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) -+{ -+ u8 gen; -+ int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false); -+ return ret < 0 ? ret : 0; - } - - /* -@@ -1581,7 +1634,7 @@ int bch2_check_alloc_info(struct bch_fs *c) - ret = for_each_btree_key(trans, iter, - BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_prefetch, k, -- bch2_check_discard_freespace_key(trans, &iter)); -+ bch2_check_discard_freespace_key_fsck(trans, &iter)); - if (ret) - goto err; - -@@ -1594,7 +1647,7 @@ int bch2_check_alloc_info(struct bch_fs *c) - break; - - ret = bkey_err(k) ?: -- bch2_check_discard_freespace_key(trans, &iter); -+ bch2_check_discard_freespace_key_fsck(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; -@@ -1750,14 +1803,14 @@ struct discard_buckets_state { - u64 open; - u64 need_journal_commit; - u64 discarded; -- u64 need_journal_commit_this_dev; - }; - - static int bch2_discard_one_bucket(struct btree_trans *trans, - struct bch_dev *ca, - struct btree_iter *need_discard_iter, - struct bpos *discard_pos_done, -- struct discard_buckets_state *s) -+ struct discard_buckets_state *s, -+ bool fastpath) - { - struct bch_fs *c = trans->c; - struct bpos pos = need_discard_iter->pos; -@@ -1773,11 +1826,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, - goto out; - } - -- if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, -- c->journal.flushed_seq_ondisk, -- pos.inode, pos.offset)) { -- s->need_journal_commit++; -- s->need_journal_commit_this_dev++; -+ u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, -+ pos.inode, pos.offset); -+ if (seq_ready > c->journal.flushed_seq_ondisk) { -+ if (seq_ready > c->journal.flushing_seq) -+ s->need_journal_commit++; - goto out; - } - -@@ -1793,80 +1846,64 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, - if (ret) - goto out; - -- if (bch2_bucket_sectors_total(a->v)) { -- if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, -- trans, "attempting to discard bucket with dirty data\n%s", -- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) -- ret = -EIO; -- goto out; -- } -- - if (a->v.data_type != BCH_DATA_need_discard) { -- if (data_type_is_empty(a->v.data_type) && -- BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { -- a->v.gen++; -- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); -- goto write; -+ if (need_discard_or_freespace_err(trans, k, true, true, true)) { -+ ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); -+ if (ret) -+ goto out; -+ goto commit; - } - -- if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, -- trans, "bucket incorrectly set in need_discard btree\n" -- "%s", -- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) -- ret = -EIO; - goto out; - } - -- if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { -- if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, -- trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", -- a->v.journal_seq, -- c->journal.flushed_seq_ondisk, -- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) -- ret = -EIO; -- goto out; -- } -- -- if (discard_in_flight_add(ca, iter.pos.offset, true)) -- goto out; -+ if (!fastpath) { -+ if (discard_in_flight_add(ca, iter.pos.offset, true)) -+ goto out; - -- discard_locked = true; -+ discard_locked = true; -+ } - -- if (!bkey_eq(*discard_pos_done, iter.pos) && -- ca->mi.discard && !c->opts.nochanges) { -- /* -- * This works without any other locks because this is the only -- * thread that removes items from the need_discard tree -- */ -- bch2_trans_unlock_long(trans); -- blkdev_issue_discard(ca->disk_sb.bdev, -- k.k->p.offset * ca->mi.bucket_size, -- ca->mi.bucket_size, -- GFP_KERNEL); -+ if (!bkey_eq(*discard_pos_done, iter.pos)) { -+ s->discarded++; - *discard_pos_done = iter.pos; - -- ret = bch2_trans_relock_notrace(trans); -- if (ret) -- goto out; -+ if (ca->mi.discard && !c->opts.nochanges) { -+ /* -+ * This works without any other locks because this is the only -+ * thread that removes items from the need_discard tree -+ */ -+ bch2_trans_unlock_long(trans); -+ blkdev_issue_discard(ca->disk_sb.bdev, -+ k.k->p.offset * ca->mi.bucket_size, -+ ca->mi.bucket_size, -+ GFP_KERNEL); -+ ret = bch2_trans_relock_notrace(trans); -+ if (ret) -+ goto out; -+ } - } - - SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); --write: - alloc_data_type_set(&a->v, a->v.data_type); - -- ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: -- bch2_trans_commit(trans, NULL, NULL, -- BCH_WATERMARK_btree| -- BCH_TRANS_COMMIT_no_enospc); -+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); -+ if (ret) -+ goto out; -+commit: -+ ret = bch2_trans_commit(trans, NULL, NULL, -+ BCH_WATERMARK_btree| -+ BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto out; - - count_event(c, bucket_discard); -- s->discarded++; - out: -+fsck_err: - if (discard_locked) - discard_in_flight_remove(ca, iter.pos.offset); -- s->seen++; -+ if (!ret) -+ s->seen++; - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -@@ -1886,11 +1923,14 @@ static void bch2_do_discards_work(struct work_struct *work) - * successful commit: - */ - ret = bch2_trans_run(c, -- for_each_btree_key_upto(trans, iter, -+ for_each_btree_key_max(trans, iter, - BTREE_ID_need_discard, - POS(ca->dev_idx, 0), - POS(ca->dev_idx, U64_MAX), 0, k, -- bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); -+ bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); -+ -+ if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal)) -+ bch2_journal_flush_async(&c->journal, NULL); - - trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, - bch2_err_str(ret)); -@@ -1923,27 +1963,29 @@ void bch2_do_discards(struct bch_fs *c) - bch2_dev_do_discards(ca); - } - --static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) -+static int bch2_do_discards_fast_one(struct btree_trans *trans, -+ struct bch_dev *ca, -+ u64 bucket, -+ struct bpos *discard_pos_done, -+ struct discard_buckets_state *s) - { -- struct btree_iter iter; -- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent); -- struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); -- int ret = bkey_err(k); -+ struct btree_iter need_discard_iter; -+ struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter, -+ BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); -+ int ret = bkey_err(discard_k); - if (ret) -- goto err; -- -- struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); -- ret = PTR_ERR_OR_ZERO(a); -- if (ret) -- goto err; -+ return ret; - -- BUG_ON(a->v.dirty_sectors); -- SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); -- alloc_data_type_set(&a->v, a->v.data_type); -+ if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set, -+ trans, discarding_bucket_not_in_need_discard_btree, -+ "attempting to discard bucket %u:%llu not in need_discard btree", -+ ca->dev_idx, bucket)) -+ goto out; - -- ret = bch2_trans_update(trans, &iter, &a->k_i, 0); --err: -- bch2_trans_iter_exit(trans, &iter); -+ ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); -+out: -+fsck_err: -+ bch2_trans_iter_exit(trans, &need_discard_iter); - return ret; - } - -@@ -1951,6 +1993,10 @@ static void bch2_do_discards_fast_work(struct work_struct *work) - { - struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); - struct bch_fs *c = ca->fs; -+ struct discard_buckets_state s = {}; -+ struct bpos discard_pos_done = POS_MAX; -+ struct btree_trans *trans = bch2_trans_get(c); -+ int ret = 0; - - while (1) { - bool got_bucket = false; -@@ -1971,16 +2017,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) - if (!got_bucket) - break; - -- if (ca->mi.discard && !c->opts.nochanges) -- blkdev_issue_discard(ca->disk_sb.bdev, -- bucket_to_sector(ca, bucket), -- ca->mi.bucket_size, -- GFP_KERNEL); -- -- int ret = bch2_trans_commit_do(c, NULL, NULL, -- BCH_WATERMARK_btree| -- BCH_TRANS_COMMIT_no_enospc, -- bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); -+ ret = lockrestart_do(trans, -+ bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s)); - bch_err_fn(c, ret); - - discard_in_flight_remove(ca, bucket); -@@ -1989,6 +2027,9 @@ static void bch2_do_discards_fast_work(struct work_struct *work) - break; - } - -+ trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); -+ -+ bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref); - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); - } -@@ -2030,8 +2071,11 @@ static int invalidate_one_bucket(struct btree_trans *trans, - return 1; - - if (!bch2_dev_bucket_exists(c, bucket)) { -- prt_str(&buf, "lru entry points to invalid bucket"); -- goto err; -+ if (fsck_err(trans, lru_entry_to_invalid_bucket, -+ "lru key points to nonexistent device:bucket %llu:%llu", -+ bucket.inode, bucket.offset)) -+ return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); -+ goto out; - } - - if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) -@@ -2072,28 +2116,9 @@ static int invalidate_one_bucket(struct btree_trans *trans, - trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); - --*nr_to_invalidate; - out: -+fsck_err: - printbuf_exit(&buf); - return ret; --err: -- prt_str(&buf, "\n lru key: "); -- bch2_bkey_val_to_text(&buf, c, lru_k); -- -- prt_str(&buf, "\n lru entry: "); -- bch2_lru_pos_to_text(&buf, lru_iter->pos); -- -- prt_str(&buf, "\n alloc key: "); -- if (!a) -- bch2_bpos_to_text(&buf, bucket); -- else -- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); -- -- bch_err(c, "%s", buf.buf); -- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { -- bch2_inconsistent_error(c); -- ret = -EINVAL; -- } -- -- goto out; - } - - static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, -@@ -2101,7 +2126,7 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter - { - struct bkey_s_c k; - again: -- k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); -+ k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); - if (!k.k && !*wrapped) { - bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); - *wrapped = true; -diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h -index 163a67b97a40..de25ba4ee94b 100644 ---- a/fs/bcachefs/alloc_background.h -+++ b/fs/bcachefs/alloc_background.h -@@ -8,8 +8,6 @@ - #include "debug.h" - #include "super.h" - --enum bch_validate_flags; -- - /* How out of date a pointer gen is allowed to be: */ - #define BUCKET_GC_GEN_MAX 96U - -@@ -245,10 +243,14 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s - - int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); - --int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); --int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); --int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); --int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -+int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); -+int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); -+int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); -+int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); - void bch2_alloc_v4_swab(struct bkey_s); - void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -@@ -282,7 +284,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - }) - - int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context); - void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - - #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ -@@ -307,6 +309,8 @@ int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *, - int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); -+ -+int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); - int bch2_check_alloc_info(struct bch_fs *); - int bch2_check_alloc_to_lru_refs(struct bch_fs *); - void bch2_dev_do_discards(struct bch_dev *); -diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h -index befdaa95c515..740238369a5a 100644 ---- a/fs/bcachefs/alloc_background_format.h -+++ b/fs/bcachefs/alloc_background_format.h -@@ -58,7 +58,7 @@ LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) - - struct bch_alloc_v4 { - struct bch_val v; -- __u64 journal_seq; -+ __u64 journal_seq_nonempty; - __u32 flags; - __u8 gen; - __u8 oldest_gen; -@@ -70,7 +70,7 @@ struct bch_alloc_v4 { - __u32 stripe; - __u32 nr_external_backpointers; - /* end of fields in original version of alloc_v4 */ -- __u64 _fragmentation_lru; /* obsolete */ -+ __u64 journal_seq_empty; - __u32 stripe_sectors; - __u32 pad; - } __packed __aligned(8); -diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c -index 372178c8d416..5a781fb4c794 100644 ---- a/fs/bcachefs/alloc_foreground.c -+++ b/fs/bcachefs/alloc_foreground.c -@@ -107,14 +107,10 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) - return; - } - -- percpu_down_read(&c->mark_lock); - spin_lock(&ob->lock); -- - ob->valid = false; - ob->data_type = 0; -- - spin_unlock(&ob->lock); -- percpu_up_read(&c->mark_lock); - - spin_lock(&c->freelist_lock); - bch2_open_bucket_hash_remove(c, ob); -@@ -156,6 +152,14 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) - return ob; - } - -+static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -+{ -+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) -+ return false; -+ -+ return bch2_is_superblock_bucket(ca, b); -+} -+ - static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) - { - BUG_ON(c->open_buckets_partial_nr >= -@@ -175,20 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) - closure_wake_up(&c->freelist_wait); - } - --/* _only_ for allocating the journal on a new device: */ --long bch2_bucket_alloc_new_fs(struct bch_dev *ca) --{ -- while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { -- u64 b = ca->new_fs_bucket_idx++; -- -- if (!is_superblock_bucket(ca, b) && -- (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) -- return b; -- } -- -- return -1; --} -- - static inline unsigned open_buckets_reserved(enum bch_watermark watermark) - { - switch (watermark) { -@@ -206,33 +196,44 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark) - } - } - --static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -- u64 bucket, -- enum bch_watermark watermark, -- const struct bch_alloc_v4 *a, -- struct bucket_alloc_state *s, -- struct closure *cl) -+static inline bool may_alloc_bucket(struct bch_fs *c, -+ struct bpos bucket, -+ struct bucket_alloc_state *s) - { -- struct open_bucket *ob; -- -- if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { -- s->skipped_nouse++; -- return NULL; -- } -- -- if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { -+ if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { - s->skipped_open++; -- return NULL; -+ return false; - } - -- if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, -- c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { -+ u64 journal_seq_ready = -+ bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, -+ bucket.inode, bucket.offset); -+ if (journal_seq_ready > c->journal.flushed_seq_ondisk) { -+ if (journal_seq_ready > c->journal.flushing_seq) -+ s->need_journal_commit++; - s->skipped_need_journal_commit++; -- return NULL; -+ return false; - } - -- if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { -+ if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { - s->skipped_nocow++; -+ return false; -+ } -+ -+ return true; -+} -+ -+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -+ u64 bucket, u8 gen, -+ enum bch_watermark watermark, -+ struct bucket_alloc_state *s, -+ struct closure *cl) -+{ -+ if (unlikely(is_superblock_bucket(c, ca, bucket))) -+ return NULL; -+ -+ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { -+ s->skipped_nouse++; - return NULL; - } - -@@ -254,14 +255,13 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * - return NULL; - } - -- ob = bch2_open_bucket_alloc(c); -+ struct open_bucket *ob = bch2_open_bucket_alloc(c); - - spin_lock(&ob->lock); -- - ob->valid = true; - ob->sectors_free = ca->mi.bucket_size; - ob->dev = ca->dev_idx; -- ob->gen = a->gen; -+ ob->gen = gen; - ob->bucket = bucket; - spin_unlock(&ob->lock); - -@@ -276,111 +276,29 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * - } - - static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, -- enum bch_watermark watermark, u64 free_entry, -+ enum bch_watermark watermark, - struct bucket_alloc_state *s, -- struct bkey_s_c freespace_k, -+ struct btree_iter *freespace_iter, - struct closure *cl) - { - struct bch_fs *c = trans->c; -- struct btree_iter iter = { NULL }; -- struct bkey_s_c k; -- struct open_bucket *ob; -- struct bch_alloc_v4 a_convert; -- const struct bch_alloc_v4 *a; -- u64 b = free_entry & ~(~0ULL << 56); -- unsigned genbits = free_entry >> 56; -- struct printbuf buf = PRINTBUF; -- int ret; -- -- if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { -- prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" -- " freespace key ", -- ca->mi.first_bucket, ca->mi.nbuckets); -- bch2_bkey_val_to_text(&buf, c, freespace_k); -- bch2_trans_inconsistent(trans, "%s", buf.buf); -- ob = ERR_PTR(-EIO); -- goto err; -- } -+ u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - -- k = bch2_bkey_get_iter(trans, &iter, -- BTREE_ID_alloc, POS(ca->dev_idx, b), -- BTREE_ITER_cached); -- ret = bkey_err(k); -- if (ret) { -- ob = ERR_PTR(ret); -- goto err; -- } -- -- a = bch2_alloc_to_v4(k, &a_convert); -- -- if (a->data_type != BCH_DATA_free) { -- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { -- ob = NULL; -- goto err; -- } -- -- prt_printf(&buf, "non free bucket in freespace btree\n" -- " freespace key "); -- bch2_bkey_val_to_text(&buf, c, freespace_k); -- prt_printf(&buf, "\n "); -- bch2_bkey_val_to_text(&buf, c, k); -- bch2_trans_inconsistent(trans, "%s", buf.buf); -- ob = ERR_PTR(-EIO); -- goto err; -- } -- -- if (genbits != (alloc_freespace_genbits(*a) >> 56) && -- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { -- prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" -- " freespace key ", -- genbits, alloc_freespace_genbits(*a) >> 56); -- bch2_bkey_val_to_text(&buf, c, freespace_k); -- prt_printf(&buf, "\n "); -- bch2_bkey_val_to_text(&buf, c, k); -- bch2_trans_inconsistent(trans, "%s", buf.buf); -- ob = ERR_PTR(-EIO); -- goto err; -- } -- -- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { -- struct bch_backpointer bp; -- struct bpos bp_pos = POS_MIN; -- -- ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1, -- &bp_pos, &bp, -- BTREE_ITER_nopreserve); -- if (ret) { -- ob = ERR_PTR(ret); -- goto err; -- } -+ if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) -+ return NULL; - -- if (!bkey_eq(bp_pos, POS_MAX)) { -- /* -- * Bucket may have data in it - we don't call -- * bc2h_trans_inconnsistent() because fsck hasn't -- * finished yet -- */ -- ob = NULL; -- goto err; -- } -- } -+ u8 gen; -+ int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); -+ if (ret < 0) -+ return ERR_PTR(ret); -+ if (ret) -+ return NULL; - -- ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); -- if (!ob) -- bch2_set_btree_iter_dontneed(&iter); --err: -- if (iter.path) -- bch2_set_btree_iter_dontneed(&iter); -- bch2_trans_iter_exit(trans, &iter); -- printbuf_exit(&buf); -- return ob; -+ return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); - } - - /* - * This path is for before the freespace btree is initialized: -- * -- * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & -- * journal buckets - journal buckets will be < ca->new_fs_bucket_idx - */ - static noinline struct open_bucket * - bch2_bucket_alloc_early(struct btree_trans *trans, -@@ -389,10 +307,11 @@ bch2_bucket_alloc_early(struct btree_trans *trans, - struct bucket_alloc_state *s, - struct closure *cl) - { -+ struct bch_fs *c = trans->c; - struct btree_iter iter, citer; - struct bkey_s_c k, ck; - struct open_bucket *ob = NULL; -- u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); -+ u64 first_bucket = ca->mi.first_bucket; - u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; - u64 alloc_start = max(first_bucket, *dev_alloc_cursor); - u64 alloc_cursor = alloc_start; -@@ -415,10 +334,6 @@ bch2_bucket_alloc_early(struct btree_trans *trans, - if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) - break; - -- if (ca->new_fs_bucket_idx && -- is_superblock_bucket(ca, k.k->p.offset)) -- continue; -- - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, - bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { -@@ -452,7 +367,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans, - - s->buckets_seen++; - -- ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); -+ ob = may_alloc_bucket(c, k.k->p, s) -+ ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, -+ watermark, s, cl) -+ : NULL; - next: - bch2_set_btree_iter_dontneed(&citer); - bch2_trans_iter_exit(trans, &citer); -@@ -489,20 +407,21 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); - u64 alloc_cursor = alloc_start; - int ret; -- -- BUG_ON(ca->new_fs_bucket_idx); - again: -- for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, -- POS(ca->dev_idx, alloc_cursor), 0, k, ret) { -- if (k.k->p.inode != ca->dev_idx) -- break; -+ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, -+ POS(ca->dev_idx, alloc_cursor), -+ POS(ca->dev_idx, U64_MAX), -+ 0, k, ret) { -+ /* -+ * peek normally dosen't trim extents - they can span iter.pos, -+ * which is not what we want here: -+ */ -+ iter.k.size = iter.k.p.offset - iter.pos.offset; - -- for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); -- alloc_cursor < k.k->p.offset; -- alloc_cursor++) { -+ while (iter.k.size) { - s->buckets_seen++; - -- u64 bucket = alloc_cursor & ~(~0ULL << 56); -+ u64 bucket = iter.pos.offset & ~(~0ULL << 56); - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, - bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { -@@ -511,32 +430,36 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - goto fail; - - bucket = sector_to_bucket(ca, -- round_up(bucket_to_sector(ca, bucket) + 1, -+ round_up(bucket_to_sector(ca, bucket + 1), - 1ULL << ca->mi.btree_bitmap_shift)); -- u64 genbits = alloc_cursor >> 56; -- alloc_cursor = bucket | (genbits << 56); -+ alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); - -- if (alloc_cursor > k.k->p.offset) -- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); -+ bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); - s->skipped_mi_btree_bitmap++; -- continue; -+ goto next; - } - -- ob = try_alloc_bucket(trans, ca, watermark, -- alloc_cursor, s, k, cl); -+ ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); - if (ob) { -+ if (!IS_ERR(ob)) -+ *dev_alloc_cursor = iter.pos.offset; - bch2_set_btree_iter_dontneed(&iter); - break; - } -- } - -+ iter.k.size--; -+ iter.pos.offset++; -+ } -+next: - if (ob || ret) - break; - } - fail: - bch2_trans_iter_exit(trans, &iter); - -- if (!ob && ret) -+ BUG_ON(ob && ret); -+ -+ if (ret) - ob = ERR_PTR(ret); - - if (!ob && alloc_start > ca->mi.first_bucket) { -@@ -544,8 +467,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - goto again; - } - -- *dev_alloc_cursor = alloc_cursor; -- - return ob; - } - -@@ -595,6 +516,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, - * @watermark: how important is this allocation? - * @data_type: BCH_DATA_journal, btree, user... - * @cl: if not NULL, closure to be used to wait if buckets not available -+ * @nowait: if true, do not wait for buckets to become available - * @usage: for secondarily also returning the current device usage - * - * Returns: an open_bucket on success, or an ERR_PTR() on failure. -@@ -629,6 +551,10 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - bch2_dev_do_invalidates(ca); - - if (!avail) { -+ if (watermark > BCH_WATERMARK_normal && -+ c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) -+ goto alloc; -+ - if (cl && !waiting) { - closure_wait(&c->freelist_wait, cl); - waiting = true; -@@ -648,7 +574,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) - : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); - -- if (s.skipped_need_journal_commit * 2 > avail) -+ if (s.need_journal_commit * 2 > avail) - bch2_journal_flush_async(&c->journal, NULL); - - if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { -@@ -711,9 +637,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, - unsigned i; - - for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) -- ret.devs[ret.nr++] = i; -+ ret.data[ret.nr++] = i; - -- bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); -+ bubble_sort(ret.data, ret.nr, dev_stripe_cmp); - return ret; - } - -@@ -785,18 +711,13 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct closure *cl) - { - struct bch_fs *c = trans->c; -- struct dev_alloc_list devs_sorted = -- bch2_dev_alloc_list(c, stripe, devs_may_alloc); - int ret = -BCH_ERR_insufficient_devices; - - BUG_ON(*nr_effective >= nr_replicas); - -- for (unsigned i = 0; i < devs_sorted.nr; i++) { -- struct bch_dev_usage usage; -- struct open_bucket *ob; -- -- unsigned dev = devs_sorted.devs[i]; -- struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); -+ struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); -+ darray_for_each(devs_sorted, i) { -+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); - if (!ca) - continue; - -@@ -805,8 +726,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - continue; - } - -- ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, -- cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); -+ struct bch_dev_usage usage; -+ struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, -+ cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); - if (!IS_ERR(ob)) - bch2_dev_stripe_increment_inlined(ca, stripe, &usage); - bch2_dev_put(ca); -@@ -850,10 +772,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, - struct closure *cl) - { - struct bch_fs *c = trans->c; -- struct dev_alloc_list devs_sorted; -- struct ec_stripe_head *h; -- struct open_bucket *ob; -- unsigned i, ec_idx; - int ret = 0; - - if (nr_replicas < 2) -@@ -862,34 +780,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, - if (ec_open_bucket(c, ptrs)) - return 0; - -- h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); -+ struct ec_stripe_head *h = -+ bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); - if (IS_ERR(h)) - return PTR_ERR(h); - if (!h) - return 0; - -- devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); -- -- for (i = 0; i < devs_sorted.nr; i++) -- for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { -+ struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); -+ darray_for_each(devs_sorted, i) -+ for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { - if (!h->s->blocks[ec_idx]) - continue; - -- ob = c->open_buckets + h->s->blocks[ec_idx]; -- if (ob->dev == devs_sorted.devs[i] && -- !test_and_set_bit(ec_idx, h->s->blocks_allocated)) -- goto got_bucket; -+ struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; -+ if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { -+ ob->ec_idx = ec_idx; -+ ob->ec = h->s; -+ ec_stripe_new_get(h->s, STRIPE_REF_io); -+ -+ ret = add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_replicas, nr_effective, -+ have_cache, ob); -+ goto out; -+ } - } -- goto out_put_head; --got_bucket: -- ob->ec_idx = ec_idx; -- ob->ec = h->s; -- ec_stripe_new_get(h->s, STRIPE_REF_io); -- -- ret = add_new_bucket(c, ptrs, devs_may_alloc, -- nr_replicas, nr_effective, -- have_cache, ob); --out_put_head: -+out: - bch2_ec_stripe_head_put(c, h); - return ret; - } -diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h -index 1a16fd5bd4f8..f25481a0d1a0 100644 ---- a/fs/bcachefs/alloc_foreground.h -+++ b/fs/bcachefs/alloc_foreground.h -@@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *); - - struct dev_alloc_list { - unsigned nr; -- u8 devs[BCH_SB_MEMBERS_MAX]; -+ u8 data[BCH_SB_MEMBERS_MAX]; - }; - - struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, -@@ -28,8 +28,6 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, - struct bch_devs_mask *); - void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); - --long bch2_bucket_alloc_new_fs(struct bch_dev *); -- - static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) - { - return bch2_dev_have_ref(c, ob->dev); -diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h -index 9bbb28e90b93..4aa8ee026cb8 100644 ---- a/fs/bcachefs/alloc_types.h -+++ b/fs/bcachefs/alloc_types.h -@@ -18,6 +18,7 @@ struct bucket_alloc_state { - u64 buckets_seen; - u64 skipped_open; - u64 skipped_need_journal_commit; -+ u64 need_journal_commit; - u64 skipped_nocow; - u64 skipped_nouse; - u64 skipped_mi_btree_bitmap; -diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c -index 654a58132a4d..ebeb6a5ff9d2 100644 ---- a/fs/bcachefs/backpointers.c -+++ b/fs/bcachefs/backpointers.c -@@ -14,42 +14,8 @@ - - #include - --static bool extent_matches_bp(struct bch_fs *c, -- enum btree_id btree_id, unsigned level, -- struct bkey_s_c k, -- struct bpos bucket, -- struct bch_backpointer bp) --{ -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- const union bch_extent_entry *entry; -- struct extent_ptr_decoded p; -- -- rcu_read_lock(); -- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -- struct bpos bucket2; -- struct bch_backpointer bp2; -- -- if (p.ptr.cached) -- continue; -- -- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); -- if (!ca) -- continue; -- -- bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2); -- if (bpos_eq(bucket, bucket2) && -- !memcmp(&bp, &bp2, sizeof(bp))) { -- rcu_read_unlock(); -- return true; -- } -- } -- rcu_read_unlock(); -- -- return false; --} -- - int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - int ret = 0; -@@ -59,67 +25,70 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, - "backpointer level bad: %u >= %u", - bp.v->level, BTREE_MAX_DEPTH); - -- rcu_read_lock(); -- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); -- if (!ca) { -- /* these will be caught by fsck */ -- rcu_read_unlock(); -- return 0; -- } -- -- struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p); -- struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset); -- rcu_read_unlock(); -- -- bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || -- !bpos_eq(bp.k->p, bp_pos), -- c, backpointer_bucket_offset_wrong, -- "backpointer bucket_offset wrong"); -+ bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, -+ c, backpointer_dev_bad, -+ "backpointer for BCH_SB_MEMBER_INVALID"); - fsck_err: - return ret; - } - --void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) -+void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) - { -- prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", -- bch2_btree_id_str(bp->btree_id), -- bp->level, -- (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), -- (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), -- bp->bucket_len); -- bch2_bpos_to_text(out, bp->pos); --} -+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - --void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) --{ - rcu_read_lock(); -- struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); - if (ca) { -- struct bpos bucket = bp_pos_to_bucket(ca, k.k->p); -+ u32 bucket_offset; -+ struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); - rcu_read_unlock(); -- prt_str(out, "bucket="); -- bch2_bpos_to_text(out, bucket); -- prt_str(out, " "); -+ prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); - } else { - rcu_read_unlock(); -+ prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); - } - -- bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); -+ bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); -+ prt_printf(out, " suboffset=%u len=%u gen=%u pos=", -+ (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), -+ bp.v->bucket_len, -+ bp.v->bucket_gen); -+ bch2_bpos_to_text(out, bp.v->pos); - } - - void bch2_backpointer_swab(struct bkey_s k) - { - struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); - -- bp.v->bucket_offset = swab40(bp.v->bucket_offset); - bp.v->bucket_len = swab32(bp.v->bucket_len); - bch2_bpos_swab(&bp.v->pos); - } - -+static bool extent_matches_bp(struct bch_fs *c, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, -+ struct bkey_s_c_backpointer bp) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ struct bkey_i_backpointer bp2; -+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); -+ -+ if (bpos_eq(bp.k->p, bp2.k.p) && -+ !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) -+ return true; -+ } -+ -+ return false; -+} -+ - static noinline int backpointer_mod_err(struct btree_trans *trans, -- struct bch_backpointer bp, -- struct bkey_s_c bp_k, - struct bkey_s_c orig_k, -+ struct bkey_i_backpointer *new_bp, -+ struct bkey_s_c found_bp, - bool insert) - { - struct bch_fs *c = trans->c; -@@ -127,12 +96,12 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, - - if (insert) { - prt_printf(&buf, "existing backpointer found when inserting "); -- bch2_backpointer_to_text(&buf, &bp); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "found "); -- bch2_bkey_val_to_text(&buf, c, bp_k); -+ bch2_bkey_val_to_text(&buf, c, found_bp); - prt_newline(&buf); - - prt_printf(&buf, "for "); -@@ -144,11 +113,11 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "searching for "); -- bch2_backpointer_to_text(&buf, &bp); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); - prt_newline(&buf); - - prt_printf(&buf, "got "); -- bch2_bkey_val_to_text(&buf, c, bp_k); -+ bch2_bkey_val_to_text(&buf, c, found_bp); - prt_newline(&buf); - - prt_printf(&buf, "for "); -@@ -167,161 +136,118 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, - } - - int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, -- struct bch_dev *ca, -- struct bpos bucket, -- struct bch_backpointer bp, - struct bkey_s_c orig_k, -+ struct bkey_i_backpointer *bp, - bool insert) - { - struct btree_iter bp_iter; -- struct bkey_s_c k; -- struct bkey_i_backpointer *bp_k; -- int ret; -- -- bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); -- ret = PTR_ERR_OR_ZERO(bp_k); -- if (ret) -- return ret; -- -- bkey_backpointer_init(&bp_k->k_i); -- bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); -- bp_k->v = bp; -- -- if (!insert) { -- bp_k->k.type = KEY_TYPE_deleted; -- set_bkey_val_u64s(&bp_k->k, 0); -- } -- -- k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, -- bp_k->k.p, -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, -+ bp->k.p, - BTREE_ITER_intent| - BTREE_ITER_slots| - BTREE_ITER_with_updates); -- ret = bkey_err(k); -+ int ret = bkey_err(k); - if (ret) -- goto err; -+ return ret; - - if (insert - ? k.k->type - : (k.k->type != KEY_TYPE_backpointer || -- memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) { -- ret = backpointer_mod_err(trans, bp, k, orig_k, insert); -+ memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { -+ ret = backpointer_mod_err(trans, orig_k, bp, k, insert); - if (ret) - goto err; - } - -- ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); -+ if (!insert) { -+ bp->k.type = KEY_TYPE_deleted; -+ set_bkey_val_u64s(&bp->k, 0); -+ } -+ -+ ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); - err: - bch2_trans_iter_exit(trans, &bp_iter); - return ret; - } - --/* -- * Find the next backpointer >= *bp_offset: -- */ --int bch2_get_next_backpointer(struct btree_trans *trans, -- struct bch_dev *ca, -- struct bpos bucket, int gen, -- struct bpos *bp_pos, -- struct bch_backpointer *bp, -- unsigned iter_flags) -+static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) - { -- struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0); -- struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; -- struct bkey_s_c k; -- int ret = 0; -- -- if (bpos_ge(*bp_pos, bp_end_pos)) -- goto done; -- -- if (gen >= 0) { -- k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, -- bucket, BTREE_ITER_cached|iter_flags); -- ret = bkey_err(k); -- if (ret) -- goto out; -- -- if (k.k->type != KEY_TYPE_alloc_v4 || -- bkey_s_c_to_alloc_v4(k).v->gen != gen) -- goto done; -- } -- -- *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0)); -- -- for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, -- *bp_pos, iter_flags, k, ret) { -- if (bpos_ge(k.k->p, bp_end_pos)) -- break; -+ return (likely(!bch2_backpointers_no_use_write_buffer) -+ ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) -+ : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: -+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -+} - -- *bp_pos = k.k->p; -- *bp = *bkey_s_c_to_backpointer(k).v; -- goto out; -- } --done: -- *bp_pos = SPOS_MAX; --out: -- bch2_trans_iter_exit(trans, &bp_iter); -- bch2_trans_iter_exit(trans, &alloc_iter); -- return ret; -+static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, -+ struct bkey_s_c visiting_k, -+ struct bkey_buf *last_flushed) -+{ -+ return likely(!bch2_backpointers_no_use_write_buffer) -+ ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) -+ : 0; - } - --static void backpointer_not_found(struct btree_trans *trans, -- struct bpos bp_pos, -- struct bch_backpointer bp, -- struct bkey_s_c k) -+static int backpointer_target_not_found(struct btree_trans *trans, -+ struct bkey_s_c_backpointer bp, -+ struct bkey_s_c target_k, -+ struct bkey_buf *last_flushed) - { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; -+ int ret = 0; - - /* - * If we're using the btree write buffer, the backpointer we were - * looking at may have already been deleted - failure to find what it - * pointed to is not an error: - */ -- if (likely(!bch2_backpointers_no_use_write_buffer)) -- return; -- -- struct bpos bucket; -- if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) -- return; -+ ret = last_flushed -+ ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) -+ : 0; -+ if (ret) -+ return ret; - - prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", -- bp.level ? "btree node" : "extent"); -- prt_printf(&buf, "bucket: "); -- bch2_bpos_to_text(&buf, bucket); -- prt_printf(&buf, "\n "); -+ bp.v->level ? "btree node" : "extent"); -+ bch2_bkey_val_to_text(&buf, c, bp.s_c); - -- prt_printf(&buf, "backpointer pos: "); -- bch2_bpos_to_text(&buf, bp_pos); - prt_printf(&buf, "\n "); -+ bch2_bkey_val_to_text(&buf, c, target_k); - -- bch2_backpointer_to_text(&buf, &bp); -- prt_printf(&buf, "\n "); -- bch2_bkey_val_to_text(&buf, c, k); -- if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) -- bch_err_ratelimited(c, "%s", buf.buf); -- else -- bch2_trans_inconsistent(trans, "%s", buf.buf); -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) -+ if (p.ptr.dev == bp.k->p.inode) { -+ prt_printf(&buf, "\n "); -+ struct bkey_i_backpointer bp2; -+ bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); -+ } - -+ if (fsck_err(trans, backpointer_to_missing_ptr, -+ "%s", buf.buf)) -+ ret = bch2_backpointer_del(trans, bp.k->p); -+fsck_err: - printbuf_exit(&buf); -+ return ret; - } - - struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, -+ struct bkey_s_c_backpointer bp, - struct btree_iter *iter, -- struct bpos bp_pos, -- struct bch_backpointer bp, -- unsigned iter_flags) -+ unsigned iter_flags, -+ struct bkey_buf *last_flushed) - { -- if (likely(!bp.level)) { -- struct bch_fs *c = trans->c; -+ struct bch_fs *c = trans->c; - -- struct bpos bucket; -- if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) -- return bkey_s_c_err(-EIO); -+ if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) -+ return bkey_s_c_null; - -+ if (likely(!bp.v->level)) { - bch2_trans_node_iter_init(trans, iter, -- bp.btree_id, -- bp.pos, -+ bp.v->btree_id, -+ bp.v->pos, - 0, 0, - iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -@@ -330,67 +256,64 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, - return k; - } - -- if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) -+ if (k.k && -+ extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) - return k; - - bch2_trans_iter_exit(trans, iter); -- backpointer_not_found(trans, bp_pos, bp, k); -- return bkey_s_c_null; -+ int ret = backpointer_target_not_found(trans, bp, k, last_flushed); -+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null; - } else { -- struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); -+ struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); -+ if (IS_ERR_OR_NULL(b)) -+ return ((struct bkey_s_c) { .k = ERR_CAST(b) }); - -- if (IS_ERR_OR_NULL(b)) { -- bch2_trans_iter_exit(trans, iter); -- return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null; -- } - return bkey_i_to_s_c(&b->key); - } - } - - struct btree *bch2_backpointer_get_node(struct btree_trans *trans, -+ struct bkey_s_c_backpointer bp, - struct btree_iter *iter, -- struct bpos bp_pos, -- struct bch_backpointer bp) -+ struct bkey_buf *last_flushed) - { - struct bch_fs *c = trans->c; - -- BUG_ON(!bp.level); -- -- struct bpos bucket; -- if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) -- return ERR_PTR(-EIO); -+ BUG_ON(!bp.v->level); - - bch2_trans_node_iter_init(trans, iter, -- bp.btree_id, -- bp.pos, -+ bp.v->btree_id, -+ bp.v->pos, - 0, -- bp.level - 1, -+ bp.v->level - 1, - 0); - struct btree *b = bch2_btree_iter_peek_node(iter); - if (IS_ERR_OR_NULL(b)) - goto err; - -- BUG_ON(b->c.level != bp.level - 1); -+ BUG_ON(b->c.level != bp.v->level - 1); - -- if (extent_matches_bp(c, bp.btree_id, bp.level, -- bkey_i_to_s_c(&b->key), -- bucket, bp)) -+ if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, -+ bkey_i_to_s_c(&b->key), bp)) - return b; - - if (btree_node_will_make_reachable(b)) { - b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); - } else { -- backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); -- b = NULL; -+ int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); -+ b = ret ? ERR_PTR(ret) : NULL; - } - err: - bch2_trans_iter_exit(trans, iter); - return b; - } - --static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, -- struct bkey_s_c k) -+static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, -+ struct bkey_buf *last_flushed) - { -+ if (k.k->type != KEY_TYPE_backpointer) -+ return 0; -+ - struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = { NULL }; - struct bkey_s_c alloc_k; -@@ -399,10 +322,14 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ - - struct bpos bucket; - if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { -+ ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); -+ if (ret) -+ goto out; -+ - if (fsck_err(trans, backpointer_to_missing_device, - "backpointer for missing device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) -- ret = bch2_btree_delete_at(trans, bp_iter, 0); -+ ret = bch2_backpointer_del(trans, k.k->p); - goto out; - } - -@@ -411,13 +338,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ - if (ret) - goto out; - -- if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, -- trans, backpointer_to_missing_alloc, -- "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", -- alloc_iter.pos.inode, alloc_iter.pos.offset, -- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -- ret = bch2_btree_delete_at(trans, bp_iter, 0); -- goto out; -+ if (alloc_k.k->type != KEY_TYPE_alloc_v4) { -+ ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); -+ if (ret) -+ goto out; -+ -+ if (fsck_err(trans, backpointer_to_missing_alloc, -+ "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", -+ alloc_iter.pos.inode, alloc_iter.pos.offset, -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) -+ ret = bch2_backpointer_del(trans, k.k->p); - } - out: - fsck_err: -@@ -429,18 +359,24 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ - /* verify that every backpointer has a corresponding alloc key */ - int bch2_check_btree_backpointers(struct bch_fs *c) - { -+ struct bkey_buf last_flushed; -+ bch2_bkey_buf_init(&last_flushed); -+ bkey_init(&last_flushed.k->k); -+ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_backpointers, POS_MIN, 0, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -- bch2_check_btree_backpointer(trans, &iter, k))); -+ bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); -+ -+ bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; - } - - struct extents_to_bp_state { -- struct bpos bucket_start; -- struct bpos bucket_end; -+ struct bpos bp_start; -+ struct bpos bp_end; - struct bkey_buf last_flushed; - }; - -@@ -501,9 +437,13 @@ static int check_extent_checksum(struct btree_trans *trans, - goto err; - - prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); -- prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree)); -+ prt_printf(&buf, "\n "); -+ bch2_btree_id_to_text(&buf, btree); -+ prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, extent); -- prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); -+ prt_printf(&buf, "\n "); -+ bch2_btree_id_to_text(&buf, o_btree); -+ prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, extent2); - - struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); -@@ -524,41 +464,25 @@ static int check_extent_checksum(struct btree_trans *trans, - - static int check_bp_exists(struct btree_trans *trans, - struct extents_to_bp_state *s, -- struct bpos bucket, -- struct bch_backpointer bp, -+ struct bkey_i_backpointer *bp, - struct bkey_s_c orig_k) - { - struct bch_fs *c = trans->c; -- struct btree_iter bp_iter = {}; - struct btree_iter other_extent_iter = {}; - struct printbuf buf = PRINTBUF; -- struct bkey_s_c bp_k; -- int ret = 0; - -- struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket); -- if (!ca) { -- prt_str(&buf, "extent for nonexistent device:bucket "); -- bch2_bpos_to_text(&buf, bucket); -- prt_str(&buf, "\n "); -- bch2_bkey_val_to_text(&buf, c, orig_k); -- bch_err(c, "%s", buf.buf); -- ret = -BCH_ERR_fsck_repair_unimplemented; -- goto err; -- } -- -- if (bpos_lt(bucket, s->bucket_start) || -- bpos_gt(bucket, s->bucket_end)) -- goto out; -+ if (bpos_lt(bp->k.p, s->bp_start) || -+ bpos_gt(bp->k.p, s->bp_end)) -+ return 0; - -- bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, -- bucket_pos_to_bp(ca, bucket, bp.bucket_offset), -- 0); -- ret = bkey_err(bp_k); -+ struct btree_iter bp_iter; -+ struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); -+ int ret = bkey_err(bp_k); - if (ret) - goto err; - - if (bp_k.k->type != KEY_TYPE_backpointer || -- memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { -+ memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { - ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); - if (ret) - goto err; -@@ -570,7 +494,6 @@ static int check_bp_exists(struct btree_trans *trans, - fsck_err: - bch2_trans_iter_exit(trans, &other_extent_iter); - bch2_trans_iter_exit(trans, &bp_iter); -- bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; - check_existing_bp: -@@ -578,10 +501,10 @@ static int check_bp_exists(struct btree_trans *trans, - if (bp_k.k->type != KEY_TYPE_backpointer) - goto missing; - -- struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v; -+ struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); - - struct bkey_s_c other_extent = -- bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0); -+ bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); - ret = bkey_err(other_extent); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - ret = 0; -@@ -600,19 +523,23 @@ static int check_bp_exists(struct btree_trans *trans, - bch_err(c, "%s", buf.buf); - - if (other_extent.k->size <= orig_k.k->size) { -- ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode); -+ ret = drop_dev_and_update(trans, other_bp.v->btree_id, -+ other_extent, bp->k.p.inode); - if (ret) - goto err; - goto out; - } else { -- ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode); -+ ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); - if (ret) - goto err; - goto missing; - } - } - -- ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode); -+ ret = check_extent_checksum(trans, -+ other_bp.v->btree_id, other_extent, -+ bp->v.btree_id, orig_k, -+ bp->k.p.inode); - if (ret < 0) - goto err; - if (ret) { -@@ -620,7 +547,8 @@ static int check_bp_exists(struct btree_trans *trans, - goto missing; - } - -- ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode); -+ ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, -+ other_bp.v->btree_id, other_extent, bp->k.p.inode); - if (ret < 0) - goto err; - if (ret) { -@@ -629,7 +557,7 @@ static int check_bp_exists(struct btree_trans *trans, - } - - printbuf_reset(&buf); -- prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode); -+ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode); - bch2_bkey_val_to_text(&buf, c, orig_k); - prt_str(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, other_extent); -@@ -638,21 +566,15 @@ static int check_bp_exists(struct btree_trans *trans, - goto err; - missing: - printbuf_reset(&buf); -- prt_printf(&buf, "missing backpointer for btree=%s l=%u ", -- bch2_btree_id_str(bp.btree_id), bp.level); -+ prt_str(&buf, "missing backpointer\n for: "); - bch2_bkey_val_to_text(&buf, c, orig_k); -- prt_printf(&buf, "\n got: "); -+ prt_printf(&buf, "\n want: "); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); -+ prt_printf(&buf, "\n got: "); - bch2_bkey_val_to_text(&buf, c, bp_k); - -- struct bkey_i_backpointer n_bp_k; -- bkey_backpointer_init(&n_bp_k.k_i); -- n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); -- n_bp_k.v = bp; -- prt_printf(&buf, "\n want: "); -- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); -- - if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) -- ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true); -+ ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); - - goto out; - } -@@ -663,31 +585,33 @@ static int check_extent_to_backpointers(struct btree_trans *trans, - struct bkey_s_c k) - { - struct bch_fs *c = trans->c; -- struct bkey_ptrs_c ptrs; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; -- int ret; - -- ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -- struct bpos bucket_pos = POS_MIN; -- struct bch_backpointer bp; -- - if (p.ptr.cached) - continue; - -+ if (p.ptr.dev == BCH_SB_MEMBER_INVALID) -+ continue; -+ - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); -- if (ca) -- bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp); -+ bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); -+ bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); - rcu_read_unlock(); - -- if (!ca) -- continue; -+ if (check || empty) { -+ struct bkey_i_backpointer bp; -+ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); - -- ret = check_bp_exists(trans, s, bucket_pos, bp, k); -- if (ret) -- return ret; -+ int ret = check -+ ? check_bp_exists(trans, s, &bp, k) -+ : bch2_bucket_backpointer_mod(trans, k, &bp, true); -+ if (ret) -+ return ret; -+ } - } - - return 0; -@@ -896,54 +820,330 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - return 0; - } - -+enum alloc_sector_counter { -+ ALLOC_dirty, -+ ALLOC_cached, -+ ALLOC_stripe, -+ ALLOC_SECTORS_NR -+}; -+ -+static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) -+{ -+ switch (t) { -+ case BCH_DATA_btree: -+ case BCH_DATA_user: -+ return ALLOC_dirty; -+ case BCH_DATA_cached: -+ return ALLOC_cached; -+ case BCH_DATA_stripe: -+ return ALLOC_stripe; -+ default: -+ BUG(); -+ } -+} -+ -+static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); -+ -+static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, -+ struct bkey_buf *last_flushed) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); -+ bool need_commit = false; -+ -+ if (a->data_type == BCH_DATA_sb || -+ a->data_type == BCH_DATA_journal || -+ a->data_type == BCH_DATA_parity) -+ return 0; -+ -+ u32 sectors[ALLOC_SECTORS_NR]; -+ memset(sectors, 0, sizeof(sectors)); -+ -+ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); -+ if (!ca) -+ return 0; -+ -+ struct btree_iter iter; -+ struct bkey_s_c bp_k; -+ int ret = 0; -+ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, -+ bucket_pos_to_bp_start(ca, alloc_k.k->p), -+ bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { -+ if (bp_k.k->type != KEY_TYPE_backpointer) -+ continue; -+ -+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); -+ -+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && -+ (bp.v->bucket_gen != a->gen || -+ bp.v->pad)) { -+ ret = bch2_backpointer_del(trans, bp_k.k->p); -+ if (ret) -+ break; -+ -+ need_commit = true; -+ continue; -+ } -+ -+ if (bp.v->bucket_gen != a->gen) -+ continue; -+ -+ sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; -+ }; -+ bch2_trans_iter_exit(trans, &iter); -+ if (ret) -+ goto err; -+ -+ if (need_commit) { -+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -+ if (ret) -+ goto err; -+ } -+ -+ /* Cached pointers don't have backpointers: */ -+ -+ if (sectors[ALLOC_dirty] != a->dirty_sectors || -+ sectors[ALLOC_stripe] != a->stripe_sectors) { -+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { -+ ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); -+ if (ret) -+ goto err; -+ } -+ -+ if (sectors[ALLOC_dirty] > a->dirty_sectors || -+ sectors[ALLOC_stripe] > a->stripe_sectors) { -+ ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: -+ -BCH_ERR_transaction_restart_nested; -+ goto err; -+ } -+ -+ if (!sectors[ALLOC_dirty] && -+ !sectors[ALLOC_stripe]) -+ __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); -+ else -+ __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); -+ } -+err: -+ bch2_dev_put(ca); -+ return ret; -+} -+ -+static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr_v2: { -+ bool ret = false; -+ -+ rcu_read_lock(); -+ struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; -+ while (pos.inode <= k.k->p.inode) { -+ if (pos.inode >= c->sb.nr_devices) -+ break; -+ -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); -+ if (!ca) -+ goto next; -+ -+ struct bpos bucket = bp_pos_to_bucket(ca, pos); -+ bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, -+ ca->mi.nbuckets, bucket.offset); -+ if (bucket.offset == ca->mi.nbuckets) -+ goto next; -+ -+ ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); -+ if (ret) -+ break; -+next: -+ pos = SPOS(pos.inode + 1, 0, 0); -+ } -+ rcu_read_unlock(); -+ -+ return ret; -+ } -+ case KEY_TYPE_btree_ptr: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, -+ enum btree_id btree, unsigned level) -+{ -+ struct btree_iter iter; -+ bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); -+ struct btree *b = bch2_btree_iter_peek_node(&iter); -+ int ret = PTR_ERR_OR_ZERO(b); -+ if (ret) -+ goto err; -+ -+ if (b) -+ bch2_node_pin(trans->c, b); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, -+ struct bpos start, struct bpos *end) -+{ -+ struct bch_fs *c = trans->c; -+ int ret = 0; -+ -+ struct bkey_buf tmp; -+ bch2_bkey_buf_init(&tmp); -+ -+ bch2_btree_cache_unpin(c); -+ -+ *end = SPOS_MAX; -+ -+ s64 mem_may_pin = mem_may_pin_bytes(c); -+ struct btree_iter iter; -+ bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, -+ 0, 1, BTREE_ITER_prefetch); -+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({ -+ if (!backpointer_node_has_missing(c, k)) -+ continue; -+ -+ mem_may_pin -= c->opts.btree_node_size; -+ if (mem_may_pin <= 0) -+ break; -+ -+ bch2_bkey_buf_reassemble(&tmp, c, k); -+ struct btree_path *path = btree_iter_path(trans, &iter); -+ -+ BUG_ON(path->level != 1); -+ -+ bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); -+ })); -+ if (ret) -+ return ret; -+ -+ struct bpos pinned = SPOS_MAX; -+ mem_may_pin = mem_may_pin_bytes(c); -+ bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, -+ 0, 1, BTREE_ITER_prefetch); -+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({ -+ if (!backpointer_node_has_missing(c, k)) -+ continue; -+ -+ mem_may_pin -= c->opts.btree_node_size; -+ if (mem_may_pin <= 0) { -+ *end = pinned; -+ break; -+ } -+ -+ bch2_bkey_buf_reassemble(&tmp, c, k); -+ struct btree_path *path = btree_iter_path(trans, &iter); -+ -+ BUG_ON(path->level != 1); -+ -+ int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); -+ -+ if (!ret2) -+ pinned = tmp.k->k.p; -+ -+ ret; -+ })); -+ if (ret) -+ return ret; -+ -+ return ret; -+} -+ - int bch2_check_extents_to_backpointers(struct bch_fs *c) - { -+ int ret = 0; -+ -+ /* -+ * Can't allow devices to come/go/resize while we have bucket bitmaps -+ * allocated -+ */ -+ lockdep_assert_held(&c->state_lock); -+ -+ for_each_member_device(c, ca) { -+ BUG_ON(ca->bucket_backpointer_mismatches); -+ ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), -+ sizeof(unsigned long), -+ GFP_KERNEL); -+ ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), -+ sizeof(unsigned long), -+ GFP_KERNEL); -+ if (!ca->bucket_backpointer_mismatches || -+ !ca->bucket_backpointer_empty) { -+ bch2_dev_put(ca); -+ ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; -+ goto err_free_bitmaps; -+ } -+ } -+ - struct btree_trans *trans = bch2_trans_get(c); -- struct extents_to_bp_state s = { .bucket_start = POS_MIN }; -- int ret; -+ struct extents_to_bp_state s = { .bp_start = POS_MIN }; - - bch2_bkey_buf_init(&s.last_flushed); - bkey_init(&s.last_flushed.k->k); - -+ ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, -+ POS_MIN, BTREE_ITER_prefetch, k, ({ -+ check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); -+ })); -+ if (ret) -+ goto err; -+ -+ u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; -+ for_each_member_device(c, ca) { -+ nr_buckets += ca->mi.nbuckets; -+ nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); -+ nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); -+ } -+ -+ if (!nr_mismatches && !nr_empty) -+ goto err; -+ -+ bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", -+ nr_mismatches + nr_empty, nr_buckets); -+ - while (1) { -- struct bbpos end; -- ret = bch2_get_btree_in_memory_pos(trans, -- BIT_ULL(BTREE_ID_backpointers), -- BIT_ULL(BTREE_ID_backpointers), -- BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); -+ ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); - if (ret) - break; - -- s.bucket_end = end.pos; -- -- if ( bpos_eq(s.bucket_start, POS_MIN) && -- !bpos_eq(s.bucket_end, SPOS_MAX)) -+ if ( bpos_eq(s.bp_start, POS_MIN) && -+ !bpos_eq(s.bp_end, SPOS_MAX)) - bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", - __func__, btree_nodes_fit_in_ram(c)); - -- if (!bpos_eq(s.bucket_start, POS_MIN) || -- !bpos_eq(s.bucket_end, SPOS_MAX)) { -+ if (!bpos_eq(s.bp_start, POS_MIN) || -+ !bpos_eq(s.bp_end, SPOS_MAX)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "check_extents_to_backpointers(): "); -- bch2_bpos_to_text(&buf, s.bucket_start); -+ bch2_bpos_to_text(&buf, s.bp_start); - prt_str(&buf, "-"); -- bch2_bpos_to_text(&buf, s.bucket_end); -+ bch2_bpos_to_text(&buf, s.bp_end); - - bch_verbose(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_check_extents_to_backpointers_pass(trans, &s); -- if (ret || bpos_eq(s.bucket_end, SPOS_MAX)) -+ if (ret || bpos_eq(s.bp_end, SPOS_MAX)) - break; - -- s.bucket_start = bpos_successor(s.bucket_end); -+ s.bp_start = bpos_successor(s.bp_end); - } -+err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&s.last_flushed, c); -- - bch2_btree_cache_unpin(c); -+err_free_bitmaps: -+ for_each_member_device(c, ca) { -+ kvfree(ca->bucket_backpointer_empty); -+ ca->bucket_backpointer_empty = NULL; -+ kvfree(ca->bucket_backpointer_mismatches); -+ ca->bucket_backpointer_mismatches = NULL; -+ } - - bch_err_fn(c, ret); - return ret; -@@ -959,44 +1159,43 @@ static int check_one_backpointer(struct btree_trans *trans, - return 0; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); -- struct bch_fs *c = trans->c; -- struct btree_iter iter; - struct bbpos pos = bp_to_bbpos(*bp.v); -- struct bkey_s_c k; -- struct printbuf buf = PRINTBUF; -- int ret; - - if (bbpos_cmp(pos, start) < 0 || - bbpos_cmp(pos, end) > 0) - return 0; - -- k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0); -- ret = bkey_err(k); -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); -+ int ret = bkey_err(k); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - return 0; - if (ret) - return ret; - -- if (!k.k) { -- ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed); -- if (ret) -- goto out; -- -- if (fsck_err(trans, backpointer_to_missing_ptr, -- "backpointer for missing %s\n %s", -- bp.v->level ? "btree node" : "extent", -- (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { -- ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); -- goto out; -- } -- } --out: --fsck_err: - bch2_trans_iter_exit(trans, &iter); -- printbuf_exit(&buf); - return ret; - } - -+static int check_bucket_backpointers_to_extents(struct btree_trans *trans, -+ struct bch_dev *ca, struct bpos bucket) -+{ -+ u32 restart_count = trans->restart_count; -+ struct bkey_buf last_flushed; -+ bch2_bkey_buf_init(&last_flushed); -+ bkey_init(&last_flushed.k->k); -+ -+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, -+ bucket_pos_to_bp_start(ca, bucket), -+ bucket_pos_to_bp_end(ca, bucket), -+ 0, k, -+ check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) -+ ); -+ -+ bch2_bkey_buf_exit(&last_flushed, trans->c); -+ return ret ?: trans_was_restarted(trans, restart_count); -+} -+ - static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, - struct bbpos start, - struct bbpos end) -@@ -1009,9 +1208,8 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, - bkey_init(&last_flushed.k->k); - progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); - -- int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, -- POS_MIN, BTREE_ITER_prefetch, k, -- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ -+ int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, -+ POS_MIN, BTREE_ITER_prefetch, k, ({ - progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); - check_one_backpointer(trans, start, end, k, &last_flushed); - })); -diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h -index 3b29fdf519dd..060dad1521ee 100644 ---- a/fs/bcachefs/backpointers.h -+++ b/fs/bcachefs/backpointers.h -@@ -18,14 +18,14 @@ static inline u64 swab40(u64 x) - ((x & 0xff00000000ULL) >> 32)); - } - --int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags); --void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); --void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, -+ struct bkey_validate_context); -+void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - void bch2_backpointer_swab(struct bkey_s); - - #define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ - .key_validate = bch2_backpointer_validate, \ -- .val_to_text = bch2_backpointer_k_to_text, \ -+ .val_to_text = bch2_backpointer_to_text, \ - .swab = bch2_backpointer_swab, \ - .min_val_size = 32, \ - }) -@@ -43,22 +43,24 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos - return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); - } - -+static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos, -+ u32 *bucket_offset) -+{ -+ u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; -+ -+ return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset)); -+} -+ - static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) - { - rcu_read_lock(); -- struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode); -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); - if (ca) - *bucket = bp_pos_to_bucket(ca, bp_pos); - rcu_read_unlock(); - return ca != NULL; - } - --static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) --{ -- return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket), -- c, "backpointer for missing device %llu", bp_pos.inode); --} -- - static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, - struct bpos bucket, - u64 bucket_offset) -@@ -80,31 +82,35 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, - return ret; - } - --int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *, -- struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); -+static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket) -+{ -+ return bucket_pos_to_bp(ca, bucket, 0); -+} -+ -+static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket) -+{ -+ return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0)); -+} -+ -+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, -+ struct bkey_s_c, -+ struct bkey_i_backpointer *, -+ bool); - - static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, -- struct bch_dev *ca, -- struct bpos bucket, -- struct bch_backpointer bp, - struct bkey_s_c orig_k, -+ struct bkey_i_backpointer *bp, - bool insert) - { - if (unlikely(bch2_backpointers_no_use_write_buffer)) -- return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert); -- -- struct bkey_i_backpointer bp_k; -- -- bkey_backpointer_init(&bp_k.k_i); -- bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); -- bp_k.v = bp; -+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); - - if (!insert) { -- bp_k.k.type = KEY_TYPE_deleted; -- set_bkey_val_u64s(&bp_k.k, 0); -+ bp->k.type = KEY_TYPE_deleted; -+ set_bkey_val_u64s(&bp->k, 0); - } - -- return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); -+ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i); - } - - static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, -@@ -134,44 +140,29 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, - } - } - --static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, -+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - const union bch_extent_entry *entry, -- struct bpos *bucket_pos, struct bch_backpointer *bp, -- u64 sectors) -+ struct bkey_i_backpointer *bp) - { -- u32 bucket_offset; -- *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); -- *bp = (struct bch_backpointer) { -+ bkey_backpointer_init(&bp->k_i); -+ bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset); -+ bp->v = (struct bch_backpointer) { - .btree_id = btree_id, - .level = level, - .data_type = bch2_bkey_ptr_data_type(k, p, entry), -- .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + -- p.crc.offset, -- .bucket_len = sectors, -+ .bucket_gen = p.ptr.gen, -+ .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p), - .pos = k.k->p, - }; - } - --static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, -- enum btree_id btree_id, unsigned level, -- struct bkey_s_c k, struct extent_ptr_decoded p, -- const union bch_extent_entry *entry, -- struct bpos *bucket_pos, struct bch_backpointer *bp) --{ -- u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); -- -- __bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors); --} -- --int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int, -- struct bpos *, struct bch_backpointer *, unsigned); --struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, -- struct bpos, struct bch_backpointer, -- unsigned); --struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, -- struct bpos, struct bch_backpointer); -+struct bkey_buf; -+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, -+ struct btree_iter *, unsigned, struct bkey_buf *); -+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, -+ struct btree_iter *, struct bkey_buf *); - - int bch2_check_btree_backpointers(struct bch_fs *); - int bch2_check_extents_to_backpointers(struct bch_fs *); -diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h -index be2edced5213..63abe17f35ea 100644 ---- a/fs/bcachefs/bbpos.h -+++ b/fs/bcachefs/bbpos.h -@@ -29,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos) - - static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) - { -- prt_str(out, bch2_btree_id_str(pos.btree)); -+ bch2_btree_id_to_text(out, pos.btree); - prt_char(out, ':'); - bch2_bpos_to_text(out, pos.pos); - } -diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h -index e94a83b8113e..161cf2f05d2a 100644 ---- a/fs/bcachefs/bcachefs.h -+++ b/fs/bcachefs/bcachefs.h -@@ -205,6 +205,7 @@ - #include - - #include "bcachefs_format.h" -+#include "btree_journal_iter_types.h" - #include "disk_accounting_types.h" - #include "errcode.h" - #include "fifo.h" -@@ -293,6 +294,8 @@ do { \ - - #define bch_info(c, fmt, ...) \ - bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_info_ratelimited(c, fmt, ...) \ -+ bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) - #define bch_notice(c, fmt, ...) \ - bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) - #define bch_warn(c, fmt, ...) \ -@@ -352,6 +355,12 @@ do { \ - bch_info(c, fmt, ##__VA_ARGS__); \ - } while (0) - -+#define bch_verbose_ratelimited(c, fmt, ...) \ -+do { \ -+ if ((c)->opts.verbose) \ -+ bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \ -+} while (0) -+ - #define pr_verbose_init(opts, fmt, ...) \ - do { \ - if (opt_get(opts, verbose)) \ -@@ -538,20 +547,20 @@ struct bch_dev { - - /* - * Buckets: -- * Per-bucket arrays are protected by c->mark_lock, bucket_lock and -- * gc_gens_lock, for device resize - holding any is sufficient for -- * access: Or rcu_read_lock(), but only for dev_ptr_stale(): -+ * Per-bucket arrays are protected by either rcu_read_lock or -+ * state_lock, for device resize. - */ - GENRADIX(struct bucket) buckets_gc; - struct bucket_gens __rcu *bucket_gens; - u8 *oldest_gen; - unsigned long *buckets_nouse; -- struct rw_semaphore bucket_lock; -+ -+ unsigned long *bucket_backpointer_mismatches; -+ unsigned long *bucket_backpointer_empty; - - struct bch_dev_usage __percpu *usage; - - /* Allocator: */ -- u64 new_fs_bucket_idx; - u64 alloc_cursor[3]; - - unsigned nr_open_buckets; -@@ -606,6 +615,7 @@ struct bch_dev { - x(going_ro) \ - x(write_disable_complete) \ - x(clean_shutdown) \ -+ x(recovery_running) \ - x(fsck_running) \ - x(initial_gc_unfixed) \ - x(need_delete_dead_snapshots) \ -@@ -650,28 +660,6 @@ struct journal_seq_blacklist_table { - } entries[]; - }; - --struct journal_keys { -- /* must match layout in darray_types.h */ -- size_t nr, size; -- struct journal_key { -- u64 journal_seq; -- u32 journal_offset; -- enum btree_id btree_id:8; -- unsigned level:8; -- bool allocated; -- bool overwritten; -- struct bkey_i *k; -- } *data; -- /* -- * Gap buffer: instead of all the empty space in the array being at the -- * end of the buffer - from @nr to @size - the empty space is at @gap. -- * This means that sequential insertions are O(n) instead of O(n^2). -- */ -- size_t gap; -- atomic_t ref; -- bool initial_ref_held; --}; -- - struct btree_trans_buf { - struct btree_trans *trans; - }; -@@ -680,6 +668,7 @@ struct btree_trans_buf { - ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) - - #define BCH_WRITE_REFS() \ -+ x(journal) \ - x(trans) \ - x(write) \ - x(promote) \ -@@ -692,6 +681,7 @@ struct btree_trans_buf { - x(dio_write) \ - x(discard) \ - x(discard_fast) \ -+ x(check_discard_freespace_key) \ - x(invalidate) \ - x(delete_dead_snapshots) \ - x(gc_gens) \ -@@ -734,6 +724,12 @@ struct bch_fs { - #else - struct percpu_ref writes; - #endif -+ /* -+ * Certain operations are only allowed in single threaded mode, during -+ * recovery, and we want to assert that this is the case: -+ */ -+ struct task_struct *recovery_task; -+ - /* - * Analagous to c->writes, for asynchronous ops that don't necessarily - * need fs to be read-write -@@ -764,6 +760,8 @@ struct bch_fs { - __uuid_t user_uuid; - - u16 version; -+ u16 version_incompat; -+ u16 version_incompat_allowed; - u16 version_min; - u16 version_upgrade_complete; - -@@ -834,9 +832,10 @@ struct bch_fs { - struct work_struct btree_interior_update_work; - - struct workqueue_struct *btree_node_rewrite_worker; -- -- struct list_head pending_node_rewrites; -- struct mutex pending_node_rewrites_lock; -+ struct list_head btree_node_rewrites; -+ struct list_head btree_node_rewrites_pending; -+ spinlock_t btree_node_rewrites_lock; -+ struct closure_waitlist btree_node_rewrites_wait; - - /* btree_io.c: */ - spinlock_t btree_write_error_lock; -@@ -967,8 +966,7 @@ struct bch_fs { - struct rhashtable promote_table; - - mempool_t compression_bounce[2]; -- mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; -- mempool_t decompress_workspace; -+ mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; - size_t zstd_workspace_size; - - struct crypto_shash *sha256; -@@ -1027,6 +1025,7 @@ struct bch_fs { - struct list_head vfs_inodes_list; - struct mutex vfs_inodes_lock; - struct rhashtable vfs_inodes_table; -+ struct rhltable vfs_inodes_by_inum_table; - - /* VFS IO PATH - fs-io.c */ - struct bio_set writepage_bioset; -@@ -1048,10 +1047,12 @@ struct bch_fs { - * for signaling to the toplevel code which pass we want to run now. - */ - enum bch_recovery_pass curr_recovery_pass; -+ enum bch_recovery_pass next_recovery_pass; - /* bitmask of recovery passes that we actually ran */ - u64 recovery_passes_complete; - /* never rewinds version of curr_recovery_pass */ - enum bch_recovery_pass recovery_pass_done; -+ spinlock_t recovery_pass_lock; - struct semaphore online_fsck_mutex; - - /* DEBUG JUNK */ -@@ -1062,9 +1063,6 @@ struct bch_fs { - struct btree_node *verify_ondisk; - struct mutex verify_lock; - -- u64 *unused_inode_hints; -- unsigned inode_shard_bits; -- - /* - * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - have to dynamically allocate them -@@ -1086,8 +1084,6 @@ struct bch_fs { - u64 counters_on_mount[BCH_COUNTER_NR]; - u64 __percpu *counters; - -- unsigned copy_gc_enabled:1; -- - struct bch2_time_stats times[BCH_TIME_STAT_NR]; - - struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; -diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h -index 5004f6ba997c..f70f0108401f 100644 ---- a/fs/bcachefs/bcachefs_format.h -+++ b/fs/bcachefs/bcachefs_format.h -@@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k) - x(snapshot_tree, 31) \ - x(logged_op_truncate, 32) \ - x(logged_op_finsert, 33) \ -- x(accounting, 34) -+ x(accounting, 34) \ -+ x(inode_alloc_cursor, 35) - - enum bch_bkey_type { - #define x(name, nr) KEY_TYPE_##name = nr, -@@ -463,7 +464,8 @@ struct bch_backpointer { - __u8 btree_id; - __u8 level; - __u8 data_type; -- __u64 bucket_offset:40; -+ __u8 bucket_gen; -+ __u32 pad; - __u32 bucket_len; - struct bpos pos; - } __packed __aligned(8); -@@ -499,8 +501,6 @@ struct bch_sb_field { - #include "disk_groups_format.h" - #include "extents_format.h" - #include "ec_format.h" --#include "dirent_format.h" --#include "disk_groups_format.h" - #include "inode_format.h" - #include "journal_seq_blacklist_format.h" - #include "logged_ops_format.h" -@@ -679,7 +679,14 @@ struct bch_sb_field_ext { - x(disk_accounting_v3, BCH_VERSION(1, 10)) \ - x(disk_accounting_inum, BCH_VERSION(1, 11)) \ - x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ -- x(inode_has_child_snapshots, BCH_VERSION(1, 13)) -+ x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \ -+ x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ -+ x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ -+ x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ -+ x(inode_depth, BCH_VERSION(1, 17)) \ -+ x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ -+ x(autofix_errors, BCH_VERSION(1, 19)) \ -+ x(directory_size, BCH_VERSION(1, 20)) - - enum bcachefs_metadata_version { - bcachefs_metadata_version_min = 9, -@@ -844,6 +851,10 @@ LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, - struct bch_sb, flags[5], 0, 16); - LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, - struct bch_sb, flags[5], 16, 32); -+LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); -+LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, -+ struct bch_sb, flags[5], 48, 64); -+LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); - - static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) - { -@@ -896,21 +907,22 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u - x(new_varint, 15) \ - x(journal_no_flush, 16) \ - x(alloc_v2, 17) \ -- x(extents_across_btree_nodes, 18) -+ x(extents_across_btree_nodes, 18) \ -+ x(incompat_version_field, 19) - - #define BCH_SB_FEATURES_ALWAYS \ -- ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ -- (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ -- (1ULL << BCH_FEATURE_btree_updates_journalled)|\ -- (1ULL << BCH_FEATURE_alloc_v2)|\ -- (1ULL << BCH_FEATURE_extents_across_btree_nodes)) -+ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ -+ BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\ -+ BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\ -+ BIT_ULL(BCH_FEATURE_alloc_v2)|\ -+ BIT_ULL(BCH_FEATURE_extents_across_btree_nodes)) - - #define BCH_SB_FEATURES_ALL \ - (BCH_SB_FEATURES_ALWAYS| \ -- (1ULL << BCH_FEATURE_new_siphash)| \ -- (1ULL << BCH_FEATURE_btree_ptr_v2)| \ -- (1ULL << BCH_FEATURE_new_varint)| \ -- (1ULL << BCH_FEATURE_journal_no_flush)) -+ BIT_ULL(BCH_FEATURE_new_siphash)| \ -+ BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ -+ BIT_ULL(BCH_FEATURE_new_varint)| \ -+ BIT_ULL(BCH_FEATURE_journal_no_flush)) - - enum bch_sb_feature { - #define x(f, n) BCH_FEATURE_##f, -@@ -1032,7 +1044,7 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) - x(crc64, 2) \ - x(xxhash, 3) - --enum bch_csum_opts { -+enum bch_csum_opt { - #define x(t, n) BCH_CSUM_OPT_##t = n, - BCH_CSUM_OPTS() - #undef x -@@ -1221,6 +1233,15 @@ struct jset_entry_log { - u8 d[]; - } __packed __aligned(8); - -+static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l) -+{ -+ unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d); -+ -+ while (b && !l->d[b - 1]) -+ --b; -+ return b; -+} -+ - struct jset_entry_datetime { - struct jset_entry entry; - __le64 seconds; -@@ -1268,14 +1289,18 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); - /* Btree: */ - - enum btree_id_flags { -- BTREE_ID_EXTENTS = BIT(0), -- BTREE_ID_SNAPSHOTS = BIT(1), -- BTREE_ID_SNAPSHOT_FIELD = BIT(2), -- BTREE_ID_DATA = BIT(3), -+ BTREE_IS_extents = BIT(0), -+ BTREE_IS_snapshots = BIT(1), -+ BTREE_IS_snapshot_field = BIT(2), -+ BTREE_IS_data = BIT(3), -+ BTREE_IS_write_buffer = BIT(4), - }; - - #define BCH_BTREE_IDS() \ -- x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\ -+ x(extents, 0, \ -+ BTREE_IS_extents| \ -+ BTREE_IS_snapshots| \ -+ BTREE_IS_data, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_error)| \ - BIT_ULL(KEY_TYPE_cookie)| \ -@@ -1283,17 +1308,20 @@ enum btree_id_flags { - BIT_ULL(KEY_TYPE_reservation)| \ - BIT_ULL(KEY_TYPE_reflink_p)| \ - BIT_ULL(KEY_TYPE_inline_data)) \ -- x(inodes, 1, BTREE_ID_SNAPSHOTS, \ -+ x(inodes, 1, \ -+ BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_inode)| \ - BIT_ULL(KEY_TYPE_inode_v2)| \ - BIT_ULL(KEY_TYPE_inode_v3)| \ - BIT_ULL(KEY_TYPE_inode_generation)) \ -- x(dirents, 2, BTREE_ID_SNAPSHOTS, \ -+ x(dirents, 2, \ -+ BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_hash_whiteout)| \ - BIT_ULL(KEY_TYPE_dirent)) \ -- x(xattrs, 3, BTREE_ID_SNAPSHOTS, \ -+ x(xattrs, 3, \ -+ BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_cookie)| \ - BIT_ULL(KEY_TYPE_hash_whiteout)| \ -@@ -1307,7 +1335,9 @@ enum btree_id_flags { - BIT_ULL(KEY_TYPE_quota)) \ - x(stripes, 6, 0, \ - BIT_ULL(KEY_TYPE_stripe)) \ -- x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \ -+ x(reflink, 7, \ -+ BTREE_IS_extents| \ -+ BTREE_IS_data, \ - BIT_ULL(KEY_TYPE_reflink_v)| \ - BIT_ULL(KEY_TYPE_indirect_inline_data)| \ - BIT_ULL(KEY_TYPE_error)) \ -@@ -1315,28 +1345,38 @@ enum btree_id_flags { - BIT_ULL(KEY_TYPE_subvolume)) \ - x(snapshots, 9, 0, \ - BIT_ULL(KEY_TYPE_snapshot)) \ -- x(lru, 10, 0, \ -+ x(lru, 10, \ -+ BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)) \ -- x(freespace, 11, BTREE_ID_EXTENTS, \ -+ x(freespace, 11, \ -+ BTREE_IS_extents, \ - BIT_ULL(KEY_TYPE_set)) \ - x(need_discard, 12, 0, \ - BIT_ULL(KEY_TYPE_set)) \ -- x(backpointers, 13, 0, \ -+ x(backpointers, 13, \ -+ BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_backpointer)) \ - x(bucket_gens, 14, 0, \ - BIT_ULL(KEY_TYPE_bucket_gens)) \ - x(snapshot_trees, 15, 0, \ - BIT_ULL(KEY_TYPE_snapshot_tree)) \ -- x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \ -+ x(deleted_inodes, 16, \ -+ BTREE_IS_snapshot_field| \ -+ BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)) \ - x(logged_ops, 17, 0, \ - BIT_ULL(KEY_TYPE_logged_op_truncate)| \ -- BIT_ULL(KEY_TYPE_logged_op_finsert)) \ -- x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ -+ BIT_ULL(KEY_TYPE_logged_op_finsert)| \ -+ BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \ -+ x(rebalance_work, 18, \ -+ BTREE_IS_snapshot_field| \ -+ BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ - x(subvolume_children, 19, 0, \ - BIT_ULL(KEY_TYPE_set)) \ -- x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \ -+ x(accounting, 20, \ -+ BTREE_IS_snapshot_field| \ -+ BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_accounting)) \ - - enum btree_id { -@@ -1361,6 +1401,8 @@ static inline bool btree_id_is_alloc(enum btree_id id) - case BTREE_ID_need_discard: - case BTREE_ID_freespace: - case BTREE_ID_bucket_gens: -+ case BTREE_ID_lru: -+ case BTREE_ID_accounting: - return true; - default: - return false; -diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h -index 41df24a53d97..054e2d5e8448 100644 ---- a/fs/bcachefs/bkey.h -+++ b/fs/bcachefs/bkey.h -@@ -9,13 +9,6 @@ - #include "util.h" - #include "vstructs.h" - --enum bch_validate_flags { -- BCH_VALIDATE_write = BIT(0), -- BCH_VALIDATE_commit = BIT(1), -- BCH_VALIDATE_journal = BIT(2), -- BCH_VALIDATE_silent = BIT(3), --}; -- - #if 0 - - /* -diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c -index e7ac227ba7e8..15c93576b5c2 100644 ---- a/fs/bcachefs/bkey_methods.c -+++ b/fs/bcachefs/bkey_methods.c -@@ -28,7 +28,7 @@ const char * const bch2_bkey_types[] = { - }; - - static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - return 0; - } -@@ -42,7 +42,7 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, - }) - - static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - int ret = 0; - -@@ -59,7 +59,7 @@ static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, - }) - - static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - return 0; - } -@@ -83,7 +83,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, - }) - - static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - return 0; - } -@@ -124,7 +124,7 @@ const struct bkey_ops bch2_bkey_null_ops = { - }; - - int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) - return 0; -@@ -140,7 +140,7 @@ int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, - if (!ops->key_validate) - return 0; - -- ret = ops->key_validate(c, k, flags); -+ ret = ops->key_validate(c, k, from); - fsck_err: - return ret; - } -@@ -161,9 +161,10 @@ const char *bch2_btree_node_type_str(enum btree_node_type type) - } - - int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, -- enum btree_node_type type, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { -+ enum btree_node_type type = __btree_node_type(from.level, from.btree); -+ - if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) - return 0; - -@@ -177,7 +178,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - return 0; - - bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && -- (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) && -+ (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) && - !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), - c, bkey_invalid_type_for_btree, - "invalid key type for btree %s (%s)", -@@ -228,15 +229,15 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - } - - int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, -- enum btree_node_type type, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { -- return __bch2_bkey_validate(c, k, type, flags) ?: -- bch2_bkey_val_validate(c, k, flags); -+ return __bch2_bkey_validate(c, k, from) ?: -+ bch2_bkey_val_validate(c, k, from); - } - - int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, -- struct bkey_s_c k, enum bch_validate_flags flags) -+ struct bkey_s_c k, -+ struct bkey_validate_context from) - { - int ret = 0; - -diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h -index 018fb72e32d3..bf34111cdf00 100644 ---- a/fs/bcachefs/bkey_methods.h -+++ b/fs/bcachefs/bkey_methods.h -@@ -22,7 +22,7 @@ extern const struct bkey_ops bch2_bkey_null_ops; - */ - struct bkey_ops { - int (*key_validate)(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags); -+ struct bkey_validate_context from); - void (*val_to_text)(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - void (*swab)(struct bkey_s); -@@ -48,13 +48,14 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) - : &bch2_bkey_null_ops; - } - --int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); --int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, -- enum bch_validate_flags); --int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, -- enum bch_validate_flags); -+int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); -+int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); -+int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); - int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context from); - - void bch2_bpos_to_text(struct printbuf *, struct bpos); - void bch2_bkey_to_text(struct printbuf *, const struct bkey *); -diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h -index c9ae9e42b385..b4f328f9853c 100644 ---- a/fs/bcachefs/bkey_types.h -+++ b/fs/bcachefs/bkey_types.h -@@ -210,4 +210,32 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ - BCH_BKEY_TYPES(); - #undef x - -+enum bch_validate_flags { -+ BCH_VALIDATE_write = BIT(0), -+ BCH_VALIDATE_commit = BIT(1), -+ BCH_VALIDATE_silent = BIT(2), -+}; -+ -+#define BKEY_VALIDATE_CONTEXTS() \ -+ x(unknown) \ -+ x(superblock) \ -+ x(journal) \ -+ x(btree_root) \ -+ x(btree_node) \ -+ x(commit) -+ -+struct bkey_validate_context { -+ enum { -+#define x(n) BKEY_VALIDATE_##n, -+ BKEY_VALIDATE_CONTEXTS() -+#undef x -+ } from:8; -+ enum bch_validate_flags flags:8; -+ u8 level; -+ enum btree_id btree; -+ bool root:1; -+ unsigned journal_offset; -+ u64 journal_seq; -+}; -+ - #endif /* _BCACHEFS_BKEY_TYPES_H */ -diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c -index 7123019ab3bc..1ec1f90e0eb3 100644 ---- a/fs/bcachefs/btree_cache.c -+++ b/fs/bcachefs/btree_cache.c -@@ -24,7 +24,10 @@ do { \ - } while (0) - - const char * const bch2_btree_node_flags[] = { --#define x(f) #f, -+ "typebit", -+ "typebit", -+ "typebit", -+#define x(f) [BTREE_NODE_##f] = #f, - BTREE_FLAGS() - #undef x - NULL -@@ -200,7 +203,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) - return NULL; - } - -- bch2_btree_lock_init(&b->c, 0); -+ bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); - - __bch2_btree_node_to_freelist(bc, b); - return b; -@@ -222,7 +225,6 @@ void bch2_node_pin(struct bch_fs *c, struct btree *b) - struct btree_cache *bc = &c->btree_cache; - - mutex_lock(&bc->lock); -- BUG_ON(!__btree_node_pinned(bc, b)); - if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { - set_btree_node_pinned(b); - list_move(&b->list, &bc->live[1].list); -@@ -326,7 +328,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans, - if (!IS_ERR_OR_NULL(b)) { - mutex_lock(&c->btree_cache.lock); - -- bch2_btree_node_hash_remove(&c->btree_cache, b); -+ __bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, new); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -@@ -793,17 +795,18 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea - } - - b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); -- if (!b) { -+ if (b) { -+ bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); -+ } else { - mutex_unlock(&bc->lock); - bch2_trans_unlock(trans); - b = __btree_node_mem_alloc(c, GFP_KERNEL); - if (!b) - goto err; -+ bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); - mutex_lock(&bc->lock); - } - -- bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); -- - BUG_ON(!six_trylock_intent(&b->c.lock)); - BUG_ON(!six_trylock_write(&b->c.lock)); - -@@ -1004,16 +1007,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) - return; - - prt_printf(&buf, -- "btree node header doesn't match ptr\n" -- "btree %s level %u\n" -- "ptr: ", -- bch2_btree_id_str(b->c.btree_id), b->c.level); -+ "btree node header doesn't match ptr: "); -+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); -+ prt_str(&buf, "\nptr: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - -- prt_printf(&buf, "\nheader: btree %s level %llu\n" -- "min ", -- bch2_btree_id_str(BTREE_NODE_ID(b->data)), -- BTREE_NODE_LEVEL(b->data)); -+ prt_str(&buf, "\nheader: "); -+ bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); -+ prt_str(&buf, "\nmin "); - bch2_bpos_to_text(&buf, b->data->min_key); - - prt_printf(&buf, "\nmax "); -@@ -1133,7 +1134,7 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr - - if (unlikely(btree_node_read_error(b))) { - six_unlock_type(&b->c.lock, lock_type); -- return ERR_PTR(-BCH_ERR_btree_node_read_error); -+ return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - } - - EBUG_ON(b->c.btree_id != path->btree_id); -@@ -1223,7 +1224,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * - - if (unlikely(btree_node_read_error(b))) { - six_unlock_type(&b->c.lock, lock_type); -- return ERR_PTR(-BCH_ERR_btree_node_read_error); -+ return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - } - - EBUG_ON(b->c.btree_id != path->btree_id); -@@ -1305,7 +1306,7 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, - - if (unlikely(btree_node_read_error(b))) { - six_unlock_read(&b->c.lock); -- b = ERR_PTR(-BCH_ERR_btree_node_read_error); -+ b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - goto out; - } - -@@ -1398,13 +1399,31 @@ void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) - prt_printf(out, "(unknown btree %u)", btree); - } - -+void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) -+{ -+ prt_str(out, "btree="); -+ bch2_btree_id_to_text(out, btree); -+ prt_printf(out, " level=%u", level); -+} -+ -+void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, -+ enum btree_id btree, unsigned level, struct bkey_s_c k) -+{ -+ bch2_btree_id_to_text(out, btree); -+ prt_printf(out, " level %u/", level); -+ struct btree_root *r = bch2_btree_id_root(c, btree); -+ if (r) -+ prt_printf(out, "%u", r->level); -+ else -+ prt_printf(out, "(unknown)"); -+ prt_printf(out, "\n "); -+ -+ bch2_bkey_val_to_text(out, c, k); -+} -+ - void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) - { -- prt_printf(out, "%s level %u/%u\n ", -- bch2_btree_id_str(b->c.btree_id), -- b->c.level, -- bch2_btree_id_root(c, b->c.btree_id)->level); -- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -+ __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); - } - - void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) -@@ -1478,8 +1497,12 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc - prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); - prt_newline(out); - -- for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) -- prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]); -+ for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { -+ bch2_btree_id_to_text(out, i); -+ prt_printf(out, "\t"); -+ prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); -+ prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); -+ } - - prt_newline(out); - prt_printf(out, "freed:\t%zu\n", bc->nr_freed); -diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h -index 66e86d1a178d..ca3c1b145330 100644 ---- a/fs/bcachefs/btree_cache.h -+++ b/fs/bcachefs/btree_cache.h -@@ -128,19 +128,27 @@ static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned i - } else { - unsigned idx = id - BTREE_ID_NR; - -- EBUG_ON(idx >= c->btree_roots_extra.nr); -+ /* This can happen when we're called from btree_node_scan */ -+ if (idx >= c->btree_roots_extra.nr) -+ return NULL; -+ - return &c->btree_roots_extra.data[idx]; - } - } - - static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) - { -- return bch2_btree_id_root(c, b->c.btree_id)->b; -+ struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); -+ -+ return r ? r->b : NULL; - } - --const char *bch2_btree_id_str(enum btree_id); -+const char *bch2_btree_id_str(enum btree_id); /* avoid */ - void bch2_btree_id_to_text(struct printbuf *, enum btree_id); -+void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned); - -+void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, -+ enum btree_id, unsigned, struct bkey_s_c); - void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); - void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); - void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); -diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c -index 81dcf9e512c0..dd1d9b74076e 100644 ---- a/fs/bcachefs/btree_gc.c -+++ b/fs/bcachefs/btree_gc.c -@@ -29,6 +29,7 @@ - #include "move.h" - #include "recovery_passes.h" - #include "reflink.h" -+#include "recovery.h" - #include "replicas.h" - #include "super-io.h" - #include "trace.h" -@@ -56,8 +57,8 @@ void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p) - { - prt_str(out, bch2_gc_phase_strs[p->phase]); - prt_char(out, ' '); -- bch2_btree_id_to_text(out, p->btree); -- prt_printf(out, " l=%u ", p->level); -+ bch2_btree_id_level_to_text(out, p->btree, p->level); -+ prt_char(out, ' '); - bch2_bpos_to_text(out, p->pos); - } - -@@ -209,8 +210,9 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * - if (bpos_eq(expected_start, cur->data->min_key)) - return 0; - -- prt_printf(&buf, " at btree %s level %u:\n parent: ", -- bch2_btree_id_str(b->c.btree_id), b->c.level); -+ prt_printf(&buf, " at "); -+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); -+ prt_printf(&buf, ":\n parent: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - if (prev) { -@@ -277,8 +279,9 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, - if (bpos_eq(child->key.k.p, b->key.k.p)) - return 0; - -- prt_printf(&buf, "at btree %s level %u:\n parent: ", -- bch2_btree_id_str(b->c.btree_id), b->c.level); -+ prt_printf(&buf, " at "); -+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); -+ prt_printf(&buf, ":\n parent: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - prt_str(&buf, "\n child: "); -@@ -341,14 +344,14 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct - ret = PTR_ERR_OR_ZERO(cur); - - printbuf_reset(&buf); -+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1); -+ prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - - if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), -- trans, btree_node_unreadable, -- "Topology repair: unreadable btree node at btree %s level %u:\n" -+ trans, btree_node_read_error, -+ "Topology repair: unreadable btree node at\n" - " %s", -- bch2_btree_id_str(b->c.btree_id), -- b->c.level - 1, - buf.buf)) { - bch2_btree_node_evict(trans, cur_k.k); - cur = NULL; -@@ -357,11 +360,9 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct - if (ret) - break; - -- if (!btree_id_is_alloc(b->c.btree_id)) { -- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); -- if (ret) -- break; -- } -+ ret = bch2_btree_lost_data(c, b->c.btree_id); -+ if (ret) -+ break; - continue; - } - -@@ -370,7 +371,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct - break; - - if (bch2_btree_node_is_stale(c, cur)) { -- bch_info(c, "btree node %s older than nodes found by scanning", buf.buf); -+ bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf); - six_unlock_read(&cur->c.lock); - bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, -@@ -478,14 +479,13 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct - } - - printbuf_reset(&buf); -+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); -+ prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - if (mustfix_fsck_err_on(!have_child, - trans, btree_node_topology_interior_node_empty, -- "empty interior btree node at btree %s level %u\n" -- " %s", -- bch2_btree_id_str(b->c.btree_id), -- b->c.level, buf.buf)) -+ "empty interior btree node at %s", buf.buf)) - ret = DROP_THIS_NODE; - err: - fsck_err: -@@ -511,6 +511,7 @@ int bch2_check_topology(struct bch_fs *c) - { - struct btree_trans *trans = bch2_trans_get(c); - struct bpos pulled_from_scan = POS_MIN; -+ struct printbuf buf = PRINTBUF; - int ret = 0; - - bch2_trans_srcu_unlock(trans); -@@ -519,19 +520,22 @@ int bch2_check_topology(struct bch_fs *c) - struct btree_root *r = bch2_btree_id_root(c, i); - bool reconstructed_root = false; - -+ printbuf_reset(&buf); -+ bch2_btree_id_to_text(&buf, i); -+ - if (r->error) { -- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); -+ ret = bch2_btree_lost_data(c, i); - if (ret) - break; - reconstruct_root: -- bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i)); -+ bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - - r->alive = false; - r->error = 0; - - if (!bch2_btree_has_scanned_nodes(c, i)) { - mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing, -- "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); -+ "no nodes found for btree %s, continue?", buf.buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); - } else { - bch2_btree_root_alloc_fake_trans(trans, i, 1); -@@ -560,13 +564,14 @@ int bch2_check_topology(struct bch_fs *c) - if (!reconstructed_root) - goto reconstruct_root; - -- bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); -+ bch_err(c, "empty btree root %s", buf.buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); - r->alive = false; - ret = 0; - } - } - fsck_err: -+ printbuf_exit(&buf); - bch2_trans_put(trans); - return ret; - } -@@ -713,6 +718,7 @@ static int bch2_gc_btrees(struct bch_fs *c) - { - struct btree_trans *trans = bch2_trans_get(c); - enum btree_id ids[BTREE_ID_NR]; -+ struct printbuf buf = PRINTBUF; - unsigned i; - int ret = 0; - -@@ -727,14 +733,9 @@ static int bch2_gc_btrees(struct bch_fs *c) - continue; - - ret = bch2_gc_btree(trans, btree, true); -- -- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), -- trans, btree_node_read_error, -- "btree node read error for %s", -- bch2_btree_id_str(btree))) -- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - } --fsck_err: -+ -+ printbuf_exit(&buf); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -@@ -802,7 +803,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, - old = bch2_alloc_to_v4(k, &old_convert); - gc = new = *old; - -- percpu_down_read(&c->mark_lock); - __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); - - old_gc = gc; -@@ -813,7 +813,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, - gc.data_type = old->data_type; - gc.dirty_sectors = old->dirty_sectors; - } -- percpu_up_read(&c->mark_lock); - - /* - * gc.data_type doesn't yet include need_discard & need_gc_gen states - -@@ -831,11 +830,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, - * safe w.r.t. transaction restarts, so fixup the gc_bucket so - * we don't run it twice: - */ -- percpu_down_read(&c->mark_lock); - struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); - gc_m->data_type = gc.data_type; - gc_m->dirty_sectors = gc.dirty_sectors; -- percpu_up_read(&c->mark_lock); - } - - if (fsck_err_on(new.data_type != gc.data_type, -@@ -895,11 +892,11 @@ static int bch2_gc_alloc_done(struct bch_fs *c) - - for_each_member_device(c, ca) { - ret = bch2_trans_run(c, -- for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, -+ for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, - POS(ca->dev_idx, ca->mi.first_bucket), - POS(ca->dev_idx, ca->mi.nbuckets - 1), - BTREE_ITER_slots|BTREE_ITER_prefetch, k, -- NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, -+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_alloc_write_key(trans, &iter, ca, k))); - if (ret) { - bch2_dev_put(ca); -@@ -928,98 +925,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c) - return ret; - } - --static int bch2_gc_write_reflink_key(struct btree_trans *trans, -- struct btree_iter *iter, -- struct bkey_s_c k, -- size_t *idx) --{ -- struct bch_fs *c = trans->c; -- const __le64 *refcount = bkey_refcount_c(k); -- struct printbuf buf = PRINTBUF; -- struct reflink_gc *r; -- int ret = 0; -- -- if (!refcount) -- return 0; -- -- while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && -- r->offset < k.k->p.offset) -- ++*idx; -- -- if (!r || -- r->offset != k.k->p.offset || -- r->size != k.k->size) { -- bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); -- return -EINVAL; -- } -- -- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), -- trans, reflink_v_refcount_wrong, -- "reflink key has wrong refcount:\n" -- " %s\n" -- " should be %u", -- (bch2_bkey_val_to_text(&buf, c, k), buf.buf), -- r->refcount)) { -- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); -- ret = PTR_ERR_OR_ZERO(new); -- if (ret) -- goto out; -- -- if (!r->refcount) -- new->k.type = KEY_TYPE_deleted; -- else -- *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); -- ret = bch2_trans_update(trans, iter, new, 0); -- } --out: --fsck_err: -- printbuf_exit(&buf); -- return ret; --} -- --static int bch2_gc_reflink_done(struct bch_fs *c) --{ -- size_t idx = 0; -- -- int ret = bch2_trans_run(c, -- for_each_btree_key_commit(trans, iter, -- BTREE_ID_reflink, POS_MIN, -- BTREE_ITER_prefetch, k, -- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -- bch2_gc_write_reflink_key(trans, &iter, k, &idx))); -- c->reflink_gc_nr = 0; -- return ret; --} -- --static int bch2_gc_reflink_start(struct bch_fs *c) --{ -- c->reflink_gc_nr = 0; -- -- int ret = bch2_trans_run(c, -- for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, -- BTREE_ITER_prefetch, k, ({ -- const __le64 *refcount = bkey_refcount_c(k); -- -- if (!refcount) -- continue; -- -- struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, -- c->reflink_gc_nr++, GFP_KERNEL); -- if (!r) { -- ret = -BCH_ERR_ENOMEM_gc_reflink_start; -- break; -- } -- -- r->offset = k.k->p.offset; -- r->size = k.k->size; -- r->refcount = 0; -- 0; -- }))); -- -- bch_err_fn(c, ret); -- return ret; --} -- - static int bch2_gc_write_stripes_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -@@ -1171,7 +1076,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, - if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) - return -EROFS; - -- percpu_down_read(&c->mark_lock); - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); -@@ -1180,7 +1084,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, - - if (dev_ptr_stale(ca, ptr) > 16) { - rcu_read_unlock(); -- percpu_up_read(&c->mark_lock); - goto update; - } - } -@@ -1195,7 +1098,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, - *gen = ptr->gen; - } - rcu_read_unlock(); -- percpu_up_read(&c->mark_lock); - return 0; - update: - u = bch2_bkey_make_mut(trans, iter, &k, 0); -@@ -1224,7 +1126,6 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev - return ret; - - a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; -- alloc_data_type_set(&a_mut->v, a_mut->v.data_type); - - return bch2_trans_update(trans, iter, &a_mut->k_i, 0); - } -@@ -1337,9 +1238,16 @@ void bch2_gc_gens_async(struct bch_fs *c) - bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); - } - --void bch2_fs_gc_init(struct bch_fs *c) -+void bch2_fs_btree_gc_exit(struct bch_fs *c) - { -- seqcount_init(&c->gc_pos_lock); -+} - -+int bch2_fs_btree_gc_init(struct bch_fs *c) -+{ -+ seqcount_init(&c->gc_pos_lock); - INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); -+ -+ init_rwsem(&c->gc_lock); -+ mutex_init(&c->gc_gens_lock); -+ return 0; - } -diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h -index 8a47e8bd0791..9693a90a48a2 100644 ---- a/fs/bcachefs/btree_gc.h -+++ b/fs/bcachefs/btree_gc.h -@@ -82,6 +82,8 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); - - int bch2_gc_gens(struct bch_fs *); - void bch2_gc_gens_async(struct bch_fs *); --void bch2_fs_gc_init(struct bch_fs *); -+ -+void bch2_fs_btree_gc_exit(struct bch_fs *); -+int bch2_fs_btree_gc_init(struct bch_fs *); - - #endif /* _BCACHEFS_BTREE_GC_H */ -diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c -index 839d68802e42..756736f9243d 100644 ---- a/fs/bcachefs/btree_io.c -+++ b/fs/bcachefs/btree_io.c -@@ -25,9 +25,8 @@ - - static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) - { -- prt_printf(out, "btree=%s l=%u seq %llux\n", -- bch2_btree_id_str(BTREE_NODE_ID(bn)), -- (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq); -+ bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); -+ prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn)); - prt_str(out, "min: "); - bch2_bpos_to_text(out, bn->min_key); - prt_newline(out); -@@ -490,8 +489,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) - if (b->nsets == MAX_BSETS && - !btree_node_write_in_flight(b) && - should_compact_all(c, b)) { -- bch2_btree_node_write(c, b, SIX_LOCK_write, -- BTREE_WRITE_init_next_bset); -+ bch2_btree_node_write_trans(trans, b, SIX_LOCK_write, -+ BTREE_WRITE_init_next_bset); - reinit_iter = true; - } - -@@ -832,13 +831,32 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, - return ret; - } - -+static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b, -+ struct bkey_s_c k, -+ enum bch_validate_flags flags) -+{ -+ return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) { -+ .from = BKEY_VALIDATE_btree_node, -+ .level = b->c.level, -+ .btree = b->c.btree_id, -+ .flags = flags -+ }); -+} -+ - static int bset_key_validate(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, -- bool updated_range, int rw) -+ bool updated_range, -+ enum bch_validate_flags flags) - { -- return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?: -- (!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?: -- (rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0); -+ struct bkey_validate_context from = (struct bkey_validate_context) { -+ .from = BKEY_VALIDATE_btree_node, -+ .level = b->c.level, -+ .btree = b->c.btree_id, -+ .flags = flags, -+ }; -+ return __bch2_bkey_validate(c, k, from) ?: -+ (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?: -+ (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0); - } - - static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, -@@ -855,7 +873,21 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, - - struct bkey tmp; - struct bkey_s u = __bkey_disassemble(b, k, &tmp); -- return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent); -+ return !__bch2_bkey_validate(c, u.s_c, -+ (struct bkey_validate_context) { -+ .from = BKEY_VALIDATE_btree_node, -+ .level = b->c.level, -+ .btree = b->c.btree_id, -+ .flags = BCH_VALIDATE_silent -+ }); -+} -+ -+static inline int btree_node_read_bkey_cmp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r) -+{ -+ return bch2_bkey_cmp_packed(b, l, r) -+ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l); - } - - static int validate_bset_keys(struct bch_fs *c, struct btree *b, -@@ -918,7 +950,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, - BSET_BIG_ENDIAN(i), write, - &b->format, k); - -- if (prev && bkey_iter_cmp(b, prev, k) > 0) { -+ if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) { - struct bkey up = bkey_unpack_key(b, prev); - - printbuf_reset(&buf); -@@ -964,7 +996,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, - } - got_good_key: - le16_add_cpu(&i->u64s, -next_good_key); -- memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); -+ memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); -+ set_btree_node_need_rewrite(b); - } - fsck_err: - printbuf_exit(&buf); -@@ -1038,39 +1071,51 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - - while (b->written < (ptr_written ?: btree_sectors(c))) { - unsigned sectors; -- struct nonce nonce; - bool first = !b->written; -- bool csum_bad; - -- if (!b->written) { -+ if (first) { -+ bne = NULL; - i = &b->data->keys; -+ } else { -+ bne = write_block(b); -+ i = &bne->keys; - -- btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -- -BCH_ERR_btree_node_read_err_want_retry, -- c, ca, b, i, NULL, -- bset_unknown_csum, -- "unknown checksum type %llu", BSET_CSUM_TYPE(i)); -- -- nonce = btree_nonce(i, b->written << 9); -+ if (i->seq != b->data->keys.seq) -+ break; -+ } - -- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); -- csum_bad = bch2_crc_cmp(b->data->csum, csum); -- if (csum_bad) -- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); -+ struct nonce nonce = btree_nonce(i, b->written << 9); -+ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); - -- btree_err_on(csum_bad, -- -BCH_ERR_btree_node_read_err_want_retry, -- c, ca, b, i, NULL, -- bset_bad_csum, -- "%s", -- (printbuf_reset(&buf), -- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), -- buf.buf)); -- -- ret = bset_encrypt(c, i, b->written << 9); -- if (bch2_fs_fatal_err_on(ret, c, -- "decrypting btree node: %s", bch2_err_str(ret))) -- goto fsck_err; -+ btree_err_on(!good_csum_type, -+ bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) -+ ? -BCH_ERR_btree_node_read_err_must_retry -+ : -BCH_ERR_btree_node_read_err_want_retry, -+ c, ca, b, i, NULL, -+ bset_unknown_csum, -+ "unknown checksum type %llu", BSET_CSUM_TYPE(i)); -+ -+ if (first) { -+ if (good_csum_type) { -+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); -+ bool csum_bad = bch2_crc_cmp(b->data->csum, csum); -+ if (csum_bad) -+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); -+ -+ btree_err_on(csum_bad, -+ -BCH_ERR_btree_node_read_err_want_retry, -+ c, ca, b, i, NULL, -+ bset_bad_csum, -+ "%s", -+ (printbuf_reset(&buf), -+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), -+ buf.buf)); -+ -+ ret = bset_encrypt(c, i, b->written << 9); -+ if (bch2_fs_fatal_err_on(ret, c, -+ "decrypting btree node: %s", bch2_err_str(ret))) -+ goto fsck_err; -+ } - - btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && - !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), -@@ -1081,37 +1126,26 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - - sectors = vstruct_sectors(b->data, c->block_bits); - } else { -- bne = write_block(b); -- i = &bne->keys; -- -- if (i->seq != b->data->keys.seq) -- break; -- -- btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -- -BCH_ERR_btree_node_read_err_want_retry, -- c, ca, b, i, NULL, -- bset_unknown_csum, -- "unknown checksum type %llu", BSET_CSUM_TYPE(i)); -- -- nonce = btree_nonce(i, b->written << 9); -- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -- csum_bad = bch2_crc_cmp(bne->csum, csum); -- if (ca && csum_bad) -- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); -- -- btree_err_on(csum_bad, -- -BCH_ERR_btree_node_read_err_want_retry, -- c, ca, b, i, NULL, -- bset_bad_csum, -- "%s", -- (printbuf_reset(&buf), -- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), -- buf.buf)); -- -- ret = bset_encrypt(c, i, b->written << 9); -- if (bch2_fs_fatal_err_on(ret, c, -- "decrypting btree node: %s", bch2_err_str(ret))) -- goto fsck_err; -+ if (good_csum_type) { -+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ bool csum_bad = bch2_crc_cmp(bne->csum, csum); -+ if (ca && csum_bad) -+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); -+ -+ btree_err_on(csum_bad, -+ -BCH_ERR_btree_node_read_err_want_retry, -+ c, ca, b, i, NULL, -+ bset_bad_csum, -+ "%s", -+ (printbuf_reset(&buf), -+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), -+ buf.buf)); -+ -+ ret = bset_encrypt(c, i, b->written << 9); -+ if (bch2_fs_fatal_err_on(ret, c, -+ "decrypting btree node: %s", bch2_err_str(ret))) -+ goto fsck_err; -+ } - - sectors = vstruct_sectors(bne, c->block_bits); - } -@@ -1152,7 +1186,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - le64_to_cpu(i->journal_seq), - b->written, b->written + sectors, ptr_written); - -- b->written += sectors; -+ b->written = min(b->written + sectors, btree_sectors(c)); - - if (blacklisted && !first) - continue; -@@ -1216,7 +1250,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - struct bkey tmp; - struct bkey_s u = __bkey_disassemble(b, k, &tmp); - -- ret = bch2_bkey_val_validate(c, u.s_c, READ); -+ ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); - if (ret == -BCH_ERR_fsck_delete_bkey || - (bch2_inject_invalid_keys && - !bversion_cmp(u.k->bversion, MAX_VERSION))) { -@@ -1226,6 +1260,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - set_btree_bset_end(b, b->set); -+ set_btree_node_need_rewrite(b); - continue; - } - if (ret) -@@ -1339,13 +1374,18 @@ static void btree_node_read_work(struct work_struct *work) - rb->start_time); - bio_put(&rb->bio); - -- if (saw_error && -+ if ((saw_error || -+ btree_node_need_rewrite(b)) && - !btree_node_read_error(b) && - c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { -- printbuf_reset(&buf); -- bch2_bpos_to_text(&buf, b->key.k.p); -- bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", -- __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf); -+ if (saw_error) { -+ printbuf_reset(&buf); -+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); -+ prt_str(&buf, " "); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); -+ bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", -+ __func__, buf.buf); -+ } - - bch2_btree_node_rewrite_async(c, b); - } -@@ -1933,7 +1973,12 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, - bool saw_error; - - int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), -- BKEY_TYPE_btree, WRITE); -+ (struct bkey_validate_context) { -+ .from = BKEY_VALIDATE_btree_node, -+ .level = b->c.level + 1, -+ .btree = b->c.btree_id, -+ .flags = BCH_VALIDATE_write, -+ }); - if (ret) { - bch2_fs_inconsistent(c, "invalid btree node key before write"); - return ret; -@@ -2300,6 +2345,34 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, - } - } - -+void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b, -+ enum six_lock_type lock_type_held, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ -+ if (lock_type_held == SIX_LOCK_intent || -+ (lock_type_held == SIX_LOCK_read && -+ six_lock_tryupgrade(&b->c.lock))) { -+ __bch2_btree_node_write(c, b, flags); -+ -+ /* don't cycle lock unnecessarily: */ -+ if (btree_node_just_written(b) && -+ six_trylock_write(&b->c.lock)) { -+ bch2_btree_post_write_cleanup(c, b); -+ __bch2_btree_node_unlock_write(trans, b); -+ } -+ -+ if (lock_type_held == SIX_LOCK_read) -+ six_lock_downgrade(&b->c.lock); -+ } else { -+ __bch2_btree_node_write(c, b, flags); -+ if (lock_type_held == SIX_LOCK_write && -+ btree_node_just_written(b)) -+ bch2_btree_post_write_cleanup(c, b); -+ } -+} -+ - static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) - { - struct bucket_table *tbl; -diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h -index 9b01ca3de907..6f9e4a6dacf7 100644 ---- a/fs/bcachefs/btree_io.h -+++ b/fs/bcachefs/btree_io.h -@@ -144,11 +144,13 @@ enum btree_write_flags { - void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); - void bch2_btree_node_write(struct bch_fs *, struct btree *, - enum six_lock_type, unsigned); -+void bch2_btree_node_write_trans(struct btree_trans *, struct btree *, -+ enum six_lock_type, unsigned); - --static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, -+static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b, - enum six_lock_type lock_held) - { -- bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); -+ bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); - } - - bool bch2_btree_flush_all_reads(struct bch_fs *); -diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c -index eef9b89c561d..e32fce4fd258 100644 ---- a/fs/bcachefs/btree_iter.c -+++ b/fs/bcachefs/btree_iter.c -@@ -270,8 +270,10 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) - BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && - iter->pos.snapshot != iter->snapshot); - -- BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || -- bkey_gt(iter->pos, iter->k.p)); -+ BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) : -+ !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) : -+ (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || -+ bkey_gt(iter->pos, iter->k.p))); - } - - static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) -@@ -327,7 +329,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k - void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos) - { -- bch2_trans_verify_not_unlocked(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - - struct btree_path *path; - struct trans_for_each_path_inorder_iter iter; -@@ -697,6 +699,19 @@ void bch2_trans_node_add(struct btree_trans *trans, - bch2_trans_revalidate_updates_in_node(trans, b); - } - -+void bch2_trans_node_drop(struct btree_trans *trans, -+ struct btree *b) -+{ -+ struct btree_path *path; -+ unsigned i, level = b->c.level; -+ -+ trans_for_each_path(trans, path, i) -+ if (path->l[level].b == b) { -+ btree_node_unlock(trans, path, level); -+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); -+ } -+} -+ - /* - * A btree node has been modified in such a way as to invalidate iterators - fix - * them: -@@ -720,7 +735,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, - unsigned long trace_ip) - { - struct bch_fs *c = trans->c; -- struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; -+ struct btree_root *r = bch2_btree_id_root(c, path->btree_id); - enum six_lock_type lock_type; - unsigned i; - int ret; -@@ -728,7 +743,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, - EBUG_ON(path->nodes_locked); - - while (1) { -- b = READ_ONCE(*rootp); -+ struct btree *b = READ_ONCE(r->b); -+ if (unlikely(!b)) { -+ BUG_ON(!r->error); -+ return r->error; -+ } -+ - path->level = READ_ONCE(b->c.level); - - if (unlikely(path->level < depth_want)) { -@@ -748,14 +768,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, - ret = btree_node_lock(trans, path, &b->c, - path->level, lock_type, trace_ip); - if (unlikely(ret)) { -- if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) -- continue; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - BUG(); - } - -- if (likely(b == READ_ONCE(*rootp) && -+ if (likely(b == READ_ONCE(r->b) && - b->c.level == path->level && - !race_fault())) { - for (i = 0; i < path->level; i++) -@@ -825,6 +843,8 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p - - bch2_bkey_buf_init(&tmp); - -+ jiter->fail_if_too_many_whiteouts = true; -+ - while (nr-- && !ret) { - if (!bch2_btree_node_relock(trans, path, path->level)) - break; -@@ -1000,7 +1020,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) - - bch2_trans_unlock(trans); - cond_resched(); -- trans_set_locked(trans); -+ trans_set_locked(trans, false); - - if (unlikely(trans->memory_allocation_failure)) { - struct closure cl; -@@ -1267,7 +1287,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, - { - int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); - -- bch2_trans_verify_not_in_restart(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - EBUG_ON(!trans->paths[path_idx].ref); - - trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos); -@@ -1427,17 +1447,31 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_ - (void *) trans->last_begin_ip); - } - --void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) -+static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) - { -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct printbuf buf = PRINTBUF; -+ bch2_prt_backtrace(&buf, &trans->last_restarted_trace); -+ panic("in transaction restart: %s, last restarted by\n%s", -+ bch2_err_str(trans->restarted), -+ buf.buf); -+#else - panic("in transaction restart: %s, last restarted by %pS\n", - bch2_err_str(trans->restarted), - (void *) trans->last_restarted_ip); -+#endif - } - --void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans) -+void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans) - { -- panic("trans should be locked, unlocked by %pS\n", -- (void *) trans->last_unlock_ip); -+ if (trans->restarted) -+ bch2_trans_in_restart_error(trans); -+ -+ if (!trans->locked) -+ panic("trans should be locked, unlocked by %pS\n", -+ (void *) trans->last_unlock_ip); -+ -+ BUG(); - } - - noinline __cold -@@ -1450,10 +1484,11 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) - trans_for_each_update(trans, i) { - struct bkey_s_c old = { &i->old_k, i->old_v }; - -- prt_printf(buf, "update: btree=%s cached=%u %pS\n", -- bch2_btree_id_str(i->btree_id), -- i->cached, -- (void *) i->ip_allocated); -+ prt_str(buf, "update: btree="); -+ bch2_btree_id_to_text(buf, i->btree_id); -+ prt_printf(buf, " cached=%u %pS\n", -+ i->cached, -+ (void *) i->ip_allocated); - - prt_printf(buf, " old "); - bch2_bkey_val_to_text(buf, trans->c, old); -@@ -1486,13 +1521,13 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra - { - struct btree_path *path = trans->paths + path_idx; - -- prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ", -+ prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ", - path_idx, path->ref, path->intent_ref, - path->preserve ? 'P' : ' ', - path->should_be_locked ? 'S' : ' ', -- path->cached ? 'C' : 'B', -- bch2_btree_id_str(path->btree_id), -- path->level); -+ path->cached ? 'C' : 'B'); -+ bch2_btree_id_level_to_text(out, path->btree_id, path->level); -+ prt_str(out, " pos "); - bch2_bpos_to_text(out, path->pos); - - if (!path->cached && btree_node_locked(path, path->level)) { -@@ -1717,8 +1752,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, - struct trans_for_each_path_inorder_iter iter; - btree_path_idx_t path_pos = 0, path_idx; - -- bch2_trans_verify_not_unlocked(trans); -- bch2_trans_verify_not_in_restart(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_trans_verify_locks(trans); - - btree_trans_sort_paths(trans); -@@ -1833,7 +1867,7 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * - !bkey_eq(path->pos, ck->key.pos)); - - *u = ck->k->k; -- k = bkey_i_to_s_c(ck->k); -+ k = (struct bkey_s_c) { u, &ck->k->v }; - } - - return k; -@@ -1843,7 +1877,6 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * - return (struct bkey_s_c) { u, NULL }; - } - -- - void bch2_set_btree_iter_dontneed(struct btree_iter *iter) - { - struct btree_trans *trans = iter->trans; -@@ -1870,7 +1903,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) - struct btree_trans *trans = iter->trans; - int ret; - -- bch2_trans_verify_not_unlocked(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - - iter->path = bch2_btree_path_set_pos(trans, iter->path, - btree_iter_search_key(iter), -@@ -1945,7 +1978,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) - int ret; - - EBUG_ON(trans->paths[iter->path].cached); -- bch2_trans_verify_not_in_restart(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(iter); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); -@@ -2101,7 +2134,7 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, - { - struct btree_path *path = btree_iter_path(trans, iter); - -- return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, -+ return bch2_journal_keys_peek_max(trans->c, iter->btree_id, - path->level, - path->pos, - end_pos, -@@ -2124,21 +2157,47 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, - } - - static noinline --struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, -- struct btree_iter *iter, -- struct bkey_s_c k) -+void btree_trans_peek_journal(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c *k) - { - struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *next_journal = - bch2_btree_journal_peek(trans, iter, -- k.k ? k.k->p : path_l(path)->b->key.k.p); -- -+ k->k ? k->k->p : path_l(path)->b->key.k.p); - if (next_journal) { - iter->k = next_journal->k; -- k = bkey_i_to_s_c(next_journal); -+ *k = bkey_i_to_s_c(next_journal); - } -+} - -- return k; -+static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos end_pos) -+{ -+ struct btree_path *path = btree_iter_path(trans, iter); -+ -+ return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, -+ path->level, -+ path->pos, -+ end_pos, -+ &iter->journal_idx); -+} -+ -+static noinline -+void btree_trans_peek_prev_journal(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c *k) -+{ -+ struct btree_path *path = btree_iter_path(trans, iter); -+ struct bkey_i *next_journal = -+ bch2_btree_journal_peek_prev(trans, iter, -+ k->k ? k->k->p : path_l(path)->b->key.k.p); -+ -+ if (next_journal) { -+ iter->k = next_journal->k; -+ *k = bkey_i_to_s_c(next_journal); -+ } - } - - /* -@@ -2154,8 +2213,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos - struct bkey_s_c k; - int ret; - -- bch2_trans_verify_not_in_restart(trans); -- bch2_trans_verify_not_unlocked(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if ((iter->flags & BTREE_ITER_key_cache_fill) && - bpos_eq(iter->pos, pos)) -@@ -2181,13 +2239,17 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos - if (unlikely(ret)) - return bkey_s_c_err(ret); - -- btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); -- - k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); -- if (k.k && !bkey_err(k)) { -- iter->k = u; -- k.k = &iter->k; -- } -+ if (!k.k) -+ return k; -+ -+ if ((iter->flags & BTREE_ITER_all_snapshots) && -+ !bpos_eq(pos, k.k->p)) -+ return bkey_s_c_null; -+ -+ iter->k = u; -+ k.k = &iter->k; -+ btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); - return k; - } - -@@ -2201,8 +2263,6 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp - bch2_btree_iter_verify(iter); - - while (1) { -- struct btree_path_level *l; -- - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); -@@ -2212,17 +2272,17 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); - k = bkey_s_c_err(ret); -- goto out; -+ break; - } - - struct btree_path *path = btree_iter_path(trans, iter); -- l = path_l(path); -+ struct btree_path_level *l = path_l(path); - - if (unlikely(!l->b)) { - /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); - k = bkey_s_c_null; -- goto out; -+ break; - } - - btree_path_set_should_be_locked(trans, path); -@@ -2233,15 +2293,14 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp - k.k && - (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { - k = k2; -- ret = bkey_err(k); -- if (ret) { -+ if (bkey_err(k)) { - bch2_btree_iter_set_pos(iter, iter->pos); -- goto out; -+ break; - } - } - - if (unlikely(iter->flags & BTREE_ITER_with_journal)) -- k = btree_trans_peek_journal(trans, iter, k); -+ btree_trans_peek_journal(trans, iter, &k); - - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) -@@ -2270,42 +2329,46 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp - /* End of btree: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); - k = bkey_s_c_null; -- goto out; -+ break; - } - } --out: -- bch2_btree_iter_verify(iter); - -+ bch2_btree_iter_verify(iter); - return k; - } - - /** -- * bch2_btree_iter_peek_upto() - returns first key greater than or equal to -+ * bch2_btree_iter_peek_max() - returns first key greater than or equal to - * iterator's current position - * @iter: iterator to peek from - * @end: search limit: returns keys less than or equal to @end - * - * Returns: key if found, or an error extractable with bkey_err(). - */ --struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) -+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end) - { - struct btree_trans *trans = iter->trans; - struct bpos search_key = btree_iter_search_key(iter); - struct bkey_s_c k; -- struct bpos iter_pos; -+ struct bpos iter_pos = iter->pos; - int ret; - -- bch2_trans_verify_not_unlocked(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); -+ bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); - -+ ret = trans_maybe_inject_restart(trans, _RET_IP_); -+ if (unlikely(ret)) { -+ k = bkey_s_c_err(ret); -+ goto out_no_locked; -+ } -+ - if (iter->update_path) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - } - -- bch2_btree_iter_verify_entry_exit(iter); -- - while (1) { - k = __bch2_btree_iter_peek(iter, search_key); - if (unlikely(!k.k)) -@@ -2313,75 +2376,75 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e - if (unlikely(bkey_err(k))) - goto out_no_locked; - -- /* -- * We need to check against @end before FILTER_SNAPSHOTS because -- * if we get to a different inode that requested we might be -- * seeing keys for a different snapshot tree that will all be -- * filtered out. -- * -- * But we can't do the full check here, because bkey_start_pos() -- * isn't monotonically increasing before FILTER_SNAPSHOTS, and -- * that's what we check against in extents mode: -- */ -- if (unlikely(!(iter->flags & BTREE_ITER_is_extents) -- ? bkey_gt(k.k->p, end) -- : k.k->p.inode > end.inode)) -- goto end; -+ if (iter->flags & BTREE_ITER_filter_snapshots) { -+ /* -+ * We need to check against @end before FILTER_SNAPSHOTS because -+ * if we get to a different inode that requested we might be -+ * seeing keys for a different snapshot tree that will all be -+ * filtered out. -+ * -+ * But we can't do the full check here, because bkey_start_pos() -+ * isn't monotonically increasing before FILTER_SNAPSHOTS, and -+ * that's what we check against in extents mode: -+ */ -+ if (unlikely(!(iter->flags & BTREE_ITER_is_extents) -+ ? bkey_gt(k.k->p, end) -+ : k.k->p.inode > end.inode)) -+ goto end; -+ -+ if (iter->update_path && -+ !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { -+ bch2_path_put_nokeep(trans, iter->update_path, -+ iter->flags & BTREE_ITER_intent); -+ iter->update_path = 0; -+ } - -- if (iter->update_path && -- !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { -- bch2_path_put_nokeep(trans, iter->update_path, -- iter->flags & BTREE_ITER_intent); -- iter->update_path = 0; -- } -+ if ((iter->flags & BTREE_ITER_intent) && -+ !(iter->flags & BTREE_ITER_is_extents) && -+ !iter->update_path) { -+ struct bpos pos = k.k->p; - -- if ((iter->flags & BTREE_ITER_filter_snapshots) && -- (iter->flags & BTREE_ITER_intent) && -- !(iter->flags & BTREE_ITER_is_extents) && -- !iter->update_path) { -- struct bpos pos = k.k->p; -+ if (pos.snapshot < iter->snapshot) { -+ search_key = bpos_successor(k.k->p); -+ continue; -+ } - -- if (pos.snapshot < iter->snapshot) { -- search_key = bpos_successor(k.k->p); -- continue; -- } -+ pos.snapshot = iter->snapshot; - -- pos.snapshot = iter->snapshot; -+ /* -+ * advance, same as on exit for iter->path, but only up -+ * to snapshot -+ */ -+ __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); -+ iter->update_path = iter->path; -+ -+ iter->update_path = bch2_btree_path_set_pos(trans, -+ iter->update_path, pos, -+ iter->flags & BTREE_ITER_intent, -+ _THIS_IP_); -+ ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); -+ if (unlikely(ret)) { -+ k = bkey_s_c_err(ret); -+ goto out_no_locked; -+ } -+ } - - /* -- * advance, same as on exit for iter->path, but only up -- * to snapshot -+ * We can never have a key in a leaf node at POS_MAX, so -+ * we don't have to check these successor() calls: - */ -- __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); -- iter->update_path = iter->path; -- -- iter->update_path = bch2_btree_path_set_pos(trans, -- iter->update_path, pos, -- iter->flags & BTREE_ITER_intent, -- _THIS_IP_); -- ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); -- if (unlikely(ret)) { -- k = bkey_s_c_err(ret); -- goto out_no_locked; -+ if (!bch2_snapshot_is_ancestor(trans->c, -+ iter->snapshot, -+ k.k->p.snapshot)) { -+ search_key = bpos_successor(k.k->p); -+ continue; - } -- } -- -- /* -- * We can never have a key in a leaf node at POS_MAX, so -- * we don't have to check these successor() calls: -- */ -- if ((iter->flags & BTREE_ITER_filter_snapshots) && -- !bch2_snapshot_is_ancestor(trans->c, -- iter->snapshot, -- k.k->p.snapshot)) { -- search_key = bpos_successor(k.k->p); -- continue; -- } - -- if (bkey_whiteout(k.k) && -- !(iter->flags & BTREE_ITER_all_snapshots)) { -- search_key = bkey_successor(iter, k.k->p); -- continue; -+ if (bkey_whiteout(k.k) && -+ !(iter->flags & BTREE_ITER_key_cache_fill)) { -+ search_key = bkey_successor(iter, k.k->p); -+ continue; -+ } - } - - /* -@@ -2451,127 +2514,210 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) - return bch2_btree_iter_peek(iter); - } - --/** -- * bch2_btree_iter_peek_prev() - returns first key less than or equal to -- * iterator's current position -- * @iter: iterator to peek from -- * -- * Returns: key if found, or an error extractable with bkey_err(). -- */ --struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) -+static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key) - { - struct btree_trans *trans = iter->trans; -- struct bpos search_key = iter->pos; -- struct bkey_s_c k; -- struct bkey saved_k; -- const struct bch_val *saved_v; -- btree_path_idx_t saved_path = 0; -- int ret; -- -- bch2_trans_verify_not_unlocked(trans); -- EBUG_ON(btree_iter_path(trans, iter)->cached || -- btree_iter_path(trans, iter)->level); -- -- if (iter->flags & BTREE_ITER_with_journal) -- return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); -+ struct bkey_s_c k, k2; - - bch2_btree_iter_verify(iter); -- bch2_btree_iter_verify_entry_exit(iter); -- -- if (iter->flags & BTREE_ITER_filter_snapshots) -- search_key.snapshot = U32_MAX; - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, -- iter->flags & BTREE_ITER_intent, -- btree_iter_ip_allocated(iter)); -+ iter->flags & BTREE_ITER_intent, -+ btree_iter_ip_allocated(iter)); - -- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); -+ int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); - k = bkey_s_c_err(ret); -- goto out_no_locked; -+ break; - } - - struct btree_path *path = btree_iter_path(trans, iter); -+ struct btree_path_level *l = path_l(path); -+ -+ if (unlikely(!l->b)) { -+ /* No btree nodes at requested level: */ -+ bch2_btree_iter_set_pos(iter, SPOS_MAX); -+ k = bkey_s_c_null; -+ break; -+ } -+ -+ btree_path_set_should_be_locked(trans, path); -+ -+ k = btree_path_level_peek_all(trans->c, l, &iter->k); -+ if (!k.k || bpos_gt(k.k->p, search_key)) { -+ k = btree_path_level_prev(trans, path, l, &iter->k); - -- k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); -- if (!k.k || -- ((iter->flags & BTREE_ITER_is_extents) -- ? bpos_ge(bkey_start_pos(k.k), search_key) -- : bpos_gt(k.k->p, search_key))) -- k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); -+ BUG_ON(k.k && bpos_gt(k.k->p, search_key)); -+ } -+ -+ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && -+ k.k && -+ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { -+ k = k2; -+ if (bkey_err(k2)) { -+ bch2_btree_iter_set_pos(iter, iter->pos); -+ break; -+ } -+ } -+ -+ if (unlikely(iter->flags & BTREE_ITER_with_journal)) -+ btree_trans_peek_prev_journal(trans, iter, &k); - - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) - bch2_btree_trans_peek_prev_updates(trans, iter, &k); - -- if (likely(k.k)) { -- if (iter->flags & BTREE_ITER_filter_snapshots) { -- if (k.k->p.snapshot == iter->snapshot) -- goto got_key; -+ if (likely(k.k && !bkey_deleted(k.k))) { -+ break; -+ } else if (k.k) { -+ search_key = bpos_predecessor(k.k->p); -+ } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { -+ /* Advance to previous leaf node: */ -+ search_key = bpos_predecessor(path->l[0].b->data->min_key); -+ } else { -+ /* Start of btree: */ -+ bch2_btree_iter_set_pos(iter, POS_MIN); -+ k = bkey_s_c_null; -+ break; -+ } -+ } - -+ bch2_btree_iter_verify(iter); -+ return k; -+} -+ -+/** -+ * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to -+ * iterator's current position -+ * @iter: iterator to peek from -+ * @end: search limit: returns keys greater than or equal to @end -+ * -+ * Returns: key if found, or an error extractable with bkey_err(). -+ */ -+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) -+{ -+ if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && -+ !bkey_eq(iter->pos, POS_MAX)) { -+ /* -+ * bkey_start_pos(), for extents, is not monotonically -+ * increasing until after filtering for snapshots: -+ * -+ * Thus, for extents we need to search forward until we find a -+ * real visible extents - easiest to just use peek_slot() (which -+ * internally uses peek() for extents) -+ */ -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -+ if (bkey_err(k)) -+ return k; -+ -+ if (!bkey_deleted(k.k) && -+ (!(iter->flags & BTREE_ITER_is_extents) || -+ bkey_lt(bkey_start_pos(k.k), iter->pos))) -+ return k; -+ } -+ -+ struct btree_trans *trans = iter->trans; -+ struct bpos search_key = iter->pos; -+ struct bkey_s_c k; -+ btree_path_idx_t saved_path = 0; -+ -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); -+ bch2_btree_iter_verify_entry_exit(iter); -+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); -+ -+ int ret = trans_maybe_inject_restart(trans, _RET_IP_); -+ if (unlikely(ret)) { -+ k = bkey_s_c_err(ret); -+ goto out_no_locked; -+ } -+ -+ while (1) { -+ k = __bch2_btree_iter_peek_prev(iter, search_key); -+ if (unlikely(!k.k)) -+ goto end; -+ if (unlikely(bkey_err(k))) -+ goto out_no_locked; -+ -+ if (iter->flags & BTREE_ITER_filter_snapshots) { -+ struct btree_path *s = saved_path ? trans->paths + saved_path : NULL; -+ if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) { -+ /* -+ * If we have a saved candidate, and we're past -+ * the last possible snapshot overwrite, return -+ * it: -+ */ -+ bch2_path_put_nokeep(trans, iter->path, -+ iter->flags & BTREE_ITER_intent); -+ iter->path = saved_path; -+ saved_path = 0; -+ k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); -+ break; -+ } -+ -+ /* -+ * We need to check against @end before FILTER_SNAPSHOTS because -+ * if we get to a different inode that requested we might be -+ * seeing keys for a different snapshot tree that will all be -+ * filtered out. -+ */ -+ if (unlikely(bkey_lt(k.k->p, end))) -+ goto end; -+ -+ if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { -+ search_key = bpos_predecessor(k.k->p); -+ continue; -+ } -+ -+ if (k.k->p.snapshot != iter->snapshot) { - /* -- * If we have a saved candidate, and we're no -- * longer at the same _key_ (not pos), return -- * that candidate -+ * Have a key visible in iter->snapshot, but -+ * might have overwrites: - save it and keep -+ * searching. Unless it's a whiteout - then drop -+ * our previous saved candidate: - */ -- if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { -- bch2_path_put_nokeep(trans, iter->path, -- iter->flags & BTREE_ITER_intent); -- iter->path = saved_path; -+ if (saved_path) { -+ bch2_path_put_nokeep(trans, saved_path, -+ iter->flags & BTREE_ITER_intent); - saved_path = 0; -- iter->k = saved_k; -- k.v = saved_v; -- goto got_key; - } - -- if (bch2_snapshot_is_ancestor(trans->c, -- iter->snapshot, -- k.k->p.snapshot)) { -- if (saved_path) -- bch2_path_put_nokeep(trans, saved_path, -- iter->flags & BTREE_ITER_intent); -+ if (!bkey_whiteout(k.k)) { - saved_path = btree_path_clone(trans, iter->path, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); -- path = btree_iter_path(trans, iter); -- trace_btree_path_save_pos(trans, path, trans->paths + saved_path); -- saved_k = *k.k; -- saved_v = k.v; -+ trace_btree_path_save_pos(trans, -+ trans->paths + iter->path, -+ trans->paths + saved_path); - } - - search_key = bpos_predecessor(k.k->p); - continue; - } --got_key: -- if (bkey_whiteout(k.k) && -- !(iter->flags & BTREE_ITER_all_snapshots)) { -+ -+ if (bkey_whiteout(k.k)) { - search_key = bkey_predecessor(iter, k.k->p); -- if (iter->flags & BTREE_ITER_filter_snapshots) -- search_key.snapshot = U32_MAX; -+ search_key.snapshot = U32_MAX; - continue; - } -- -- btree_path_set_should_be_locked(trans, path); -- break; -- } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { -- /* Advance to previous leaf node: */ -- search_key = bpos_predecessor(path->l[0].b->data->min_key); -- } else { -- /* Start of btree: */ -- bch2_btree_iter_set_pos(iter, POS_MIN); -- k = bkey_s_c_null; -- goto out_no_locked; - } -- } - -- EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); -+ EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) : -+ iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) : -+ bkey_gt(k.k->p, iter->pos)); -+ -+ if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) : -+ iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) : -+ bkey_lt(k.k->p, end))) -+ goto end; -+ -+ break; -+ } - - /* Extents can straddle iter->pos: */ -- if (bkey_lt(k.k->p, iter->pos)) -- iter->pos = k.k->p; -+ iter->pos = bpos_min(iter->pos, k.k->p);; - - if (iter->flags & BTREE_ITER_filter_snapshots) - iter->pos.snapshot = iter->snapshot; -@@ -2581,8 +2727,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) - - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); -- - return k; -+end: -+ bch2_btree_iter_set_pos(iter, end); -+ k = bkey_s_c_null; -+ goto out_no_locked; - } - - /** -@@ -2607,11 +2756,17 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) - struct bkey_s_c k; - int ret; - -- bch2_trans_verify_not_unlocked(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(iter); - bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); - -+ ret = trans_maybe_inject_restart(trans, _RET_IP_); -+ if (unlikely(ret)) { -+ k = bkey_s_c_err(ret); -+ goto out_no_locked; -+ } -+ - /* extents can't span inode numbers: */ - if ((iter->flags & BTREE_ITER_is_extents) && - unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { -@@ -2632,6 +2787,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) - goto out_no_locked; - } - -+ struct btree_path *path = btree_iter_path(trans, iter); -+ if (unlikely(!btree_path_node(path, path->level))) -+ return bkey_s_c_null; -+ - if ((iter->flags & BTREE_ITER_cached) || - !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { - k = bkey_s_c_null; -@@ -2658,6 +2817,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) - k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); - if (unlikely(!k.k)) - goto out_no_locked; -+ -+ if (unlikely(k.k->type == KEY_TYPE_whiteout && -+ (iter->flags & BTREE_ITER_filter_snapshots) && -+ !(iter->flags & BTREE_ITER_key_cache_fill))) -+ iter->k.type = KEY_TYPE_deleted; - } else { - struct bpos next; - struct bpos end = iter->pos; -@@ -2671,7 +2835,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) - struct btree_iter iter2; - - bch2_trans_copy_iter(&iter2, iter); -- k = bch2_btree_iter_peek_upto(&iter2, end); -+ k = bch2_btree_iter_peek_max(&iter2, end); - - if (k.k && !bkey_err(k)) { - swap(iter->key_cache_path, iter2.key_cache_path); -@@ -2682,7 +2846,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) - } else { - struct bpos pos = iter->pos; - -- k = bch2_btree_iter_peek_upto(iter, end); -+ k = bch2_btree_iter_peek_max(iter, end); - if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(iter, pos); - else -@@ -2902,7 +3066,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans, - unsigned flags) - { - bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, -- bch2_btree_iter_flags(trans, btree_id, flags), -+ bch2_btree_iter_flags(trans, btree_id, 0, flags), - _RET_IP_); - } - -@@ -2918,8 +3082,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, - flags |= BTREE_ITER_snapshot_field; - flags |= BTREE_ITER_all_snapshots; - -+ if (!depth && btree_id_cached(trans->c, btree_id)) -+ flags |= BTREE_ITER_with_key_cache; -+ - bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, -- __bch2_btree_iter_flags(trans, btree_id, flags), -+ bch2_btree_iter_flags(trans, btree_id, depth, flags), - _RET_IP_); - - iter->min_depth = depth; -@@ -2957,6 +3124,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) - - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); - -+ ret = trans_maybe_inject_restart(trans, _RET_IP_); -+ if (ret) -+ return ERR_PTR(ret); -+ - struct btree_transaction_stats *s = btree_trans_stats(trans); - s->max_mem = max(s->max_mem, new_bytes); - -@@ -3014,7 +3185,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) - - if (old_bytes) { - trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); -- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); -+ return ERR_PTR(btree_trans_restart_ip(trans, -+ BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); - } - out_change_top: - p = trans->mem + trans->mem_top; -@@ -3122,14 +3294,22 @@ u32 bch2_trans_begin(struct btree_trans *trans) - - trans->last_begin_ip = _RET_IP_; - -- trans_set_locked(trans); -+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS -+ if (trans->restarted) { -+ trans->restart_count_this_trans++; -+ } else { -+ trans->restart_count_this_trans = 0; -+ } -+#endif -+ -+ trans_set_locked(trans, false); - - if (trans->restarted) { - bch2_btree_path_traverse_all(trans); - trans->notrace_relock_fail = false; - } - -- bch2_trans_verify_not_unlocked(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - return trans->restart_count; - } - -@@ -3228,7 +3408,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; - trans->srcu_held = true; -- trans_set_locked(trans); -+ trans_set_locked(trans, false); - - closure_init_stack_release(&trans->ref); - return trans; -@@ -3262,6 +3442,9 @@ void bch2_trans_put(struct btree_trans *trans) - { - struct bch_fs *c = trans->c; - -+ if (trans->restarted) -+ bch2_trans_in_restart_error(trans); -+ - bch2_trans_unlock(trans); - - trans_for_each_update(trans, i) -@@ -3285,6 +3468,10 @@ void bch2_trans_put(struct btree_trans *trans) - closure_return_sync(&trans->ref); - trans->locking_wait.task = NULL; - -+#ifdef CONFIG_BCACHEFS_DEBUG -+ darray_exit(&trans->last_restarted_trace); -+#endif -+ - unsigned long *paths_allocated = trans->paths_allocated; - trans->paths_allocated = NULL; - trans->paths = NULL; -@@ -3338,8 +3525,9 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, - pid = owner ? owner->pid : 0; - rcu_read_unlock(); - -- prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b', -- b->level, bch2_btree_id_str(b->btree_id)); -+ prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); -+ bch2_btree_id_to_text(out, b->btree_id); -+ prt_printf(out, " l=%u:", b->level); - bch2_bpos_to_text(out, btree_node_pos(b)); - - prt_printf(out, "\t locks %u:%u:%u held by pid %u", -@@ -3378,11 +3566,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) - if (!path->nodes_locked) - continue; - -- prt_printf(out, " path %u %c l=%u %s:", -- idx, -- path->cached ? 'c' : 'b', -- path->level, -- bch2_btree_id_str(path->btree_id)); -+ prt_printf(out, " path %u %c ", -+ idx, -+ path->cached ? 'c' : 'b'); -+ bch2_btree_id_to_text(out, path->btree_id); -+ prt_printf(out, " l=%u:", path->level); - bch2_bpos_to_text(out, path->pos); - prt_newline(out); - -@@ -3488,7 +3676,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) - #ifdef CONFIG_LOCKDEP - fs_reclaim_acquire(GFP_KERNEL); - struct btree_trans *trans = bch2_trans_get(c); -- trans_set_locked(trans); -+ trans_set_locked(trans, false); - bch2_trans_put(trans); - fs_reclaim_release(GFP_KERNEL); - #endif -diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h -index 0bda054f80d7..b96157f3dc9c 100644 ---- a/fs/bcachefs/btree_iter.h -+++ b/fs/bcachefs/btree_iter.h -@@ -23,6 +23,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path - { - unsigned idx = path - trans->paths; - -+ EBUG_ON(idx >= trans->nr_paths); - EBUG_ON(!test_bit(idx, trans->paths_allocated)); - if (unlikely(path->ref == U8_MAX)) { - bch2_dump_trans_paths_updates(trans); -@@ -36,6 +37,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path - - static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) - { -+ EBUG_ON(path - trans->paths >= trans->nr_paths); - EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated)); - EBUG_ON(!path->ref); - EBUG_ON(!path->intent_ref && intent); -@@ -234,12 +236,12 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *, - btree_path_idx_t, - unsigned, unsigned long); - --static inline void bch2_trans_verify_not_unlocked(struct btree_trans *); -+static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *); - - static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, - btree_path_idx_t path, unsigned flags) - { -- bch2_trans_verify_not_unlocked(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) - return 0; -@@ -324,38 +326,45 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, - bch2_trans_restart_error(trans, restart_count); - } - --void __noreturn bch2_trans_in_restart_error(struct btree_trans *); -+void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *); - --static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) -+static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans) - { -- if (trans->restarted) -- bch2_trans_in_restart_error(trans); --} -- --void __noreturn bch2_trans_unlocked_error(struct btree_trans *); -- --static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans) --{ -- if (!trans->locked) -- bch2_trans_unlocked_error(trans); -+ if (trans->restarted || !trans->locked) -+ bch2_trans_unlocked_or_in_restart_error(trans); - } - - __always_inline --static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) -+static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) - { - BUG_ON(err <= 0); - BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); - - trans->restarted = err; -- trans->last_restarted_ip = _THIS_IP_; -+ trans->last_restarted_ip = ip; -+#ifdef CONFIG_BCACHEFS_DEBUG -+ darray_exit(&trans->last_restarted_trace); -+ bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); -+#endif - return -err; - } - - __always_inline - static int btree_trans_restart(struct btree_trans *trans, int err) - { -- btree_trans_restart_nounlock(trans, err); -- return -err; -+ return btree_trans_restart_ip(trans, err, _THIS_IP_); -+} -+ -+static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip) -+{ -+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS -+ if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) { -+ trace_and_count(trans->c, trans_restart_injected, trans, ip); -+ return btree_trans_restart_ip(trans, -+ BCH_ERR_transaction_restart_fault_inject, ip); -+ } -+#endif -+ return 0; - } - - bool bch2_btree_node_upgrade(struct btree_trans *, -@@ -375,6 +384,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans, - void bch2_trans_downgrade(struct btree_trans *); - - void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *); -+void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); - void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); - - int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); -@@ -384,15 +394,21 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *); - struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); - struct btree *bch2_btree_iter_next_node(struct btree_iter *); - --struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); -+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos); - struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); - - static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) - { -- return bch2_btree_iter_peek_upto(iter, SPOS_MAX); -+ return bch2_btree_iter_peek_max(iter, SPOS_MAX); -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos); -+ -+static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) -+{ -+ return bch2_btree_iter_peek_prev_min(iter, POS_MIN); - } - --struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); - struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); - - struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); -@@ -443,10 +459,17 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna - - void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); - --static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, -- unsigned btree_id, -- unsigned flags) -+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, -+ unsigned btree_id, -+ unsigned level, -+ unsigned flags) - { -+ if (level || !btree_id_cached(trans->c, btree_id)) { -+ flags &= ~BTREE_ITER_cached; -+ flags &= ~BTREE_ITER_with_key_cache; -+ } else if (!(flags & BTREE_ITER_cached)) -+ flags |= BTREE_ITER_with_key_cache; -+ - if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && - btree_id_is_extents(btree_id)) - flags |= BTREE_ITER_is_extents; -@@ -465,19 +488,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, - return flags; - } - --static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, -- unsigned btree_id, -- unsigned flags) --{ -- if (!btree_id_cached(trans->c, btree_id)) { -- flags &= ~BTREE_ITER_cached; -- flags &= ~BTREE_ITER_with_key_cache; -- } else if (!(flags & BTREE_ITER_cached)) -- flags |= BTREE_ITER_with_key_cache; -- -- return __bch2_btree_iter_flags(trans, btree_id, flags); --} -- - static inline void bch2_trans_iter_init_common(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, -@@ -514,7 +524,7 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans, - if (__builtin_constant_p(btree_id) && - __builtin_constant_p(flags)) - bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, -- bch2_btree_iter_flags(trans, btree_id, flags), -+ bch2_btree_iter_flags(trans, btree_id, 0, flags), - _THIS_IP_); - else - bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); -@@ -593,13 +603,18 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, - bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ - _btree_id, _pos, _flags, KEY_TYPE_##_type)) - -+static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k) -+{ -+ unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k)); -+ memcpy(dst_v, src_k.v, b); -+ if (unlikely(b < dst_size)) -+ memset(dst_v + b, 0, dst_size - b); -+} -+ - #define bkey_val_copy(_dst_v, _src_k) \ - do { \ -- unsigned b = min_t(unsigned, sizeof(*_dst_v), \ -- bkey_val_bytes(_src_k.k)); \ -- memcpy(_dst_v, _src_k.v, b); \ -- if (b < sizeof(*_dst_v)) \ -- memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \ -+ BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \ -+ __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \ - } while (0) - - static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, -@@ -608,17 +623,10 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, - unsigned val_size, void *val) - { - struct btree_iter iter; -- struct bkey_s_c k; -- int ret; -- -- k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); -- ret = bkey_err(k); -+ struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); -+ int ret = bkey_err(k); - if (!ret) { -- unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size); -- -- memcpy(val, k.v, b); -- if (unlikely(b < sizeof(*val))) -- memset((void *) val + b, 0, sizeof(*val) - b); -+ __bkey_val_copy(val, val_size, k); - bch2_trans_iter_exit(trans, &iter); - } - -@@ -677,12 +685,12 @@ static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, - bch2_btree_iter_peek(iter); - } - --static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, -+static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter, - struct bpos end, - unsigned flags) - { - if (!(flags & BTREE_ITER_slots)) -- return bch2_btree_iter_peek_upto(iter, end); -+ return bch2_btree_iter_peek_max(iter, end); - - if (bkey_gt(iter->pos, end)) - return bkey_s_c_null; -@@ -743,10 +751,10 @@ transaction_restart: \ - if (!_ret2) \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - \ -- _ret2 ?: trans_was_restarted(_trans, _restart_count); \ -+ _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ - }) - --#define for_each_btree_key_upto_continue(_trans, _iter, \ -+#define for_each_btree_key_max_continue(_trans, _iter, \ - _end, _flags, _k, _do) \ - ({ \ - struct bkey_s_c _k; \ -@@ -754,7 +762,7 @@ transaction_restart: \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ -- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ -+ (_k) = bch2_btree_iter_peek_max_type(&(_iter), \ - _end, (_flags)); \ - if (!(_k).k) \ - break; \ -@@ -768,9 +776,9 @@ transaction_restart: \ - }) - - #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ -- for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) -+ for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) - --#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ -+#define for_each_btree_key_max(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ - ({ \ - bch2_trans_begin(trans); \ -@@ -779,12 +787,12 @@ transaction_restart: \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ -- for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\ -+ for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\ - }) - - #define for_each_btree_key(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ -- for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \ -+ for_each_btree_key_max(_trans, _iter, _btree_id, _start, \ - SPOS_MAX, _flags, _k, _do) - - #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ -@@ -828,33 +836,33 @@ transaction_restart: \ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - --#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ -+#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \ - _start, _end, _iter_flags, _k, \ - _disk_res, _journal_seq, _commit_flags,\ - _do) \ -- for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ -+ for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - - struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); - --#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ -+#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ -- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ -+ (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - --#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ -+#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\ - for (; \ -- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ -+ (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - - #define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ -- for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ -+ for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ - SPOS_MAX, _flags, _k, _ret) - - #define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ -@@ -866,7 +874,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); - bch2_btree_iter_rewind(&(_iter))) - - #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ -- for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) -+ for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) - - /* - * This should not be used in a fastpath, without first trying _do in -diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c -index c1657182c275..6d25e3f85ce8 100644 ---- a/fs/bcachefs/btree_journal_iter.c -+++ b/fs/bcachefs/btree_journal_iter.c -@@ -16,6 +16,17 @@ - * operations for the regular btree iter code to use: - */ - -+static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos) -+{ -+ size_t gap_size = keys->size - keys->nr; -+ -+ BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size); -+ -+ if (pos >= keys->gap) -+ pos -= gap_size; -+ return pos; -+} -+ - static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) - { - size_t gap_size = keys->size - keys->nr; -@@ -61,7 +72,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys, - } - - /* Returns first non-overwritten key >= search key: */ --struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, -+struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos, - struct bpos end_pos, size_t *idx) - { -@@ -84,27 +95,92 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree - } - } - -+ struct bkey_i *ret = NULL; -+ rcu_read_lock(); /* for overwritten_ranges */ -+ - while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { - if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) -- return NULL; -+ break; - - if (k->overwritten) { -- (*idx)++; -+ if (k->overwritten_range) -+ *idx = rcu_dereference(k->overwritten_range)->end; -+ else -+ *idx += 1; - continue; - } - -- if (__journal_key_cmp(btree_id, level, pos, k) <= 0) -- return k->k; -+ if (__journal_key_cmp(btree_id, level, pos, k) <= 0) { -+ ret = k->k; -+ break; -+ } - - (*idx)++; - iters++; - if (iters == 10) { - *idx = 0; -+ rcu_read_unlock(); - goto search; - } - } - -- return NULL; -+ rcu_read_unlock(); -+ return ret; -+} -+ -+struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, -+ unsigned level, struct bpos pos, -+ struct bpos end_pos, size_t *idx) -+{ -+ struct journal_keys *keys = &c->journal_keys; -+ unsigned iters = 0; -+ struct journal_key *k; -+ -+ BUG_ON(*idx > keys->nr); -+search: -+ if (!*idx) -+ *idx = __bch2_journal_key_search(keys, btree_id, level, pos); -+ -+ while (*idx && -+ __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { -+ (*idx)++; -+ iters++; -+ if (iters == 10) { -+ *idx = 0; -+ goto search; -+ } -+ } -+ -+ struct bkey_i *ret = NULL; -+ rcu_read_lock(); /* for overwritten_ranges */ -+ -+ while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { -+ if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) -+ break; -+ -+ if (k->overwritten) { -+ if (k->overwritten_range) -+ *idx = rcu_dereference(k->overwritten_range)->start - 1; -+ else -+ *idx -= 1; -+ continue; -+ } -+ -+ if (__journal_key_cmp(btree_id, level, pos, k) >= 0) { -+ ret = k->k; -+ break; -+ } -+ -+ --(*idx); -+ iters++; -+ if (iters == 10) { -+ *idx = 0; -+ goto search; -+ } -+ } -+ -+ rcu_read_unlock(); -+ return ret; - } - - struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, -@@ -112,11 +188,12 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree - { - size_t idx = 0; - -- return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); -+ return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx); - } - - static void journal_iter_verify(struct journal_iter *iter) - { -+#ifdef CONFIG_BCACHEFS_DEBUG - struct journal_keys *keys = iter->keys; - size_t gap_size = keys->size - keys->nr; - -@@ -126,10 +203,10 @@ static void journal_iter_verify(struct journal_iter *iter) - if (iter->idx < keys->size) { - struct journal_key *k = keys->data + iter->idx; - -- int cmp = cmp_int(k->btree_id, iter->btree_id) ?: -- cmp_int(k->level, iter->level); -- BUG_ON(cmp < 0); -+ int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); -+ BUG_ON(cmp > 0); - } -+#endif - } - - static void journal_iters_fix(struct bch_fs *c) -@@ -182,7 +259,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, - * Ensure these keys are done last by journal replay, to unblock - * journal reclaim: - */ -- .journal_seq = U32_MAX, -+ .journal_seq = U64_MAX, - }; - struct journal_keys *keys = &c->journal_keys; - size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); -@@ -290,6 +367,68 @@ bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, - bkey_deleted(&keys->data[idx].k->k)); - } - -+static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos) -+{ -+ struct journal_key *k = keys->data + pos; -+ size_t idx = pos_to_idx(keys, pos); -+ -+ k->overwritten = true; -+ -+ struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL; -+ struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL; -+ -+ bool prev_overwritten = prev && prev->overwritten; -+ bool next_overwritten = next && next->overwritten; -+ -+ struct journal_key_range_overwritten *prev_range = -+ prev_overwritten ? prev->overwritten_range : NULL; -+ struct journal_key_range_overwritten *next_range = -+ next_overwritten ? next->overwritten_range : NULL; -+ -+ BUG_ON(prev_range && prev_range->end != idx); -+ BUG_ON(next_range && next_range->start != idx + 1); -+ -+ if (prev_range && next_range) { -+ prev_range->end = next_range->end; -+ -+ keys->data[pos].overwritten_range = prev_range; -+ for (size_t i = next_range->start; i < next_range->end; i++) { -+ struct journal_key *ip = keys->data + idx_to_pos(keys, i); -+ BUG_ON(ip->overwritten_range != next_range); -+ ip->overwritten_range = prev_range; -+ } -+ -+ kfree_rcu_mightsleep(next_range); -+ } else if (prev_range) { -+ prev_range->end++; -+ k->overwritten_range = prev_range; -+ if (next_overwritten) { -+ prev_range->end++; -+ next->overwritten_range = prev_range; -+ } -+ } else if (next_range) { -+ next_range->start--; -+ k->overwritten_range = next_range; -+ if (prev_overwritten) { -+ next_range->start--; -+ prev->overwritten_range = next_range; -+ } -+ } else if (prev_overwritten || next_overwritten) { -+ struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL); -+ if (!r) -+ return; -+ -+ r->start = idx - (size_t) prev_overwritten; -+ r->end = idx + 1 + (size_t) next_overwritten; -+ -+ rcu_assign_pointer(k->overwritten_range, r); -+ if (prev_overwritten) -+ prev->overwritten_range = r; -+ if (next_overwritten) -+ next->overwritten_range = r; -+ } -+} -+ - void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, - unsigned level, struct bpos pos) - { -@@ -299,8 +438,12 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, - if (idx < keys->size && - keys->data[idx].btree_id == btree && - keys->data[idx].level == level && -- bpos_eq(keys->data[idx].k->k.p, pos)) -- keys->data[idx].overwritten = true; -+ bpos_eq(keys->data[idx].k->k.p, pos) && -+ !keys->data[idx].overwritten) { -+ mutex_lock(&keys->overwrite_lock); -+ __bch2_journal_key_overwritten(keys, idx); -+ mutex_unlock(&keys->overwrite_lock); -+ } - } - - static void bch2_journal_iter_advance(struct journal_iter *iter) -@@ -314,24 +457,32 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) - - static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) - { -+ struct bkey_s_c ret = bkey_s_c_null; -+ - journal_iter_verify(iter); - -+ rcu_read_lock(); - while (iter->idx < iter->keys->size) { - struct journal_key *k = iter->keys->data + iter->idx; - -- int cmp = cmp_int(k->btree_id, iter->btree_id) ?: -- cmp_int(k->level, iter->level); -- if (cmp > 0) -+ int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); -+ if (cmp < 0) - break; - BUG_ON(cmp); - -- if (!k->overwritten) -- return bkey_i_to_s_c(k->k); -+ if (!k->overwritten) { -+ ret = bkey_i_to_s_c(k->k); -+ break; -+ } - -- bch2_journal_iter_advance(iter); -+ if (k->overwritten_range) -+ iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); -+ else -+ bch2_journal_iter_advance(iter); - } -+ rcu_read_unlock(); - -- return bkey_s_c_null; -+ return ret; - } - - static void bch2_journal_iter_exit(struct journal_iter *iter) -@@ -382,6 +533,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter - : (level > 1 ? 1 : 16); - - iter.prefetch = false; -+ iter.fail_if_too_many_whiteouts = true; - bch2_bkey_buf_init(&tmp); - - while (nr--) { -@@ -400,6 +552,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter - struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) - { - struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; -+ size_t iters = 0; - - if (iter->prefetch && iter->journal.level) - btree_and_journal_iter_prefetch(iter); -@@ -407,6 +560,11 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * - if (iter->at_end) - return bkey_s_c_null; - -+ iters++; -+ -+ if (iters > 20 && iter->fail_if_too_many_whiteouts) -+ return bkey_s_c_null; -+ - while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && - bpos_lt(btree_k.k->p, iter->pos)) - bch2_journal_iter_advance_btree(iter); -@@ -481,16 +639,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, - - /* sort and dedup all keys in the journal: */ - --void bch2_journal_entries_free(struct bch_fs *c) --{ -- struct journal_replay **i; -- struct genradix_iter iter; -- -- genradix_for_each(&c->journal_entries, iter, i) -- kvfree(*i); -- genradix_free(&c->journal_entries); --} -- - /* - * When keys compare equal, oldest compares first: - */ -@@ -515,15 +663,26 @@ void bch2_journal_keys_put(struct bch_fs *c) - - move_gap(keys, keys->nr); - -- darray_for_each(*keys, i) -+ darray_for_each(*keys, i) { -+ if (i->overwritten_range && -+ (i == &darray_last(*keys) || -+ i->overwritten_range != i[1].overwritten_range)) -+ kfree(i->overwritten_range); -+ - if (i->allocated) - kfree(i->k); -+ } - - kvfree(keys->data); - keys->data = NULL; - keys->nr = keys->gap = keys->size = 0; - -- bch2_journal_entries_free(c); -+ struct journal_replay **i; -+ struct genradix_iter iter; -+ -+ genradix_for_each(&c->journal_entries, iter, i) -+ kvfree(*i); -+ genradix_free(&c->journal_entries); - } - - static void __journal_keys_sort(struct journal_keys *keys) -@@ -628,8 +787,20 @@ void bch2_journal_keys_dump(struct bch_fs *c) - - darray_for_each(*keys, i) { - printbuf_reset(&buf); -+ prt_printf(&buf, "btree="); -+ bch2_btree_id_to_text(&buf, i->btree_id); -+ prt_printf(&buf, " l=%u ", i->level); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); -- pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf); -+ pr_err("%s", buf.buf); - } - printbuf_exit(&buf); - } -+ -+void bch2_fs_journal_keys_init(struct bch_fs *c) -+{ -+ struct journal_keys *keys = &c->journal_keys; -+ -+ atomic_set(&keys->ref, 1); -+ keys->initial_ref_held = true; -+ mutex_init(&keys->overwrite_lock); -+} -diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h -index 1653de9d609b..2a3082919b8d 100644 ---- a/fs/bcachefs/btree_journal_iter.h -+++ b/fs/bcachefs/btree_journal_iter.h -@@ -26,16 +26,24 @@ struct btree_and_journal_iter { - struct bpos pos; - bool at_end; - bool prefetch; -+ bool fail_if_too_many_whiteouts; - }; - -+static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, -+ unsigned l_level, -+ const struct journal_key *r) -+{ -+ return -cmp_int(l_level, r->level) ?: -+ cmp_int(l_btree_id, r->btree_id); -+} -+ - static inline int __journal_key_cmp(enum btree_id l_btree_id, - unsigned l_level, - struct bpos l_pos, - const struct journal_key *r) - { -- return (cmp_int(l_btree_id, r->btree_id) ?: -- cmp_int(l_level, r->level) ?: -- bpos_cmp(l_pos, r->k->k.p)); -+ return __journal_key_btree_cmp(l_btree_id, l_level, r) ?: -+ bpos_cmp(l_pos, r->k->k.p); - } - - static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) -@@ -43,7 +51,9 @@ static inline int journal_key_cmp(const struct journal_key *l, const struct jour - return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); - } - --struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, -+struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, -+ unsigned, struct bpos, struct bpos, size_t *); -+struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, - unsigned, struct bpos, struct bpos, size_t *); - struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, - unsigned, struct bpos); -@@ -79,8 +89,6 @@ static inline void bch2_journal_keys_put_initial(struct bch_fs *c) - c->journal_keys.initial_ref_held = false; - } - --void bch2_journal_entries_free(struct bch_fs *); -- - int bch2_journal_keys_sort(struct bch_fs *); - - void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, -@@ -89,4 +97,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, - - void bch2_journal_keys_dump(struct bch_fs *); - -+void bch2_fs_journal_keys_init(struct bch_fs *); -+ - #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ -diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h -new file mode 100644 -index 000000000000..8b773823704f ---- /dev/null -+++ b/fs/bcachefs/btree_journal_iter_types.h -@@ -0,0 +1,36 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H -+#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H -+ -+struct journal_key_range_overwritten { -+ size_t start, end; -+}; -+ -+struct journal_key { -+ u64 journal_seq; -+ u32 journal_offset; -+ enum btree_id btree_id:8; -+ unsigned level:8; -+ bool allocated; -+ bool overwritten; -+ struct journal_key_range_overwritten __rcu * -+ overwritten_range; -+ struct bkey_i *k; -+}; -+ -+struct journal_keys { -+ /* must match layout in darray_types.h */ -+ size_t nr, size; -+ struct journal_key *data; -+ /* -+ * Gap buffer: instead of all the empty space in the array being at the -+ * end of the buffer - from @nr to @size - the empty space is at @gap. -+ * This means that sequential insertions are O(n) instead of O(n^2). -+ */ -+ size_t gap; -+ atomic_t ref; -+ bool initial_ref_held; -+ struct mutex overwrite_lock; -+}; -+ -+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */ -diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c -index 244610b1d0b5..edce59433375 100644 ---- a/fs/bcachefs/btree_key_cache.c -+++ b/fs/bcachefs/btree_key_cache.c -@@ -156,7 +156,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k - } - - if (ck) { -- bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); -+ bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); - ck->c.cached = true; - goto lock; - } -@@ -197,7 +197,9 @@ bkey_cached_reuse(struct btree_key_cache *c) - return ck; - } - --static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path, -+static int btree_key_cache_create(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree_path *ck_path, - struct bkey_s_c k) - { - struct bch_fs *c = trans->c; -@@ -217,7 +219,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * - key_u64s = min(256U, (key_u64s * 3) / 2); - key_u64s = roundup_pow_of_two(key_u64s); - -- struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s); -+ struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); - int ret = PTR_ERR_OR_ZERO(ck); - if (ret) - return ret; -@@ -226,19 +228,19 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * - ck = bkey_cached_reuse(bc); - if (unlikely(!ck)) { - bch_err(c, "error allocating memory for key cache item, btree %s", -- bch2_btree_id_str(path->btree_id)); -+ bch2_btree_id_str(ck_path->btree_id)); - return -BCH_ERR_ENOMEM_btree_key_cache_create; - } - } - - ck->c.level = 0; -- ck->c.btree_id = path->btree_id; -- ck->key.btree_id = path->btree_id; -- ck->key.pos = path->pos; -+ ck->c.btree_id = ck_path->btree_id; -+ ck->key.btree_id = ck_path->btree_id; -+ ck->key.pos = ck_path->pos; - ck->flags = 1U << BKEY_CACHED_ACCESSED; - - if (unlikely(key_u64s > ck->u64s)) { -- mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); -+ mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); - - struct bkey_i *new_k = allocate_dropping_locks(trans, ret, - kmalloc(key_u64s * sizeof(u64), _gfp)); -@@ -258,22 +260,29 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * - - bkey_reassemble(ck->k, k); - -+ ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); -+ if (unlikely(ret)) -+ goto err; -+ - ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); -+ -+ bch2_btree_node_unlock_write(trans, path, path_l(path)->b); -+ - if (unlikely(ret)) /* raced with another fill? */ - goto err; - - atomic_long_inc(&bc->nr_keys); - six_unlock_write(&ck->c.lock); - -- enum six_lock_type lock_want = __btree_lock_want(path, 0); -+ enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); - if (lock_want == SIX_LOCK_read) - six_lock_downgrade(&ck->c.lock); -- btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); -- path->uptodate = BTREE_ITER_UPTODATE; -+ btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); -+ ck_path->uptodate = BTREE_ITER_UPTODATE; - return 0; - err: - bkey_cached_free(bc, ck); -- mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); -+ mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); - - return ret; - } -@@ -283,7 +292,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, - unsigned flags) - { - if (flags & BTREE_ITER_cached_nofill) { -- ck_path->uptodate = BTREE_ITER_UPTODATE; -+ ck_path->l[0].b = NULL; - return 0; - } - -@@ -293,6 +302,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, - int ret; - - bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, -+ BTREE_ITER_intent| - BTREE_ITER_key_cache_fill| - BTREE_ITER_cached_nofill); - iter.flags &= ~BTREE_ITER_with_journal; -@@ -306,9 +316,19 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, - if (unlikely(ret)) - goto out; - -- ret = btree_key_cache_create(trans, ck_path, k); -+ ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); - if (ret) - goto err; -+ -+ if (trace_key_cache_fill_enabled()) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bpos_to_text(&buf, ck_path->pos); -+ prt_char(&buf, ' '); -+ bch2_bkey_val_to_text(&buf, trans->c, k); -+ trace_key_cache_fill(trans, buf.buf); -+ printbuf_exit(&buf); -+ } - out: - /* We're not likely to need this iterator again: */ - bch2_set_btree_iter_dontneed(&iter); -@@ -424,8 +444,15 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, - !test_bit(JOURNAL_space_low, &c->journal.flags)) - commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - -- ret = bch2_btree_iter_traverse(&b_iter) ?: -- bch2_trans_update(trans, &b_iter, ck->k, -+ struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); -+ ret = bkey_err(btree_k); -+ if (ret) -+ goto err; -+ -+ /* * Check that we're not violating cache coherency rules: */ -+ BUG_ON(bkey_deleted(btree_k.k)); -+ -+ ret = bch2_trans_update(trans, &b_iter, ck->k, - BTREE_UPDATE_key_cache_reclaim| - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun) ?: -@@ -433,7 +460,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - commit_flags); -- -+err: - bch2_fs_fatal_err_on(ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && -@@ -586,8 +613,18 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, - bkey_cached_free(bc, ck); - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); -- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); -- path->should_be_locked = false; -+ -+ struct btree_path *path2; -+ unsigned i; -+ trans_for_each_path(trans, path2, i) -+ if (path2->l[0].b == (void *) ck) { -+ __bch2_btree_path_unlock(trans, path2); -+ path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); -+ path2->should_be_locked = false; -+ btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); -+ } -+ -+ bch2_trans_verify_locks(trans); - } - - static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, -@@ -711,7 +748,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) - rcu_read_unlock(); - mutex_lock(&bc->table.mutex); - mutex_unlock(&bc->table.mutex); -- rcu_read_lock(); - continue; - } - for (i = 0; i < tbl->size; i++) -diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c -index efe2a007b482..caef65adeae4 100644 ---- a/fs/bcachefs/btree_locking.c -+++ b/fs/bcachefs/btree_locking.c -@@ -7,9 +7,10 @@ - static struct lock_class_key bch2_btree_node_lock_key; - - void bch2_btree_lock_init(struct btree_bkey_cached_common *b, -- enum six_lock_init_flags flags) -+ enum six_lock_init_flags flags, -+ gfp_t gfp) - { -- __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); -+ __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp); - lockdep_set_notrack_class(&b->lock); - } - -@@ -109,6 +110,12 @@ static noinline void lock_graph_pop_all(struct lock_graph *g) - lock_graph_up(g); - } - -+static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i) -+{ -+ while (g->g + g->nr > i) -+ lock_graph_up(g); -+} -+ - static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) - { - g->g[g->nr++] = (struct trans_waiting_for_lock) { -@@ -124,15 +131,20 @@ static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) - __lock_graph_down(g, trans); - } - --static bool lock_graph_remove_non_waiters(struct lock_graph *g) -+static bool lock_graph_remove_non_waiters(struct lock_graph *g, -+ struct trans_waiting_for_lock *from) - { - struct trans_waiting_for_lock *i; - -- for (i = g->g + 1; i < g->g + g->nr; i++) -+ if (from->trans->locking != from->node_want) { -+ lock_graph_pop_from(g, from); -+ return true; -+ } -+ -+ for (i = from + 1; i < g->g + g->nr; i++) - if (i->trans->locking != i->node_want || - i->trans->locking_wait.start_time != i[-1].lock_start_time) { -- while (g->g + g->nr > i) -- lock_graph_up(g); -+ lock_graph_pop_from(g, i); - return true; - } - -@@ -179,13 +191,14 @@ static int btree_trans_abort_preference(struct btree_trans *trans) - return 3; - } - --static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) -+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, -+ struct trans_waiting_for_lock *from) - { - struct trans_waiting_for_lock *i, *abort = NULL; - unsigned best = 0, pref; - int ret; - -- if (lock_graph_remove_non_waiters(g)) -+ if (lock_graph_remove_non_waiters(g, from)) - return 0; - - /* Only checking, for debugfs: */ -@@ -195,7 +208,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) - goto out; - } - -- for (i = g->g; i < g->g + g->nr; i++) { -+ for (i = from; i < g->g + g->nr; i++) { - pref = btree_trans_abort_preference(i->trans); - if (pref > best) { - abort = i; -@@ -229,8 +242,9 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) - ret = abort_lock(g, abort); - out: - if (ret) -- while (g->nr) -- lock_graph_up(g); -+ lock_graph_pop_all(g); -+ else -+ lock_graph_pop_from(g, abort); - return ret; - } - -@@ -243,7 +257,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, - for (i = g->g; i < g->g + g->nr; i++) - if (i->trans == trans) { - closure_put(&trans->ref); -- return break_cycle(g, cycle); -+ return break_cycle(g, cycle, i); - } - - if (g->nr == ARRAY_SIZE(g->g)) { -@@ -252,8 +266,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, - if (orig_trans->lock_may_not_fail) - return 0; - -- while (g->nr) -- lock_graph_up(g); -+ lock_graph_pop_all(g); - - if (cycle) - return 0; -@@ -281,7 +294,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) - - g.nr = 0; - -- if (trans->lock_must_abort) { -+ if (trans->lock_must_abort && !trans->lock_may_not_fail) { - if (cycle) - return -1; - -@@ -336,7 +349,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) - * structures - which means it can't be blocked - * waiting on a lock: - */ -- if (!lock_graph_remove_non_waiters(&g)) { -+ if (!lock_graph_remove_non_waiters(&g, g.g)) { - /* - * If lock_graph_remove_non_waiters() - * didn't do anything, it must be -@@ -512,7 +525,6 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, - struct btree_path *path, unsigned level) - { - struct btree *b = path->l[level].b; -- struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level); - - if (!is_btree_node(path, level)) - return false; -@@ -536,24 +548,11 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, - if (race_fault()) - return false; - -- if (btree_node_locked(path, level)) { -- bool ret; -- -- six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]); -- ret = six_lock_tryupgrade(&b->c.lock); -- six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]); -- -- if (ret) -- goto success; -- } else { -- if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) -- goto success; -- } -+ if (btree_node_locked(path, level) -+ ? six_lock_tryupgrade(&b->c.lock) -+ : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) -+ goto success; - -- /* -- * Do we already have an intent lock via another path? If so, just bump -- * lock count: -- */ - if (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { - btree_node_unlock(trans, path, level); -@@ -782,7 +781,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) - return bch2_trans_relock_fail(trans, path, &f, trace); - } - -- trans_set_locked(trans); -+ trans_set_locked(trans, true); - out: - bch2_trans_verify_locks(trans); - return 0; -@@ -818,6 +817,17 @@ void bch2_trans_unlock_long(struct btree_trans *trans) - bch2_trans_srcu_unlock(trans); - } - -+void bch2_trans_unlock_write(struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ unsigned i; -+ -+ trans_for_each_path(trans, path, i) -+ for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) -+ if (btree_node_write_locked(path, l)) -+ bch2_btree_node_unlock_write(trans, path, path->l[l].b); -+} -+ - int __bch2_trans_mutex_lock(struct btree_trans *trans, - struct mutex *lock) - { -@@ -856,6 +866,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path) - (want == BTREE_NODE_UNLOCKED || - have != BTREE_NODE_WRITE_LOCKED) && - want != have); -+ -+ BUG_ON(btree_node_locked(path, l) && -+ path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); - } - } - -diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h -index 7c07f9fa9add..b33ab7af8440 100644 ---- a/fs/bcachefs/btree_locking.h -+++ b/fs/bcachefs/btree_locking.h -@@ -13,9 +13,10 @@ - #include "btree_iter.h" - #include "six.h" - --void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); -+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); - - void bch2_trans_unlock_noassert(struct btree_trans *); -+void bch2_trans_unlock_write(struct btree_trans *); - - static inline bool is_btree_node(struct btree_path *path, unsigned l) - { -@@ -75,13 +76,6 @@ static inline void mark_btree_node_locked_noreset(struct btree_path *path, - path->nodes_locked |= (type + 1) << (level << 1); - } - --static inline void mark_btree_node_unlocked(struct btree_path *path, -- unsigned level) --{ -- EBUG_ON(btree_node_write_locked(path, level)); -- mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); --} -- - static inline void mark_btree_node_locked(struct btree_trans *trans, - struct btree_path *path, - unsigned level, -@@ -124,19 +118,25 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans, - - /* unlock: */ - -+void bch2_btree_node_unlock_write(struct btree_trans *, -+ struct btree_path *, struct btree *); -+ - static inline void btree_node_unlock(struct btree_trans *trans, - struct btree_path *path, unsigned level) - { - int lock_type = btree_node_locked_type(path, level); - - EBUG_ON(level >= BTREE_MAX_DEPTH); -- EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED); - - if (lock_type != BTREE_NODE_UNLOCKED) { -+ if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) { -+ bch2_btree_node_unlock_write(trans, path, path->l[level].b); -+ lock_type = BTREE_NODE_INTENT_LOCKED; -+ } - six_unlock_type(&path->l[level].b->c.lock, lock_type); - btree_trans_lock_hold_time_update(trans, path, level); -+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); - } -- mark_btree_node_unlocked(path, level); - } - - static inline int btree_path_lowest_level_locked(struct btree_path *path) -@@ -162,36 +162,40 @@ static inline void __bch2_btree_path_unlock(struct btree_trans *trans, - * Updates the saved lock sequence number, so that bch2_btree_node_relock() will - * succeed: - */ -+static inline void -+__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b) -+{ -+ if (!b->c.lock.write_lock_recurse) { -+ struct btree_path *linked; -+ unsigned i; -+ -+ trans_for_each_path_with_node(trans, b, linked, i) -+ linked->l[b->c.level].lock_seq++; -+ } -+ -+ six_unlock_write(&b->c.lock); -+} -+ - static inline void - bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, - struct btree *b) - { -- struct btree_path *linked; -- unsigned i; -- - EBUG_ON(path->l[b->c.level].b != b); - EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); - EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); - - mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); -- -- trans_for_each_path_with_node(trans, b, linked, i) -- linked->l[b->c.level].lock_seq++; -- -- six_unlock_write(&b->c.lock); -+ __bch2_btree_node_unlock_write(trans, b); - } - --void bch2_btree_node_unlock_write(struct btree_trans *, -- struct btree_path *, struct btree *); -- - int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); - - /* lock: */ - --static inline void trans_set_locked(struct btree_trans *trans) -+static inline void trans_set_locked(struct btree_trans *trans, bool try) - { - if (!trans->locked) { -- lock_acquire_exclusive(&trans->dep_map, 0, 0, NULL, _THIS_IP_); -+ lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_); - trans->locked = true; - trans->last_unlock_ip = 0; - -@@ -282,7 +286,7 @@ static inline int btree_node_lock(struct btree_trans *trans, - int ret = 0; - - EBUG_ON(level >= BTREE_MAX_DEPTH); -- bch2_trans_verify_not_unlocked(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if (likely(six_trylock_type(&b->lock, type)) || - btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || -diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c -index 30131c3bdd97..a7f06deee13c 100644 ---- a/fs/bcachefs/btree_node_scan.c -+++ b/fs/bcachefs/btree_node_scan.c -@@ -12,6 +12,7 @@ - #include "recovery_passes.h" - - #include -+#include - #include - - struct find_btree_nodes_worker { -@@ -22,17 +23,15 @@ struct find_btree_nodes_worker { - - static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) - { -- prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ", -- bch2_btree_id_str(n->btree_id), n->level, n->seq, -- n->journal_seq, n->cookie); -+ bch2_btree_id_level_to_text(out, n->btree_id, n->level); -+ prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ", -+ n->seq, n->journal_seq, n->cookie); - bch2_bpos_to_text(out, n->min_key); - prt_str(out, "-"); - bch2_bpos_to_text(out, n->max_key); - - if (n->range_updated) - prt_str(out, " range updated"); -- if (n->overwritten) -- prt_str(out, " overwritten"); - - for (unsigned i = 0; i < n->nr_ptrs; i++) { - prt_char(out, ' '); -@@ -140,6 +139,24 @@ static int found_btree_node_cmp_pos(const void *_l, const void *_r) - -found_btree_node_cmp_time(l, r); - } - -+static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg) -+{ -+ return found_btree_node_cmp_pos(l, r) < 0; -+} -+ -+static inline void found_btree_node_swap(void *_l, void *_r, void *arg) -+{ -+ struct found_btree_node *l = _l; -+ struct found_btree_node *r = _r; -+ -+ swap(*l, *r); -+} -+ -+static const struct min_heap_callbacks found_btree_node_heap_cbs = { -+ .less = found_btree_node_cmp_pos_less, -+ .swp = found_btree_node_swap, -+}; -+ - static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, - struct bio *bio, struct btree_node *bn, u64 offset) - { -@@ -159,6 +176,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, - return; - - if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { -+ if (!c->chacha20) -+ return; -+ - struct nonce nonce = btree_nonce(&bn->keys, 0); - unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; - -@@ -292,55 +312,48 @@ static int read_btree_nodes(struct find_btree_nodes *f) - return f->ret ?: ret; - } - --static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) -+static bool nodes_overlap(const struct found_btree_node *l, -+ const struct found_btree_node *r) - { -- while (n + 1 < end && -- found_btree_node_cmp_pos(n, n + 1) > 0) { -- swap(n[0], n[1]); -- n++; -- } -+ return (l->btree_id == r->btree_id && -+ l->level == r->level && -+ bpos_gt(l->max_key, r->min_key)); - } - - static int handle_overwrites(struct bch_fs *c, -- struct found_btree_node *start, -- struct found_btree_node *end) -+ struct found_btree_node *l, -+ found_btree_nodes *nodes_heap) - { -- struct found_btree_node *n; --again: -- for (n = start + 1; -- n < end && -- n->btree_id == start->btree_id && -- n->level == start->level && -- bpos_lt(n->min_key, start->max_key); -- n++) { -- int cmp = found_btree_node_cmp_time(start, n); -+ struct found_btree_node *r; -+ -+ while ((r = min_heap_peek(nodes_heap)) && -+ nodes_overlap(l, r)) { -+ int cmp = found_btree_node_cmp_time(l, r); - - if (cmp > 0) { -- if (bpos_cmp(start->max_key, n->max_key) >= 0) -- n->overwritten = true; -+ if (bpos_cmp(l->max_key, r->max_key) >= 0) -+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - else { -- n->range_updated = true; -- n->min_key = bpos_successor(start->max_key); -- n->range_updated = true; -- bubble_up(n, end); -- goto again; -+ r->range_updated = true; -+ r->min_key = bpos_successor(l->max_key); -+ r->range_updated = true; -+ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); - } - } else if (cmp < 0) { -- BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); -+ BUG_ON(bpos_eq(l->min_key, r->min_key)); - -- start->max_key = bpos_predecessor(n->min_key); -- start->range_updated = true; -- } else if (n->level) { -- n->overwritten = true; -+ l->max_key = bpos_predecessor(r->min_key); -+ l->range_updated = true; -+ } else if (r->level) { -+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - } else { -- if (bpos_cmp(start->max_key, n->max_key) >= 0) -- n->overwritten = true; -+ if (bpos_cmp(l->max_key, r->max_key) >= 0) -+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - else { -- n->range_updated = true; -- n->min_key = bpos_successor(start->max_key); -- n->range_updated = true; -- bubble_up(n, end); -- goto again; -+ r->range_updated = true; -+ r->min_key = bpos_successor(l->max_key); -+ r->range_updated = true; -+ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); - } - } - } -@@ -352,6 +365,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) - { - struct find_btree_nodes *f = &c->found_btree_nodes; - struct printbuf buf = PRINTBUF; -+ found_btree_nodes nodes_heap = {}; - size_t dst; - int ret = 0; - -@@ -406,29 +420,57 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) - bch2_print_string_as_lines(KERN_INFO, buf.buf); - } - -- dst = 0; -- darray_for_each(f->nodes, i) { -- if (i->overwritten) -- continue; -+ swap(nodes_heap, f->nodes); -+ -+ { -+ /* darray must have same layout as a heap */ -+ min_heap_char real_heap; -+ BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr)); -+ BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size)); -+ BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr)); -+ BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size)); -+ } -+ -+ min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL); - -- ret = handle_overwrites(c, i, &darray_top(f->nodes)); -+ if (nodes_heap.nr) { -+ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); - if (ret) - goto err; - -- BUG_ON(i->overwritten); -- f->nodes.data[dst++] = *i; -+ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); - } -- f->nodes.nr = dst; - -- if (c->opts.verbose) { -+ while (true) { -+ ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap); -+ if (ret) -+ goto err; -+ -+ if (!nodes_heap.nr) -+ break; -+ -+ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); -+ if (ret) -+ goto err; -+ -+ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); -+ } -+ -+ for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++) -+ BUG_ON(nodes_overlap(n, n + 1)); -+ -+ if (0 && c->opts.verbose) { - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); -+ } else { -+ bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); - } - - eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); - err: -+ darray_exit(&nodes_heap); - printbuf_exit(&buf); - return ret; - } -@@ -499,7 +541,9 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - -- prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); -+ prt_str(&buf, "recovery "); -+ bch2_btree_id_level_to_text(&buf, btree, level); -+ prt_str(&buf, " "); - bch2_bpos_to_text(&buf, node_min); - prt_str(&buf, " - "); - bch2_bpos_to_text(&buf, node_max); -@@ -533,7 +577,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, - bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); - printbuf_exit(&buf); - -- BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0)); -+ BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), -+ (struct bkey_validate_context) { -+ .from = BKEY_VALIDATE_btree_node, -+ .level = level + 1, -+ .btree = btree, -+ })); - - ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); - if (ret) -diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h -index b6c36c45d0be..2811b6857c97 100644 ---- a/fs/bcachefs/btree_node_scan_types.h -+++ b/fs/bcachefs/btree_node_scan_types.h -@@ -6,7 +6,6 @@ - - struct found_btree_node { - bool range_updated:1; -- bool overwritten:1; - u8 btree_id; - u8 level; - unsigned sectors_written; -diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c -index 9bf471fa4361..c4f524b2ca9a 100644 ---- a/fs/bcachefs/btree_trans_commit.c -+++ b/fs/bcachefs/btree_trans_commit.c -@@ -133,7 +133,7 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans) - return 0; - } - --static inline void bch2_trans_unlock_write(struct btree_trans *trans) -+static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans) - { - if (likely(trans->write_locked)) { - trans_for_each_update(trans, i) -@@ -249,7 +249,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, - new |= 1 << BTREE_NODE_need_write; - } while (!try_cmpxchg(&b->flags, &old, new)); - -- btree_node_write_if_need(c, b, SIX_LOCK_read); -+ btree_node_write_if_need(trans, b, SIX_LOCK_read); - six_unlock_read(&b->c.lock); - - bch2_trans_put(trans); -@@ -348,7 +348,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, - unsigned flags) - { - return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, -- trans->journal_u64s, flags); -+ trans->journal_u64s, flags, trans); - } - - #define JSET_ENTRY_LOG_U64s 4 -@@ -384,7 +384,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, - struct bkey_i *new_k; - int ret; - -- bch2_trans_unlock_write(trans); -+ bch2_trans_unlock_updates_write(trans); - bch2_trans_unlock(trans); - - new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); -@@ -479,8 +479,7 @@ static int run_one_mem_trigger(struct btree_trans *trans, - old, flags); - } - --static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, -- bool overwrite) -+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i) - { - verify_update_old_key(trans, i); - -@@ -507,10 +506,10 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ - return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), - BTREE_TRIGGER_insert| - BTREE_TRIGGER_overwrite|flags) ?: 1; -- } else if (overwrite && !i->overwrite_trigger_run) { -+ } else if (!i->overwrite_trigger_run) { - i->overwrite_trigger_run = true; - return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; -- } else if (!overwrite && !i->insert_trigger_run) { -+ } else if (!i->insert_trigger_run) { - i->insert_trigger_run = true; - return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; - } else { -@@ -519,39 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ - } - - static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, -- unsigned btree_id_start) -+ unsigned *btree_id_updates_start) - { -- for (int overwrite = 1; overwrite >= 0; --overwrite) { -- bool trans_trigger_run; -+ bool trans_trigger_run; - -- /* -- * Running triggers will append more updates to the list of updates as -- * we're walking it: -- */ -- do { -- trans_trigger_run = false; -- -- for (unsigned i = btree_id_start; -- i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; -- i++) { -- if (trans->updates[i].btree_id != btree_id) -- continue; -+ /* -+ * Running triggers will append more updates to the list of updates as -+ * we're walking it: -+ */ -+ do { -+ trans_trigger_run = false; - -- int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); -- if (ret < 0) -- return ret; -- if (ret) -- trans_trigger_run = true; -+ for (unsigned i = *btree_id_updates_start; -+ i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; -+ i++) { -+ if (trans->updates[i].btree_id < btree_id) { -+ *btree_id_updates_start = i; -+ continue; - } -- } while (trans_trigger_run); -- } -+ -+ int ret = run_one_trans_trigger(trans, trans->updates + i); -+ if (ret < 0) -+ return ret; -+ if (ret) -+ trans_trigger_run = true; -+ } -+ } while (trans_trigger_run); -+ -+ trans_for_each_update(trans, i) -+ BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && -+ i->btree_id == btree_id && -+ btree_node_type_has_trans_triggers(i->bkey_type) && -+ (!i->insert_trigger_run || !i->overwrite_trigger_run)); - - return 0; - } - - static int bch2_trans_commit_run_triggers(struct btree_trans *trans) - { -- unsigned btree_id = 0, btree_id_start = 0; -+ unsigned btree_id = 0, btree_id_updates_start = 0; - int ret = 0; - - /* -@@ -565,27 +570,15 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) - if (btree_id == BTREE_ID_alloc) - continue; - -- while (btree_id_start < trans->nr_updates && -- trans->updates[btree_id_start].btree_id < btree_id) -- btree_id_start++; -- -- ret = run_btree_triggers(trans, btree_id, btree_id_start); -+ ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start); - if (ret) - return ret; - } - -- for (unsigned idx = 0; idx < trans->nr_updates; idx++) { -- struct btree_insert_entry *i = trans->updates + idx; -- -- if (i->btree_id > BTREE_ID_alloc) -- break; -- if (i->btree_id == BTREE_ID_alloc) { -- ret = run_btree_triggers(trans, BTREE_ID_alloc, idx); -- if (ret) -- return ret; -- break; -- } -- } -+ btree_id_updates_start = 0; -+ ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start); -+ if (ret) -+ return ret; - - #ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) -@@ -609,14 +602,6 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) - return 0; - } - --static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) --{ -- return (struct bversion) { -- .hi = res->seq >> 32, -- .lo = (res->seq << 32) | (res->offset + offset), -- }; --} -- - static inline int - bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry **stopped_at, -@@ -627,12 +612,11 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - unsigned u64s = 0; - int ret = 0; - -- bch2_trans_verify_not_unlocked(trans); -- bch2_trans_verify_not_in_restart(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if (race_fault()) { - trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); -- return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); - } - - /* -@@ -701,25 +685,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - struct jset_entry *entry = trans->journal_entries; - - percpu_down_read(&c->mark_lock); -- - for (entry = trans->journal_entries; - entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && - entry->start->k.type == KEY_TYPE_accounting) { -- BUG_ON(!trans->journal_res.ref); -- -- struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start); -- -- a->k.bversion = journal_pos_to_bversion(&trans->journal_res, -- (u64 *) entry - (u64 *) trans->journal_entries); -- BUG_ON(bversion_zero(a->k.bversion)); -- -- if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { -- ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal); -- if (ret) -- goto revert_fs_usage; -- } -+ ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags); -+ if (ret) -+ goto revert_fs_usage; - } - percpu_up_read(&c->mark_lock); - -@@ -739,33 +712,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - goto fatal_err; - } - -- trans_for_each_update(trans, i) { -- enum bch_validate_flags invalid_flags = 0; -+ struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit }; - -- if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) -- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; -- -- ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), -- i->bkey_type, invalid_flags); -- if (unlikely(ret)){ -- bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", -- trans->fn, (void *) i->ip_allocated); -- goto fatal_err; -- } -- btree_insert_entry_checks(trans, i); -- } -+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) -+ validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; - - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) { -- enum bch_validate_flags invalid_flags = 0; -- -- if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) -- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; -- - ret = bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, -- CPU_BIG_ENDIAN, invalid_flags); -+ CPU_BIG_ENDIAN, validate_context); - if (unlikely(ret)) { - bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", - trans->fn); -@@ -773,6 +730,19 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - } - } - -+ trans_for_each_update(trans, i) { -+ validate_context.level = i->level; -+ validate_context.btree = i->btree_id; -+ -+ ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context); -+ if (unlikely(ret)){ -+ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", -+ trans->fn, (void *) i->ip_allocated); -+ goto fatal_err; -+ } -+ btree_insert_entry_checks(trans, i); -+ } -+ - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { - struct journal *j = &c->journal; - struct jset_entry *entry; -@@ -833,13 +803,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - entry2 != entry; - entry2 = vstruct_next(entry2)) - if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && -- entry2->start->k.type == KEY_TYPE_accounting) { -- struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start); -- -- bch2_accounting_neg(a); -- bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); -- bch2_accounting_neg(a); -- } -+ entry2->start->k.type == KEY_TYPE_accounting) -+ bch2_accounting_trans_commit_revert(trans, -+ bkey_i_to_accounting(entry2->start), flags); - percpu_up_read(&c->mark_lock); - return ret; - } -@@ -902,7 +868,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags - if (!ret && unlikely(trans->journal_replay_not_finished)) - bch2_drop_overwrites_from_journal(trans); - -- bch2_trans_unlock_write(trans); -+ bch2_trans_unlock_updates_write(trans); - - if (!ret && trans->journal_pin) - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, -@@ -994,24 +960,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, - return ret; - } - --static noinline int --bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) --{ -- struct bch_fs *c = trans->c; -- int ret; -- -- if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || -- test_bit(BCH_FS_started, &c->flags)) -- return -BCH_ERR_erofs_trans_commit; -- -- ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); -- if (ret) -- return ret; -- -- bch2_write_ref_get(c, BCH_WRITE_REF_trans); -- return 0; --} -- - /* - * This is for updates done in the early part of fsck - btree_gc - before we've - * gone RW. we only add the new key to the list of keys for journal replay to -@@ -1022,6 +970,8 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) - { - struct bch_fs *c = trans->c; - -+ BUG_ON(current != c->recovery_task); -+ - trans_for_each_update(trans, i) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); - if (ret) -@@ -1047,8 +997,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) - struct bch_fs *c = trans->c; - int ret = 0; - -- bch2_trans_verify_not_unlocked(trans); -- bch2_trans_verify_not_in_restart(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); -+ -+ ret = trans_maybe_inject_restart(trans, _RET_IP_); -+ if (unlikely(ret)) -+ goto out_reset; - - if (!trans->nr_updates && - !trans->journal_entries_u64s) -@@ -1058,16 +1011,13 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) - if (ret) - goto out_reset; - -- if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { -- ret = do_bch2_trans_commit_to_journal_replay(trans); -- goto out_reset; -- } -- - if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && - unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { -- ret = bch2_trans_commit_get_rw_cold(trans, flags); -- if (ret) -- goto out_reset; -+ if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) -+ ret = do_bch2_trans_commit_to_journal_replay(trans); -+ else -+ ret = -BCH_ERR_erofs_trans_commit; -+ goto out_reset; - } - - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); -@@ -1112,8 +1062,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) - } - retry: - errored_at = NULL; -- bch2_trans_verify_not_unlocked(trans); -- bch2_trans_verify_not_in_restart(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); -diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h -index 4568a41fefaf..a09cbe9cd94f 100644 ---- a/fs/bcachefs/btree_types.h -+++ b/fs/bcachefs/btree_types.h -@@ -509,10 +509,16 @@ struct btree_trans { - bool notrace_relock_fail:1; - enum bch_errcode restarted:16; - u32 restart_count; -+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS -+ u32 restart_count_this_trans; -+#endif - - u64 last_begin_time; - unsigned long last_begin_ip; - unsigned long last_restarted_ip; -+#ifdef CONFIG_BCACHEFS_DEBUG -+ bch_stacktrace last_restarted_trace; -+#endif - unsigned long last_unlock_ip; - unsigned long srcu_lock_time; - -@@ -787,53 +793,64 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type) - return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; - } - --static inline bool btree_node_type_is_extents(enum btree_node_type type) -+static inline bool btree_id_is_extents(enum btree_id btree) - { - const u64 mask = 0 --#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) -+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr) - BCH_BTREE_IDS() - #undef x - ; - -- return BIT_ULL(type) & mask; -+ return BIT_ULL(btree) & mask; - } - --static inline bool btree_id_is_extents(enum btree_id btree) -+static inline bool btree_node_type_is_extents(enum btree_node_type type) -+{ -+ return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1); -+} -+ -+static inline bool btree_type_has_snapshots(enum btree_id btree) - { -- return btree_node_type_is_extents(__btree_node_type(0, btree)); -+ const u64 mask = 0 -+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr) -+ BCH_BTREE_IDS() -+#undef x -+ ; -+ -+ return BIT_ULL(btree) & mask; - } - --static inline bool btree_type_has_snapshots(enum btree_id id) -+static inline bool btree_type_has_snapshot_field(enum btree_id btree) - { - const u64 mask = 0 --#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) -+#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr) - BCH_BTREE_IDS() - #undef x - ; - -- return BIT_ULL(id) & mask; -+ return BIT_ULL(btree) & mask; - } - --static inline bool btree_type_has_snapshot_field(enum btree_id id) -+static inline bool btree_type_has_ptrs(enum btree_id btree) - { - const u64 mask = 0 --#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) -+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr) - BCH_BTREE_IDS() - #undef x - ; - -- return BIT_ULL(id) & mask; -+ return BIT_ULL(btree) & mask; - } - --static inline bool btree_type_has_ptrs(enum btree_id id) -+static inline bool btree_type_uses_write_buffer(enum btree_id btree) - { - const u64 mask = 0 --#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) -+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr) - BCH_BTREE_IDS() - #undef x - ; - -- return BIT_ULL(id) & mask; -+ return BIT_ULL(btree) & mask; - } - - struct btree_root { -diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c -index 5d809e8bd170..13d794f201a5 100644 ---- a/fs/bcachefs/btree_update.c -+++ b/fs/bcachefs/btree_update.c -@@ -144,7 +144,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - !(ret = bkey_err(old_k)) && - bkey_eq(old_pos, old_k.k->p)) { - struct bpos whiteout_pos = -- SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; -+ SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot); - - if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || - snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) -@@ -296,7 +296,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, - BTREE_ITER_intent| - BTREE_ITER_with_updates| - BTREE_ITER_not_extents); -- k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); -+ k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) -@@ -323,7 +323,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, - goto out; - next: - bch2_btree_iter_advance(&iter); -- k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); -+ k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) -@@ -588,12 +588,9 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi - int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, - enum btree_id btree, struct bpos end) - { -- struct bkey_s_c k; -- int ret = 0; -- -- bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent); -- k = bch2_btree_iter_prev(iter); -- ret = bkey_err(k); -+ bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); -+ struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); -+ int ret = bkey_err(k); - if (ret) - goto err; - -@@ -672,27 +669,19 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, - bch2_btree_insert_trans(trans, id, k, iter_flags)); - } - --int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, -- unsigned len, unsigned update_flags) -+int bch2_btree_delete_at(struct btree_trans *trans, -+ struct btree_iter *iter, unsigned update_flags) - { -- struct bkey_i *k; -- -- k = bch2_trans_kmalloc(trans, sizeof(*k)); -- if (IS_ERR(k)) -- return PTR_ERR(k); -+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); -+ int ret = PTR_ERR_OR_ZERO(k); -+ if (ret) -+ return ret; - - bkey_init(&k->k); - k->k.p = iter->pos; -- bch2_key_resize(&k->k, len); - return bch2_trans_update(trans, iter, k, update_flags); - } - --int bch2_btree_delete_at(struct btree_trans *trans, -- struct btree_iter *iter, unsigned update_flags) --{ -- return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); --} -- - int bch2_btree_delete(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - unsigned update_flags) -@@ -721,7 +710,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, - int ret = 0; - - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); -- while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { -+ while ((k = bch2_btree_iter_peek_max(&iter, end)).k) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(trans->c, 0); - struct bkey_i delete; -@@ -794,8 +783,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, - return ret; - } - --int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, -- struct bpos pos, bool set) -+int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set) - { - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); - int ret = PTR_ERR_OR_ZERO(k); -@@ -804,13 +792,21 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, - - bkey_init(&k->k); - k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; -- k->k.p = pos; -+ k->k.p = iter->pos; -+ if (iter->flags & BTREE_ITER_is_extents) -+ bch2_key_resize(&k->k, 1); - -+ return bch2_trans_update(trans, iter, k, 0); -+} -+ -+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, -+ struct bpos pos, bool set) -+{ - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - -- ret = bch2_btree_iter_traverse(&iter) ?: -- bch2_trans_update(trans, &iter, k, 0); -+ int ret = bch2_btree_iter_traverse(&iter) ?: -+ bch2_btree_bit_mod_iter(trans, &iter, set); - bch2_trans_iter_exit(trans, &iter); - return ret; - } -@@ -827,10 +823,17 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, - return bch2_trans_update_buffered(trans, btree, &k); - } - --static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s) -+int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) - { -+ unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); -+ prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); -+ -+ int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; -+ if (ret) -+ return ret; -+ - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); -- int ret = PTR_ERR_OR_ZERO(e); -+ ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - -@@ -865,9 +868,8 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, - memcpy(l->d, buf.buf, buf.pos); - c->journal.early_journal_entries.nr += jset_u64s(u64s); - } else { -- ret = bch2_trans_commit_do(c, NULL, NULL, -- BCH_TRANS_COMMIT_lazy_rw|commit_flags, -- __bch2_trans_log_msg(trans, &buf, u64s)); -+ ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, -+ bch2_trans_log_msg(trans, &buf)); - } - err: - printbuf_exit(&buf); -diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h -index 70b3c989fac2..47d8690f01bf 100644 ---- a/fs/bcachefs/btree_update.h -+++ b/fs/bcachefs/btree_update.h -@@ -24,7 +24,6 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, - #define BCH_TRANS_COMMIT_FLAGS() \ - x(no_enospc, "don't check for enospc") \ - x(no_check_rw, "don't attempt to take a ref on c->writes") \ -- x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \ - x(no_journal_res, "don't take a journal reservation, instead " \ - "pin journal entry referred to by trans->journal_res.seq") \ - x(journal_reclaim, "operation required for journal reclaim; may return error" \ -@@ -47,8 +46,6 @@ enum bch_trans_commit_flags { - - void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); - --int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, -- unsigned, unsigned); - int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); - int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); - -@@ -66,6 +63,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, - int bch2_btree_delete_range(struct bch_fs *, enum btree_id, - struct bpos, struct bpos, unsigned, u64 *); - -+int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool); - int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); - int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); - -@@ -128,10 +126,18 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) - - int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); - -+int bch2_btree_write_buffer_insert_err(struct btree_trans *, -+ enum btree_id, struct bkey_i *); -+ - static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) - { -+ if (unlikely(!btree_type_uses_write_buffer(btree))) { -+ int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); -+ dump_stack(); -+ return ret; -+ } - /* - * Most updates skip the btree write buffer until journal replay is - * finished because synchronization with journal replay relies on having -@@ -161,6 +167,7 @@ void bch2_trans_commit_hook(struct btree_trans *, - struct btree_trans_commit_hook *); - int __bch2_trans_commit(struct btree_trans *, unsigned); - -+int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); - __printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); - __printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); - -@@ -244,7 +251,8 @@ static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *tra - KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) - - static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, -- struct bkey_s_c *k, unsigned flags, -+ struct bkey_s_c *k, -+ enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned min_bytes) - { - struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); -@@ -261,8 +269,9 @@ static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, str - return mut; - } - --static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, -- struct bkey_s_c *k, unsigned flags) -+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, -+ struct btree_iter *iter, struct bkey_s_c *k, -+ enum btree_iter_update_trigger_flags flags) - { - return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); - } -@@ -274,7 +283,8 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struc - static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, -- unsigned flags, unsigned type, unsigned min_bytes) -+ enum btree_iter_update_trigger_flags flags, -+ unsigned type, unsigned min_bytes) - { - struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, - btree_id, pos, flags|BTREE_ITER_intent, type); -@@ -289,7 +299,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr - static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, -- unsigned flags) -+ enum btree_iter_update_trigger_flags flags) - { - return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); - } -@@ -297,7 +307,8 @@ static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *tran - static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, -- unsigned flags, unsigned type, unsigned min_bytes) -+ enum btree_iter_update_trigger_flags flags, -+ unsigned type, unsigned min_bytes) - { - struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, - btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); -@@ -318,7 +329,8 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, - static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, -- unsigned flags, unsigned min_bytes) -+ enum btree_iter_update_trigger_flags flags, -+ unsigned min_bytes) - { - return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); - } -@@ -326,7 +338,7 @@ static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans - static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, -- unsigned flags) -+ enum btree_iter_update_trigger_flags flags) - { - return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); - } -@@ -337,7 +349,8 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, - KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) - - static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, -- unsigned flags, unsigned type, unsigned val_size) -+ enum btree_iter_update_trigger_flags flags, -+ unsigned type, unsigned val_size) - { - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); - int ret; -diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c -index d596ef93239f..e4e7c804625e 100644 ---- a/fs/bcachefs/btree_update_interior.c -+++ b/fs/bcachefs/btree_update_interior.c -@@ -58,11 +58,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) - !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, - b->data->min_key)); - -+ bch2_bkey_buf_init(&prev); -+ bkey_init(&prev.k->k); -+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); -+ - if (b == btree_node_root(c, b)) { - if (!bpos_eq(b->data->min_key, POS_MIN)) { - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->data->min_key); -- need_fsck_err(trans, btree_root_bad_min_key, -+ log_fsck_err(trans, btree_root_bad_min_key, - "btree root with incorrect min_key: %s", buf.buf); - goto topology_repair; - } -@@ -70,18 +74,14 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) - if (!bpos_eq(b->data->max_key, SPOS_MAX)) { - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->data->max_key); -- need_fsck_err(trans, btree_root_bad_max_key, -+ log_fsck_err(trans, btree_root_bad_max_key, - "btree root with incorrect max_key: %s", buf.buf); - goto topology_repair; - } - } - - if (!b->c.level) -- return 0; -- -- bch2_bkey_buf_init(&prev); -- bkey_init(&prev.k->k); -- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); -+ goto out; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - if (k.k->type != KEY_TYPE_btree_ptr_v2) -@@ -97,16 +97,16 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) - bch2_topology_error(c); - - printbuf_reset(&buf); -- prt_str(&buf, "end of prev node doesn't match start of next node\n"), -- prt_printf(&buf, " in btree %s level %u node ", -- bch2_btree_id_str(b->c.btree_id), b->c.level); -+ prt_str(&buf, "end of prev node doesn't match start of next node\n in "); -+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); -+ prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\n prev "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - prt_str(&buf, "\n next "); - bch2_bkey_val_to_text(&buf, c, k); - -- need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); -+ log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); - goto topology_repair; - } - -@@ -118,25 +118,25 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) - bch2_topology_error(c); - - printbuf_reset(&buf); -- prt_str(&buf, "empty interior node\n"); -- prt_printf(&buf, " in btree %s level %u node ", -- bch2_btree_id_str(b->c.btree_id), b->c.level); -+ prt_str(&buf, "empty interior node\n in "); -+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); -+ prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - -- need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); -+ log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); - goto topology_repair; - } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { - bch2_topology_error(c); - - printbuf_reset(&buf); -- prt_str(&buf, "last child node doesn't end at end of parent node\n"); -- prt_printf(&buf, " in btree %s level %u node ", -- bch2_btree_id_str(b->c.btree_id), b->c.level); -+ prt_str(&buf, "last child node doesn't end at end of parent node\n in "); -+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); -+ prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\n last key "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - -- need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); -+ log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); - goto topology_repair; - } - out: -@@ -146,13 +146,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) - printbuf_exit(&buf); - return ret; - topology_repair: -- if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && -- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { -- bch2_inconsistent_error(c); -- ret = -BCH_ERR_btree_need_topology_repair; -- } else { -- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); -- } -+ ret = bch2_topology_error(c); - goto out; - } - -@@ -244,7 +238,6 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, - struct btree *b) - { - struct bch_fs *c = trans->c; -- unsigned i, level = b->c.level; - - bch2_btree_node_lock_write_nofail(trans, path, &b->c); - -@@ -255,13 +248,9 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, - mutex_unlock(&c->btree_cache.lock); - - six_unlock_write(&b->c.lock); -- mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); -+ mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - -- trans_for_each_path(trans, path, i) -- if (path->l[level].b == b) { -- btree_node_unlock(trans, path, level); -- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); -- } -+ bch2_trans_node_drop(trans, b); - } - - static void bch2_btree_node_free_never_used(struct btree_update *as, -@@ -270,8 +259,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, - { - struct bch_fs *c = as->c; - struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; -- struct btree_path *path; -- unsigned i, level = b->c.level; - - BUG_ON(!list_empty(&b->write_blocked)); - BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); -@@ -293,11 +280,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, - - six_unlock_intent(&b->c.lock); - -- trans_for_each_path(trans, path, i) -- if (path->l[level].b == b) { -- btree_node_unlock(trans, path, level); -- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); -- } -+ bch2_trans_node_drop(trans, b); - } - - static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, -@@ -698,9 +681,11 @@ static void btree_update_nodes_written(struct btree_update *as) - - b = as->old_nodes[i]; - -+ bch2_trans_begin(trans); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - seq = b->data ? b->data->keys.seq : 0; - six_unlock_read(&b->c.lock); -+ bch2_trans_unlock_long(trans); - - if (seq == as->old_nodes_seq[i]) - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, -@@ -809,7 +794,7 @@ static void btree_update_nodes_written(struct btree_update *as) - mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - six_unlock_write(&b->c.lock); - -- btree_node_write_if_need(c, b, SIX_LOCK_intent); -+ btree_node_write_if_need(trans, b, SIX_LOCK_intent); - btree_node_unlock(trans, path, b->c.level); - bch2_path_put(trans, path_idx, true); - } -@@ -830,7 +815,7 @@ static void btree_update_nodes_written(struct btree_update *as) - b = as->new_nodes[i]; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); -- btree_node_write_if_need(c, b, SIX_LOCK_read); -+ btree_node_write_if_need(trans, b, SIX_LOCK_read); - six_unlock_read(&b->c.lock); - } - -@@ -1366,9 +1351,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, - if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) - bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); - -- if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), -- btree_node_type(b), BCH_VALIDATE_write) ?: -- bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) { -+ struct bkey_validate_context from = (struct bkey_validate_context) { -+ .from = BKEY_VALIDATE_btree_node, -+ .level = b->c.level, -+ .btree = b->c.btree_id, -+ .flags = BCH_VALIDATE_commit, -+ }; -+ if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?: -+ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) { - bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); - dump_stack(); - } -@@ -1418,15 +1408,26 @@ bch2_btree_insert_keys_interior(struct btree_update *as, - (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) - ; - -- while (!bch2_keylist_empty(keys)) { -- insert = bch2_keylist_front(keys); -+ for (; -+ insert != keys->top && bpos_le(insert->k.p, b->key.k.p); -+ insert = bkey_next(insert)) -+ bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); - -- if (bpos_gt(insert->k.p, b->key.k.p)) -- break; -+ if (bch2_btree_node_check_topology(trans, b)) { -+ struct printbuf buf = PRINTBUF; - -- bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); -- bch2_keylist_pop_front(keys); -+ for (struct bkey_i *k = keys->keys; -+ k != insert; -+ k = bkey_next(k)) { -+ bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); -+ prt_newline(&buf); -+ } -+ -+ panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf); - } -+ -+ memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); -+ keys->top_p -= insert->_data - keys->keys_p; - } - - static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) -@@ -1575,8 +1576,6 @@ static void btree_split_insert_keys(struct btree_update *as, - bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - - bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); -- -- BUG_ON(bch2_btree_node_check_topology(trans, b)); - } - } - -@@ -1599,8 +1598,6 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, - if (ret) - return ret; - -- bch2_btree_interior_update_will_free_node(as, b); -- - if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { - struct btree *n[2]; - -@@ -1699,16 +1696,18 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, - if (ret) - goto err; - -+ bch2_btree_interior_update_will_free_node(as, b); -+ - if (n3) { - bch2_btree_update_get_open_buckets(as, n3); -- bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); -+ bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0); - } - if (n2) { - bch2_btree_update_get_open_buckets(as, n2); -- bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); -+ bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0); - } - bch2_btree_update_get_open_buckets(as, n1); -- bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); -+ bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0); - - /* - * The old node must be freed (in memory) _before_ unlocking the new -@@ -1827,8 +1826,6 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t - - btree_update_updated_node(as, b); - bch2_btree_node_unlock_write(trans, path, b); -- -- BUG_ON(bch2_btree_node_check_topology(trans, b)); - return 0; - split: - /* -@@ -1905,7 +1902,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * - BUG_ON(ret); - - bch2_btree_update_get_open_buckets(as, n); -- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); -+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - bch2_trans_node_add(trans, path, n); - six_unlock_intent(&n->c.lock); - -@@ -1953,8 +1950,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, - u64 start_time = local_clock(); - int ret = 0; - -- bch2_trans_verify_not_in_restart(trans); -- bch2_trans_verify_not_unlocked(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - BUG_ON(!trans->paths[path].should_be_locked); - BUG_ON(!btree_node_locked(&trans->paths[path], level)); - -@@ -2058,9 +2054,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, - - trace_and_count(c, btree_node_merge, trans, b); - -- bch2_btree_interior_update_will_free_node(as, b); -- bch2_btree_interior_update_will_free_node(as, m); -- - n = bch2_btree_node_alloc(as, trans, b->c.level); - - SET_BTREE_NODE_SEQ(n->data, -@@ -2096,10 +2089,13 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, - if (ret) - goto err_free_update; - -+ bch2_btree_interior_update_will_free_node(as, b); -+ bch2_btree_interior_update_will_free_node(as, m); -+ - bch2_trans_verify_paths(trans); - - bch2_btree_update_get_open_buckets(as, n); -- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); -+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - - bch2_btree_node_free_inmem(trans, trans->paths + path, b); - bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); -@@ -2150,8 +2146,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, - if (ret) - goto out; - -- bch2_btree_interior_update_will_free_node(as, b); -- - n = bch2_btree_node_alloc_replacement(as, trans, b); - - bch2_btree_build_aux_trees(n); -@@ -2175,8 +2169,10 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, - if (ret) - goto err; - -+ bch2_btree_interior_update_will_free_node(as, b); -+ - bch2_btree_update_get_open_buckets(as, n); -- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); -+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - - bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); - -@@ -2201,42 +2197,50 @@ struct async_btree_rewrite { - struct list_head list; - enum btree_id btree_id; - unsigned level; -- struct bpos pos; -- __le64 seq; -+ struct bkey_buf key; - }; - - static int async_btree_node_rewrite_trans(struct btree_trans *trans, - struct async_btree_rewrite *a) - { -- struct bch_fs *c = trans->c; - struct btree_iter iter; -- struct btree *b; -- int ret; -- -- bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, -+ bch2_trans_node_iter_init(trans, &iter, -+ a->btree_id, a->key.k->k.p, - BTREE_MAX_DEPTH, a->level, 0); -- b = bch2_btree_iter_peek_node(&iter); -- ret = PTR_ERR_OR_ZERO(b); -+ struct btree *b = bch2_btree_iter_peek_node(&iter); -+ int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto out; - -- if (!b || b->data->keys.seq != a->seq) { -+ bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); -+ ret = found -+ ? bch2_btree_node_rewrite(trans, &iter, b, 0) -+ : -ENOENT; -+ -+#if 0 -+ /* Tracepoint... */ -+ if (!ret || ret == -ENOENT) { -+ struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - -- if (b) -- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); -- else -- prt_str(&buf, "(null"); -- bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s", -- __func__, a->seq, buf.buf); -+ if (!ret) { -+ prt_printf(&buf, "rewrite node:\n "); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); -+ } else { -+ prt_printf(&buf, "node to rewrite not found:\n want: "); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); -+ prt_printf(&buf, "\n got: "); -+ if (b) -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); -+ else -+ prt_str(&buf, "(null)"); -+ } -+ bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); -- goto out; - } -- -- ret = bch2_btree_node_rewrite(trans, &iter, b, 0); -+#endif - out: - bch2_trans_iter_exit(trans, &iter); -- - return ret; - } - -@@ -2247,81 +2251,97 @@ static void async_btree_node_rewrite_work(struct work_struct *work) - struct bch_fs *c = a->c; - - int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); -- bch_err_fn_ratelimited(c, ret); -+ if (ret != -ENOENT) -+ bch_err_fn_ratelimited(c, ret); -+ -+ spin_lock(&c->btree_node_rewrites_lock); -+ list_del(&a->list); -+ spin_unlock(&c->btree_node_rewrites_lock); -+ -+ closure_wake_up(&c->btree_node_rewrites_wait); -+ -+ bch2_bkey_buf_exit(&a->key, c); - bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); - kfree(a); - } - - void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) - { -- struct async_btree_rewrite *a; -- int ret; -- -- a = kmalloc(sizeof(*a), GFP_NOFS); -- if (!a) { -- bch_err(c, "%s: error allocating memory", __func__); -+ struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS); -+ if (!a) - return; -- } - - a->c = c; - a->btree_id = b->c.btree_id; - a->level = b->c.level; -- a->pos = b->key.k.p; -- a->seq = b->data->keys.seq; - INIT_WORK(&a->work, async_btree_node_rewrite_work); - -- if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { -- mutex_lock(&c->pending_node_rewrites_lock); -- list_add(&a->list, &c->pending_node_rewrites); -- mutex_unlock(&c->pending_node_rewrites_lock); -- return; -- } -+ bch2_bkey_buf_init(&a->key); -+ bch2_bkey_buf_copy(&a->key, c, &b->key); - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { -- if (test_bit(BCH_FS_started, &c->flags)) { -- bch_err(c, "%s: error getting c->writes ref", __func__); -- kfree(a); -- return; -- } -+ bool now = false, pending = false; - -- ret = bch2_fs_read_write_early(c); -- bch_err_msg(c, ret, "going read-write"); -- if (ret) { -- kfree(a); -- return; -- } -+ spin_lock(&c->btree_node_rewrites_lock); -+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && -+ bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { -+ list_add(&a->list, &c->btree_node_rewrites); -+ now = true; -+ } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { -+ list_add(&a->list, &c->btree_node_rewrites_pending); -+ pending = true; -+ } -+ spin_unlock(&c->btree_node_rewrites_lock); - -- bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); -+ if (now) { -+ queue_work(c->btree_node_rewrite_worker, &a->work); -+ } else if (pending) { -+ /* bch2_do_pending_node_rewrites will execute */ -+ } else { -+ bch2_bkey_buf_exit(&a->key, c); -+ kfree(a); - } -+} - -- queue_work(c->btree_node_rewrite_worker, &a->work); -+void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) -+{ -+ closure_wait_event(&c->btree_node_rewrites_wait, -+ list_empty(&c->btree_node_rewrites)); - } - - void bch2_do_pending_node_rewrites(struct bch_fs *c) - { -- struct async_btree_rewrite *a, *n; -- -- mutex_lock(&c->pending_node_rewrites_lock); -- list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { -- list_del(&a->list); -+ while (1) { -+ spin_lock(&c->btree_node_rewrites_lock); -+ struct async_btree_rewrite *a = -+ list_pop_entry(&c->btree_node_rewrites_pending, -+ struct async_btree_rewrite, list); -+ if (a) -+ list_add(&a->list, &c->btree_node_rewrites); -+ spin_unlock(&c->btree_node_rewrites_lock); -+ -+ if (!a) -+ break; - - bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); - queue_work(c->btree_node_rewrite_worker, &a->work); - } -- mutex_unlock(&c->pending_node_rewrites_lock); - } - - void bch2_free_pending_node_rewrites(struct bch_fs *c) - { -- struct async_btree_rewrite *a, *n; -+ while (1) { -+ spin_lock(&c->btree_node_rewrites_lock); -+ struct async_btree_rewrite *a = -+ list_pop_entry(&c->btree_node_rewrites_pending, -+ struct async_btree_rewrite, list); -+ spin_unlock(&c->btree_node_rewrites_lock); - -- mutex_lock(&c->pending_node_rewrites_lock); -- list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { -- list_del(&a->list); -+ if (!a) -+ break; - -+ bch2_bkey_buf_exit(&a->key, c); - kfree(a); - } -- mutex_unlock(&c->pending_node_rewrites_lock); - } - - static int __bch2_btree_node_update_key(struct btree_trans *trans, -@@ -2575,8 +2595,9 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update - prt_printf(out, "%ps: ", (void *) as->ip_started); - bch2_trans_commit_flags_to_text(out, as->flags); - -- prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", -- bch2_btree_id_str(as->btree_id), -+ prt_str(out, " "); -+ bch2_btree_id_to_text(out, as->btree_id); -+ prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - as->update_level_start, - as->update_level_end, - bch2_btree_update_modes[as->mode], -@@ -2677,6 +2698,9 @@ void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c) - - void bch2_fs_btree_interior_update_exit(struct bch_fs *c) - { -+ WARN_ON(!list_empty(&c->btree_node_rewrites)); -+ WARN_ON(!list_empty(&c->btree_node_rewrites_pending)); -+ - if (c->btree_node_rewrite_worker) - destroy_workqueue(c->btree_node_rewrite_worker); - if (c->btree_interior_update_worker) -@@ -2692,8 +2716,9 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) - mutex_init(&c->btree_interior_update_lock); - INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); - -- INIT_LIST_HEAD(&c->pending_node_rewrites); -- mutex_init(&c->pending_node_rewrites_lock); -+ INIT_LIST_HEAD(&c->btree_node_rewrites); -+ INIT_LIST_HEAD(&c->btree_node_rewrites_pending); -+ spin_lock_init(&c->btree_node_rewrites_lock); - } - - int bch2_fs_btree_interior_update_init(struct bch_fs *c) -diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h -index 10f400957f21..26d646e1275c 100644 ---- a/fs/bcachefs/btree_update_interior.h -+++ b/fs/bcachefs/btree_update_interior.h -@@ -159,7 +159,7 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, - unsigned level, - unsigned flags) - { -- bch2_trans_verify_not_unlocked(trans); -+ bch2_trans_verify_not_unlocked_or_in_restart(trans); - - return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, - btree_prev_sib) ?: -@@ -278,12 +278,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt - { - struct bset_tree *t = bset_tree_last(b); - struct btree_node_entry *bne = max(write_block(b), -- (void *) btree_bkey_last(b, bset_tree_last(b))); -+ (void *) btree_bkey_last(b, t)); - ssize_t remaining_space = - __bch2_btree_u64s_remaining(b, bne->keys.start); - - if (unlikely(bset_written(b, bset(b, t)))) { -- if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) -+ if (b->written + block_sectors(c) <= btree_sectors(c)) - return bne; - } else { - if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && -@@ -334,6 +334,7 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); - struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, - struct jset_entry *, unsigned long); - -+void bch2_async_btree_node_rewrites_flush(struct bch_fs *); - void bch2_do_pending_node_rewrites(struct bch_fs *); - void bch2_free_pending_node_rewrites(struct bch_fs *); - -diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c -index 1639c60dffa0..2c09d19dd621 100644 ---- a/fs/bcachefs/btree_write_buffer.c -+++ b/fs/bcachefs/btree_write_buffer.c -@@ -19,8 +19,6 @@ - static int bch2_btree_write_buffer_journal_flush(struct journal *, - struct journal_entry_pin *, u64); - --static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); -- - static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) - { - return (cmp_int(l->hi, r->hi) ?: -@@ -266,6 +264,22 @@ static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) - BUG_ON(wb->sorted.size < wb->flushing.keys.nr); - } - -+int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, -+ enum btree_id btree, struct bkey_i *k) -+{ -+ struct bch_fs *c = trans->c; -+ struct printbuf buf = PRINTBUF; -+ -+ prt_printf(&buf, "attempting to do write buffer update on non wb btree="); -+ bch2_btree_id_to_text(&buf, btree); -+ prt_str(&buf, "\n"); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); -+ -+ bch2_fs_inconsistent(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ return -EROFS; -+} -+ - static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) - { - struct bch_fs *c = trans->c; -@@ -314,6 +328,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) - darray_for_each(wb->sorted, i) { - struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; - -+ if (unlikely(!btree_type_uses_write_buffer(k->btree))) { -+ ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k); -+ goto err; -+ } -+ - for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) - prefetch(&wb->flushing.keys.data[n->idx]); - -@@ -481,21 +500,55 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) - return ret; - } - --static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) -+static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) -+{ -+ struct journal_keys_to_wb dst; -+ int ret = 0; -+ -+ bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); -+ -+ for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { -+ jset_entry_for_each_key(entry, k) { -+ ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); -+ if (ret) -+ goto out; -+ } -+ -+ entry->type = BCH_JSET_ENTRY_btree_keys; -+ } -+out: -+ ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; -+ return ret; -+} -+ -+static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) - { - struct journal *j = &c->journal; - struct journal_buf *buf; -+ bool blocked; - int ret = 0; - -- while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { -+ while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) { - ret = bch2_journal_keys_to_write_buffer(c, buf); -+ -+ if (!blocked && !ret) { -+ spin_lock(&j->lock); -+ buf->need_flush_to_write_buffer = false; -+ spin_unlock(&j->lock); -+ } -+ - mutex_unlock(&j->buf_lock); -+ -+ if (blocked) { -+ bch2_journal_unblock(j); -+ break; -+ } - } - - return ret; - } - --static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, -+static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq, - bool *did_work) - { - struct bch_fs *c = trans->c; -@@ -505,7 +558,7 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, - do { - bch2_trans_unlock(trans); - -- fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); -+ fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq); - - *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; - -@@ -518,8 +571,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, - mutex_unlock(&wb->flushing.lock); - } while (!ret && - (fetch_from_journal_err || -- (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || -- (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); -+ (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) || -+ (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq))); - - return ret; - } -@@ -600,6 +653,14 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, - bch2_bkey_buf_init(&tmp); - - if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { -+ if (trace_write_buffer_maybe_flush_enabled()) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf, c, referring_k); -+ trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf); -+ printbuf_exit(&buf); -+ } -+ - bch2_bkey_buf_reassemble(&tmp, c, referring_k); - - if (bkey_is_btree_ptr(referring_k.k)) { -@@ -771,31 +832,6 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ - return ret; - } - --static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) --{ -- struct journal_keys_to_wb dst; -- int ret = 0; -- -- bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); -- -- for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { -- jset_entry_for_each_key(entry, k) { -- ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); -- if (ret) -- goto out; -- } -- -- entry->type = BCH_JSET_ENTRY_btree_keys; -- } -- -- spin_lock(&c->journal.lock); -- buf->need_flush_to_write_buffer = false; -- spin_unlock(&c->journal.lock); --out: -- ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; -- return ret; --} -- - static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) - { - if (wb->keys.size >= new_size) -diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c -index ec7d9a59bea9..345b117a4a4a 100644 ---- a/fs/bcachefs/buckets.c -+++ b/fs/bcachefs/buckets.c -@@ -18,7 +18,9 @@ - #include "error.h" - #include "inode.h" - #include "movinggc.h" -+#include "rebalance.h" - #include "recovery.h" -+#include "recovery_passes.h" - #include "reflink.h" - #include "replicas.h" - #include "subvolume.h" -@@ -260,8 +262,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, - struct printbuf buf = PRINTBUF; - int ret = 0; - -- percpu_down_read(&c->mark_lock); -- - bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { - ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); - if (ret) -@@ -362,7 +362,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, - bch_info(c, "new key %s", buf.buf); - } - -- percpu_up_read(&c->mark_lock); - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); -@@ -371,8 +370,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun); - bch2_trans_iter_exit(trans, &iter); -- percpu_down_read(&c->mark_lock); -- - if (ret) - goto err; - -@@ -380,7 +377,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, - bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); - } - err: -- percpu_up_read(&c->mark_lock); - printbuf_exit(&buf); - return ret; - } -@@ -401,8 +397,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - BUG_ON(!sectors); - - if (gen_after(ptr->gen, b_gen)) { -- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -- ptr_gen_newer_than_bucket_gen, -+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -+ log_fsck_err(trans, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" - "while marking %s", - ptr->dev, bucket_nr, b_gen, -@@ -415,8 +411,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - } - - if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { -- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -- ptr_too_stale, -+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -+ log_fsck_err(trans, ptr_too_stale, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - ptr->dev, bucket_nr, b_gen, -@@ -435,8 +431,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - } - - if (b_gen != ptr->gen) { -- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -- stale_dirty_ptr, -+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -+ log_fsck_err(trans, stale_dirty_ptr, - "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" - "while marking %s", - ptr->dev, bucket_nr, b_gen, -@@ -451,8 +447,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - } - - if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { -- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -- ptr_bucket_data_type_mismatch, -+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -+ log_fsck_err(trans, ptr_bucket_data_type_mismatch, - "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - ptr->dev, bucket_nr, b_gen, -@@ -466,8 +462,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - } - - if ((u64) *bucket_sectors + sectors > U32_MAX) { -- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -- bucket_sector_count_overflow, -+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -+ log_fsck_err(trans, bucket_sector_count_overflow, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" - "while marking %s", - ptr->dev, bucket_nr, b_gen, -@@ -485,7 +481,9 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - printbuf_exit(&buf); - return ret; - err: -+fsck_err: - bch2_dump_trans_updates(trans); -+ bch2_inconsistent_error(c); - ret = -BCH_ERR_bucket_ref_update; - goto out; - } -@@ -543,7 +541,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, - struct bkey_s_c k, - const struct extent_ptr_decoded *p, - s64 sectors, enum bch_data_type ptr_data_type, -- struct bch_alloc_v4 *a) -+ struct bch_alloc_v4 *a, -+ bool insert) - { - u32 *dst_sectors = p->has_ec ? &a->stripe_sectors : - !p->ptr.cached ? &a->dirty_sectors : -@@ -553,8 +552,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, - - if (ret) - return ret; -- -- alloc_data_type_set(a, ptr_data_type); -+ if (insert) -+ alloc_data_type_set(a, ptr_data_type); - return 0; - } - -@@ -570,8 +569,10 @@ static int bch2_trigger_pointer(struct btree_trans *trans, - struct printbuf buf = PRINTBUF; - int ret = 0; - -- u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); -- *sectors = insert ? abs_sectors : -abs_sectors; -+ struct bkey_i_backpointer bp; -+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp); -+ -+ *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len; - - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); - if (unlikely(!ca)) { -@@ -580,41 +581,36 @@ static int bch2_trigger_pointer(struct btree_trans *trans, - goto err; - } - -- struct bpos bucket; -- struct bch_backpointer bp; -- __bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors); -+ struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); - - if (flags & BTREE_TRIGGER_transactional) { - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: -- __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v); -+ __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); - if (ret) - goto err; - - if (!p.ptr.cached) { -- ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert); -+ ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); - if (ret) - goto err; - } - } - - if (flags & BTREE_TRIGGER_gc) { -- percpu_down_read(&c->mark_lock); - struct bucket *g = gc_bucket(ca, bucket.offset); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", - p.ptr.dev, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -BCH_ERR_trigger_pointer; -- goto err_unlock; -+ goto err; - } - - bucket_lock(g); - struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; -- ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new); -+ ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); - alloc_to_bucket(g, new); - bucket_unlock(g); --err_unlock: -- percpu_up_read(&c->mark_lock); - - if (!ret) - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); -@@ -951,6 +947,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - enum bch_data_type type, - unsigned sectors) - { -+ struct bch_fs *c = trans->c; - struct btree_iter iter; - int ret = 0; - -@@ -960,8 +957,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - return PTR_ERR(a); - - if (a->v.data_type && type && a->v.data_type != type) { -- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -- bucket_metadata_type_mismatch, -+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -+ log_fsck_err(trans, bucket_metadata_type_mismatch, - "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - iter.pos.inode, iter.pos.offset, a->v.gen, -@@ -979,6 +976,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - } - err: -+fsck_err: - bch2_trans_iter_exit(trans, &iter); - return ret; - } -@@ -990,11 +988,10 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * - struct bch_fs *c = trans->c; - int ret = 0; - -- percpu_down_read(&c->mark_lock); - struct bucket *g = gc_bucket(ca, b); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", - ca->dev_idx, bch2_data_type_str(data_type))) -- goto err_unlock; -+ goto err; - - bucket_lock(g); - struct bch_alloc_v4 old = bucket_m_to_alloc(*g); -@@ -1004,26 +1001,24 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * - "different types of data in same bucket: %s, %s", - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type))) -- goto err; -+ goto err_unlock; - - if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, - "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", - ca->dev_idx, b, g->gen, - bch2_data_type_str(g->data_type ?: data_type), - g->dirty_sectors, sectors)) -- goto err; -+ goto err_unlock; - - g->data_type = data_type; - g->dirty_sectors += sectors; - struct bch_alloc_v4 new = bucket_m_to_alloc(*g); - bucket_unlock(g); -- percpu_up_read(&c->mark_lock); - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - return ret; --err: -- bucket_unlock(g); - err_unlock: -- percpu_up_read(&c->mark_lock); -+ bucket_unlock(g); -+err: - return -BCH_ERR_metadata_bucket_inconsistency; - } - -@@ -1155,6 +1150,31 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) - return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); - } - -+bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) -+{ -+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; -+ u64 b_offset = bucket_to_sector(ca, b); -+ u64 b_end = bucket_to_sector(ca, b + 1); -+ unsigned i; -+ -+ if (!b) -+ return true; -+ -+ for (i = 0; i < layout->nr_superblocks; i++) { -+ u64 offset = le64_to_cpu(layout->sb_offset[i]); -+ u64 end = offset + (1 << layout->sb_max_size_bits); -+ -+ if (!(offset >= b_end || end <= b_offset)) -+ return true; -+ } -+ -+ for (i = 0; i < ca->journal.nr; i++) -+ if (b == ca->journal.buckets[i]) -+ return true; -+ -+ return false; -+} -+ - /* Disk reservations: */ - - #define SECTORS_CACHE 1024 -@@ -1238,7 +1258,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c) - for_each_member_device(c, ca) { - BUG_ON(ca->buckets_nouse); - -- ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * -+ ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO); - if (!ca->buckets_nouse) { -@@ -1264,10 +1284,15 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) - bool resize = ca->bucket_gens != NULL; - int ret; - -- BUG_ON(resize && ca->buckets_nouse); -+ if (resize) -+ lockdep_assert_held(&c->state_lock); -+ -+ if (resize && ca->buckets_nouse) -+ return -BCH_ERR_no_resize_with_buckets_nouse; - -- if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, -- GFP_KERNEL|__GFP_ZERO))) { -+ bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), -+ GFP_KERNEL|__GFP_ZERO); -+ if (!bucket_gens) { - ret = -BCH_ERR_ENOMEM_bucket_gens; - goto err; - } -@@ -1277,19 +1302,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) - bucket_gens->nbuckets_minus_first = - bucket_gens->nbuckets - bucket_gens->first_bucket; - -- if (resize) { -- down_write(&ca->bucket_lock); -- percpu_down_write(&c->mark_lock); -- } -- - old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); - - if (resize) { -- size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); -- -+ bucket_gens->nbuckets = min(bucket_gens->nbuckets, -+ old_bucket_gens->nbuckets); -+ bucket_gens->nbuckets_minus_first = -+ bucket_gens->nbuckets - bucket_gens->first_bucket; - memcpy(bucket_gens->b, - old_bucket_gens->b, -- n); -+ bucket_gens->nbuckets); - } - - rcu_assign_pointer(ca->bucket_gens, bucket_gens); -@@ -1297,11 +1319,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) - - nbuckets = ca->mi.nbuckets; - -- if (resize) { -- percpu_up_write(&c->mark_lock); -- up_write(&ca->bucket_lock); -- } -- - ret = 0; - err: - if (bucket_gens) -diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h -index ccc78bfe2fd4..a9acdd6c0c86 100644 ---- a/fs/bcachefs/buckets.h -+++ b/fs/bcachefs/buckets.h -@@ -82,16 +82,15 @@ static inline void bucket_lock(struct bucket *b) - - static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) - { -- return genradix_ptr(&ca->buckets_gc, b); -+ return bucket_valid(ca, b) -+ ? genradix_ptr(&ca->buckets_gc, b) -+ : NULL; - } - - static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) - { - return rcu_dereference_check(ca->bucket_gens, -- !ca->fs || -- percpu_rwsem_is_held(&ca->fs->mark_lock) || -- lockdep_is_held(&ca->fs->state_lock) || -- lockdep_is_held(&ca->bucket_lock)); -+ lockdep_is_held(&ca->fs->state_lock)); - } - - static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) -@@ -308,26 +307,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, - enum btree_iter_update_trigger_flags); - int bch2_trans_mark_dev_sbs(struct bch_fs *); - --static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) --{ -- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; -- u64 b_offset = bucket_to_sector(ca, b); -- u64 b_end = bucket_to_sector(ca, b + 1); -- unsigned i; -- -- if (!b) -- return true; -- -- for (i = 0; i < layout->nr_superblocks; i++) { -- u64 offset = le64_to_cpu(layout->sb_offset[i]); -- u64 end = offset + (1 << layout->sb_max_size_bits); -- -- if (!(offset >= b_end || end <= b_offset)) -- return true; -- } -- -- return false; --} -+bool bch2_is_superblock_bucket(struct bch_dev *, u64); - - static inline const char *bch2_data_type_str(enum bch_data_type type) - { -diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h -index 28bd09a253c8..7174047b8e92 100644 ---- a/fs/bcachefs/buckets_types.h -+++ b/fs/bcachefs/buckets_types.h -@@ -24,7 +24,7 @@ struct bucket_gens { - u16 first_bucket; - size_t nbuckets; - size_t nbuckets_minus_first; -- u8 b[]; -+ u8 b[] __counted_by(nbuckets); - }; - - struct bch_dev_usage { -diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c -index f9fb150eda70..c8a488e6b7b8 100644 ---- a/fs/bcachefs/buckets_waiting_for_journal.c -+++ b/fs/bcachefs/buckets_waiting_for_journal.c -@@ -22,23 +22,21 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_ - memset(t->d, 0, sizeof(t->d[0]) << t->bits); - } - --bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, -- u64 flushed_seq, -- unsigned dev, u64 bucket) -+u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b, -+ unsigned dev, u64 bucket) - { - struct buckets_waiting_for_journal_table *t; - u64 dev_bucket = (u64) dev << 56 | bucket; -- bool ret = false; -- unsigned i; -+ u64 ret = 0; - - mutex_lock(&b->lock); - t = b->t; - -- for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { -+ for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { - struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); - - if (h->dev_bucket == dev_bucket) { -- ret = h->journal_seq > flushed_seq; -+ ret = h->journal_seq; - break; - } - } -diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h -index d2ae19cbe18c..365619ca44c8 100644 ---- a/fs/bcachefs/buckets_waiting_for_journal.h -+++ b/fs/bcachefs/buckets_waiting_for_journal.h -@@ -4,8 +4,8 @@ - - #include "buckets_waiting_for_journal_types.h" - --bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, -- u64, unsigned, u64); -+u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *, -+ unsigned, u64); - int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, - u64, unsigned, u64, u64); - -diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c -index 2182b555c112..46e9e32105a9 100644 ---- a/fs/bcachefs/chardev.c -+++ b/fs/bcachefs/chardev.c -@@ -6,11 +6,11 @@ - #include "buckets.h" - #include "chardev.h" - #include "disk_accounting.h" -+#include "fsck.h" - #include "journal.h" - #include "move.h" - #include "recovery_passes.h" - #include "replicas.h" --#include "super.h" - #include "super-io.h" - #include "thread_with_file.h" - -@@ -127,130 +127,6 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg - } - #endif - --struct fsck_thread { -- struct thread_with_stdio thr; -- struct bch_fs *c; -- struct bch_opts opts; --}; -- --static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) --{ -- struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); -- kfree(thr); --} -- --static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) --{ -- struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); -- struct bch_fs *c = thr->c; -- -- int ret = PTR_ERR_OR_ZERO(c); -- if (ret) -- return ret; -- -- ret = bch2_fs_start(thr->c); -- if (ret) -- goto err; -- -- if (test_bit(BCH_FS_errors_fixed, &c->flags)) { -- bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); -- ret |= 1; -- } -- if (test_bit(BCH_FS_error, &c->flags)) { -- bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); -- ret |= 4; -- } --err: -- bch2_fs_stop(c); -- return ret; --} -- --static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { -- .exit = bch2_fsck_thread_exit, -- .fn = bch2_fsck_offline_thread_fn, --}; -- --static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) --{ -- struct bch_ioctl_fsck_offline arg; -- struct fsck_thread *thr = NULL; -- darray_str(devs) = {}; -- long ret = 0; -- -- if (copy_from_user(&arg, user_arg, sizeof(arg))) -- return -EFAULT; -- -- if (arg.flags) -- return -EINVAL; -- -- if (!capable(CAP_SYS_ADMIN)) -- return -EPERM; -- -- for (size_t i = 0; i < arg.nr_devs; i++) { -- u64 dev_u64; -- ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); -- if (ret) -- goto err; -- -- char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); -- ret = PTR_ERR_OR_ZERO(dev_str); -- if (ret) -- goto err; -- -- ret = darray_push(&devs, dev_str); -- if (ret) { -- kfree(dev_str); -- goto err; -- } -- } -- -- thr = kzalloc(sizeof(*thr), GFP_KERNEL); -- if (!thr) { -- ret = -ENOMEM; -- goto err; -- } -- -- thr->opts = bch2_opts_empty(); -- -- if (arg.opts) { -- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); -- ret = PTR_ERR_OR_ZERO(optstr) ?: -- bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); -- if (!IS_ERR(optstr)) -- kfree(optstr); -- -- if (ret) -- goto err; -- } -- -- opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); -- opt_set(thr->opts, read_only, 1); -- opt_set(thr->opts, ratelimit_errors, 0); -- -- /* We need request_key() to be called before we punt to kthread: */ -- opt_set(thr->opts, nostart, true); -- -- bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); -- -- thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); -- -- if (!IS_ERR(thr->c) && -- thr->c->opts.errors == BCH_ON_ERROR_panic) -- thr->c->opts.errors = BCH_ON_ERROR_ro; -- -- ret = __bch2_run_thread_with_stdio(&thr->thr); --out: -- darray_for_each(devs, i) -- kfree(*i); -- darray_exit(&devs); -- return ret; --err: -- if (thr) -- bch2_fsck_thread_exit(&thr->thr); -- pr_err("ret %s", bch2_err_str(ret)); -- goto out; --} -- - static long bch2_global_ioctl(unsigned cmd, void __user *arg) - { - long ret; -@@ -775,99 +651,6 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, - return ret; - } - --static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) --{ -- struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); -- struct bch_fs *c = thr->c; -- -- c->stdio_filter = current; -- c->stdio = &thr->thr.stdio; -- -- /* -- * XXX: can we figure out a way to do this without mucking with c->opts? -- */ -- unsigned old_fix_errors = c->opts.fix_errors; -- if (opt_defined(thr->opts, fix_errors)) -- c->opts.fix_errors = thr->opts.fix_errors; -- else -- c->opts.fix_errors = FSCK_FIX_ask; -- -- c->opts.fsck = true; -- set_bit(BCH_FS_fsck_running, &c->flags); -- -- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; -- int ret = bch2_run_online_recovery_passes(c); -- -- clear_bit(BCH_FS_fsck_running, &c->flags); -- bch_err_fn(c, ret); -- -- c->stdio = NULL; -- c->stdio_filter = NULL; -- c->opts.fix_errors = old_fix_errors; -- -- up(&c->online_fsck_mutex); -- bch2_ro_ref_put(c); -- return ret; --} -- --static const struct thread_with_stdio_ops bch2_online_fsck_ops = { -- .exit = bch2_fsck_thread_exit, -- .fn = bch2_fsck_online_thread_fn, --}; -- --static long bch2_ioctl_fsck_online(struct bch_fs *c, -- struct bch_ioctl_fsck_online arg) --{ -- struct fsck_thread *thr = NULL; -- long ret = 0; -- -- if (arg.flags) -- return -EINVAL; -- -- if (!capable(CAP_SYS_ADMIN)) -- return -EPERM; -- -- if (!bch2_ro_ref_tryget(c)) -- return -EROFS; -- -- if (down_trylock(&c->online_fsck_mutex)) { -- bch2_ro_ref_put(c); -- return -EAGAIN; -- } -- -- thr = kzalloc(sizeof(*thr), GFP_KERNEL); -- if (!thr) { -- ret = -ENOMEM; -- goto err; -- } -- -- thr->c = c; -- thr->opts = bch2_opts_empty(); -- -- if (arg.opts) { -- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); -- -- ret = PTR_ERR_OR_ZERO(optstr) ?: -- bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); -- if (!IS_ERR(optstr)) -- kfree(optstr); -- -- if (ret) -- goto err; -- } -- -- ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); --err: -- if (ret < 0) { -- bch_err_fn(c, ret); -- if (thr) -- bch2_fsck_thread_exit(&thr->thr); -- up(&c->online_fsck_mutex); -- bch2_ro_ref_put(c); -- } -- return ret; --} -- - #define BCH_IOCTL(_name, _argtype) \ - do { \ - _argtype i; \ -diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c -index ce8fc677bef9..23a383577d4c 100644 ---- a/fs/bcachefs/checksum.c -+++ b/fs/bcachefs/checksum.c -@@ -2,6 +2,7 @@ - #include "bcachefs.h" - #include "checksum.h" - #include "errcode.h" -+#include "error.h" - #include "super.h" - #include "super-io.h" - -@@ -252,6 +253,10 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, - if (!bch2_csum_type_is_encryption(type)) - return 0; - -+ if (bch2_fs_inconsistent_on(!c->chacha20, -+ c, "attempting to encrypt without encryption key")) -+ return -BCH_ERR_no_encryption_key; -+ - return do_encrypt(c->chacha20, nonce, data, len); - } - -@@ -337,8 +342,9 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, - size_t sgl_len = 0; - int ret = 0; - -- if (!bch2_csum_type_is_encryption(type)) -- return 0; -+ if (bch2_fs_inconsistent_on(!c->chacha20, -+ c, "attempting to encrypt without encryption key")) -+ return -BCH_ERR_no_encryption_key; - - darray_init(&sgl); - -diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h -index e40499fde9a4..43b9d71f2f2b 100644 ---- a/fs/bcachefs/checksum.h -+++ b/fs/bcachefs/checksum.h -@@ -109,7 +109,7 @@ int bch2_enable_encryption(struct bch_fs *, bool); - void bch2_fs_encryption_exit(struct bch_fs *); - int bch2_fs_encryption_init(struct bch_fs *); - --static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, -+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type, - bool data) - { - switch (type) { -diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c -index 1410365a8891..114bf2f3879f 100644 ---- a/fs/bcachefs/compress.c -+++ b/fs/bcachefs/compress.c -@@ -2,13 +2,34 @@ - #include "bcachefs.h" - #include "checksum.h" - #include "compress.h" -+#include "error.h" - #include "extents.h" -+#include "io_write.h" -+#include "opts.h" - #include "super-io.h" - - #include - #include - #include - -+static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type) -+{ -+ switch (type) { -+ case BCH_COMPRESSION_TYPE_none: -+ case BCH_COMPRESSION_TYPE_incompressible: -+ return BCH_COMPRESSION_OPT_none; -+ case BCH_COMPRESSION_TYPE_lz4_old: -+ case BCH_COMPRESSION_TYPE_lz4: -+ return BCH_COMPRESSION_OPT_lz4; -+ case BCH_COMPRESSION_TYPE_gzip: -+ return BCH_COMPRESSION_OPT_gzip; -+ case BCH_COMPRESSION_TYPE_zstd: -+ return BCH_COMPRESSION_OPT_zstd; -+ default: -+ BUG(); -+ } -+} -+ - /* Bounce buffer: */ - struct bbuf { - void *b; -@@ -158,6 +179,19 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, - void *workspace; - int ret; - -+ enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); -+ mempool_t *workspace_pool = &c->compress_workspace[opt]; -+ if (unlikely(!mempool_initialized(workspace_pool))) { -+ if (fsck_err(c, compression_type_not_marked_in_sb, -+ "compression type %s set but not marked in superblock", -+ __bch2_compression_types[crc.compression_type])) -+ ret = bch2_check_set_has_compressed_data(c, opt); -+ else -+ ret = -BCH_ERR_compression_workspace_not_initialized; -+ if (ret) -+ goto out; -+ } -+ - src_data = bio_map_or_bounce(c, src, READ); - - switch (crc.compression_type) { -@@ -176,13 +210,13 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, - .avail_out = dst_len, - }; - -- workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); -+ workspace = mempool_alloc(workspace_pool, GFP_NOFS); - - zlib_set_workspace(&strm, workspace); - zlib_inflateInit2(&strm, -MAX_WBITS); - ret = zlib_inflate(&strm, Z_FINISH); - -- mempool_free(workspace, &c->decompress_workspace); -+ mempool_free(workspace, workspace_pool); - - if (ret != Z_STREAM_END) - goto err; -@@ -195,14 +229,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, - if (real_src_len > src_len - 4) - goto err; - -- workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); -+ workspace = mempool_alloc(workspace_pool, GFP_NOFS); - ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - - ret = zstd_decompress_dctx(ctx, - dst_data, dst_len, - src_data.b + 4, real_src_len); - -- mempool_free(workspace, &c->decompress_workspace); -+ mempool_free(workspace, workspace_pool); - - if (ret != dst_len) - goto err; -@@ -212,6 +246,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, - BUG(); - } - ret = 0; -+fsck_err: - out: - bio_unmap_or_unbounce(c, src_data); - return ret; -@@ -220,11 +255,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, - goto out; - } - --int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, -- struct bch_extent_crc_unpacked *crc) -+int bch2_bio_uncompress_inplace(struct bch_write_op *op, -+ struct bio *bio) - { -+ struct bch_fs *c = op->c; -+ struct bch_extent_crc_unpacked *crc = &op->crc; - struct bbuf data = { NULL }; - size_t dst_len = crc->uncompressed_size << 9; -+ int ret = 0; - - /* bio must own its pages: */ - BUG_ON(!bio->bi_vcnt); -@@ -232,17 +270,26 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, - - if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || - crc->compressed_size << 9 > c->opts.encoded_extent_max) { -- bch_err(c, "error rewriting existing data: extent too big"); -+ struct printbuf buf = PRINTBUF; -+ bch2_write_op_error(&buf, op); -+ prt_printf(&buf, "error rewriting existing data: extent too big"); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); - return -EIO; - } - - data = __bounce_alloc(c, dst_len, WRITE); - - if (__bio_uncompress(c, bio, data.b, *crc)) { -- if (!c->opts.no_data_io) -- bch_err(c, "error rewriting existing data: decompression error"); -- bio_unmap_or_unbounce(c, data); -- return -EIO; -+ if (!c->opts.no_data_io) { -+ struct printbuf buf = PRINTBUF; -+ bch2_write_op_error(&buf, op); -+ prt_printf(&buf, "error rewriting existing data: decompression error"); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } -+ ret = -EIO; -+ goto err; - } - - /* -@@ -259,9 +306,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, - crc->uncompressed_size = crc->live_size; - crc->offset = 0; - crc->csum = (struct bch_csum) { 0, 0 }; -- -+err: - bio_unmap_or_unbounce(c, data); -- return 0; -+ return ret; - } - - int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, -@@ -394,8 +441,21 @@ static unsigned __bio_compress(struct bch_fs *c, - unsigned pad; - int ret = 0; - -- BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); -- BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); -+ /* bch2_compression_decode catches unknown compression types: */ -+ BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR); -+ -+ mempool_t *workspace_pool = &c->compress_workspace[compression.type]; -+ if (unlikely(!mempool_initialized(workspace_pool))) { -+ if (fsck_err(c, compression_opt_not_marked_in_sb, -+ "compression opt %s set but not marked in superblock", -+ bch2_compression_opts[compression.type])) { -+ ret = bch2_check_set_has_compressed_data(c, compression.type); -+ if (ret) /* memory allocation failure, don't compress */ -+ return 0; -+ } else { -+ return 0; -+ } -+ } - - /* If it's only one block, don't bother trying to compress: */ - if (src->bi_iter.bi_size <= c->opts.block_size) -@@ -404,7 +464,7 @@ static unsigned __bio_compress(struct bch_fs *c, - dst_data = bio_map_or_bounce(c, dst, WRITE); - src_data = bio_map_or_bounce(c, src, READ); - -- workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS); -+ workspace = mempool_alloc(workspace_pool, GFP_NOFS); - - *src_len = src->bi_iter.bi_size; - *dst_len = dst->bi_iter.bi_size; -@@ -447,7 +507,7 @@ static unsigned __bio_compress(struct bch_fs *c, - *src_len = round_down(*src_len, block_bytes(c)); - } - -- mempool_free(workspace, &c->compress_workspace[compression_type]); -+ mempool_free(workspace, workspace_pool); - - if (ret) - goto err; -@@ -477,6 +537,9 @@ static unsigned __bio_compress(struct bch_fs *c, - err: - ret = BCH_COMPRESSION_TYPE_incompressible; - goto out; -+fsck_err: -+ ret = 0; -+ goto out; - } - - unsigned bch2_bio_compress(struct bch_fs *c, -@@ -559,7 +622,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) - { - unsigned i; - -- mempool_exit(&c->decompress_workspace); - for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) - mempool_exit(&c->compress_workspace[i]); - mempool_exit(&c->compression_bounce[WRITE]); -@@ -568,7 +630,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) - - static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) - { -- size_t decompress_workspace_size = 0; - ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), - c->opts.encoded_extent_max); - -@@ -576,19 +637,17 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) - - struct { - unsigned feature; -- enum bch_compression_type type; -+ enum bch_compression_opts type; - size_t compress_workspace; -- size_t decompress_workspace; - } compression_types[] = { -- { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, -- max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS), -- 0 }, -- { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, -- zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), -- zlib_inflate_workspacesize(), }, -- { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, -- c->zstd_workspace_size, -- zstd_dctx_workspace_bound() }, -+ { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4, -+ max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, -+ { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip, -+ max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), -+ zlib_inflate_workspacesize()) }, -+ { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd, -+ max(c->zstd_workspace_size, -+ zstd_dctx_workspace_bound()) }, - }, *i; - bool have_compressed = false; - -@@ -613,9 +672,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) - for (i = compression_types; - i < compression_types + ARRAY_SIZE(compression_types); - i++) { -- decompress_workspace_size = -- max(decompress_workspace_size, i->decompress_workspace); -- - if (!(features & (1 << i->feature))) - continue; - -@@ -628,11 +684,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) - return -BCH_ERR_ENOMEM_compression_workspace_init; - } - -- if (!mempool_initialized(&c->decompress_workspace) && -- mempool_init_kvmalloc_pool(&c->decompress_workspace, -- 1, decompress_workspace_size)) -- return -BCH_ERR_ENOMEM_decompression_workspace_init; -- - return 0; - } - -diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h -index 607fd5e232c9..bec2f05bfd52 100644 ---- a/fs/bcachefs/compress.h -+++ b/fs/bcachefs/compress.h -@@ -47,8 +47,8 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) - return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; - } - --int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, -- struct bch_extent_crc_unpacked *); -+struct bch_write_op; -+int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *); - int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, - struct bvec_iter, struct bch_extent_crc_unpacked); - unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, -diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h -index 8f4c3f0665c4..c6151495985f 100644 ---- a/fs/bcachefs/darray.h -+++ b/fs/bcachefs/darray.h -@@ -83,7 +83,7 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); - for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) - - #define darray_for_each_reverse(_d, _i) \ -- for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) -+ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) - - #define darray_init(_d) \ - do { \ -diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c -index 8e75a852b358..642fbc60ecab 100644 ---- a/fs/bcachefs/data_update.c -+++ b/fs/bcachefs/data_update.c -@@ -91,15 +91,28 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc - return true; - } - --static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) -+static noinline void trace_move_extent_finish2(struct data_update *u, -+ struct bkey_i *new, -+ struct bkey_i *insert) - { -- if (trace_move_extent_finish_enabled()) { -- struct printbuf buf = PRINTBUF; -+ struct bch_fs *c = u->op.c; -+ struct printbuf buf = PRINTBUF; - -- bch2_bkey_val_to_text(&buf, c, k); -- trace_move_extent_finish(c, buf.buf); -- printbuf_exit(&buf); -- } -+ prt_newline(&buf); -+ -+ bch2_data_update_to_text(&buf, u); -+ prt_newline(&buf); -+ -+ prt_str_indented(&buf, "new replicas:\t"); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); -+ prt_newline(&buf); -+ -+ prt_str_indented(&buf, "insert:\t"); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); -+ prt_newline(&buf); -+ -+ trace_move_extent_finish(c, buf.buf); -+ printbuf_exit(&buf); - } - - static void trace_move_extent_fail2(struct data_update *m, -@@ -110,11 +123,8 @@ static void trace_move_extent_fail2(struct data_update *m, - { - struct bch_fs *c = m->op.c; - struct bkey_s_c old = bkey_i_to_s_c(m->k.k); -- const union bch_extent_entry *entry; -- struct bch_extent_ptr *ptr; -- struct extent_ptr_decoded p; - struct printbuf buf = PRINTBUF; -- unsigned i, rewrites_found = 0; -+ unsigned rewrites_found = 0; - - if (!trace_move_extent_fail_enabled()) - return; -@@ -122,27 +132,25 @@ static void trace_move_extent_fail2(struct data_update *m, - prt_str(&buf, msg); - - if (insert) { -- i = 0; -+ const union bch_extent_entry *entry; -+ struct bch_extent_ptr *ptr; -+ struct extent_ptr_decoded p; -+ -+ unsigned ptr_bit = 1; - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { -- if (((1U << i) & m->data_opts.rewrite_ptrs) && -+ if ((ptr_bit & m->data_opts.rewrite_ptrs) && - (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && - !ptr->cached) -- rewrites_found |= 1U << i; -- i++; -+ rewrites_found |= ptr_bit; -+ ptr_bit <<= 1; - } - } - -- prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u", -- (m->data_opts.rewrite_ptrs & (1 << 0)) != 0, -- (m->data_opts.rewrite_ptrs & (1 << 1)) != 0, -- (m->data_opts.rewrite_ptrs & (1 << 2)) != 0, -- (m->data_opts.rewrite_ptrs & (1 << 3)) != 0); -+ prt_str(&buf, "rewrites found:\t"); -+ bch2_prt_u64_base2(&buf, rewrites_found); -+ prt_newline(&buf); - -- prt_printf(&buf, "\nrewrites found: %u%u%u%u", -- (rewrites_found & (1 << 0)) != 0, -- (rewrites_found & (1 << 1)) != 0, -- (rewrites_found & (1 << 2)) != 0, -- (rewrites_found & (1 << 3)) != 0); -+ bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); -@@ -194,7 +202,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - struct bpos next_pos; - bool should_check_enospc; - s64 i_sectors_delta = 0, disk_sectors_delta = 0; -- unsigned rewrites_found = 0, durability, i; -+ unsigned rewrites_found = 0, durability, ptr_bit; - - bch2_trans_begin(trans); - -@@ -231,16 +239,16 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - * - * Fist, drop rewrite_ptrs from @new: - */ -- i = 0; -+ ptr_bit = 1; - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { -- if (((1U << i) & m->data_opts.rewrite_ptrs) && -+ if ((ptr_bit & m->data_opts.rewrite_ptrs) && - (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && - !ptr->cached) { - bch2_extent_ptr_set_cached(c, &m->op.opts, - bkey_i_to_s(insert), ptr); -- rewrites_found |= 1U << i; -+ rewrites_found |= ptr_bit; - } -- i++; -+ ptr_bit <<= 1; - } - - if (m->data_opts.rewrite_ptrs && -@@ -323,12 +331,16 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - * it's been hard to reproduce, so this should give us some more - * information when it does occur: - */ -- int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), -- BCH_VALIDATE_commit); -+ int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), -+ (struct bkey_validate_context) { -+ .btree = m->btree_id, -+ .flags = BCH_VALIDATE_commit, -+ }); - if (invalid) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "about to insert invalid key in data update path"); -+ prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); -@@ -362,7 +374,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - k.k->p, bkey_start_pos(&insert->k)) ?: - bch2_insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, insert->k.p) ?: -- bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: -+ bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: - bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, &op->res, -@@ -374,7 +386,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - bch2_btree_iter_set_pos(&iter, next_pos); - - this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); -- trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); -+ if (trace_move_extent_finish_enabled()) -+ trace_move_extent_finish2(m, &new->k_i, insert); - } - err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -@@ -527,34 +540,38 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct data_update_opts *data_opts) - { - printbuf_tabstop_push(out, 20); -- prt_str(out, "rewrite ptrs:\t"); -+ -+ prt_str_indented(out, "rewrite ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); - prt_newline(out); - -- prt_str(out, "kill ptrs:\t"); -+ prt_str_indented(out, "kill ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->kill_ptrs); - prt_newline(out); - -- prt_str(out, "target:\t"); -+ prt_str_indented(out, "target:\t"); - bch2_target_to_text(out, c, data_opts->target); - prt_newline(out); - -- prt_str(out, "compression:\t"); -- bch2_compression_opt_to_text(out, background_compression(*io_opts)); -+ prt_str_indented(out, "compression:\t"); -+ bch2_compression_opt_to_text(out, io_opts->background_compression); - prt_newline(out); - -- prt_str(out, "opts.replicas:\t"); -+ prt_str_indented(out, "opts.replicas:\t"); - prt_u64(out, io_opts->data_replicas); -+ prt_newline(out); - -- prt_str(out, "extra replicas:\t"); -+ prt_str_indented(out, "extra replicas:\t"); - prt_u64(out, data_opts->extra_replicas); - } - - void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) - { -- bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); -- prt_newline(out); - bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); -+ prt_newline(out); -+ -+ prt_str_indented(out, "old key:\t"); -+ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); - } - - int bch2_extent_drop_ptrs(struct btree_trans *trans, -@@ -614,7 +631,7 @@ int bch2_data_update_init(struct btree_trans *trans, - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; -- unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; -+ unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; - int ret = 0; - - /* -@@ -622,7 +639,7 @@ int bch2_data_update_init(struct btree_trans *trans, - * and we have to check for this because we go rw before repairing the - * snapshots table - just skip it, we can move it later. - */ -- if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) -+ if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) - return -BCH_ERR_data_update_done; - - if (!bkey_get_dev_refs(c, k)) -@@ -652,22 +669,22 @@ int bch2_data_update_init(struct btree_trans *trans, - BCH_WRITE_DATA_ENCODED| - BCH_WRITE_MOVE| - m->data_opts.write_flags; -- m->op.compression_opt = background_compression(io_opts); -+ m->op.compression_opt = io_opts.background_compression; - m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - - unsigned durability_have = 0, durability_removing = 0; - -- i = 0; -+ unsigned ptr_bit = 1; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (!p.ptr.cached) { - rcu_read_lock(); -- if (BIT(i) & m->data_opts.rewrite_ptrs) { -+ if (ptr_bit & m->data_opts.rewrite_ptrs) { - if (crc_is_compressed(p.crc)) - reserve_sectors += k.k->size; - - m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); - durability_removing += bch2_extent_ptr_desired_durability(c, &p); -- } else if (!(BIT(i) & m->data_opts.kill_ptrs)) { -+ } else if (!(ptr_bit & m->data_opts.kill_ptrs)) { - bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); - durability_have += bch2_extent_ptr_durability(c, &p); - } -@@ -687,7 +704,7 @@ int bch2_data_update_init(struct btree_trans *trans, - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) - m->op.incompressible = true; - -- i++; -+ ptr_bit <<= 1; - } - - unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); -@@ -750,14 +767,14 @@ int bch2_data_update_init(struct btree_trans *trans, - void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) - { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- unsigned i = 0; -+ unsigned ptr_bit = 1; - - bkey_for_each_ptr(ptrs, ptr) { -- if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) { -- opts->kill_ptrs |= 1U << i; -- opts->rewrite_ptrs ^= 1U << i; -+ if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) { -+ opts->kill_ptrs |= ptr_bit; -+ opts->rewrite_ptrs ^= ptr_bit; - } - -- i++; -+ ptr_bit <<= 1; - } - } -diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c -index 45aec1afdb0e..55333e82d1fe 100644 ---- a/fs/bcachefs/debug.c -+++ b/fs/bcachefs/debug.c -@@ -20,6 +20,7 @@ - #include "extents.h" - #include "fsck.h" - #include "inode.h" -+#include "journal_reclaim.h" - #include "super.h" - - #include -@@ -472,7 +473,9 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - -- prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level); -+ prt_printf(out, "%px ", b); -+ bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); -+ prt_printf(out, "\n"); - - printbuf_indent_add(out, 2); - -diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c -index faffc98d5605..600eee936f13 100644 ---- a/fs/bcachefs/dirent.c -+++ b/fs/bcachefs/dirent.c -@@ -101,7 +101,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { - }; - - int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr d_name = bch2_dirent_get_name(d); -@@ -120,7 +120,7 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, - * Check new keys don't exceed the max length - * (older keys may be larger.) - */ -- bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, -+ bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, - c, dirent_name_too_long, - "dirent name too big (%u > %u)", - d_name.len, BCH_NAME_MAX); -@@ -266,7 +266,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, - } else { - target->subvol = le32_to_cpu(d.v->d_child_subvol); - -- ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s); -+ ret = bch2_subvolume_get(trans, target->subvol, true, &s); - - target->inum = le64_to_cpu(s.inode); - } -@@ -500,7 +500,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 - struct bkey_s_c k; - int ret; - -- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, -+ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, - SPOS(dir, 0, snapshot), - POS(dir, U64_MAX), 0, k, ret) - if (k.k->type == KEY_TYPE_dirent) { -@@ -549,7 +549,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) - bch2_bkey_buf_init(&sk); - - int ret = bch2_trans_run(c, -- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents, -+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, - POS(inum.inum, ctx->pos), - POS(inum.inum, U64_MAX), - inum.subvol, 0, k, ({ -diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h -index 53ad99666022..362b3b2f2f2e 100644 ---- a/fs/bcachefs/dirent.h -+++ b/fs/bcachefs/dirent.h -@@ -4,10 +4,10 @@ - - #include "str_hash.h" - --enum bch_validate_flags; - extern const struct bch_hash_desc bch2_dirent_hash_desc; - --int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -+int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); - void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - - #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ -diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c -index 07eb8fa1b026..b32e91ba8be8 100644 ---- a/fs/bcachefs/disk_accounting.c -+++ b/fs/bcachefs/disk_accounting.c -@@ -79,6 +79,8 @@ static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_ - memcpy_u64s_small(acc->v.d, d, nr); - } - -+static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); -+ - int bch2_disk_accounting_mod(struct btree_trans *trans, - struct disk_accounting_pos *k, - s64 *d, unsigned nr, bool gc) -@@ -96,9 +98,16 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, - - accounting_key_init(&k_i.k, k, d, nr); - -- return likely(!gc) -- ? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k) -- : bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); -+ if (unlikely(gc)) { -+ int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); -+ if (ret == -BCH_ERR_btree_insert_need_mark_replicas) -+ ret = drop_locks_do(trans, -+ bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: -+ bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); -+ return ret; -+ } else { -+ return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); -+ } - } - - int bch2_mod_dev_cached_sectors(struct btree_trans *trans, -@@ -127,14 +136,15 @@ static inline bool is_zero(char *start, char *end) - #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) - - int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - void *end = &acc_k + 1; - int ret = 0; - -- bkey_fsck_err_on(bversion_zero(k.k->bversion), -+ bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && -+ bversion_zero(k.k->bversion), - c, accounting_key_version_0, - "accounting key with version=0"); - -@@ -217,7 +227,8 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po - prt_printf(out, "id=%u", k->snapshot.id); - break; - case BCH_DISK_ACCOUNTING_btree: -- prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id)); -+ prt_str(out, "btree="); -+ bch2_btree_id_to_text(out, k->btree.id); - break; - } - } -@@ -243,10 +254,10 @@ void bch2_accounting_swab(struct bkey_s k) - } - - static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, -- struct disk_accounting_pos acc) -+ struct disk_accounting_pos *acc) - { -- unsafe_memcpy(r, &acc.replicas, -- replicas_entry_bytes(&acc.replicas), -+ unsafe_memcpy(r, &acc->replicas, -+ replicas_entry_bytes(&acc->replicas), - "variable length struct"); - } - -@@ -257,7 +268,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc - - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_replicas: -- __accounting_to_replicas(r, acc_k); -+ __accounting_to_replicas(r, &acc_k); - return true; - default: - return false; -@@ -322,6 +333,14 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun - - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); -+ -+ if (trace_accounting_mem_insert_enabled()) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_accounting_to_text(&buf, c, a.s_c); -+ trace_accounting_mem_insert(c, buf.buf); -+ printbuf_exit(&buf); -+ } - return 0; - err: - free_percpu(n.v[1]); -@@ -461,32 +480,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc - return ret; - } - --void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) --{ -- struct bch_accounting_mem *acc = &c->accounting; -- -- percpu_down_read(&c->mark_lock); -- out->atomic++; -- -- eytzinger0_for_each(i, acc->k.nr) { -- struct disk_accounting_pos acc_k; -- bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos); -- -- bch2_accounting_key_to_text(out, &acc_k); -- -- u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; -- bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); -- -- prt_str(out, ":"); -- for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) -- prt_printf(out, " %llu", v[j]); -- prt_newline(out); -- } -- -- --out->atomic; -- percpu_up_read(&c->mark_lock); --} -- - static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) - { - darray_for_each(acc->k, e) { -@@ -625,7 +618,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - switch (acc.type) { - case BCH_DISK_ACCOUNTING_replicas: { - struct bch_replicas_padded r; -- __accounting_to_replicas(&r.e, acc); -+ __accounting_to_replicas(&r.e, &acc); - - for (unsigned i = 0; i < r.e.nr_devs; i++) - if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && -@@ -699,11 +692,45 @@ int bch2_accounting_read(struct bch_fs *c) - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - -- int ret = for_each_btree_key(trans, iter, -- BTREE_ID_accounting, POS_MIN, -+ /* -+ * We might run more than once if we rewind to start topology repair or -+ * btree node scan - and those might cause us to get different results, -+ * so we can't just skip if we've already run. -+ * -+ * Instead, zero out any accounting we have: -+ */ -+ percpu_down_write(&c->mark_lock); -+ darray_for_each(acc->k, e) -+ percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); -+ for_each_member_device(c, ca) -+ percpu_memset(ca->usage, 0, sizeof(*ca->usage)); -+ percpu_memset(c->usage, 0, sizeof(*c->usage)); -+ percpu_up_write(&c->mark_lock); -+ -+ struct btree_iter iter; -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, -+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); -+ iter.flags &= ~BTREE_ITER_with_journal; -+ int ret = for_each_btree_key_continue(trans, iter, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); -+ -+ if (k.k->type != KEY_TYPE_accounting) -+ continue; -+ -+ struct disk_accounting_pos acc_k; -+ bpos_to_disk_accounting_pos(&acc_k, k.k->p); -+ -+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) -+ break; -+ -+ if (!bch2_accounting_is_mem(acc_k)) { -+ struct disk_accounting_pos next = { .type = acc_k.type + 1 }; -+ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); -+ continue; -+ } -+ - accounting_read_key(trans, k); - })); - if (ret) -@@ -715,6 +742,12 @@ int bch2_accounting_read(struct bch_fs *c) - - darray_for_each(*keys, i) { - if (i->k->k.type == KEY_TYPE_accounting) { -+ struct disk_accounting_pos acc_k; -+ bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); -+ -+ if (!bch2_accounting_is_mem(acc_k)) -+ continue; -+ - struct bkey_s_c k = bkey_i_to_s_c(i->k); - unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, - sizeof(acc->k.data[0]), -@@ -748,15 +781,16 @@ int bch2_accounting_read(struct bch_fs *c) - keys->gap = keys->nr = dst - keys->data; - - percpu_down_write(&c->mark_lock); -- unsigned i = 0; -- while (i < acc->k.nr) { -- unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); - -+ darray_for_each_reverse(acc->k, i) { - struct disk_accounting_pos acc_k; -- bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); -+ bpos_to_disk_accounting_pos(&acc_k, i->pos); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; -- bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); -+ memset(v, 0, sizeof(v)); -+ -+ for (unsigned j = 0; j < i->nr_counters; j++) -+ v[j] = percpu_u64_get(i->v[0] + j); - - /* - * If the entry counters are zeroed, it should be treated as -@@ -765,26 +799,25 @@ int bch2_accounting_read(struct bch_fs *c) - * Remove it, so that if it's re-added it gets re-marked in the - * superblock: - */ -- ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) -+ ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) - ? -BCH_ERR_remove_disk_accounting_entry -- : bch2_disk_accounting_validate_late(trans, acc_k, -- v, acc->k.data[idx].nr_counters); -+ : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); - - if (ret == -BCH_ERR_remove_disk_accounting_entry) { -- free_percpu(acc->k.data[idx].v[0]); -- free_percpu(acc->k.data[idx].v[1]); -- darray_remove_item(&acc->k, &acc->k.data[idx]); -- eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), -- accounting_pos_cmp, NULL); -+ free_percpu(i->v[0]); -+ free_percpu(i->v[1]); -+ darray_remove_item(&acc->k, i); - ret = 0; - continue; - } - - if (ret) - goto fsck_err; -- i++; - } - -+ eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), -+ accounting_pos_cmp, NULL); -+ - preempt_disable(); - struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); - -@@ -804,7 +837,7 @@ int bch2_accounting_read(struct bch_fs *c) - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - rcu_read_lock(); -- struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev); -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); - if (ca) { - struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; - percpu_u64_set(&d->buckets, v[0]); -@@ -881,10 +914,13 @@ void bch2_verify_accounting_clean(struct bch_fs *c) - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) -- continue; -+ break; - -- if (acc_k.type == BCH_DISK_ACCOUNTING_inum) -+ if (!bch2_accounting_is_mem(acc_k)) { -+ struct disk_accounting_pos next = { .type = acc_k.type + 1 }; -+ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); - continue; -+ } - - bch2_accounting_mem_read(c, k.k->p, v, nr); - -@@ -910,7 +946,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) - break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - rcu_read_lock(); -- struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (!ca) { - rcu_read_unlock(); - continue; -diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h -index 4ea6c8a092bc..f4372cafea2e 100644 ---- a/fs/bcachefs/disk_accounting.h -+++ b/fs/bcachefs/disk_accounting.h -@@ -2,6 +2,7 @@ - #ifndef _BCACHEFS_DISK_ACCOUNTING_H - #define _BCACHEFS_DISK_ACCOUNTING_H - -+#include "btree_update.h" - #include "eytzinger.h" - #include "sb-members.h" - -@@ -62,27 +63,32 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage - - static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p) - { -- acc->_pad = p; -+ BUILD_BUG_ON(sizeof(*acc) != sizeof(p)); -+ - #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -- bch2_bpos_swab(&acc->_pad); -+ acc->_pad = p; -+#else -+ memcpy_swab(acc, &p, sizeof(p)); - #endif - } - --static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k) -+static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc) - { -- struct bpos ret = k->_pad; -- -+ struct bpos p; - #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -- bch2_bpos_swab(&ret); -+ p = acc->_pad; -+#else -+ memcpy_swab(&p, acc, sizeof(p)); - #endif -- return ret; -+ return p; - } - - int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, - s64 *, unsigned, bool); - int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); - --int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -+int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); - void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); - void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - void bch2_accounting_swab(struct bkey_s); -@@ -112,6 +118,12 @@ enum bch_accounting_mode { - int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); - void bch2_accounting_mem_gc(struct bch_fs *); - -+static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) -+{ -+ return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && -+ acc.type != BCH_DISK_ACCOUNTING_inum; -+} -+ - /* - * Update in memory counters so they match the btree update we're doing; called - * from transaction commit path -@@ -126,9 +138,10 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, - bpos_to_disk_accounting_pos(&acc_k, a.k->p); - bool gc = mode == BCH_ACCOUNTING_gc; - -- EBUG_ON(gc && !acc->gc_running); -+ if (gc && !acc->gc_running) -+ return 0; - -- if (acc_k.type == BCH_DISK_ACCOUNTING_inum) -+ if (!bch2_accounting_is_mem(acc_k)) - return 0; - - if (mode == BCH_ACCOUNTING_normal) { -@@ -141,7 +154,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - rcu_read_lock(); -- struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (ca) { - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); -@@ -197,16 +210,54 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem * - static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, - u64 *v, unsigned nr) - { -+ percpu_down_read(&c->mark_lock); - struct bch_accounting_mem *acc = &c->accounting; - unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &p); - - bch2_accounting_mem_read_counters(acc, idx, v, nr, false); -+ percpu_up_read(&c->mark_lock); -+} -+ -+static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) -+{ -+ EBUG_ON(!res->ref); -+ -+ return (struct bversion) { -+ .hi = res->seq >> 32, -+ .lo = (res->seq << 32) | (res->offset + offset), -+ }; -+} -+ -+static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, -+ struct bkey_i_accounting *a, -+ unsigned commit_flags) -+{ -+ a->k.bversion = journal_pos_to_bversion(&trans->journal_res, -+ (u64 *) a - (u64 *) trans->journal_entries); -+ -+ EBUG_ON(bversion_zero(a->k.bversion)); -+ -+ return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) -+ ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal) -+ : 0; -+} -+ -+static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans, -+ struct bkey_i_accounting *a_i, -+ unsigned commit_flags) -+{ -+ if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { -+ struct bkey_s_accounting a = accounting_i_to_s(a_i); -+ -+ bch2_accounting_neg(a); -+ bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); -+ bch2_accounting_neg(a); -+ } - } - - int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); - int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); --void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *); - - int bch2_gc_accounting_start(struct bch_fs *); - int bch2_gc_accounting_done(struct bch_fs *); -diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c -index 749dcf368841..b211e90ac54e 100644 ---- a/fs/bcachefs/ec.c -+++ b/fs/bcachefs/ec.c -@@ -26,6 +26,7 @@ - #include "util.h" - - #include -+#include - - #ifdef __KERNEL__ - -@@ -109,7 +110,7 @@ struct ec_bio { - /* Stripes btree keys: */ - - int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - int ret = 0; -@@ -129,7 +130,7 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, - "invalid csum granularity (%u >= 64)", - s->csum_granularity_bits); - -- ret = bch2_bkey_ptrs_validate(c, k, flags); -+ ret = bch2_bkey_ptrs_validate(c, k, from); - fsck_err: - return ret; - } -@@ -304,13 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans, - } - - if (flags & BTREE_TRIGGER_gc) { -- percpu_down_read(&c->mark_lock); - struct bucket *g = gc_bucket(ca, bucket.offset); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", - ptr->dev, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; -- goto err_unlock; -+ goto err; - } - - bucket_lock(g); -@@ -318,8 +318,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, - ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); - alloc_to_bucket(g, new); - bucket_unlock(g); --err_unlock: -- percpu_up_read(&c->mark_lock); -+ - if (!ret) - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - } -@@ -732,7 +731,7 @@ static void ec_block_endio(struct bio *bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "erasure coding %s error: %s", -- bio_data_dir(bio) ? "write" : "read", -+ str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) - clear_bit(ec_bio->idx, ec_bio->buf->valid); - -@@ -909,7 +908,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, - bch2_bkey_val_to_text(&msgbuf, c, orig_k); - bch_err_ratelimited(c, - "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); -- printbuf_exit(&msgbuf);; -+ printbuf_exit(&msgbuf); - ret = -BCH_ERR_stripe_reconstruct; - goto out; - } -@@ -1275,11 +1274,11 @@ static int ec_stripe_update_extent(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, u8 gen, - struct ec_stripe_buf *s, -- struct bpos *bp_pos) -+ struct bkey_s_c_backpointer bp, -+ struct bkey_buf *last_flushed) - { - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_fs *c = trans->c; -- struct bch_backpointer bp; - struct btree_iter iter; - struct bkey_s_c k; - const struct bch_extent_ptr *ptr_c; -@@ -1288,33 +1287,26 @@ static int ec_stripe_update_extent(struct btree_trans *trans, - struct bkey_i *n; - int ret, dev, block; - -- ret = bch2_get_next_backpointer(trans, ca, bucket, gen, -- bp_pos, &bp, BTREE_ITER_cached); -- if (ret) -- return ret; -- if (bpos_eq(*bp_pos, SPOS_MAX)) -- return 0; -- -- if (bp.level) { -+ if (bp.v->level) { - struct printbuf buf = PRINTBUF; - struct btree_iter node_iter; - struct btree *b; - -- b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); -+ b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); - bch2_trans_iter_exit(trans, &node_iter); - - if (!b) - return 0; - - prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); -- bch2_backpointer_to_text(&buf, &bp); -+ bch2_bkey_val_to_text(&buf, c, bp.s_c); - - bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - return -EIO; - } - -- k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent); -+ k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); - ret = bkey_err(k); - if (ret) - return ret; -@@ -1373,7 +1365,6 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b - struct bch_fs *c = trans->c; - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_extent_ptr ptr = v->ptrs[block]; -- struct bpos bp_pos = POS_MIN; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); -@@ -1382,19 +1373,27 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b - - struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); - -- while (1) { -- ret = commit_do(trans, NULL, NULL, -- BCH_TRANS_COMMIT_no_check_rw| -- BCH_TRANS_COMMIT_no_enospc, -- ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos)); -- if (ret) -- break; -- if (bkey_eq(bp_pos, POS_MAX)) -+ struct bkey_buf last_flushed; -+ bch2_bkey_buf_init(&last_flushed); -+ bkey_init(&last_flushed.k->k); -+ -+ ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, -+ bucket_pos_to_bp_start(ca, bucket_pos), -+ bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, -+ NULL, NULL, -+ BCH_TRANS_COMMIT_no_check_rw| -+ BCH_TRANS_COMMIT_no_enospc, ({ -+ if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0))) - break; - -- bp_pos = bpos_nosnap_successor(bp_pos); -- } -+ if (bp_k.k->type != KEY_TYPE_backpointer) -+ continue; - -+ ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, -+ bkey_s_c_to_backpointer(bp_k), &last_flushed); -+ })); -+ -+ bch2_bkey_buf_exit(&last_flushed, c); - bch2_dev_put(ca); - return ret; - } -@@ -1716,7 +1715,7 @@ static void ec_stripe_key_init(struct bch_fs *c, - set_bkey_val_u64s(&s->k, u64s); - } - --static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) -+static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) - { - struct ec_stripe_new *s; - -@@ -1724,7 +1723,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) -- return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; -+ return NULL; - - mutex_init(&s->lock); - closure_init(&s->iodone, NULL); -@@ -1739,10 +1738,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) - ec_stripe_key_init(c, &s->new_stripe.key, - s->nr_data, s->nr_parity, - h->blocksize, h->disk_label); -- -- h->s = s; -- h->nr_created++; -- return 0; -+ return s; - } - - static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) -@@ -1887,25 +1883,26 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, - return h; - } - --static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, -+static int new_stripe_alloc_buckets(struct btree_trans *trans, -+ struct ec_stripe_head *h, struct ec_stripe_new *s, - enum bch_watermark watermark, struct closure *cl) - { - struct bch_fs *c = trans->c; - struct bch_devs_mask devs = h->devs; - struct open_bucket *ob; - struct open_buckets buckets; -- struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; -+ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - unsigned i, j, nr_have_parity = 0, nr_have_data = 0; - bool have_cache = true; - int ret = 0; - -- BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); -- BUG_ON(v->nr_redundant != h->s->nr_parity); -+ BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); -+ BUG_ON(v->nr_redundant != s->nr_parity); - - /* * We bypass the sector allocator which normally does this: */ - bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); - -- for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { -+ for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { - /* - * Note: we don't yet repair invalid blocks (failed/removed - * devices) when reusing stripes - we still need a codepath to -@@ -1915,21 +1912,21 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ - if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, devs.d); - -- if (i < h->s->nr_data) -+ if (i < s->nr_data) - nr_have_data++; - else - nr_have_parity++; - } - -- BUG_ON(nr_have_data > h->s->nr_data); -- BUG_ON(nr_have_parity > h->s->nr_parity); -+ BUG_ON(nr_have_data > s->nr_data); -+ BUG_ON(nr_have_parity > s->nr_parity); - - buckets.nr = 0; -- if (nr_have_parity < h->s->nr_parity) { -+ if (nr_have_parity < s->nr_parity) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, - &h->parity_stripe, - &devs, -- h->s->nr_parity, -+ s->nr_parity, - &nr_have_parity, - &have_cache, 0, - BCH_DATA_parity, -@@ -1937,14 +1934,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ - cl); - - open_bucket_for_each(c, &buckets, ob, i) { -- j = find_next_zero_bit(h->s->blocks_gotten, -- h->s->nr_data + h->s->nr_parity, -- h->s->nr_data); -- BUG_ON(j >= h->s->nr_data + h->s->nr_parity); -+ j = find_next_zero_bit(s->blocks_gotten, -+ s->nr_data + s->nr_parity, -+ s->nr_data); -+ BUG_ON(j >= s->nr_data + s->nr_parity); - -- h->s->blocks[j] = buckets.v[i]; -+ s->blocks[j] = buckets.v[i]; - v->ptrs[j] = bch2_ob_ptr(c, ob); -- __set_bit(j, h->s->blocks_gotten); -+ __set_bit(j, s->blocks_gotten); - } - - if (ret) -@@ -1952,11 +1949,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ - } - - buckets.nr = 0; -- if (nr_have_data < h->s->nr_data) { -+ if (nr_have_data < s->nr_data) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, - &h->block_stripe, - &devs, -- h->s->nr_data, -+ s->nr_data, - &nr_have_data, - &have_cache, 0, - BCH_DATA_user, -@@ -1964,13 +1961,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ - cl); - - open_bucket_for_each(c, &buckets, ob, i) { -- j = find_next_zero_bit(h->s->blocks_gotten, -- h->s->nr_data, 0); -- BUG_ON(j >= h->s->nr_data); -+ j = find_next_zero_bit(s->blocks_gotten, -+ s->nr_data, 0); -+ BUG_ON(j >= s->nr_data); - -- h->s->blocks[j] = buckets.v[i]; -+ s->blocks[j] = buckets.v[i]; - v->ptrs[j] = bch2_ob_ptr(c, ob); -- __set_bit(j, h->s->blocks_gotten); -+ __set_bit(j, s->blocks_gotten); - } - - if (ret) -@@ -2016,73 +2013,78 @@ static s64 get_existing_stripe(struct bch_fs *c, - return ret; - } - --static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) -+static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) - { -- struct bch_fs *c = trans->c; -- struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; -- struct bch_stripe *existing_v; -+ struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v; -+ struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v; - unsigned i; -- s64 idx; -- int ret; -- -- /* -- * If we can't allocate a new stripe, and there's no stripes with empty -- * blocks for us to reuse, that means we have to wait on copygc: -- */ -- idx = get_existing_stripe(c, h); -- if (idx < 0) -- return -BCH_ERR_stripe_alloc_blocked; -- -- ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); -- bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, -- "reading stripe key: %s", bch2_err_str(ret)); -- if (ret) { -- bch2_stripe_close(c, h->s); -- return ret; -- } - -- existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; -- -- BUG_ON(existing_v->nr_redundant != h->s->nr_parity); -- h->s->nr_data = existing_v->nr_blocks - -+ BUG_ON(existing_v->nr_redundant != s->nr_parity); -+ s->nr_data = existing_v->nr_blocks - - existing_v->nr_redundant; - -- ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); -+ int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); - if (ret) { -- bch2_stripe_close(c, h->s); -+ bch2_stripe_close(c, s); - return ret; - } - -- BUG_ON(h->s->existing_stripe.size != h->blocksize); -- BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); -+ BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); - - /* - * Free buckets we initially allocated - they might conflict with - * blocks from the stripe we're reusing: - */ -- for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { -- bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); -- h->s->blocks[i] = 0; -+ for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) { -+ bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]); -+ s->blocks[i] = 0; - } -- memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); -- memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); -+ memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten)); -+ memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated)); - -- for (i = 0; i < existing_v->nr_blocks; i++) { -+ for (unsigned i = 0; i < existing_v->nr_blocks; i++) { - if (stripe_blockcount_get(existing_v, i)) { -- __set_bit(i, h->s->blocks_gotten); -- __set_bit(i, h->s->blocks_allocated); -+ __set_bit(i, s->blocks_gotten); -+ __set_bit(i, s->blocks_allocated); - } - -- ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); -+ ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone); - } - -- bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); -- h->s->have_existing_stripe = true; -+ bkey_copy(&s->new_stripe.key, &s->existing_stripe.key); -+ s->have_existing_stripe = true; - - return 0; - } - --static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) -+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h, -+ struct ec_stripe_new *s) -+{ -+ struct bch_fs *c = trans->c; -+ s64 idx; -+ int ret; -+ -+ /* -+ * If we can't allocate a new stripe, and there's no stripes with empty -+ * blocks for us to reuse, that means we have to wait on copygc: -+ */ -+ idx = get_existing_stripe(c, h); -+ if (idx < 0) -+ return -BCH_ERR_stripe_alloc_blocked; -+ -+ ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); -+ bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, -+ "reading stripe key: %s", bch2_err_str(ret)); -+ if (ret) { -+ bch2_stripe_close(c, s); -+ return ret; -+ } -+ -+ return init_new_stripe_from_existing(c, s); -+} -+ -+static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h, -+ struct ec_stripe_new *s) - { - struct bch_fs *c = trans->c; - struct btree_iter iter; -@@ -2091,15 +2093,19 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st - struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); - int ret; - -- if (!h->s->res.sectors) { -- ret = bch2_disk_reservation_get(c, &h->s->res, -+ if (!s->res.sectors) { -+ ret = bch2_disk_reservation_get(c, &s->res, - h->blocksize, -- h->s->nr_parity, -+ s->nr_parity, - BCH_DISK_RESERVATION_NOFAIL); - if (ret) - return ret; - } - -+ /* -+ * Allocate stripe slot -+ * XXX: we're going to need a bitrange btree of free stripes -+ */ - for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, - BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { - if (bkey_gt(k.k->p, POS(0, U32_MAX))) { -@@ -2114,7 +2120,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st - } - - if (bkey_deleted(k.k) && -- bch2_try_open_stripe(c, h->s, k.k->p.offset)) -+ bch2_try_open_stripe(c, s, k.k->p.offset)) - break; - } - -@@ -2125,16 +2131,16 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st - - ret = ec_stripe_mem_alloc(trans, &iter); - if (ret) { -- bch2_stripe_close(c, h->s); -+ bch2_stripe_close(c, s); - goto err; - } - -- h->s->new_stripe.key.k.p = iter.pos; -+ s->new_stripe.key.k.p = iter.pos; - out: - bch2_trans_iter_exit(trans, &iter); - return ret; - err: -- bch2_disk_reservation_put(c, &h->s->res); -+ bch2_disk_reservation_put(c, &s->res); - goto out; - } - -@@ -2165,22 +2171,27 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - return h; - - if (!h->s) { -- ret = ec_new_stripe_alloc(c, h); -- if (ret) { -+ h->s = ec_new_stripe_alloc(c, h); -+ if (!h->s) { -+ ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc; - bch_err(c, "failed to allocate new stripe"); - goto err; - } -+ -+ h->nr_created++; - } - -- if (h->s->allocated) -+ struct ec_stripe_new *s = h->s; -+ -+ if (s->allocated) - goto allocated; - -- if (h->s->have_existing_stripe) -+ if (s->have_existing_stripe) - goto alloc_existing; - - /* First, try to allocate a full stripe: */ -- ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: -- __bch2_ec_stripe_head_reserve(trans, h); -+ ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: -+ __bch2_ec_stripe_head_reserve(trans, h, s); - if (!ret) - goto allocate_buf; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || -@@ -2192,15 +2203,15 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - * existing stripe: - */ - while (1) { -- ret = __bch2_ec_stripe_head_reuse(trans, h); -+ ret = __bch2_ec_stripe_head_reuse(trans, h, s); - if (!ret) - break; - if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) - goto err; - - if (watermark == BCH_WATERMARK_copygc) { -- ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: -- __bch2_ec_stripe_head_reserve(trans, h); -+ ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: -+ __bch2_ec_stripe_head_reserve(trans, h, s); - if (ret) - goto err; - goto allocate_buf; -@@ -2218,19 +2229,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - * Retry allocating buckets, with the watermark for this - * particular write: - */ -- ret = new_stripe_alloc_buckets(trans, h, watermark, cl); -+ ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); - if (ret) - goto err; - - allocate_buf: -- ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); -+ ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize); - if (ret) - goto err; - -- h->s->allocated = true; -+ s->allocated = true; - allocated: -- BUG_ON(!h->s->idx); -- BUG_ON(!h->s->new_stripe.data[0]); -+ BUG_ON(!s->idx); -+ BUG_ON(!s->new_stripe.data[0]); - BUG_ON(trans->restarted); - return h; - err: -@@ -2295,7 +2306,7 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ - int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) - { - return bch2_trans_run(c, -- for_each_btree_key_upto_commit(trans, iter, -+ for_each_btree_key_max_commit(trans, iter, - BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), - BTREE_ITER_intent, k, - NULL, NULL, 0, ({ -@@ -2458,11 +2469,9 @@ void bch2_fs_ec_exit(struct bch_fs *c) - - while (1) { - mutex_lock(&c->ec_stripe_head_lock); -- h = list_first_entry_or_null(&c->ec_stripe_head_list, -- struct ec_stripe_head, list); -- if (h) -- list_del(&h->list); -+ h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); - mutex_unlock(&c->ec_stripe_head_lock); -+ - if (!h) - break; - -diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h -index 43326370b410..583ca6a226da 100644 ---- a/fs/bcachefs/ec.h -+++ b/fs/bcachefs/ec.h -@@ -6,9 +6,8 @@ - #include "buckets_types.h" - #include "extents_types.h" - --enum bch_validate_flags; -- --int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -+int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); - void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, -diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h -index 64ef52e00078..b9770f24f213 100644 ---- a/fs/bcachefs/ec_format.h -+++ b/fs/bcachefs/ec_format.h -@@ -20,6 +20,23 @@ struct bch_stripe { - */ - __u8 disk_label; - -+ /* -+ * Variable length sections: -+ * - Pointers -+ * - Checksums -+ * 2D array of [stripe block/device][csum block], with checksum block -+ * size given by csum_granularity_bits -+ * - Block sector counts: per-block array of u16s -+ * -+ * XXX: -+ * Either checksums should have come last, or we should have included a -+ * checksum_size field (the size in bytes of the checksum itself, not -+ * the blocksize the checksum covers). -+ * -+ * Currently we aren't able to access the block sector counts if the -+ * checksum type is unknown. -+ */ -+ - struct bch_extent_ptr ptrs[]; - } __packed __aligned(8); - -diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h -index 9c4fe5cdbfb7..4590cd0c7c90 100644 ---- a/fs/bcachefs/errcode.h -+++ b/fs/bcachefs/errcode.h -@@ -54,7 +54,8 @@ - x(ENOMEM, ENOMEM_compression_bounce_read_init) \ - x(ENOMEM, ENOMEM_compression_bounce_write_init) \ - x(ENOMEM, ENOMEM_compression_workspace_init) \ -- x(ENOMEM, ENOMEM_decompression_workspace_init) \ -+ x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \ -+ x(EIO, compression_workspace_not_initialized) \ - x(ENOMEM, ENOMEM_bucket_gens) \ - x(ENOMEM, ENOMEM_buckets_nouse) \ - x(ENOMEM, ENOMEM_usage_init) \ -@@ -116,6 +117,8 @@ - x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ - x(ENOENT, ENOENT_dev_not_found) \ - x(ENOENT, ENOENT_dev_idx_not_found) \ -+ x(ENOENT, ENOENT_inode_no_backpointer) \ -+ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ - x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ - x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ - x(EEXIST, EEXIST_str_hash_set) \ -@@ -148,6 +151,7 @@ - x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ - x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ - x(BCH_ERR_transaction_restart, transaction_restart_nested) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_commit) \ - x(0, no_btree_node) \ - x(BCH_ERR_no_btree_node, no_btree_node_relock) \ - x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ -@@ -164,7 +168,6 @@ - x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ - x(0, backpointer_to_overwritten_btree_node) \ -- x(0, lock_fail_root_changed) \ - x(0, journal_reclaim_would_deadlock) \ - x(EINVAL, fsck) \ - x(BCH_ERR_fsck, fsck_fix) \ -@@ -173,7 +176,9 @@ - x(BCH_ERR_fsck, fsck_errors_not_fixed) \ - x(BCH_ERR_fsck, fsck_repair_unimplemented) \ - x(BCH_ERR_fsck, fsck_repair_impossible) \ -- x(0, restart_recovery) \ -+ x(EINVAL, restart_recovery) \ -+ x(EINVAL, not_in_recovery) \ -+ x(EINVAL, cannot_rewind_recovery) \ - x(0, data_update_done) \ - x(EINVAL, device_state_not_allowed) \ - x(EINVAL, member_info_missing) \ -@@ -192,7 +197,9 @@ - x(EINVAL, opt_parse_error) \ - x(EINVAL, remove_with_metadata_missing_unimplemented)\ - x(EINVAL, remove_would_lose_data) \ -- x(EINVAL, btree_iter_with_journal_not_supported) \ -+ x(EINVAL, no_resize_with_buckets_nouse) \ -+ x(EINVAL, inode_unpack_error) \ -+ x(EINVAL, varint_decode_error) \ - x(EROFS, erofs_trans_commit) \ - x(EROFS, erofs_no_writes) \ - x(EROFS, erofs_journal_err) \ -@@ -241,7 +248,10 @@ - x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ - x(BCH_ERR_invalid, invalid_bkey) \ - x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ -+ x(EIO, journal_shutdown) \ -+ x(EIO, journal_flush_err) \ - x(EIO, btree_node_read_err) \ -+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ - x(EIO, sb_not_downgraded) \ - x(EIO, btree_node_write_all_failed) \ - x(EIO, btree_node_read_error) \ -@@ -257,6 +267,8 @@ - x(EIO, no_device_to_read_from) \ - x(EIO, missing_indirect_extent) \ - x(EIO, invalidate_stripe_to_dev) \ -+ x(EIO, no_encryption_key) \ -+ x(EIO, insufficient_journal_devices) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ -@@ -305,6 +317,7 @@ static inline long bch2_err_class(long err) - - #define BLK_STS_REMOVED ((__force blk_status_t)128) - -+#include - const char *bch2_blk_status_to_str(blk_status_t); - - #endif /* _BCACHFES_ERRCODE_H */ -diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c -index b679def8fb98..038da6a61f6b 100644 ---- a/fs/bcachefs/error.c -+++ b/fs/bcachefs/error.c -@@ -1,7 +1,9 @@ - // SPDX-License-Identifier: GPL-2.0 - #include "bcachefs.h" -+#include "btree_cache.h" - #include "btree_iter.h" - #include "error.h" -+#include "fs-common.h" - #include "journal.h" - #include "recovery_passes.h" - #include "super.h" -@@ -33,7 +35,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) - int bch2_topology_error(struct bch_fs *c) - { - set_bit(BCH_FS_topology_error, &c->flags); -- if (!test_bit(BCH_FS_fsck_running, &c->flags)) { -+ if (!test_bit(BCH_FS_recovery_running, &c->flags)) { - bch2_inconsistent_error(c); - return -BCH_ERR_btree_need_topology_repair; - } else { -@@ -218,6 +220,30 @@ static const u8 fsck_flags_extra[] = { - #undef x - }; - -+static int do_fsck_ask_yn(struct bch_fs *c, -+ struct btree_trans *trans, -+ struct printbuf *question, -+ const char *action) -+{ -+ prt_str(question, ", "); -+ prt_str(question, action); -+ -+ if (bch2_fs_stdio_redirect(c)) -+ bch2_print(c, "%s", question->buf); -+ else -+ bch2_print_string_as_lines(KERN_ERR, question->buf); -+ -+ int ask = bch2_fsck_ask_yn(c, trans); -+ -+ if (trans) { -+ int ret = bch2_trans_relock(trans); -+ if (ret) -+ return ret; -+ } -+ -+ return ask; -+} -+ - int __bch2_fsck_err(struct bch_fs *c, - struct btree_trans *trans, - enum bch_fsck_flags flags, -@@ -226,7 +252,7 @@ int __bch2_fsck_err(struct bch_fs *c, - { - struct fsck_err_state *s = NULL; - va_list args; -- bool print = true, suppressing = false, inconsistent = false; -+ bool print = true, suppressing = false, inconsistent = false, exiting = false; - struct printbuf buf = PRINTBUF, *out = &buf; - int ret = -BCH_ERR_fsck_ignore; - const char *action_orig = "fix?", *action = action_orig; -@@ -256,9 +282,10 @@ int __bch2_fsck_err(struct bch_fs *c, - !trans && - bch2_current_has_btree_trans(c)); - -- if ((flags & FSCK_CAN_FIX) && -- test_bit(err, c->sb.errors_silent)) -- return -BCH_ERR_fsck_fix; -+ if (test_bit(err, c->sb.errors_silent)) -+ return flags & FSCK_CAN_FIX -+ ? -BCH_ERR_fsck_fix -+ : -BCH_ERR_fsck_ignore; - - bch2_sb_error_count(c, err); - -@@ -289,16 +316,14 @@ int __bch2_fsck_err(struct bch_fs *c, - */ - if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { - ret = s->ret; -- mutex_unlock(&c->fsck_error_msgs_lock); -- goto err; -+ goto err_unlock; - } - - kfree(s->last_msg); - s->last_msg = kstrdup(buf.buf, GFP_KERNEL); - if (!s->last_msg) { -- mutex_unlock(&c->fsck_error_msgs_lock); - ret = -ENOMEM; -- goto err; -+ goto err_unlock; - } - - if (c->opts.ratelimit_errors && -@@ -318,13 +343,19 @@ int __bch2_fsck_err(struct bch_fs *c, - prt_printf(out, bch2_log_msg(c, "")); - #endif - -- if ((flags & FSCK_CAN_FIX) && -- (flags & FSCK_AUTOFIX) && -+ if ((flags & FSCK_AUTOFIX) && - (c->opts.errors == BCH_ON_ERROR_continue || - c->opts.errors == BCH_ON_ERROR_fix_safe)) { - prt_str(out, ", "); -- prt_actioning(out, action); -- ret = -BCH_ERR_fsck_fix; -+ if (flags & FSCK_CAN_FIX) { -+ prt_actioning(out, action); -+ ret = -BCH_ERR_fsck_fix; -+ } else { -+ prt_str(out, ", continuing"); -+ ret = -BCH_ERR_fsck_ignore; -+ } -+ -+ goto print; - } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { - if (c->opts.errors != BCH_ON_ERROR_continue || - !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { -@@ -348,31 +379,18 @@ int __bch2_fsck_err(struct bch_fs *c, - : c->opts.fix_errors; - - if (fix == FSCK_FIX_ask) { -- prt_str(out, ", "); -- prt_str(out, action); -- -- if (bch2_fs_stdio_redirect(c)) -- bch2_print(c, "%s", out->buf); -- else -- bch2_print_string_as_lines(KERN_ERR, out->buf); - print = false; - -- int ask = bch2_fsck_ask_yn(c, trans); -- -- if (trans) { -- ret = bch2_trans_relock(trans); -- if (ret) { -- mutex_unlock(&c->fsck_error_msgs_lock); -- goto err; -- } -- } -+ ret = do_fsck_ask_yn(c, trans, out, action); -+ if (ret < 0) -+ goto err_unlock; - -- if (ask >= YN_ALLNO && s) -- s->fix = ask == YN_ALLNO -+ if (ret >= YN_ALLNO && s) -+ s->fix = ret == YN_ALLNO - ? FSCK_FIX_no - : FSCK_FIX_yes; - -- ret = ask & 1 -+ ret = ret & 1 - ? -BCH_ERR_fsck_fix - : -BCH_ERR_fsck_ignore; - } else if (fix == FSCK_FIX_yes || -@@ -385,9 +403,7 @@ int __bch2_fsck_err(struct bch_fs *c, - prt_str(out, ", not "); - prt_actioning(out, action); - } -- } else if (flags & FSCK_NEED_FSCK) { -- prt_str(out, " (run fsck to correct)"); -- } else { -+ } else if (!(flags & FSCK_CAN_IGNORE)) { - prt_str(out, " (repair unimplemented)"); - } - -@@ -396,14 +412,13 @@ int __bch2_fsck_err(struct bch_fs *c, - !(flags & FSCK_CAN_IGNORE))) - ret = -BCH_ERR_fsck_errors_not_fixed; - -- bool exiting = -- test_bit(BCH_FS_fsck_running, &c->flags) && -- (ret != -BCH_ERR_fsck_fix && -- ret != -BCH_ERR_fsck_ignore); -- -- if (exiting) -+ if (test_bit(BCH_FS_fsck_running, &c->flags) && -+ (ret != -BCH_ERR_fsck_fix && -+ ret != -BCH_ERR_fsck_ignore)) { -+ exiting = true; - print = true; -- -+ } -+print: - if (print) { - if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s\n", out->buf); -@@ -419,17 +434,24 @@ int __bch2_fsck_err(struct bch_fs *c, - if (s) - s->ret = ret; - -- mutex_unlock(&c->fsck_error_msgs_lock); -- - if (inconsistent) - bch2_inconsistent_error(c); - -- if (ret == -BCH_ERR_fsck_fix) { -- set_bit(BCH_FS_errors_fixed, &c->flags); -- } else { -- set_bit(BCH_FS_errors_not_fixed, &c->flags); -- set_bit(BCH_FS_error, &c->flags); -+ /* -+ * We don't yet track whether the filesystem currently has errors, for -+ * log_fsck_err()s: that would require us to track for every error type -+ * which recovery pass corrects it, to get the fsck exit status correct: -+ */ -+ if (flags & FSCK_CAN_FIX) { -+ if (ret == -BCH_ERR_fsck_fix) { -+ set_bit(BCH_FS_errors_fixed, &c->flags); -+ } else { -+ set_bit(BCH_FS_errors_not_fixed, &c->flags); -+ set_bit(BCH_FS_error, &c->flags); -+ } - } -+err_unlock: -+ mutex_unlock(&c->fsck_error_msgs_lock); - err: - if (action != action_orig) - kfree(action); -@@ -437,28 +459,52 @@ int __bch2_fsck_err(struct bch_fs *c, - return ret; - } - -+static const char * const bch2_bkey_validate_contexts[] = { -+#define x(n) #n, -+ BKEY_VALIDATE_CONTEXTS() -+#undef x -+ NULL -+}; -+ - int __bch2_bkey_fsck_err(struct bch_fs *c, - struct bkey_s_c k, -- enum bch_validate_flags validate_flags, -+ struct bkey_validate_context from, - enum bch_sb_error_id err, - const char *fmt, ...) - { -- if (validate_flags & BCH_VALIDATE_silent) -+ if (from.flags & BCH_VALIDATE_silent) - return -BCH_ERR_fsck_delete_bkey; - - unsigned fsck_flags = 0; -- if (!(validate_flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) -+ if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { -+ if (test_bit(err, c->sb.errors_silent)) -+ return -BCH_ERR_fsck_delete_bkey; -+ - fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; -+ } -+ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) -+ fsck_flags |= fsck_flags_extra[err]; - - struct printbuf buf = PRINTBUF; -- va_list args; -+ prt_printf(&buf, "invalid bkey in %s", -+ bch2_bkey_validate_contexts[from.from]); -+ -+ if (from.from == BKEY_VALIDATE_journal) -+ prt_printf(&buf, " journal seq=%llu offset=%u", -+ from.journal_seq, from.journal_offset); -+ -+ prt_str(&buf, " btree="); -+ bch2_btree_id_to_text(&buf, from.btree); -+ prt_printf(&buf, " level=%u: ", from.level); - -- prt_str(&buf, "invalid bkey "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\n "); -+ -+ va_list args; - va_start(args, fmt); - prt_vprintf(&buf, fmt, args); - va_end(args); -+ - prt_str(&buf, ": delete?"); - - int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf); -@@ -483,3 +529,36 @@ void bch2_flush_fsck_errs(struct bch_fs *c) - - mutex_unlock(&c->fsck_error_msgs_lock); - } -+ -+int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) -+{ -+ u32 restart_count = trans->restart_count; -+ int ret = 0; -+ -+ /* XXX: we don't yet attempt to print paths when we don't know the subvol */ -+ if (inum.subvol) -+ ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); -+ if (!inum.subvol || ret) -+ prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); -+ -+ return trans_was_restarted(trans, restart_count); -+} -+ -+int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, -+ subvol_inum inum, u64 offset) -+{ -+ int ret = bch2_inum_err_msg_trans(trans, out, inum); -+ prt_printf(out, " offset %llu: ", offset); -+ return ret; -+} -+ -+void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) -+{ -+ bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); -+} -+ -+void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, -+ subvol_inum inum, u64 offset) -+{ -+ bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); -+} -diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h -index 6551ada926b6..7acf2a27ca28 100644 ---- a/fs/bcachefs/error.h -+++ b/fs/bcachefs/error.h -@@ -45,32 +45,11 @@ int bch2_topology_error(struct bch_fs *); - bch2_inconsistent_error(c); \ - }) - --#define bch2_fs_inconsistent_on(cond, c, ...) \ -+#define bch2_fs_inconsistent_on(cond, ...) \ - ({ \ - bool _ret = unlikely(!!(cond)); \ -- \ -- if (_ret) \ -- bch2_fs_inconsistent(c, __VA_ARGS__); \ -- _ret; \ --}) -- --/* -- * Later we might want to mark only the particular device inconsistent, not the -- * entire filesystem: -- */ -- --#define bch2_dev_inconsistent(ca, ...) \ --do { \ -- bch_err(ca, __VA_ARGS__); \ -- bch2_inconsistent_error((ca)->fs); \ --} while (0) -- --#define bch2_dev_inconsistent_on(cond, ca, ...) \ --({ \ -- bool _ret = unlikely(!!(cond)); \ -- \ - if (_ret) \ -- bch2_dev_inconsistent(ca, __VA_ARGS__); \ -+ bch2_fs_inconsistent(__VA_ARGS__); \ - _ret; \ - }) - -@@ -123,9 +102,9 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, - - void bch2_flush_fsck_errs(struct bch_fs *); - --#define __fsck_err(c, _flags, _err_type, ...) \ -+#define fsck_err_wrap(_do) \ - ({ \ -- int _ret = bch2_fsck_err(c, _flags, _err_type, __VA_ARGS__); \ -+ int _ret = _do; \ - if (_ret != -BCH_ERR_fsck_fix && \ - _ret != -BCH_ERR_fsck_ignore) { \ - ret = _ret; \ -@@ -135,6 +114,8 @@ void bch2_flush_fsck_errs(struct bch_fs *); - _ret == -BCH_ERR_fsck_fix; \ - }) - -+#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) -+ - /* These macros return true if error should be fixed: */ - - /* XXX: mark in superblock that filesystem contains errors, if we ignore: */ -@@ -149,12 +130,6 @@ void bch2_flush_fsck_errs(struct bch_fs *); - (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\ - }) - --#define need_fsck_err_on(cond, c, _err_type, ...) \ -- __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) -- --#define need_fsck_err(c, _err_type, ...) \ -- __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) -- - #define mustfix_fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) - -@@ -167,11 +142,22 @@ void bch2_flush_fsck_errs(struct bch_fs *); - #define fsck_err_on(cond, c, _err_type, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) - -+#define log_fsck_err(c, _err_type, ...) \ -+ __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) -+ -+#define log_fsck_err_on(cond, ...) \ -+({ \ -+ bool _ret = unlikely(!!(cond)); \ -+ if (_ret) \ -+ log_fsck_err(__VA_ARGS__); \ -+ _ret; \ -+}) -+ - enum bch_validate_flags; - __printf(5, 6) - int __bch2_bkey_fsck_err(struct bch_fs *, - struct bkey_s_c, -- enum bch_validate_flags, -+ struct bkey_validate_context from, - enum bch_sb_error_id, - const char *, ...); - -@@ -181,7 +167,7 @@ int __bch2_bkey_fsck_err(struct bch_fs *, - */ - #define bkey_fsck_err(c, _err_type, _err_msg, ...) \ - do { \ -- int _ret = __bch2_bkey_fsck_err(c, k, flags, \ -+ int _ret = __bch2_bkey_fsck_err(c, k, from, \ - BCH_FSCK_ERR_##_err_type, \ - _err_msg, ##__VA_ARGS__); \ - if (_ret != -BCH_ERR_fsck_fix && \ -@@ -252,4 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); - _ret; \ - }) - -+int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); -+int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); -+ -+void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); -+void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); -+ - #endif /* _BCACHEFS_ERROR_H */ -diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c -index 5f4fecb358da..6aac579a692a 100644 ---- a/fs/bcachefs/extent_update.c -+++ b/fs/bcachefs/extent_update.c -@@ -64,7 +64,7 @@ static int count_iters_for_insert(struct btree_trans *trans, - break; - case KEY_TYPE_reflink_p: { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -- u64 idx = le64_to_cpu(p.v->idx); -+ u64 idx = REFLINK_P_IDX(p.v); - unsigned sectors = bpos_min(*end, p.k->p).offset - - bkey_start_offset(p.k); - struct btree_iter iter; -@@ -128,7 +128,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans, - - bch2_trans_copy_iter(©, iter); - -- for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) { -+ for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) { - unsigned offset = 0; - - if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) -diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c -index 37e3d69bec06..2d8042f853dc 100644 ---- a/fs/bcachefs/extents.c -+++ b/fs/bcachefs/extents.c -@@ -21,6 +21,7 @@ - #include "extents.h" - #include "inode.h" - #include "journal.h" -+#include "rebalance.h" - #include "replicas.h" - #include "super.h" - #include "super-io.h" -@@ -88,9 +89,17 @@ static inline bool ptr_better(struct bch_fs *c, - u64 l1 = dev_latency(c, p1.ptr.dev); - u64 l2 = dev_latency(c, p2.ptr.dev); - -+ /* -+ * Square the latencies, to bias more in favor of the faster -+ * device - we never want to stop issuing reads to the slower -+ * device altogether, so that we can update our latency numbers: -+ */ -+ l1 *= l1; -+ l2 *= l2; -+ - /* Pick at random, biased in favor of the faster device: */ - -- return bch2_rand_range(l1 + l2) > l1; -+ return bch2_get_random_u64_below(l1 + l2) > l1; - } - - if (bch2_force_reconstruct_read) -@@ -169,7 +178,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, - /* KEY_TYPE_btree_ptr: */ - - int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - int ret = 0; - -@@ -177,7 +186,7 @@ int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, - c, btree_ptr_val_too_big, - "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - -- ret = bch2_bkey_ptrs_validate(c, k, flags); -+ ret = bch2_bkey_ptrs_validate(c, k, from); - fsck_err: - return ret; - } -@@ -189,7 +198,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, - } - - int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - int ret = 0; -@@ -203,12 +212,13 @@ int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, - c, btree_ptr_v2_min_key_bad, - "min_key > key"); - -- if (flags & BCH_VALIDATE_write) -+ if ((from.flags & BCH_VALIDATE_write) && -+ c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written) - bkey_fsck_err_on(!bp.v->sectors_written, - c, btree_ptr_v2_written_0, - "sectors_written == 0"); - -- ret = bch2_bkey_ptrs_validate(c, k, flags); -+ ret = bch2_bkey_ptrs_validate(c, k, from); - fsck_err: - return ret; - } -@@ -395,7 +405,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) - /* KEY_TYPE_reservation: */ - - int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - int ret = 0; -@@ -1120,6 +1130,57 @@ void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_cr - bch2_prt_compression_type(out, crc->compression_type); - } - -+static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, -+ const struct bch_extent_rebalance *r) -+{ -+ prt_str(out, "rebalance:"); -+ -+ prt_printf(out, " replicas=%u", r->data_replicas); -+ if (r->data_replicas_from_inode) -+ prt_str(out, " (inode)"); -+ -+ prt_str(out, " checksum="); -+ bch2_prt_csum_opt(out, r->data_checksum); -+ if (r->data_checksum_from_inode) -+ prt_str(out, " (inode)"); -+ -+ if (r->background_compression || r->background_compression_from_inode) { -+ prt_str(out, " background_compression="); -+ bch2_compression_opt_to_text(out, r->background_compression); -+ -+ if (r->background_compression_from_inode) -+ prt_str(out, " (inode)"); -+ } -+ -+ if (r->background_target || r->background_target_from_inode) { -+ prt_str(out, " background_target="); -+ if (c) -+ bch2_target_to_text(out, c, r->background_target); -+ else -+ prt_printf(out, "%u", r->background_target); -+ -+ if (r->background_target_from_inode) -+ prt_str(out, " (inode)"); -+ } -+ -+ if (r->promote_target || r->promote_target_from_inode) { -+ prt_str(out, " promote_target="); -+ if (c) -+ bch2_target_to_text(out, c, r->promote_target); -+ else -+ prt_printf(out, "%u", r->promote_target); -+ -+ if (r->promote_target_from_inode) -+ prt_str(out, " (inode)"); -+ } -+ -+ if (r->erasure_code || r->erasure_code_from_inode) { -+ prt_printf(out, " ec=%u", r->erasure_code); -+ if (r->erasure_code_from_inode) -+ prt_str(out, " (inode)"); -+ } -+} -+ - void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) - { -@@ -1155,18 +1216,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - (u64) ec->idx, ec->block); - break; - } -- case BCH_EXTENT_ENTRY_rebalance: { -- const struct bch_extent_rebalance *r = &entry->rebalance; -- -- prt_str(out, "rebalance: target "); -- if (c) -- bch2_target_to_text(out, c, r->target); -- else -- prt_printf(out, "%u", r->target); -- prt_str(out, " compression "); -- bch2_compression_opt_to_text(out, r->compression); -+ case BCH_EXTENT_ENTRY_rebalance: -+ bch2_extent_rebalance_to_text(out, c, &entry->rebalance); - break; -- } -+ - default: - prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); - return; -@@ -1178,13 +1231,19 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - - static int extent_ptr_validate(struct bch_fs *c, - struct bkey_s_c k, -- enum bch_validate_flags flags, -+ struct bkey_validate_context from, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) - { - int ret = 0; - -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ bkey_for_each_ptr(ptrs, ptr2) -+ bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, -+ c, ptr_to_duplicate_device, -+ "multiple pointers to same device (%u)", ptr->dev); -+ - /* bad pointers are repaired by check_fix_ptrs(): */ - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); -@@ -1199,13 +1258,6 @@ static int extent_ptr_validate(struct bch_fs *c, - unsigned bucket_size = ca->mi.bucket_size; - rcu_read_unlock(); - -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- bkey_for_each_ptr(ptrs, ptr2) -- bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, -- c, ptr_to_duplicate_device, -- "multiple pointers to same device (%u)", ptr->dev); -- -- - bkey_fsck_err_on(bucket >= nbuckets, - c, ptr_after_last_bucket, - "pointer past last bucket (%llu > %llu)", bucket, nbuckets); -@@ -1221,7 +1273,7 @@ static int extent_ptr_validate(struct bch_fs *c, - } - - int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; -@@ -1248,7 +1300,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, - - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: -- ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false); -+ ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false); - if (ret) - return ret; - -@@ -1270,9 +1322,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - -- bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, -- c, ptr_crc_uncompressed_size_too_small, -- "checksum offset + key size > uncompressed size"); - bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), - c, ptr_crc_csum_type_unknown, - "invalid checksum type"); -@@ -1280,6 +1329,19 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, - c, ptr_crc_compression_type_unknown, - "invalid compression type"); - -+ bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, -+ c, ptr_crc_uncompressed_size_too_small, -+ "checksum offset + key size > uncompressed size"); -+ bkey_fsck_err_on(crc_is_encoded(crc) && -+ (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && -+ (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), -+ c, ptr_crc_uncompressed_size_too_big, -+ "too large encoded extent"); -+ bkey_fsck_err_on(!crc_is_compressed(crc) && -+ crc.compressed_size != crc.uncompressed_size, -+ c, ptr_crc_uncompressed_size_mismatch, -+ "not compressed but compressed != uncompressed size"); -+ - if (bch2_csum_type_is_encryption(crc.csum_type)) { - if (nonce == UINT_MAX) - nonce = crc.offset + crc.nonce; -@@ -1293,12 +1355,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, - "redundant crc entry"); - crc_since_last_ptr = true; - -- bkey_fsck_err_on(crc_is_encoded(crc) && -- (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && -- (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), -- c, ptr_crc_uncompressed_size_too_big, -- "too large encoded extent"); -- - size_ondisk = crc.compressed_size; - break; - case BCH_EXTENT_ENTRY_stripe_ptr: -@@ -1391,166 +1447,6 @@ void bch2_ptr_swab(struct bkey_s k) - } - } - --const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) --{ -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- const union bch_extent_entry *entry; -- -- bkey_extent_entry_for_each(ptrs, entry) -- if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) -- return &entry->rebalance; -- -- return NULL; --} -- --unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, -- unsigned target, unsigned compression) --{ -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- unsigned rewrite_ptrs = 0; -- -- if (compression) { -- unsigned compression_type = bch2_compression_opt_to_type(compression); -- const union bch_extent_entry *entry; -- struct extent_ptr_decoded p; -- unsigned i = 0; -- -- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || -- p.ptr.unwritten) { -- rewrite_ptrs = 0; -- goto incompressible; -- } -- -- if (!p.ptr.cached && p.crc.compression_type != compression_type) -- rewrite_ptrs |= 1U << i; -- i++; -- } -- } --incompressible: -- if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { -- unsigned i = 0; -- -- bkey_for_each_ptr(ptrs, ptr) { -- if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) -- rewrite_ptrs |= 1U << i; -- i++; -- } -- } -- -- return rewrite_ptrs; --} -- --bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) --{ -- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); -- -- /* -- * If it's an indirect extent, we don't delete the rebalance entry when -- * done so that we know what options were applied - check if it still -- * needs work done: -- */ -- if (r && -- k.k->type == KEY_TYPE_reflink_v && -- !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression)) -- r = NULL; -- -- return r != NULL; --} -- --static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k, -- unsigned target, unsigned compression) --{ -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- const union bch_extent_entry *entry; -- struct extent_ptr_decoded p; -- u64 sectors = 0; -- -- if (compression) { -- unsigned compression_type = bch2_compression_opt_to_type(compression); -- -- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || -- p.ptr.unwritten) { -- sectors = 0; -- goto incompressible; -- } -- -- if (!p.ptr.cached && p.crc.compression_type != compression_type) -- sectors += p.crc.compressed_size; -- } -- } --incompressible: -- if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { -- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -- if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target)) -- sectors += p.crc.compressed_size; -- } -- -- return sectors; --} -- --u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) --{ -- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); -- -- return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0; --} -- --int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, -- struct bch_io_opts *opts) --{ -- struct bkey_s k = bkey_i_to_s(_k); -- struct bch_extent_rebalance *r; -- unsigned target = opts->background_target; -- unsigned compression = background_compression(*opts); -- bool needs_rebalance; -- -- if (!bkey_extent_is_direct_data(k.k)) -- return 0; -- -- /* get existing rebalance entry: */ -- r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); -- if (r) { -- if (k.k->type == KEY_TYPE_reflink_v) { -- /* -- * indirect extents: existing options take precedence, -- * so that we don't move extents back and forth if -- * they're referenced by different inodes with different -- * options: -- */ -- if (r->target) -- target = r->target; -- if (r->compression) -- compression = r->compression; -- } -- -- r->target = target; -- r->compression = compression; -- } -- -- needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); -- -- if (needs_rebalance && !r) { -- union bch_extent_entry *new = bkey_val_end(k); -- -- new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; -- new->rebalance.compression = compression; -- new->rebalance.target = target; -- new->rebalance.unused = 0; -- k.k->u64s += extent_entry_u64s(new); -- } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { -- /* -- * For indirect extents, don't delete the rebalance entry when -- * we're finished so that we know we specifically moved it or -- * compressed it to its current location/compression type -- */ -- extent_entry_drop(k, (union bch_extent_entry *) r); -- } -- -- return 0; --} -- - /* Generic extent code: */ - - int bch2_cut_front_s(struct bpos where, struct bkey_s k) -@@ -1610,7 +1506,7 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) - case KEY_TYPE_reflink_p: { - struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); - -- le64_add_cpu(&p.v->idx, sub); -+ SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub); - break; - } - case KEY_TYPE_inline_data: -diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h -index bcffcf60aaaf..204d765dd74c 100644 ---- a/fs/bcachefs/extents.h -+++ b/fs/bcachefs/extents.h -@@ -8,7 +8,6 @@ - - struct bch_fs; - struct btree_trans; --enum bch_validate_flags; - - /* extent entries: */ - -@@ -410,12 +409,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, - /* KEY_TYPE_btree_ptr: */ - - int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context); - void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - - int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context); - void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, - int, struct bkey_s); -@@ -452,7 +451,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - /* KEY_TYPE_reservation: */ - - int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context); - void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - -@@ -696,7 +695,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct - void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context); - - static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, - struct bch_extent_ptr ptr2) -@@ -705,20 +704,11 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, - ptr1.unwritten == ptr2.unwritten && - ptr1.offset == ptr2.offset && - ptr1.dev == ptr2.dev && -- ptr1.dev == ptr2.dev); -+ ptr1.gen == ptr2.gen); - } - - void bch2_ptr_swab(struct bkey_s); - --const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); --unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, -- unsigned, unsigned); --bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); --u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); -- --int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, -- struct bch_io_opts *); -- - /* Generic extent code: */ - - enum bch_extent_overlap { -diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h -index 3bd2fdbb0817..c198dfc376d6 100644 ---- a/fs/bcachefs/extents_format.h -+++ b/fs/bcachefs/extents_format.h -@@ -201,19 +201,8 @@ struct bch_extent_stripe_ptr { - #endif - }; - --struct bch_extent_rebalance { --#if defined(__LITTLE_ENDIAN_BITFIELD) -- __u64 type:6, -- unused:34, -- compression:8, /* enum bch_compression_opt */ -- target:16; --#elif defined (__BIG_ENDIAN_BITFIELD) -- __u64 target:16, -- compression:8, -- unused:34, -- type:6; --#endif --}; -+/* bch_extent_rebalance: */ -+#include "rebalance_format.h" - - union bch_extent_entry { - #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 -diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c -index 7e10a9ddcfd9..2c3d46ac70c6 100644 ---- a/fs/bcachefs/fs-common.c -+++ b/fs/bcachefs/fs-common.c -@@ -69,9 +69,7 @@ int bch2_create_trans(struct btree_trans *trans, - if (!snapshot_src.inum) { - /* Inode wasn't specified, just snapshot: */ - struct bch_subvolume s; -- -- ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, -- BTREE_ITER_cached, &s); -+ ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s); - if (ret) - goto err; - -@@ -172,6 +170,10 @@ int bch2_create_trans(struct btree_trans *trans, - new_inode->bi_dir_offset = dir_offset; - } - -+ if (S_ISDIR(mode) && -+ !new_inode->bi_subvol) -+ new_inode->bi_depth = dir_u->bi_depth + 1; -+ - inode_iter.flags &= ~BTREE_ITER_all_snapshots; - bch2_btree_iter_set_snapshot(&inode_iter, snapshot); - -@@ -512,6 +514,15 @@ int bch2_rename_trans(struct btree_trans *trans, - dst_dir_u->bi_nlink++; - } - -+ if (S_ISDIR(src_inode_u->bi_mode) && -+ !src_inode_u->bi_subvol) -+ src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; -+ -+ if (mode == BCH_RENAME_EXCHANGE && -+ S_ISDIR(dst_inode_u->bi_mode) && -+ !dst_inode_u->bi_subvol) -+ dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; -+ - if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { - dst_dir_u->bi_nlink--; - src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; -@@ -548,3 +559,94 @@ int bch2_rename_trans(struct btree_trans *trans, - bch2_trans_iter_exit(trans, &src_dir_iter); - return ret; - } -+ -+static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) -+{ -+ bch2_printbuf_make_room(out, n); -+ -+ unsigned can_print = min(n, printbuf_remaining(out)); -+ -+ b += n; -+ -+ for (unsigned i = 0; i < can_print; i++) -+ out->buf[out->pos++] = *((char *) --b); -+ -+ printbuf_nul_terminate(out); -+} -+ -+static inline void prt_str_reversed(struct printbuf *out, const char *s) -+{ -+ prt_bytes_reversed(out, s, strlen(s)); -+} -+ -+static inline void reverse_bytes(void *b, size_t n) -+{ -+ char *e = b + n, *s = b; -+ -+ while (s < e) { -+ --e; -+ swap(*s, *e); -+ s++; -+ } -+} -+ -+/* XXX: we don't yet attempt to print paths when we don't know the subvol */ -+int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path) -+{ -+ unsigned orig_pos = path->pos; -+ int ret = 0; -+ -+ while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL && -+ inum.inum == BCACHEFS_ROOT_INO)) { -+ struct bch_inode_unpacked inode; -+ ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); -+ if (ret) -+ goto disconnected; -+ -+ if (!inode.bi_dir && !inode.bi_dir_offset) { -+ ret = -BCH_ERR_ENOENT_inode_no_backpointer; -+ goto disconnected; -+ } -+ -+ inum.subvol = inode.bi_parent_subvol ?: inum.subvol; -+ inum.inum = inode.bi_dir; -+ -+ u32 snapshot; -+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); -+ if (ret) -+ goto disconnected; -+ -+ struct btree_iter d_iter; -+ struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, -+ BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), -+ 0, dirent); -+ ret = bkey_err(d.s_c); -+ if (ret) -+ goto disconnected; -+ -+ struct qstr dirent_name = bch2_dirent_get_name(d); -+ prt_bytes_reversed(path, dirent_name.name, dirent_name.len); -+ -+ prt_char(path, '/'); -+ -+ bch2_trans_iter_exit(trans, &d_iter); -+ } -+ -+ if (orig_pos == path->pos) -+ prt_char(path, '/'); -+out: -+ ret = path->allocation_failure ? -ENOMEM : 0; -+ if (ret) -+ goto err; -+ -+ reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); -+ return 0; -+err: -+ return ret; -+disconnected: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto err; -+ -+ prt_str_reversed(path, "(disconnected)"); -+ goto out; -+} -diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h -index c934e807b380..2b59210bb5e8 100644 ---- a/fs/bcachefs/fs-common.h -+++ b/fs/bcachefs/fs-common.h -@@ -42,4 +42,6 @@ int bch2_rename_trans(struct btree_trans *, - bool bch2_reinherit_attrs(struct bch_inode_unpacked *, - struct bch_inode_unpacked *); - -+int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -+ - #endif /* _BCACHEFS_FS_COMMON_H */ -diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c -index 95972809e76d..d9a360782946 100644 ---- a/fs/bcachefs/fs-io-buffered.c -+++ b/fs/bcachefs/fs-io-buffered.c -@@ -110,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans, - if (!get_more) - break; - -+ unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio); -+ -+ if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping)) -+ break; -+ -+ unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); -+ -+ /* ensure proper alignment */ -+ order = min(order, __ffs(folio_offset|BIT(31))); -+ - folio = xa_load(&iter->mapping->i_pages, folio_offset); - if (folio && !xa_is_value(folio)) - break; - -- folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); -+ folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order); - if (!folio) - break; - -@@ -164,7 +174,8 @@ static void bchfs_read(struct btree_trans *trans, - BTREE_ITER_slots); - while (1) { - struct bkey_s_c k; -- unsigned bytes, sectors, offset_into_extent; -+ unsigned bytes, sectors; -+ s64 offset_into_extent; - enum btree_id data_btree = BTREE_ID_extents; - - bch2_trans_begin(trans); -@@ -197,7 +208,7 @@ static void bchfs_read(struct btree_trans *trans, - - k = bkey_i_to_s_c(sk.k); - -- sectors = min(sectors, k.k->size - offset_into_extent); -+ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - - if (readpages_iter) { - ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, -@@ -230,10 +241,12 @@ static void bchfs_read(struct btree_trans *trans, - bch2_trans_iter_exit(trans, &iter); - - if (ret) { -- bch_err_inum_offset_ratelimited(c, -- iter.pos.inode, -- iter.pos.offset << 9, -- "read error %i from btree lookup", ret); -+ struct printbuf buf = PRINTBUF; -+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); -+ prt_printf(&buf, "read error %i from btree lookup", ret); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ - rbio->bio.bi_status = BLK_STS_IOERR; - bio_endio(&rbio->bio); - } -@@ -248,6 +261,7 @@ void bch2_readahead(struct readahead_control *ractl) - struct bch_io_opts opts; - struct folio *folio; - struct readpages_iter readpages_iter; -+ struct blk_plug plug; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - -@@ -255,6 +269,16 @@ void bch2_readahead(struct readahead_control *ractl) - if (ret) - return; - -+ /* -+ * Besides being a general performance optimization, plugging helps with -+ * avoiding btree transaction srcu warnings - submitting a bio can -+ * block, and we don't want todo that with the transaction locked. -+ * -+ * However, plugged bios are submitted when we schedule; we ideally -+ * would have our own scheduler hook to call unlock_long() before -+ * scheduling. -+ */ -+ blk_start_plug(&plug); - bch2_pagecache_add_get(inode); - - struct btree_trans *trans = bch2_trans_get(c); -@@ -281,7 +305,7 @@ void bch2_readahead(struct readahead_control *ractl) - bch2_trans_put(trans); - - bch2_pagecache_add_put(inode); -- -+ blk_finish_plug(&plug); - darray_exit(&readpages_iter.folios); - } - -@@ -296,9 +320,13 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_read_bio *rbio; - struct bch_io_opts opts; -+ struct blk_plug plug; - int ret; - DECLARE_COMPLETION_ONSTACK(done); - -+ BUG_ON(folio_test_uptodate(folio)); -+ BUG_ON(folio_test_dirty(folio)); -+ - if (!bch2_folio_create(folio, GFP_KERNEL)) - return -ENOMEM; - -@@ -313,7 +341,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - -+ blk_start_plug(&plug); - bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); -+ blk_finish_plug(&plug); - wait_for_completion(&done); - - ret = blk_status_to_errno(rbio->bio.bi_status); -@@ -605,15 +635,6 @@ static int __bch2_writepage(struct folio *folio, - BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, - sectors << 9, offset << 9)); - -- /* Check for writing past i_size: */ -- WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > -- round_up(i_size, block_bytes(c)) && -- !test_bit(BCH_FS_emergency_ro, &c->flags), -- "writing past i_size: %llu > %llu (unrounded %llu)\n", -- bio_end_sector(&w->io->op.wbio.bio) << 9, -- round_up(i_size, block_bytes(c)), -- i_size); -- - w->io->op.res.sectors += reserved_sectors; - w->io->op.i_sectors_delta -= dirty_sectors; - w->io->op.new_i_size = i_size; -@@ -669,7 +690,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, - folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, - FGP_WRITEBEGIN | fgf_set_order(len), - mapping_gfp_mask(mapping)); -- if (IS_ERR_OR_NULL(folio)) -+ if (IS_ERR(folio)) - goto err_unlock; - - offset = pos - folio_pos(folio); -diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c -index 6d3a05ae5da8..2089c36b5866 100644 ---- a/fs/bcachefs/fs-io-direct.c -+++ b/fs/bcachefs/fs-io-direct.c -@@ -70,6 +70,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - struct bch_io_opts opts; - struct dio_read *dio; - struct bio *bio; -+ struct blk_plug plug; - loff_t offset = req->ki_pos; - bool sync = is_sync_kiocb(req); - size_t shorten; -@@ -128,6 +129,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - */ - dio->should_dirty = iter_is_iovec(iter); - -+ blk_start_plug(&plug); -+ - goto start; - while (iter->count) { - bio = bio_alloc_bioset(NULL, -@@ -160,6 +163,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); - } - -+ blk_finish_plug(&plug); -+ - iter->count += shorten; - - if (sync) { -diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c -index 1d4910ea0f1d..e072900e6a5b 100644 ---- a/fs/bcachefs/fs-io-pagecache.c -+++ b/fs/bcachefs/fs-io-pagecache.c -@@ -29,7 +29,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, - break; - - f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); -- if (IS_ERR_OR_NULL(f)) -+ if (IS_ERR(f)) - break; - - BUG_ON(fs->nr && folio_pos(f) != pos); -@@ -199,7 +199,7 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum, - unsigned folio_idx = 0; - - return bch2_trans_run(c, -- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, -+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inum.inum, offset), - POS(inum.inum, U64_MAX), - inum.subvol, BTREE_ITER_slots, k, ({ -diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c -index 2456c41b215e..717e7b94c66f 100644 ---- a/fs/bcachefs/fs-io.c -+++ b/fs/bcachefs/fs-io.c -@@ -167,6 +167,34 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - - /* fsync: */ - -+static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, -+ u64 *seq) -+{ -+ struct printbuf buf = PRINTBUF; -+ struct bch_inode_unpacked u; -+ struct btree_iter iter; -+ int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); -+ if (ret) -+ return ret; -+ -+ u64 cur_seq = journal_cur_seq(&trans->c->journal); -+ *seq = min(cur_seq, u.bi_journal_seq); -+ -+ if (fsck_err_on(u.bi_journal_seq > cur_seq, -+ trans, inode_journal_seq_in_future, -+ "inode journal seq in future (currently at %llu)\n%s", -+ cur_seq, -+ (bch2_inode_unpacked_to_text(&buf, &u), -+ buf.buf))) { -+ u.bi_journal_seq = cur_seq; -+ ret = bch2_inode_write(trans, &iter, &u); -+ } -+fsck_err: -+ bch2_trans_iter_exit(trans, &iter); -+ printbuf_exit(&buf); -+ return ret; -+} -+ - /* - * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an - * insert trigger: look up the btree inode instead -@@ -180,9 +208,10 @@ static int bch2_flush_inode(struct bch_fs *c, - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) - return -EROFS; - -- struct bch_inode_unpacked u; -- int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: -- bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?: -+ u64 seq; -+ int ret = bch2_trans_commit_do(c, NULL, NULL, 0, -+ bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: -+ bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: - bch2_inode_flush_nocow_writes(c, inode); - bch2_write_ref_put(c, BCH_WRITE_REF_fsync); - return ret; -@@ -222,7 +251,7 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, - struct bpos end) - { - return bch2_trans_run(c, -- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end, -+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, - subvol, 0, k, ({ - bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); - }))); -@@ -256,7 +285,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, - - folio = __filemap_get_folio(mapping, index, - FGP_LOCK|FGP_CREAT, GFP_KERNEL); -- if (IS_ERR_OR_NULL(folio)) { -+ if (IS_ERR(folio)) { - ret = -ENOMEM; - goto out; - } -@@ -437,6 +466,7 @@ int bchfs_truncate(struct mnt_idmap *idmap, - ret = bch2_truncate_folio(inode, iattr->ia_size); - if (unlikely(ret < 0)) - goto err; -+ ret = 0; - - truncate_setsize(&inode->v, iattr->ia_size); - -@@ -806,7 +836,7 @@ static int quota_reserve_range(struct bch_inode_info *inode, - u64 sectors = end - start; - - int ret = bch2_trans_run(c, -- for_each_btree_key_in_subvolume_upto(trans, iter, -+ for_each_btree_key_in_subvolume_max(trans, iter, - BTREE_ID_extents, - POS(inode->v.i_ino, start), - POS(inode->v.i_ino, end - 1), -@@ -877,11 +907,18 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, - bch2_mark_pagecache_unallocated(src, pos_src >> 9, - (pos_src + aligned_len) >> 9); - -+ /* -+ * XXX: we'd like to be telling bch2_remap_range() if we have -+ * permission to write to the source file, and thus if io path option -+ * changes should be propagated through the copy, but we need mnt_idmap -+ * from the pathwalk, awkward -+ */ - ret = bch2_remap_range(c, - inode_inum(dst), pos_dst >> 9, - inode_inum(src), pos_src >> 9, - aligned_len >> 9, -- pos_dst + len, &i_sectors_delta); -+ pos_dst + len, &i_sectors_delta, -+ false); - if (ret < 0) - goto err; - -@@ -922,7 +959,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) - return -ENXIO; - - int ret = bch2_trans_run(c, -- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, -+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), - POS(inode->v.i_ino, U64_MAX), - inum.subvol, 0, k, ({ -@@ -958,7 +995,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) - return -ENXIO; - - int ret = bch2_trans_run(c, -- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, -+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), - POS(inode->v.i_ino, U64_MAX), - inum.subvol, BTREE_ITER_slots, k, ({ -diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c -index 405cf08bda34..15725b4ce393 100644 ---- a/fs/bcachefs/fs-ioctl.c -+++ b/fs/bcachefs/fs-ioctl.c -@@ -406,7 +406,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - sync_inodes_sb(c->vfs_sb); - up_read(&c->vfs_sb->s_umount); - } --retry: -+ - if (arg.src_ptr) { - error = user_path_at(arg.dirfd, - (const char __user *)(unsigned long)arg.src_ptr, -@@ -486,11 +486,6 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - err2: - if (arg.src_ptr) - path_put(&src_path); -- -- if (retry_estale(error, lookup_flags)) { -- lookup_flags |= LOOKUP_REVAL; -- goto retry; -- } - err1: - return error; - } -diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c -index a41d0d8a2f7b..5d910f1c671c 100644 ---- a/fs/bcachefs/fs.c -+++ b/fs/bcachefs/fs.c -@@ -23,6 +23,7 @@ - #include "journal.h" - #include "keylist.h" - #include "quota.h" -+#include "rebalance.h" - #include "snapshot.h" - #include "super.h" - #include "xattr.h" -@@ -38,6 +39,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -65,6 +67,9 @@ void bch2_inode_update_after_write(struct btree_trans *trans, - i_gid_write(&inode->v, bi->bi_gid); - inode->v.i_mode = bi->bi_mode; - -+ if (fields & ATTR_SIZE) -+ i_size_write(&inode->v, bi->bi_size); -+ - if (fields & ATTR_ATIME) - inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); - if (fields & ATTR_MTIME) -@@ -89,10 +94,25 @@ int __must_check bch2_write_inode(struct bch_fs *c, - retry: - bch2_trans_begin(trans); - -- ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), -- BTREE_ITER_intent) ?: -- (set ? set(trans, inode, &inode_u, p) : 0) ?: -- bch2_inode_write(trans, &iter, &inode_u) ?: -+ ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); -+ if (ret) -+ goto err; -+ -+ struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); -+ -+ ret = (set ? set(trans, inode, &inode_u, p) : 0); -+ if (ret) -+ goto err; -+ -+ struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); -+ -+ if (memcmp(&old_r, &new_r, sizeof(new_r))) { -+ ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_inode_write(trans, &iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - - /* -@@ -101,7 +121,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, - */ - if (!ret) - bch2_inode_update_after_write(trans, inode, &inode_u, fields); -- -+err: - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -@@ -160,8 +180,9 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) - static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) - { - const subvol_inum *inum = data; -+ siphash_key_t k = { .key[0] = seed }; - -- return jhash(&inum->inum, sizeof(inum->inum), seed); -+ return siphash_2u64(inum->subvol, inum->inum, &k); - } - - static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) -@@ -190,11 +211,18 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { - .automatic_shrinking = true, - }; - -+static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { -+ .head_offset = offsetof(struct bch_inode_info, by_inum_hash), -+ .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), -+ .key_len = sizeof(u64), -+ .automatic_shrinking = true, -+}; -+ - int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) - { - struct bch_fs *c = trans->c; -- struct rhashtable *ht = &c->vfs_inodes_table; -- subvol_inum inum = (subvol_inum) { .inum = p.offset }; -+ struct rhltable *ht = &c->vfs_inodes_by_inum_table; -+ u64 inum = p.offset; - DARRAY(u32) subvols; - int ret = 0; - -@@ -219,15 +247,15 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) - struct rhash_lock_head __rcu *const *bkt; - struct rhash_head *he; - unsigned int hash; -- struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); -+ struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); - restart: -- hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); -+ hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); - bkt = rht_bucket(tbl, hash); - do { - struct bch_inode_info *inode; - - rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { -- if (inode->ei_inum.inum == inum.inum) { -+ if (inode->ei_inum.inum == inum) { - ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, - GFP_NOWAIT|__GFP_NOWARN); - if (ret) { -@@ -248,7 +276,7 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) - /* Ensure we see any new tables. */ - smp_rmb(); - -- tbl = rht_dereference_rcu(tbl->future_tbl, ht); -+ tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); - if (unlikely(tbl)) - goto restart; - rcu_read_unlock(); -@@ -327,7 +355,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod - spin_unlock(&inode->v.i_lock); - - if (remove) { -- int ret = rhashtable_remove_fast(&c->vfs_inodes_table, -+ int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, -+ &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); -+ BUG_ON(ret); -+ -+ ret = rhashtable_remove_fast(&c->vfs_inodes_table, - &inode->hash, bch2_vfs_inodes_params); - BUG_ON(ret); - inode->v.i_hash.pprev = NULL; -@@ -372,6 +404,11 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, - discard_new_inode(&inode->v); - return old; - } else { -+ int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, -+ &inode->by_inum_hash, -+ bch2_vfs_inodes_by_inum_params); -+ BUG_ON(ret); -+ - inode_fake_hash(&inode->v); - - inode_sb_list_add(&inode->v); -@@ -465,7 +502,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) - struct bch_inode_unpacked inode_u; - struct bch_subvolume subvol; - int ret = lockrestart_do(trans, -- bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: -+ bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: - PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); - bch2_trans_put(trans); -@@ -535,8 +572,7 @@ __bch2_create(struct mnt_idmap *idmap, - inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; - inum.inum = inode_u.bi_inum; - -- ret = bch2_subvolume_get(trans, inum.subvol, true, -- BTREE_ITER_with_updates, &subvol) ?: -+ ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_trans_commit(trans, NULL, &journal_seq, 0); - if (unlikely(ret)) { - bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, -@@ -549,7 +585,7 @@ __bch2_create(struct mnt_idmap *idmap, - - if (!(flags & BCH_CREATE_TMPFILE)) { - bch2_inode_update_after_write(trans, dir, &dir_u, -- ATTR_MTIME|ATTR_CTIME); -+ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - mutex_unlock(&dir->ei_update_lock); - } - -@@ -617,7 +653,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, - - struct bch_subvolume subvol; - struct bch_inode_unpacked inode_u; -- ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: -+ ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: - PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); - -@@ -628,7 +664,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, - goto err; - - /* regular files may have hardlinks: */ -- if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) && -+ if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && - !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), - c, - "dirent points to inode that does not point back:\n %s", -@@ -706,7 +742,7 @@ static int __bch2_link(struct bch_fs *c, - - if (likely(!ret)) { - bch2_inode_update_after_write(trans, dir, &dir_u, -- ATTR_MTIME|ATTR_CTIME); -+ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); - } - -@@ -759,7 +795,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, - goto err; - - bch2_inode_update_after_write(trans, dir, &dir_u, -- ATTR_MTIME|ATTR_CTIME); -+ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - bch2_inode_update_after_write(trans, inode, &inode_u, - ATTR_MTIME); - -@@ -937,11 +973,11 @@ static int bch2_rename2(struct mnt_idmap *idmap, - dst_inode->v.i_ino != dst_inode_u.bi_inum); - - bch2_inode_update_after_write(trans, src_dir, &src_dir_u, -- ATTR_MTIME|ATTR_CTIME); -+ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - - if (src_dir != dst_dir) - bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, -- ATTR_MTIME|ATTR_CTIME); -+ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - - bch2_inode_update_after_write(trans, src_inode, &src_inode_u, - ATTR_CTIME); -@@ -1245,7 +1281,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_buf cur, prev; -- unsigned offset_into_extent, sectors; - bool have_extent = false; - int ret = 0; - -@@ -1278,7 +1313,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, - - bch2_btree_iter_set_snapshot(&iter, snapshot); - -- k = bch2_btree_iter_peek_upto(&iter, end); -+ k = bch2_btree_iter_peek_max(&iter, end); - ret = bkey_err(k); - if (ret) - continue; -@@ -1292,9 +1327,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, - continue; - } - -- offset_into_extent = iter.pos.offset - -- bkey_start_offset(k.k); -- sectors = k.k->size - offset_into_extent; -+ s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); -+ unsigned sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&cur, c, k); - -@@ -1306,7 +1340,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, - k = bkey_i_to_s_c(cur.k); - bch2_bkey_buf_realloc(&prev, c, k.k->u64s); - -- sectors = min(sectors, k.k->size - offset_into_extent); -+ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - - bch2_cut_front(POS(k.k->p.inode, - bkey_start_offset(k.k) + -@@ -1736,7 +1770,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, - bch2_inode_update_after_write(trans, inode, bi, ~0); - - inode->v.i_blocks = bi->bi_sectors; -- inode->v.i_ino = bi->bi_inum; - inode->v.i_rdev = bi->bi_dev; - inode->v.i_generation = bi->bi_generation; - inode->v.i_size = bi->bi_size; -@@ -1769,7 +1802,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, - break; - } - -- mapping_set_large_folios(inode->v.i_mapping); -+ mapping_set_folio_min_order(inode->v.i_mapping, -+ get_order(trans->c->opts.block_size)); - } - - static void bch2_free_inode(struct inode *vinode) -@@ -2200,7 +2234,8 @@ static int bch2_fs_get_tree(struct fs_context *fc) - sb->s_time_gran = c->sb.nsec_per_time_unit; - sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; - sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); -- sb->s_uuid = c->sb.user_uuid; -+ super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); -+ super_set_sysfs_name_uuid(sb); - sb->s_shrink->seeks = 0; - c->vfs_sb = sb; - strscpy(sb->s_id, c->name, sizeof(sb->s_id)); -@@ -2345,13 +2380,16 @@ static int bch2_init_fs_context(struct fs_context *fc) - - void bch2_fs_vfs_exit(struct bch_fs *c) - { -+ if (c->vfs_inodes_by_inum_table.ht.tbl) -+ rhltable_destroy(&c->vfs_inodes_by_inum_table); - if (c->vfs_inodes_table.tbl) - rhashtable_destroy(&c->vfs_inodes_table); - } - - int bch2_fs_vfs_init(struct bch_fs *c) - { -- return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); -+ return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: -+ rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); - } - - static struct file_system_type bcache_fs_type = { -diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h -index 59f9f7ae728d..dd2198541455 100644 ---- a/fs/bcachefs/fs.h -+++ b/fs/bcachefs/fs.h -@@ -14,6 +14,7 @@ - struct bch_inode_info { - struct inode v; - struct rhash_head hash; -+ struct rhlist_head by_inum_hash; - subvol_inum ei_inum; - - struct list_head ei_vfs_inode_list; -diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c -index 75c8a97a6954..451b42c83fa2 100644 ---- a/fs/bcachefs/fsck.c -+++ b/fs/bcachefs/fsck.c -@@ -1,6 +1,7 @@ - // SPDX-License-Identifier: GPL-2.0 - - #include "bcachefs.h" -+#include "bcachefs_ioctl.h" - #include "bkey_buf.h" - #include "btree_cache.h" - #include "btree_update.h" -@@ -16,6 +17,7 @@ - #include "recovery_passes.h" - #include "snapshot.h" - #include "super.h" -+#include "thread_with_file.h" - #include "xattr.h" - - #include -@@ -73,7 +75,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, - { - u64 sectors = 0; - -- int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, -+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(inum, 0, snapshot), - POS(inum, U64_MAX), - 0, k, ({ -@@ -90,7 +92,7 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, - { - u64 subdirs = 0; - -- int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, -+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - POS(inum, U64_MAX), - 0, k, ({ -@@ -107,7 +109,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, - u32 *snapshot, u64 *inum) - { - struct bch_subvolume s; -- int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); -+ int ret = bch2_subvolume_get(trans, subvol, false, &s); - - *snapshot = le32_to_cpu(s.snapshot); - *inum = le64_to_cpu(s.inode); -@@ -170,7 +172,7 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, - if (ret) - return ret; - -- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - *target = le64_to_cpu(d.v->d_inum); - *type = d.v->d_type; - bch2_trans_iter_exit(trans, &iter); -@@ -203,6 +205,36 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) - return ret; - } - -+/* -+ * Find any subvolume associated with a tree of snapshots -+ * We can't rely on master_subvol - it might have been deleted. -+ */ -+static int find_snapshot_tree_subvol(struct btree_trans *trans, -+ u32 tree_id, u32 *subvol) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_snapshot) -+ continue; -+ -+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); -+ if (le32_to_cpu(s.v->tree) != tree_id) -+ continue; -+ -+ if (s.v->subvol) { -+ *subvol = le32_to_cpu(s.v->subvol); -+ goto found; -+ } -+ } -+ ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol; -+found: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ - /* Get lost+found, create if it doesn't exist: */ - static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - struct bch_inode_unpacked *lostfound, -@@ -210,6 +242,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - { - struct bch_fs *c = trans->c; - struct qstr lostfound_str = QSTR("lost+found"); -+ struct btree_iter lostfound_iter = { NULL }; - u64 inum = 0; - unsigned d_type = 0; - int ret; -@@ -220,20 +253,24 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - if (ret) - return ret; - -- subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; -+ u32 subvolid; -+ ret = find_snapshot_tree_subvol(trans, -+ bch2_snapshot_tree(c, snapshot), &subvolid); -+ bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u", -+ bch2_snapshot_tree(c, snapshot)); -+ if (ret) -+ return ret; - - struct bch_subvolume subvol; -- ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), -- false, 0, &subvol); -- bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", -- le32_to_cpu(st.master_subvol), snapshot); -+ ret = bch2_subvolume_get(trans, subvolid, false, &subvol); -+ bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot); - if (ret) - return ret; - - if (!subvol.inode) { - struct btree_iter iter; - struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, -- BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), -+ BTREE_ID_subvolumes, POS(0, subvolid), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(subvol); - if (ret) -@@ -243,13 +280,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - bch2_trans_iter_exit(trans, &iter); - } - -- root_inum.inum = le64_to_cpu(subvol.inode); -+ subvol_inum root_inum = { -+ .subvol = subvolid, -+ .inum = le64_to_cpu(subvol.inode) -+ }; - - struct bch_inode_unpacked root_inode; - struct bch_hash_info root_hash_info; - ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); - bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", -- root_inum.inum, le32_to_cpu(st.master_subvol)); -+ root_inum.inum, subvolid); - if (ret) - return ret; - -@@ -288,11 +328,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - * XXX: we could have a nicer log message here if we had a nice way to - * walk backpointers to print a path - */ -- bch_notice(c, "creating lost+found in subvol %llu snapshot %u", -- root_inum.subvol, le32_to_cpu(st.root_snapshot)); -+ struct printbuf path = PRINTBUF; -+ ret = bch2_inum_to_path(trans, root_inum, &path); -+ if (ret) -+ goto err; -+ -+ bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", -+ path.buf, root_inum.subvol, snapshot); -+ printbuf_exit(&path); - - u64 now = bch2_current_time(c); -- struct btree_iter lostfound_iter = { NULL }; - u64 cpu = raw_smp_processor_id(); - - bch2_inode_init_early(c, lostfound); -@@ -451,7 +496,9 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * - continue; - - struct bch_inode_unpacked child_inode; -- bch2_inode_unpack(k, &child_inode); -+ ret = bch2_inode_unpack(k, &child_inode); -+ if (ret) -+ break; - - if (!inode_should_reattach(&child_inode)) { - ret = maybe_delete_dirent(trans, -@@ -482,6 +529,13 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * - return ret; - } - -+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos pos) -+{ -+ return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); -+} -+ - static int remove_backpointer(struct btree_trans *trans, - struct bch_inode_unpacked *inode) - { -@@ -490,13 +544,11 @@ static int remove_backpointer(struct btree_trans *trans, - - struct bch_fs *c = trans->c; - struct btree_iter iter; -- struct bkey_s_c_dirent d = -- bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, -- SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0, -- dirent); -- int ret = bkey_err(d) ?: -- dirent_points_to_inode(c, d, inode) ?: -- __remove_dirent(trans, d.k->p); -+ struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, -+ SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); -+ int ret = bkey_err(d) ?: -+ dirent_points_to_inode(c, d, inode) ?: -+ __remove_dirent(trans, d.k->p); - bch2_trans_iter_exit(trans, &iter); - return ret; - } -@@ -613,7 +665,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 - struct btree_iter iter = {}; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); -- struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); -+ struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); - bch2_trans_iter_exit(trans, &iter); - int ret = bkey_err(k); - if (ret) -@@ -771,6 +823,7 @@ struct inode_walker_entry { - struct bch_inode_unpacked inode; - u32 snapshot; - u64 count; -+ u64 i_size; - }; - - struct inode_walker { -@@ -780,11 +833,13 @@ struct inode_walker { - struct bpos last_pos; - - DARRAY(struct inode_walker_entry) inodes; -+ snapshot_id_list deletes; - }; - - static void inode_walker_exit(struct inode_walker *w) - { - darray_exit(&w->inodes); -+ darray_exit(&w->deletes); - } - - static struct inode_walker inode_walker_init(void) -@@ -797,9 +852,8 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, - { - struct bch_inode_unpacked u; - -- BUG_ON(bch2_inode_unpack(inode, &u)); -- -- return darray_push(&w->inodes, ((struct inode_walker_entry) { -+ return bch2_inode_unpack(inode, &u) ?: -+ darray_push(&w->inodes, ((struct inode_walker_entry) { - .inode = u, - .snapshot = inode.k->p.snapshot, - })); -@@ -857,8 +911,9 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_ - if (k.k->p.snapshot != i->snapshot && !is_whiteout) { - struct inode_walker_entry new = *i; - -- new.snapshot = k.k->p.snapshot; -- new.count = 0; -+ new.snapshot = k.k->p.snapshot; -+ new.count = 0; -+ new.i_size = 0; - - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); -@@ -909,8 +964,9 @@ static int get_visible_inodes(struct btree_trans *trans, - int ret; - - w->inodes.nr = 0; -+ w->deletes.nr = 0; - -- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), -+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; -@@ -918,10 +974,13 @@ static int get_visible_inodes(struct btree_trans *trans, - if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) - continue; - -- if (bkey_is_inode(k.k)) -- add_inode(c, w, k); -+ if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) -+ continue; - -- if (k.k->p.snapshot >= s->pos.snapshot) -+ ret = bkey_is_inode(k.k) -+ ? add_inode(c, w, k) -+ : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); -+ if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); -@@ -929,69 +988,16 @@ static int get_visible_inodes(struct btree_trans *trans, - return ret; - } - --static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) --{ -- if (d.v->d_type == DT_SUBVOL) { -- u32 snap; -- u64 inum; -- int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum); -- if (ret && !bch2_err_matches(ret, ENOENT)) -- return ret; -- return !ret; -- } else { -- struct btree_iter iter; -- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, -- SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); -- int ret = bkey_err(k); -- if (ret) -- return ret; -- -- ret = bkey_is_inode(k.k); -- bch2_trans_iter_exit(trans, &iter); -- return ret; -- } --} -- - /* - * Prefer to delete the first one, since that will be the one at the wrong - * offset: - * return value: 0 -> delete k1, 1 -> delete k2 - */ --static int hash_pick_winner(struct btree_trans *trans, -- const struct bch_hash_desc desc, -- struct bch_hash_info *hash_info, -- struct bkey_s_c k1, -- struct bkey_s_c k2) --{ -- if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && -- !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) -- return 0; -- -- switch (desc.btree_id) { -- case BTREE_ID_dirents: { -- int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1)); -- if (ret < 0) -- return ret; -- if (!ret) -- return 0; -- -- ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2)); -- if (ret < 0) -- return ret; -- if (!ret) -- return 1; -- return 2; -- } -- default: -- return 0; -- } --} -- --static int fsck_update_backpointers(struct btree_trans *trans, -- struct snapshots_seen *s, -- const struct bch_hash_desc desc, -- struct bch_hash_info *hash_info, -- struct bkey_i *new) -+int bch2_fsck_update_backpointers(struct btree_trans *trans, -+ struct snapshots_seen *s, -+ const struct bch_hash_desc desc, -+ struct bch_hash_info *hash_info, -+ struct bkey_i *new) - { - if (new->k.type != KEY_TYPE_dirent) - return 0; -@@ -1019,160 +1025,6 @@ static int fsck_update_backpointers(struct btree_trans *trans, - return ret; - } - --static int fsck_rename_dirent(struct btree_trans *trans, -- struct snapshots_seen *s, -- const struct bch_hash_desc desc, -- struct bch_hash_info *hash_info, -- struct bkey_s_c_dirent old) --{ -- struct qstr old_name = bch2_dirent_get_name(old); -- struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); -- int ret = PTR_ERR_OR_ZERO(new); -- if (ret) -- return ret; -- -- bkey_dirent_init(&new->k_i); -- dirent_copy_target(new, old); -- new->k.p = old.k->p; -- -- for (unsigned i = 0; i < 1000; i++) { -- unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", -- old_name.len, old_name.name, i); -- unsigned u64s = BKEY_U64s + dirent_val_u64s(len); -- -- if (u64s > U8_MAX) -- return -EINVAL; -- -- new->k.u64s = u64s; -- -- ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, -- (subvol_inum) { 0, old.k->p.inode }, -- old.k->p.snapshot, &new->k_i, -- BTREE_UPDATE_internal_snapshot_node); -- if (!bch2_err_matches(ret, EEXIST)) -- break; -- } -- -- if (ret) -- return ret; -- -- return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); --} -- --static int hash_check_key(struct btree_trans *trans, -- struct snapshots_seen *s, -- const struct bch_hash_desc desc, -- struct bch_hash_info *hash_info, -- struct btree_iter *k_iter, struct bkey_s_c hash_k) --{ -- struct bch_fs *c = trans->c; -- struct btree_iter iter = { NULL }; -- struct printbuf buf = PRINTBUF; -- struct bkey_s_c k; -- u64 hash; -- int ret = 0; -- -- if (hash_k.k->type != desc.key_type) -- return 0; -- -- hash = desc.hash_bkey(hash_info, hash_k); -- -- if (likely(hash == hash_k.k->p.offset)) -- return 0; -- -- if (hash_k.k->p.offset < hash) -- goto bad_hash; -- -- for_each_btree_key_norestart(trans, iter, desc.btree_id, -- SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), -- BTREE_ITER_slots, k, ret) { -- if (bkey_eq(k.k->p, hash_k.k->p)) -- break; -- -- if (k.k->type == desc.key_type && -- !desc.cmp_bkey(k, hash_k)) -- goto duplicate_entries; -- -- if (bkey_deleted(k.k)) { -- bch2_trans_iter_exit(trans, &iter); -- goto bad_hash; -- } -- } --out: -- bch2_trans_iter_exit(trans, &iter); -- printbuf_exit(&buf); -- return ret; --bad_hash: -- if (fsck_err(trans, hash_table_key_wrong_offset, -- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", -- bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, -- (printbuf_reset(&buf), -- bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { -- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); -- if (IS_ERR(new)) -- return PTR_ERR(new); -- -- k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, -- (subvol_inum) { 0, hash_k.k->p.inode }, -- hash_k.k->p.snapshot, new, -- STR_HASH_must_create| -- BTREE_ITER_with_updates| -- BTREE_UPDATE_internal_snapshot_node); -- ret = bkey_err(k); -- if (ret) -- goto out; -- if (k.k) -- goto duplicate_entries; -- -- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, -- BTREE_UPDATE_internal_snapshot_node) ?: -- fsck_update_backpointers(trans, s, desc, hash_info, new) ?: -- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: -- -BCH_ERR_transaction_restart_nested; -- goto out; -- } --fsck_err: -- goto out; --duplicate_entries: -- ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); -- if (ret < 0) -- goto out; -- -- if (!fsck_err(trans, hash_table_key_duplicate, -- "duplicate hash table keys%s:\n%s", -- ret != 2 ? "" : ", both point to valid inodes", -- (printbuf_reset(&buf), -- bch2_bkey_val_to_text(&buf, c, hash_k), -- prt_newline(&buf), -- bch2_bkey_val_to_text(&buf, c, k), -- buf.buf))) -- goto out; -- -- switch (ret) { -- case 0: -- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); -- break; -- case 1: -- ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); -- break; -- case 2: -- ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: -- bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); -- goto out; -- } -- -- ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: -- -BCH_ERR_transaction_restart_nested; -- goto out; --} -- --static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, -- struct btree_iter *iter, -- struct bpos pos) --{ -- return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); --} -- - static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, -@@ -1260,7 +1112,7 @@ static int get_snapshot_root_inode(struct btree_trans *trans, - goto err; - BUG(); - found_root: -- BUG_ON(bch2_inode_unpack(k, root)); -+ ret = bch2_inode_unpack(k, root); - err: - bch2_trans_iter_exit(trans, &iter); - return ret; -@@ -1291,7 +1143,9 @@ static int check_inode(struct btree_trans *trans, - if (!bkey_is_inode(k.k)) - return 0; - -- BUG_ON(bch2_inode_unpack(k, &u)); -+ ret = bch2_inode_unpack(k, &u); -+ if (ret) -+ goto err; - - if (snapshot_root->bi_inum != u.bi_inum) { - ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); -@@ -1302,7 +1156,7 @@ static int check_inode(struct btree_trans *trans, - if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), - trans, inode_snapshot_mismatch, -- "inodes in different snapshots don't match")) { -+ "inode hash info in different snapshots don't match")) { - u.bi_hash_seed = snapshot_root->bi_hash_seed; - SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); - do_update = true; -@@ -1392,7 +1246,7 @@ static int check_inode(struct btree_trans *trans, - - if (fsck_err_on(!ret, - trans, inode_unlinked_and_not_open, -- "inode %llu%u unlinked and not open", -+ "inode %llu:%u unlinked and not open", - u.bi_inum, u.bi_snapshot)) { - ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); - bch_err_msg(c, ret, "in fsck deleting inode"); -@@ -1415,7 +1269,7 @@ static int check_inode(struct btree_trans *trans, - if (u.bi_subvol) { - struct bch_subvolume s; - -- ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); -+ ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - -@@ -1441,6 +1295,17 @@ static int check_inode(struct btree_trans *trans, - do_update = true; - } - } -+ -+ if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), -+ trans, inode_journal_seq_in_future, -+ "inode journal seq in future (currently at %llu)\n%s", -+ journal_cur_seq(&c->journal), -+ (printbuf_reset(&buf), -+ bch2_inode_unpacked_to_text(&buf, &u), -+ buf.buf))) { -+ u.bi_journal_seq = journal_cur_seq(&c->journal); -+ do_update = true; -+ } - do_update: - if (do_update) { - ret = __bch2_fsck_write_inode(trans, &u); -@@ -1502,7 +1367,9 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans, - break; - - struct bch_inode_unpacked parent_inode; -- bch2_inode_unpack(k, &parent_inode); -+ ret = bch2_inode_unpack(k, &parent_inode); -+ if (ret) -+ break; - - if (!inode_should_reattach(&parent_inode)) - break; -@@ -1525,7 +1392,9 @@ static int check_unreachable_inode(struct btree_trans *trans, - return 0; - - struct bch_inode_unpacked inode; -- BUG_ON(bch2_inode_unpack(k, &inode)); -+ ret = bch2_inode_unpack(k, &inode); -+ if (ret) -+ return ret; - - if (!inode_should_reattach(&inode)) - return 0; -@@ -1649,7 +1518,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal - if (i->count != count2) { - bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); -- return -BCH_ERR_internal_fsck_err; -+ i->count = count2; - } - - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), -@@ -1753,7 +1622,7 @@ static int overlapping_extents_found(struct btree_trans *trans, - bch2_trans_iter_init(trans, &iter1, btree, pos1, - BTREE_ITER_all_snapshots| - BTREE_ITER_not_extents); -- k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); -+ k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); - ret = bkey_err(k1); - if (ret) - goto err; -@@ -1778,7 +1647,7 @@ static int overlapping_extents_found(struct btree_trans *trans, - while (1) { - bch2_btree_iter_advance(&iter2); - -- k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); -+ k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); - ret = bkey_err(k2); - if (ret) - goto err; -@@ -2109,7 +1978,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ - return ret; - } - --static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) -+static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) - { - u32 restart_count = trans->restart_count; - return check_subdir_count_notnested(trans, w) ?: -@@ -2156,7 +2025,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, - return __bch2_fsck_write_inode(trans, target); - } - -- if (bch2_inode_should_have_bp(target) && -+ if (bch2_inode_should_have_single_bp(target) && - !fsck_err(trans, inode_wrong_backpointer, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), -@@ -2459,7 +2328,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, - goto out; - - if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { -- ret = check_subdir_count(trans, dir); -+ ret = check_subdir_dirents_count(trans, dir); - if (ret) - goto err; - } -@@ -2480,7 +2349,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, - *hash_info = bch2_hash_info_init(c, &i->inode); - dir->first_this_inode = false; - -- ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); -+ ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); - if (ret < 0) - goto err; - if (ret) { -@@ -2519,15 +2388,41 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, - if (ret) - goto err; - } -+ -+ darray_for_each(target->deletes, i) -+ if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i), -+ trans, dirent_to_overwritten_inode, -+ "dirent points to inode overwritten in snapshot %u:\n%s", -+ *i, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), -+ buf.buf))) { -+ struct btree_iter delete_iter; -+ bch2_trans_iter_init(trans, &delete_iter, -+ BTREE_ID_dirents, -+ SPOS(k.k->p.inode, k.k->p.offset, *i), -+ BTREE_ITER_intent); -+ ret = bch2_btree_iter_traverse(&delete_iter) ?: -+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc, -+ hash_info, -+ &delete_iter, -+ BTREE_UPDATE_internal_snapshot_node); -+ bch2_trans_iter_exit(trans, &delete_iter); -+ if (ret) -+ goto err; -+ -+ } - } - - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - -- if (d.v->d_type == DT_DIR) -- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) -+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { -+ if (d.v->d_type == DT_DIR) - i->count++; -+ i->i_size += bkey_bytes(d.k); -+ } - out: - err: - fsck_err: -@@ -2594,7 +2489,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, - *hash_info = bch2_hash_info_init(c, &i->inode); - inode->first_this_inode = false; - -- ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); -+ ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); - bch_err_fn(c, ret); - return ret; - } -@@ -2774,6 +2669,48 @@ struct pathbuf_entry { - - typedef DARRAY(struct pathbuf_entry) pathbuf; - -+static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p, -+ u32 new_depth) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, -+ SPOS(0, p->inum, p->snapshot), 0); -+ -+ struct bch_inode_unpacked inode; -+ int ret = bkey_err(k) ?: -+ !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode -+ : bch2_inode_unpack(k, &inode); -+ if (ret) -+ goto err; -+ -+ if (inode.bi_depth != new_depth) { -+ inode.bi_depth = new_depth; -+ ret = __bch2_fsck_write_inode(trans, &inode) ?: -+ bch2_trans_commit(trans, NULL, NULL, 0); -+ } -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth) -+{ -+ u32 restart_count = trans->restart_count; -+ int ret = 0; -+ -+ darray_for_each_reverse(*path, i) { -+ ret = nested_lockrestart_do(trans, -+ bch2_bi_depth_renumber_one(trans, i, new_bi_depth)); -+ bch_err_fn(trans->c, ret); -+ if (ret) -+ break; -+ -+ new_bi_depth++; -+ } -+ -+ return ret ?: trans_was_restarted(trans, restart_count); -+} -+ - static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) - { - darray_for_each(*p, i) -@@ -2783,21 +2720,21 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) - return false; - } - --static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) -+static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) - { - struct bch_fs *c = trans->c; - struct btree_iter inode_iter = {}; -- struct bch_inode_unpacked inode; -+ pathbuf path = {}; - struct printbuf buf = PRINTBUF; - u32 snapshot = inode_k.k->p.snapshot; -+ bool redo_bi_depth = false; -+ u32 min_bi_depth = U32_MAX; - int ret = 0; - -- p->nr = 0; -- -- BUG_ON(bch2_inode_unpack(inode_k, &inode)); -- -- if (!S_ISDIR(inode.bi_mode)) -- return 0; -+ struct bch_inode_unpacked inode; -+ ret = bch2_inode_unpack(inode_k, &inode); -+ if (ret) -+ return ret; - - while (!inode.bi_subvol) { - struct btree_iter dirent_iter; -@@ -2807,7 +2744,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino - d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); - ret = bkey_err(d.s_c); - if (ret && !bch2_err_matches(ret, ENOENT)) -- break; -+ goto out; - - if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) - bch2_trans_iter_exit(trans, &dirent_iter); -@@ -2822,7 +2759,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino - - bch2_trans_iter_exit(trans, &dirent_iter); - -- ret = darray_push(p, ((struct pathbuf_entry) { -+ ret = darray_push(&path, ((struct pathbuf_entry) { - .inum = inode.bi_inum, - .snapshot = snapshot, - })); -@@ -2834,22 +2771,32 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino - bch2_trans_iter_exit(trans, &inode_iter); - inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, inode.bi_dir, snapshot), 0); -+ -+ struct bch_inode_unpacked parent_inode; - ret = bkey_err(inode_k) ?: - !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode -- : bch2_inode_unpack(inode_k, &inode); -+ : bch2_inode_unpack(inode_k, &parent_inode); - if (ret) { - /* Should have been caught in dirents pass */ - bch_err_msg(c, ret, "error looking up parent directory"); -- break; -+ goto out; - } - -+ min_bi_depth = parent_inode.bi_depth; -+ -+ if (parent_inode.bi_depth < inode.bi_depth && -+ min_bi_depth < U16_MAX) -+ break; -+ -+ inode = parent_inode; - snapshot = inode_k.k->p.snapshot; -+ redo_bi_depth = true; - -- if (path_is_dup(p, inode.bi_inum, snapshot)) { -+ if (path_is_dup(&path, inode.bi_inum, snapshot)) { - /* XXX print path */ - bch_err(c, "directory structure loop"); - -- darray_for_each(*p, i) -+ darray_for_each(path, i) - pr_err("%llu:%u", i->inum, i->snapshot); - pr_err("%llu:%u", inode.bi_inum, snapshot); - -@@ -2862,12 +2809,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino - ret = reattach_inode(trans, &inode); - bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); - } -- break; -+ -+ goto out; - } - } -+ -+ if (inode.bi_subvol) -+ min_bi_depth = 0; -+ -+ if (redo_bi_depth) -+ ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth); - out: - fsck_err: - bch2_trans_iter_exit(trans, &inode_iter); -+ darray_exit(&path); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -@@ -2879,24 +2834,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino - */ - int bch2_check_directory_structure(struct bch_fs *c) - { -- pathbuf path = { 0, }; -- int ret; -- -- ret = bch2_trans_run(c, -+ int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_intent| - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ -- if (!bkey_is_inode(k.k)) -+ if (!S_ISDIR(bkey_inode_mode(k))) - continue; - - if (bch2_inode_flags(k) & BCH_INODE_unlinked) - continue; - -- check_path(trans, &path, k); -+ check_path_loop(trans, k); - }))); -- darray_exit(&path); - - bch_err_fn(c, ret); - return ret; -@@ -2994,7 +2945,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, - - /* Should never fail, checked by bch2_inode_invalid: */ - struct bch_inode_unpacked u; -- BUG_ON(bch2_inode_unpack(k, &u)); -+ _ret3 = bch2_inode_unpack(k, &u); -+ if (_ret3) -+ break; - - /* - * Backpointer and directory structure checks are sufficient for -@@ -3072,7 +3025,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite - if (!bkey_is_inode(k.k)) - return 0; - -- BUG_ON(bch2_inode_unpack(k, &u)); -+ ret = bch2_inode_unpack(k, &u); -+ if (ret) -+ return ret; - - if (S_ISDIR(u.bi_mode)) - return 0; -@@ -3194,3 +3149,223 @@ int bch2_fix_reflink_p(struct bch_fs *c) - bch_err_fn(c, ret); - return ret; - } -+ -+#ifndef NO_BCACHEFS_CHARDEV -+ -+struct fsck_thread { -+ struct thread_with_stdio thr; -+ struct bch_fs *c; -+ struct bch_opts opts; -+}; -+ -+static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) -+{ -+ struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); -+ kfree(thr); -+} -+ -+static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) -+{ -+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); -+ struct bch_fs *c = thr->c; -+ -+ int ret = PTR_ERR_OR_ZERO(c); -+ if (ret) -+ return ret; -+ -+ ret = bch2_fs_start(thr->c); -+ if (ret) -+ goto err; -+ -+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) { -+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); -+ ret |= 1; -+ } -+ if (test_bit(BCH_FS_error, &c->flags)) { -+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); -+ ret |= 4; -+ } -+err: -+ bch2_fs_stop(c); -+ return ret; -+} -+ -+static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { -+ .exit = bch2_fsck_thread_exit, -+ .fn = bch2_fsck_offline_thread_fn, -+}; -+ -+long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) -+{ -+ struct bch_ioctl_fsck_offline arg; -+ struct fsck_thread *thr = NULL; -+ darray_str(devs) = {}; -+ long ret = 0; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if (arg.flags) -+ return -EINVAL; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ for (size_t i = 0; i < arg.nr_devs; i++) { -+ u64 dev_u64; -+ ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); -+ if (ret) -+ goto err; -+ -+ char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); -+ ret = PTR_ERR_OR_ZERO(dev_str); -+ if (ret) -+ goto err; -+ -+ ret = darray_push(&devs, dev_str); -+ if (ret) { -+ kfree(dev_str); -+ goto err; -+ } -+ } -+ -+ thr = kzalloc(sizeof(*thr), GFP_KERNEL); -+ if (!thr) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ thr->opts = bch2_opts_empty(); -+ -+ if (arg.opts) { -+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); -+ ret = PTR_ERR_OR_ZERO(optstr) ?: -+ bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); -+ if (!IS_ERR(optstr)) -+ kfree(optstr); -+ -+ if (ret) -+ goto err; -+ } -+ -+ opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); -+ opt_set(thr->opts, read_only, 1); -+ opt_set(thr->opts, ratelimit_errors, 0); -+ -+ /* We need request_key() to be called before we punt to kthread: */ -+ opt_set(thr->opts, nostart, true); -+ -+ bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); -+ -+ thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); -+ -+ if (!IS_ERR(thr->c) && -+ thr->c->opts.errors == BCH_ON_ERROR_panic) -+ thr->c->opts.errors = BCH_ON_ERROR_ro; -+ -+ ret = __bch2_run_thread_with_stdio(&thr->thr); -+out: -+ darray_for_each(devs, i) -+ kfree(*i); -+ darray_exit(&devs); -+ return ret; -+err: -+ if (thr) -+ bch2_fsck_thread_exit(&thr->thr); -+ pr_err("ret %s", bch2_err_str(ret)); -+ goto out; -+} -+ -+static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) -+{ -+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); -+ struct bch_fs *c = thr->c; -+ -+ c->stdio_filter = current; -+ c->stdio = &thr->thr.stdio; -+ -+ /* -+ * XXX: can we figure out a way to do this without mucking with c->opts? -+ */ -+ unsigned old_fix_errors = c->opts.fix_errors; -+ if (opt_defined(thr->opts, fix_errors)) -+ c->opts.fix_errors = thr->opts.fix_errors; -+ else -+ c->opts.fix_errors = FSCK_FIX_ask; -+ -+ c->opts.fsck = true; -+ set_bit(BCH_FS_fsck_running, &c->flags); -+ -+ c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; -+ int ret = bch2_run_online_recovery_passes(c); -+ -+ clear_bit(BCH_FS_fsck_running, &c->flags); -+ bch_err_fn(c, ret); -+ -+ c->stdio = NULL; -+ c->stdio_filter = NULL; -+ c->opts.fix_errors = old_fix_errors; -+ -+ up(&c->online_fsck_mutex); -+ bch2_ro_ref_put(c); -+ return ret; -+} -+ -+static const struct thread_with_stdio_ops bch2_online_fsck_ops = { -+ .exit = bch2_fsck_thread_exit, -+ .fn = bch2_fsck_online_thread_fn, -+}; -+ -+long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) -+{ -+ struct fsck_thread *thr = NULL; -+ long ret = 0; -+ -+ if (arg.flags) -+ return -EINVAL; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if (!bch2_ro_ref_tryget(c)) -+ return -EROFS; -+ -+ if (down_trylock(&c->online_fsck_mutex)) { -+ bch2_ro_ref_put(c); -+ return -EAGAIN; -+ } -+ -+ thr = kzalloc(sizeof(*thr), GFP_KERNEL); -+ if (!thr) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ thr->c = c; -+ thr->opts = bch2_opts_empty(); -+ -+ if (arg.opts) { -+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); -+ -+ ret = PTR_ERR_OR_ZERO(optstr) ?: -+ bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); -+ if (!IS_ERR(optstr)) -+ kfree(optstr); -+ -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); -+err: -+ if (ret < 0) { -+ bch_err_fn(c, ret); -+ if (thr) -+ bch2_fsck_thread_exit(&thr->thr); -+ up(&c->online_fsck_mutex); -+ bch2_ro_ref_put(c); -+ } -+ return ret; -+} -+ -+#endif /* NO_BCACHEFS_CHARDEV */ -diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h -index 1cca31011530..574948278cd4 100644 ---- a/fs/bcachefs/fsck.h -+++ b/fs/bcachefs/fsck.h -@@ -2,6 +2,14 @@ - #ifndef _BCACHEFS_FSCK_H - #define _BCACHEFS_FSCK_H - -+#include "str_hash.h" -+ -+int bch2_fsck_update_backpointers(struct btree_trans *, -+ struct snapshots_seen *, -+ const struct bch_hash_desc, -+ struct bch_hash_info *, -+ struct bkey_i *); -+ - int bch2_check_inodes(struct bch_fs *); - int bch2_check_extents(struct bch_fs *); - int bch2_check_indirect_extents(struct bch_fs *); -@@ -14,4 +22,7 @@ int bch2_check_directory_structure(struct bch_fs *); - int bch2_check_nlinks(struct bch_fs *); - int bch2_fix_reflink_p(struct bch_fs *); - -+long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *); -+long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online); -+ - #endif /* _BCACHEFS_FSCK_H */ -diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c -index 039cb7a22244..339b80770f1d 100644 ---- a/fs/bcachefs/inode.c -+++ b/fs/bcachefs/inode.c -@@ -14,6 +14,7 @@ - #include "extent_update.h" - #include "fs.h" - #include "inode.h" -+#include "opts.h" - #include "str_hash.h" - #include "snapshot.h" - #include "subvolume.h" -@@ -47,10 +48,10 @@ static int inode_decode_field(const u8 *in, const u8 *end, - u8 *p; - - if (in >= end) -- return -1; -+ return -BCH_ERR_inode_unpack_error; - - if (!*in) -- return -1; -+ return -BCH_ERR_inode_unpack_error; - - /* - * position of highest set bit indicates number of bytes: -@@ -60,7 +61,7 @@ static int inode_decode_field(const u8 *in, const u8 *end, - bytes = byte_table[shift - 1]; - - if (in + bytes > end) -- return -1; -+ return -BCH_ERR_inode_unpack_error; - - p = (u8 *) be + 16 - bytes; - memcpy(p, in, bytes); -@@ -176,7 +177,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, - return ret; \ - \ - if (field_bits > sizeof(unpacked->_name) * 8) \ -- return -1; \ -+ return -BCH_ERR_inode_unpack_error; \ - \ - unpacked->_name = field[1]; \ - in += ret; -@@ -217,7 +218,7 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, - \ - unpacked->_name = v[0]; \ - if (v[1] || v[0] != unpacked->_name) \ -- return -1; \ -+ return -BCH_ERR_inode_unpack_error; \ - fieldnr++; - - BCH_INODE_FIELDS_v2() -@@ -268,7 +269,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, - \ - unpacked->_name = v[0]; \ - if (v[1] || v[0] != unpacked->_name) \ -- return -1; \ -+ return -BCH_ERR_inode_unpack_error; \ - fieldnr++; - - BCH_INODE_FIELDS_v3() -@@ -428,7 +429,7 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) - } - - static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bch_inode_unpacked unpacked; - int ret = 0; -@@ -468,7 +469,7 @@ static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - } - - int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - int ret = 0; -@@ -478,13 +479,13 @@ int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - "invalid str hash type (%llu >= %u)", - INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); - -- ret = __bch2_inode_validate(c, k, flags); -+ ret = __bch2_inode_validate(c, k, from); - fsck_err: - return ret; - } - - int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); - int ret = 0; -@@ -494,13 +495,13 @@ int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, - "invalid str hash type (%llu >= %u)", - INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - -- ret = __bch2_inode_validate(c, k, flags); -+ ret = __bch2_inode_validate(c, k, from); - fsck_err: - return ret; - } - - int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); - int ret = 0; -@@ -518,7 +519,7 @@ int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, - "invalid str hash type (%llu >= %u)", - INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - -- ret = __bch2_inode_validate(c, k, flags); -+ ret = __bch2_inode_validate(c, k, from); - fsck_err: - return ret; - } -@@ -617,7 +618,7 @@ bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter - struct bkey_s_c k; - int ret = 0; - -- for_each_btree_key_upto_norestart(trans, *iter, btree, -+ for_each_btree_key_max_norestart(trans, *iter, btree, - bpos_successor(pos), - SPOS(pos.inode, pos.offset, U32_MAX), - flags|BTREE_ITER_all_snapshots, k, ret) -@@ -652,7 +653,7 @@ int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) - struct bkey_s_c k; - int ret = 0; - -- for_each_btree_key_upto_norestart(trans, iter, -+ for_each_btree_key_max_norestart(trans, iter, - BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), - BTREE_ITER_all_snapshots| - BTREE_ITER_with_updates, k, ret) -@@ -779,7 +780,7 @@ int bch2_trigger_inode(struct btree_trans *trans, - } - - int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - int ret = 0; - -@@ -798,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, - prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); - } - -+int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k, -+ struct bkey_validate_context from) -+{ -+ int ret = 0; -+ -+ bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors, -+ c, inode_alloc_cursor_inode_bad, -+ "k.p.inode bad"); -+fsck_err: -+ return ret; -+} -+ -+void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k); -+ -+ prt_printf(out, "idx %llu generation %llu", -+ le64_to_cpu(i.v->idx), -+ le64_to_cpu(i.v->gen)); -+} -+ - void bch2_inode_init_early(struct bch_fs *c, - struct bch_inode_unpacked *inode_u) - { -@@ -858,43 +881,78 @@ static inline u32 bkey_generation(struct bkey_s_c k) - } - } - --/* -- * This just finds an empty slot: -- */ --int bch2_inode_create(struct btree_trans *trans, -- struct btree_iter *iter, -- struct bch_inode_unpacked *inode_u, -- u32 snapshot, u64 cpu) -+static struct bkey_i_inode_alloc_cursor * -+bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) - { - struct bch_fs *c = trans->c; -- struct bkey_s_c k; -- u64 min, max, start, pos, *hint; -- int ret = 0; -- unsigned bits = (c->opts.inodes_32bit ? 31 : 63); - -- if (c->opts.shard_inode_numbers) { -- bits -= c->inode_shard_bits; -+ u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1; - -- min = (cpu << bits); -- max = (cpu << bits) | ~(ULLONG_MAX << bits); -+ cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); - -- min = max_t(u64, min, BLOCKDEV_INODE_MAX); -- hint = c->unused_inode_hints + cpu; -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, -+ BTREE_ID_logged_ops, -+ POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), -+ BTREE_ITER_cached); -+ int ret = bkey_err(k); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ struct bkey_i_inode_alloc_cursor *cursor = -+ k.k->type == KEY_TYPE_inode_alloc_cursor -+ ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) -+ : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); -+ ret = PTR_ERR_OR_ZERO(cursor); -+ if (ret) -+ goto err; -+ -+ if (c->opts.inodes_32bit) { -+ *min = BLOCKDEV_INODE_MAX; -+ *max = INT_MAX; - } else { -- min = BLOCKDEV_INODE_MAX; -- max = ~(ULLONG_MAX << bits); -- hint = c->unused_inode_hints; -+ cursor->v.bits = c->opts.shard_inode_numbers_bits; -+ -+ unsigned bits = 63 - c->opts.shard_inode_numbers_bits; -+ -+ *min = max(cpu << bits, (u64) INT_MAX + 1); -+ *max = (cpu << bits) | ~(ULLONG_MAX << bits); - } - -- start = READ_ONCE(*hint); -+ if (le64_to_cpu(cursor->v.idx) < *min) -+ cursor->v.idx = cpu_to_le64(*min); - -- if (start >= max || start < min) -- start = min; -+ if (le64_to_cpu(cursor->v.idx) >= *max) { -+ cursor->v.idx = cpu_to_le64(*min); -+ le32_add_cpu(&cursor->v.gen, 1); -+ } -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret ? ERR_PTR(ret) : cursor; -+} -+ -+/* -+ * This just finds an empty slot: -+ */ -+int bch2_inode_create(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bch_inode_unpacked *inode_u, -+ u32 snapshot, u64 cpu) -+{ -+ u64 min, max; -+ struct bkey_i_inode_alloc_cursor *cursor = -+ bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); -+ int ret = PTR_ERR_OR_ZERO(cursor); -+ if (ret) -+ return ret; -+ -+ u64 start = le64_to_cpu(cursor->v.idx); -+ u64 pos = start; - -- pos = start; - bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), - BTREE_ITER_all_snapshots| - BTREE_ITER_intent); -+ struct bkey_s_c k; - again: - while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k)) && -@@ -924,6 +982,7 @@ int bch2_inode_create(struct btree_trans *trans, - /* Retry from start */ - pos = start = min; - bch2_btree_iter_set_pos(iter, POS(0, pos)); -+ le32_add_cpu(&cursor->v.gen, 1); - goto again; - found_slot: - bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); -@@ -934,9 +993,9 @@ int bch2_inode_create(struct btree_trans *trans, - return ret; - } - -- *hint = k.k->p.offset; - inode_u->bi_inum = k.k->p.offset; -- inode_u->bi_generation = bkey_generation(k); -+ inode_u->bi_generation = le64_to_cpu(cursor->v.gen); -+ cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); - return 0; - } - -@@ -966,7 +1025,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, - - bch2_btree_iter_set_snapshot(&iter, snapshot); - -- k = bch2_btree_iter_peek_upto(&iter, end); -+ k = bch2_btree_iter_peek_max(&iter, end); - ret = bkey_err(k); - if (ret) - goto err; -@@ -998,8 +1057,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) - { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; -- struct bkey_i_inode_generation delete; -- struct bch_inode_unpacked inode_u; - struct bkey_s_c k; - u32 snapshot; - int ret; -@@ -1039,13 +1096,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) - goto err; - } - -- bch2_inode_unpack(k, &inode_u); -- -- bkey_inode_generation_init(&delete.k_i); -- delete.k.p = iter.pos; -- delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); -- -- ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: -+ ret = bch2_btree_delete_at(trans, &iter, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - err: -@@ -1141,12 +1192,18 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) - void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, - struct bch_inode_unpacked *inode) - { --#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); -+#define x(_name, _bits) \ -+ if ((inode)->bi_##_name) { \ -+ opts->_name = inode->bi_##_name - 1; \ -+ opts->_name##_from_inode = true; \ -+ } else { \ -+ opts->_name = c->opts._name; \ -+ opts->_name##_from_inode = false; \ -+ } - BCH_INODE_OPTS() - #undef x - -- if (opts->nocow) -- opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; -+ bch2_io_opts_fixups(opts); - } - - int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) -@@ -1380,7 +1437,8 @@ int bch2_delete_dead_inodes(struct bch_fs *c) - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); - if (ret > 0) { -- bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); -+ bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", -+ k.k->p.offset, k.k->p.snapshot); - - ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); - /* -diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h -index eab82b5eb897..428b9be6af34 100644 ---- a/fs/bcachefs/inode.h -+++ b/fs/bcachefs/inode.h -@@ -7,15 +7,14 @@ - #include "opts.h" - #include "snapshot.h" - --enum bch_validate_flags; - extern const char * const bch2_inode_opts[]; - - int bch2_inode_validate(struct bch_fs *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context); - int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context); - int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context); - void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - - int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); -@@ -60,7 +59,7 @@ static inline bool bkey_is_inode(const struct bkey *k) - } - - int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context); - void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - - #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ -@@ -69,6 +68,16 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk - .min_val_size = 8, \ - }) - -+int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); -+void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \ -+ .key_validate = bch2_inode_alloc_cursor_validate, \ -+ .val_to_text = bch2_inode_alloc_cursor_to_text, \ -+ .min_val_size = 16, \ -+}) -+ - #if 0 - typedef struct { - u64 lo; -@@ -220,6 +229,20 @@ static inline u32 bch2_inode_flags(struct bkey_s_c k) - } - } - -+static inline unsigned bkey_inode_mode(struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_inode: -+ return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode); -+ case KEY_TYPE_inode_v2: -+ return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode); -+ case KEY_TYPE_inode_v3: -+ return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v); -+ default: -+ return 0; -+ } -+} -+ - /* i_nlink: */ - - static inline unsigned nlink_bias(umode_t mode) -@@ -249,7 +272,7 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, - int bch2_inode_nlink_inc(struct bch_inode_unpacked *); - void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); - --static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode) -+static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode) - { - bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; - -@@ -262,6 +285,16 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, - struct bch_inode_unpacked *); - int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); - -+#include "rebalance.h" -+ -+static inline struct bch_extent_rebalance -+bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) -+{ -+ struct bch_io_opts io_opts; -+ bch2_inode_opts_get(&io_opts, c, inode); -+ return io_opts_to_rebalance_opts(c, &io_opts); -+} -+ - int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); - int bch2_delete_dead_inodes(struct bch_fs *); - -diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h -index 7928d0c6954f..b99a5bf1a75e 100644 ---- a/fs/bcachefs/inode_format.h -+++ b/fs/bcachefs/inode_format.h -@@ -101,7 +101,9 @@ struct bch_inode_generation { - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) \ -- x(bi_nocow, 8) -+ x(bi_nocow, 8) \ -+ x(bi_depth, 32) \ -+ x(bi_inodes_32bit, 8) - - /* subset of BCH_INODE_FIELDS */ - #define BCH_INODE_OPTS() \ -@@ -114,7 +116,8 @@ struct bch_inode_generation { - x(foreground_target, 16) \ - x(background_target, 16) \ - x(erasure_code, 16) \ -- x(nocow, 8) -+ x(nocow, 8) \ -+ x(inodes_32bit, 8) - - enum inode_opt_id { - #define x(name, ...) \ -@@ -164,4 +167,12 @@ LE64_BITMASK(INODEv3_FIELDS_START, - struct bch_inode_v3, bi_flags, 31, 36); - LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); - -+struct bch_inode_alloc_cursor { -+ struct bch_val v; -+ __u8 bits; -+ __u8 pad; -+ __le32 gen; -+ __le64 idx; -+}; -+ - #endif /* _BCACHEFS_INODE_FORMAT_H */ -diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c -index f283051758d6..5353979117b0 100644 ---- a/fs/bcachefs/io_misc.c -+++ b/fs/bcachefs/io_misc.c -@@ -113,11 +113,13 @@ int bch2_extent_fallocate(struct btree_trans *trans, - err: - if (!ret && sectors_allocated) - bch2_increment_clock(c, sectors_allocated, WRITE); -- if (should_print_err(ret)) -- bch_err_inum_offset_ratelimited(c, -- inum.inum, -- iter->pos.offset << 9, -- "%s(): error: %s", __func__, bch2_err_str(ret)); -+ if (should_print_err(ret)) { -+ struct printbuf buf = PRINTBUF; -+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); -+ prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } - err_noprint: - bch2_open_buckets_put(c, &open_buckets); - bch2_disk_reservation_put(c, &disk_res); -@@ -164,9 +166,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - bch2_btree_iter_set_snapshot(iter, snapshot); - - /* -- * peek_upto() doesn't have ideal semantics for extents: -+ * peek_max() doesn't have ideal semantics for extents: - */ -- k = bch2_btree_iter_peek_upto(iter, end_pos); -+ k = bch2_btree_iter_peek_max(iter, end_pos); - if (!k.k) - break; - -@@ -426,8 +428,8 @@ case LOGGED_OP_FINSERT_shift_extents: - bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); - - k = insert -- ? bch2_btree_iter_peek_prev(&iter) -- : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); -+ ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0)) -+ : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); - if ((ret = bkey_err(k))) - goto btree_err; - -@@ -461,7 +463,7 @@ case LOGGED_OP_FINSERT_shift_extents: - - op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - -- ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: -+ ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?: - bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: - bch2_logged_op_update(trans, &op->k_i) ?: -diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c -index b3b934a87c6d..aa91fcf51eec 100644 ---- a/fs/bcachefs/io_read.c -+++ b/fs/bcachefs/io_read.c -@@ -21,6 +21,7 @@ - #include "io_read.h" - #include "io_misc.h" - #include "io_write.h" -+#include "reflink.h" - #include "subvolume.h" - #include "trace.h" - -@@ -58,7 +59,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) - } - rcu_read_unlock(); - -- return bch2_rand_range(nr * CONGESTED_MAX) < total; -+ return get_random_u32_below(nr * CONGESTED_MAX) < total; - } - - #else -@@ -90,13 +91,18 @@ static const struct rhashtable_params bch_promote_params = { - .automatic_shrinking = true, - }; - -+static inline bool have_io_error(struct bch_io_failures *failed) -+{ -+ return failed && failed->nr; -+} -+ - static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, - struct bpos pos, - struct bch_io_opts opts, - unsigned flags, - struct bch_io_failures *failed) - { -- if (!failed) { -+ if (!have_io_error(failed)) { - BUG_ON(!opts.promote_target); - - if (!(flags & BCH_READ_MAY_PROMOTE)) -@@ -223,7 +229,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, - - struct data_update_opts update_opts = {}; - -- if (!failed) { -+ if (!have_io_error(failed)) { - update_opts.target = opts.promote_target; - update_opts.extra_replicas = 1; - update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; -@@ -231,11 +237,11 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, - update_opts.target = opts.foreground_target; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- unsigned i = 0; -+ unsigned ptr_bit = 1; - bkey_for_each_ptr(ptrs, ptr) { - if (bch2_dev_io_failures(failed, ptr->dev)) -- update_opts.rewrite_ptrs |= BIT(i); -- i++; -+ update_opts.rewrite_ptrs |= ptr_bit; -+ ptr_bit <<= 1; - } - } - -@@ -285,7 +291,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, - * if failed != NULL we're not actually doing a promote, we're - * recovering from an io/checksum error - */ -- bool promote_full = (failed || -+ bool promote_full = (have_io_error(failed) || - *read_full || - READ_ONCE(c->opts.promote_whole_extents)); - /* data might have to be decompressed in the write path: */ -@@ -321,6 +327,20 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, - - /* Read */ - -+static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, -+ struct bch_read_bio *rbio, struct bpos read_pos) -+{ -+ return bch2_inum_offset_err_msg_trans(trans, out, -+ (subvol_inum) { rbio->subvol, read_pos.inode }, -+ read_pos.offset << 9); -+} -+ -+static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, -+ struct bch_read_bio *rbio, struct bpos read_pos) -+{ -+ bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); -+} -+ - #define READ_RETRY_AVOID 1 - #define READ_RETRY 2 - #define READ_ERR 3 -@@ -499,6 +519,29 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, - } - } - -+static void bch2_read_io_err(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bio *bio = &rbio->bio; -+ struct bch_fs *c = rbio->c; -+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); -+ prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); -+ -+ if (ca) { -+ bch2_io_error(ca, BCH_MEMBER_ERROR_read); -+ bch_err_ratelimited(ca, "%s", buf.buf); -+ } else { -+ bch_err_ratelimited(c, "%s", buf.buf); -+ } -+ -+ printbuf_exit(&buf); -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); -+} -+ - static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, - struct bch_read_bio *rbio) - { -@@ -562,6 +605,73 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) - __bch2_rbio_narrow_crcs(trans, rbio)); - } - -+static void bch2_read_csum_err(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bch_fs *c = rbio->c; -+ struct bio *src = &rbio->bio; -+ struct bch_extent_crc_unpacked crc = rbio->pick.crc; -+ struct nonce nonce = extent_nonce(rbio->version, crc); -+ struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); -+ prt_str(&buf, "data "); -+ bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); -+ -+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; -+ if (ca) { -+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); -+ bch_err_ratelimited(ca, "%s", buf.buf); -+ } else { -+ bch_err_ratelimited(c, "%s", buf.buf); -+ } -+ -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ printbuf_exit(&buf); -+} -+ -+static void bch2_read_decompress_err(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bch_fs *c = rbio->c; -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); -+ prt_str(&buf, "decompression error"); -+ -+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; -+ if (ca) -+ bch_err_ratelimited(ca, "%s", buf.buf); -+ else -+ bch_err_ratelimited(c, "%s", buf.buf); -+ -+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); -+ printbuf_exit(&buf); -+} -+ -+static void bch2_read_decrypt_err(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bch_fs *c = rbio->c; -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); -+ prt_str(&buf, "decrypt error"); -+ -+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; -+ if (ca) -+ bch_err_ratelimited(ca, "%s", buf.buf); -+ else -+ bch_err_ratelimited(c, "%s", buf.buf); -+ -+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); -+ printbuf_exit(&buf); -+} -+ - /* Inner part that may run in process context */ - static void __bch2_read_endio(struct work_struct *work) - { -@@ -668,33 +778,13 @@ static void __bch2_read_endio(struct work_struct *work) - goto out; - } - -- struct printbuf buf = PRINTBUF; -- buf.atomic++; -- prt_str(&buf, "data "); -- bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); -- -- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; -- if (ca) { -- bch_err_inum_offset_ratelimited(ca, -- rbio->read_pos.inode, -- rbio->read_pos.offset << 9, -- "data %s", buf.buf); -- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); -- } -- printbuf_exit(&buf); -- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); - goto out; - decompression_err: -- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, -- rbio->read_pos.offset << 9, -- "decompression error"); -- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); -+ bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); - goto out; - decrypt_err: -- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, -- rbio->read_pos.offset << 9, -- "decrypt error"); -- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); -+ bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); - goto out; - } - -@@ -715,16 +805,8 @@ static void bch2_read_endio(struct bio *bio) - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - -- if (bio->bi_status) { -- if (ca) { -- bch_err_inum_offset_ratelimited(ca, -- rbio->read_pos.inode, -- rbio->read_pos.offset, -- "data read error: %s", -- bch2_blk_status_to_str(bio->bi_status)); -- bch2_io_error(ca, BCH_MEMBER_ERROR_read); -- } -- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); -+ if (unlikely(bio->bi_status)) { -+ bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); - return; - } - -@@ -750,45 +832,6 @@ static void bch2_read_endio(struct bio *bio) - bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); - } - --int __bch2_read_indirect_extent(struct btree_trans *trans, -- unsigned *offset_into_extent, -- struct bkey_buf *orig_k) --{ -- struct btree_iter iter; -- struct bkey_s_c k; -- u64 reflink_offset; -- int ret; -- -- reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + -- *offset_into_extent; -- -- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, -- POS(0, reflink_offset), 0); -- ret = bkey_err(k); -- if (ret) -- goto err; -- -- if (k.k->type != KEY_TYPE_reflink_v && -- k.k->type != KEY_TYPE_indirect_inline_data) { -- bch_err_inum_offset_ratelimited(trans->c, -- orig_k->k->k.p.inode, -- orig_k->k->k.p.offset << 9, -- "%llu len %u points to nonexistent indirect extent %llu", -- orig_k->k->k.p.offset, -- orig_k->k->k.size, -- reflink_offset); -- bch2_inconsistent_error(trans->c); -- ret = -BCH_ERR_missing_indirect_extent; -- goto err; -- } -- -- *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); -- bch2_bkey_buf_reassemble(orig_k, trans->c, k); --err: -- bch2_trans_iter_exit(trans, &iter); -- return ret; --} -- - static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c k, -@@ -868,15 +911,24 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - if (!pick_ret) - goto hole; - -- if (pick_ret < 0) { -+ if (unlikely(pick_ret < 0)) { - struct printbuf buf = PRINTBUF; -+ bch2_read_err_msg_trans(trans, &buf, orig, read_pos); -+ prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); - bch2_bkey_val_to_text(&buf, c, k); - -- bch_err_inum_offset_ratelimited(c, -- read_pos.inode, read_pos.offset << 9, -- "no device to read from: %s\n %s", -- bch2_err_str(pick_ret), -- buf.buf); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ goto err; -+ } -+ -+ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { -+ struct printbuf buf = PRINTBUF; -+ bch2_read_err_msg_trans(trans, &buf, orig, read_pos); -+ prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); -+ bch2_bkey_val_to_text(&buf, c, k); -+ -+ bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - goto err; - } -@@ -899,12 +951,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - goto retry_pick; - } - -- /* -- * Unlock the iterator while the btree node's lock is still in -- * cache, before doing the IO: -- */ -- bch2_trans_unlock(trans); -- - if (flags & BCH_READ_NODECODE) { - /* - * can happen if we retry, and the extent we were going to read -@@ -942,7 +988,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - bounce = true; - } - -- if (orig->opts.promote_target)// || failed) -+ if (orig->opts.promote_target || have_io_error(failed)) - promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full, failed); - -@@ -1061,12 +1107,25 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - trace_and_count(c, read_split, &orig->bio); - } - -+ /* -+ * Unlock the iterator while the btree node's lock is still in -+ * cache, before doing the IO: -+ */ -+ if (!(flags & BCH_READ_IN_RETRY)) -+ bch2_trans_unlock(trans); -+ else -+ bch2_trans_unlock_long(trans); -+ - if (!rbio->pick.idx) { -- if (!rbio->have_ioref) { -- bch_err_inum_offset_ratelimited(c, -- read_pos.inode, -- read_pos.offset << 9, -- "no device to read from"); -+ if (unlikely(!rbio->have_ioref)) { -+ struct printbuf buf = PRINTBUF; -+ bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); -+ prt_printf(&buf, "no device to read from:\n "); -+ bch2_bkey_val_to_text(&buf, c, k); -+ -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - goto out; - } -@@ -1104,6 +1163,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - if (likely(!(flags & BCH_READ_IN_RETRY))) { - return 0; - } else { -+ bch2_trans_unlock(trans); -+ - int ret; - - rbio->context = RBIO_CONTEXT_UNBOUND; -@@ -1164,7 +1225,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - BTREE_ITER_slots); - - while (1) { -- unsigned bytes, sectors, offset_into_extent; - enum btree_id data_btree = BTREE_ID_extents; - - bch2_trans_begin(trans); -@@ -1184,9 +1244,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - if (ret) - goto err; - -- offset_into_extent = iter.pos.offset - -+ s64 offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); -- sectors = k.k->size - offset_into_extent; -+ unsigned sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - -@@ -1201,9 +1261,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - * With indirect extents, the amount of data to read is the min - * of the original extent and the indirect extent: - */ -- sectors = min(sectors, k.k->size - offset_into_extent); -+ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - -- bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; -+ unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; - swap(bvec_iter.bi_size, bytes); - - if (bvec_iter.bi_size == bytes) -@@ -1229,16 +1289,20 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - } - - bch2_trans_iter_exit(trans, &iter); -- bch2_trans_put(trans); -- bch2_bkey_buf_exit(&sk, c); - - if (ret) { -- bch_err_inum_offset_ratelimited(c, inum.inum, -- bvec_iter.bi_sector << 9, -- "read error %i from btree lookup", ret); -+ struct printbuf buf = PRINTBUF; -+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); -+ prt_printf(&buf, "read error %i from btree lookup", ret); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ - rbio->bio.bi_status = BLK_STS_IOERR; - bch2_rbio_done(rbio); - } -+ -+ bch2_trans_put(trans); -+ bch2_bkey_buf_exit(&sk, c); - } - - void bch2_fs_io_read_exit(struct bch_fs *c) -diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h -index d9c18bb7d403..a82e8a94ccb6 100644 ---- a/fs/bcachefs/io_read.h -+++ b/fs/bcachefs/io_read.h -@@ -3,6 +3,7 @@ - #define _BCACHEFS_IO_READ_H - - #include "bkey_buf.h" -+#include "reflink.h" - - struct bch_read_bio { - struct bch_fs *c; -@@ -79,19 +80,32 @@ struct bch_devs_mask; - struct cache_promote_op; - struct extent_ptr_decoded; - --int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, -- struct bkey_buf *); -- - static inline int bch2_read_indirect_extent(struct btree_trans *trans, - enum btree_id *data_btree, -- unsigned *offset_into_extent, -- struct bkey_buf *k) -+ s64 *offset_into_extent, -+ struct bkey_buf *extent) - { -- if (k->k->k.type != KEY_TYPE_reflink_p) -+ if (extent->k->k.type != KEY_TYPE_reflink_p) - return 0; - - *data_btree = BTREE_ID_reflink; -- return __bch2_read_indirect_extent(trans, offset_into_extent, k); -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, -+ offset_into_extent, -+ bkey_i_to_s_c_reflink_p(extent->k), -+ true, 0); -+ int ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ if (bkey_deleted(k.k)) { -+ bch2_trans_iter_exit(trans, &iter); -+ return -BCH_ERR_missing_indirect_extent; -+ } -+ -+ bch2_bkey_buf_reassemble(extent, trans->c, k); -+ bch2_trans_iter_exit(trans, &iter); -+ return 0; - } - - enum bch_read_flags { -diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c -index 96720adcfee0..03892388832b 100644 ---- a/fs/bcachefs/io_write.c -+++ b/fs/bcachefs/io_write.c -@@ -164,7 +164,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, - - bch2_trans_copy_iter(&iter, extent_iter); - -- for_each_btree_key_upto_continue_norestart(iter, -+ for_each_btree_key_max_continue_norestart(iter, - new->k.p, BTREE_ITER_slots, old, ret) { - s64 sectors = min(new->k.p.offset, old.k->p.offset) - - max(bkey_start_offset(&new->k), -@@ -216,6 +216,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, - SPOS(0, - extent_iter->pos.inode, - extent_iter->snapshot), -+ BTREE_ITER_intent| - BTREE_ITER_cached); - int ret = bkey_err(k); - if (unlikely(ret)) -@@ -369,7 +370,7 @@ static int bch2_write_index_default(struct bch_write_op *op) - bkey_start_pos(&sk.k->k), - BTREE_ITER_slots|BTREE_ITER_intent); - -- ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: -+ ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: - bch2_extent_update(trans, inum, &iter, sk.k, - &op->res, - op->new_i_size, &op->i_sectors_delta, -@@ -395,6 +396,31 @@ static int bch2_write_index_default(struct bch_write_op *op) - - /* Writes */ - -+static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, -+ u64 offset) -+{ -+ bch2_inum_offset_err_msg(op->c, out, -+ (subvol_inum) { op->subvol, op->pos.inode, }, -+ offset << 9); -+ prt_printf(out, "write error%s: ", -+ op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); -+} -+ -+void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) -+{ -+ __bch2_write_op_error(out, op, op->pos.offset); -+} -+ -+static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, -+ struct bch_write_op *op, u64 offset) -+{ -+ bch2_inum_offset_err_msg_trans(trans, out, -+ (subvol_inum) { op->subvol, op->pos.inode, }, -+ offset << 9); -+ prt_printf(out, "write error%s: ", -+ op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); -+} -+ - void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, - enum bch_data_type type, - const struct bkey_i *k, -@@ -531,14 +557,14 @@ static void __bch2_write_index(struct bch_write_op *op) - - op->written += sectors_start - keylist_sectors(keys); - -- if (ret && !bch2_err_matches(ret, EROFS)) { -+ if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - -- bch_err_inum_offset_ratelimited(c, -- insert->k.p.inode, insert->k.p.offset << 9, -- "%s write error while doing btree update: %s", -- op->flags & BCH_WRITE_MOVE ? "move" : "user", -- bch2_err_str(ret)); -+ struct printbuf buf = PRINTBUF; -+ __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); -+ prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); - } - - if (ret) -@@ -621,9 +647,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work) - - while (1) { - spin_lock_irq(&wp->writes_lock); -- op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); -- if (op) -- list_del(&op->wp_list); -+ op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); - wp_update_state(wp, op != NULL); - spin_unlock_irq(&wp->writes_lock); - -@@ -859,7 +883,7 @@ static enum prep_encoded_ret { - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; - -- if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) -+ if (bch2_bio_uncompress_inplace(op, bio)) - return PREP_ENCODED_ERR; - } - -@@ -1080,11 +1104,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - *_dst = dst; - return more; - csum_err: -- bch_err_inum_offset_ratelimited(c, -- op->pos.inode, -- op->pos.offset << 9, -- "%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)", -- op->flags & BCH_WRITE_MOVE ? "move" : "user"); -+ { -+ struct printbuf buf = PRINTBUF; -+ bch2_write_op_error(&buf, op); -+ prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } -+ - ret = -EIO; - err: - if (to_wbio(dst)->bounce) -@@ -1165,7 +1192,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) - struct btree_trans *trans = bch2_trans_get(c); - - for_each_keylist_key(&op->insert_keys, orig) { -- int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, -+ int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, - bkey_start_pos(&orig->k), orig->k.p, - BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ -@@ -1175,11 +1202,11 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - -- bch_err_inum_offset_ratelimited(c, -- insert->k.p.inode, insert->k.p.offset << 9, -- "%s write error while doing btree update: %s", -- op->flags & BCH_WRITE_MOVE ? "move" : "user", -- bch2_err_str(ret)); -+ struct printbuf buf = PRINTBUF; -+ bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); -+ prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); - } - - if (ret) { -@@ -1339,17 +1366,19 @@ static void bch2_nocow_write(struct bch_write_op *op) - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - -+ bch2_trans_put(trans); -+ darray_exit(&buckets); -+ - if (ret) { -- bch_err_inum_offset_ratelimited(c, -- op->pos.inode, op->pos.offset << 9, -- "%s: btree lookup error %s", __func__, bch2_err_str(ret)); -+ struct printbuf buf = PRINTBUF; -+ bch2_write_op_error(&buf, op); -+ prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); - op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; - } - -- bch2_trans_put(trans); -- darray_exit(&buckets); -- - /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_SUBMITTED)) { - closure_sync(&op->cl); -@@ -1462,14 +1491,14 @@ static void __bch2_write(struct bch_write_op *op) - if (ret <= 0) { - op->flags |= BCH_WRITE_SUBMITTED; - -- if (ret < 0) { -- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) -- bch_err_inum_offset_ratelimited(c, -- op->pos.inode, -- op->pos.offset << 9, -- "%s(): %s error: %s", __func__, -- op->flags & BCH_WRITE_MOVE ? "move" : "user", -- bch2_err_str(ret)); -+ if (unlikely(ret < 0)) { -+ if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { -+ struct printbuf buf = PRINTBUF; -+ bch2_write_op_error(&buf, op); -+ prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } - op->error = ret; - break; - } -@@ -1595,12 +1624,11 @@ CLOSURE_CALLBACK(bch2_write) - bch2_keylist_init(&op->insert_keys, op->inline_keys); - wbio_init(bio)->put_bio = false; - -- if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { -- bch_err_inum_offset_ratelimited(c, -- op->pos.inode, -- op->pos.offset << 9, -- "%s write error: misaligned write", -- op->flags & BCH_WRITE_MOVE ? "move" : "user"); -+ if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { -+ struct printbuf buf = PRINTBUF; -+ bch2_write_op_error(&buf, op); -+ prt_printf(&buf, "misaligned write"); -+ printbuf_exit(&buf); - op->error = -EIO; - goto err; - } -diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h -index 5400ce94ee57..b4626013abc8 100644 ---- a/fs/bcachefs/io_write.h -+++ b/fs/bcachefs/io_write.h -@@ -20,6 +20,8 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw - void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - enum bch_data_type, const struct bkey_i *, bool); - -+void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); -+ - #define BCH_WRITE_FLAGS() \ - x(ALLOC_NOWAIT) \ - x(CACHED) \ -diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c -index 2dc0d60c1745..05b1250619ec 100644 ---- a/fs/bcachefs/journal.c -+++ b/fs/bcachefs/journal.c -@@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq) - - static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) - { -- unsigned i; -- -- for (i = 0; i < ARRAY_SIZE(p->list); i++) -- INIT_LIST_HEAD(&p->list[i]); -- INIT_LIST_HEAD(&p->flushed); -+ for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) -+ INIT_LIST_HEAD(&p->unflushed[i]); -+ for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) -+ INIT_LIST_HEAD(&p->flushed[i]); - atomic_set(&p->count, count); - p->devs.nr = 0; - } -@@ -217,6 +216,12 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq) - if (__bch2_journal_pin_put(j, seq)) - bch2_journal_reclaim_fast(j); - bch2_journal_do_writes(j); -+ -+ /* -+ * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an -+ * open journal entry -+ */ -+ wake_up(&j->wait); - } - - /* -@@ -251,6 +256,9 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t - if (!__journal_entry_is_open(old)) - return; - -+ if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) -+ old.cur_entry_offset = j->cur_entry_offset_if_blocked; -+ - /* Close out old buffer: */ - buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - -@@ -311,6 +319,16 @@ void bch2_journal_halt(struct journal *j) - spin_unlock(&j->lock); - } - -+void bch2_journal_halt_locked(struct journal *j) -+{ -+ lockdep_assert_held(&j->lock); -+ -+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); -+ if (!j->err_seq) -+ j->err_seq = journal_cur_seq(j); -+ journal_wake(j); -+} -+ - static bool journal_entry_want_write(struct journal *j) - { - bool ret = !journal_entry_is_open(j) || -@@ -373,6 +391,13 @@ static int journal_entry_open(struct journal *j) - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return JOURNAL_ERR_max_in_flight; - -+ if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { -+ bch_err(c, "cannot start: journal seq overflow"); -+ if (bch2_fs_emergency_read_only_locked(c)) -+ bch_err(c, "fatal error - emergency read only"); -+ return JOURNAL_ERR_insufficient_devices; /* -EROFS */ -+ } -+ - BUG_ON(!j->cur_entry_sectors); - - buf->expires = -@@ -588,6 +613,16 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, - : -BCH_ERR_journal_res_get_blocked; - } - -+static unsigned max_dev_latency(struct bch_fs *c) -+{ -+ u64 nsecs = 0; -+ -+ for_each_rw_member(c, ca) -+ nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); -+ -+ return nsecs_to_jiffies(nsecs); -+} -+ - /* - * Essentially the entry function to the journaling code. When bcachefs is doing - * a btree insert, it calls this function to get the current journal write. -@@ -599,17 +634,31 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, - * btree node write locks. - */ - int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, -- unsigned flags) -+ unsigned flags, -+ struct btree_trans *trans) - { - int ret; - - if (closure_wait_event_timeout(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || - (flags & JOURNAL_RES_GET_NONBLOCK), -- HZ * 10)) -+ HZ)) - return ret; - -+ if (trans) -+ bch2_trans_unlock_long(trans); -+ - struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); -+ -+ remaining_wait = max(0, remaining_wait - HZ); -+ -+ if (closure_wait_event_timeout(&j->async_wait, -+ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || -+ (flags & JOURNAL_RES_GET_NONBLOCK), -+ remaining_wait)) -+ return ret; -+ - struct printbuf buf = PRINTBUF; - bch2_journal_debug_to_text(&buf, j); - bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", -@@ -664,7 +713,7 @@ void bch2_journal_entry_res_resize(struct journal *j, - * @seq: seq to flush - * @parent: closure object to wait with - * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, -- * -EIO if @seq will never be flushed -+ * -BCH_ERR_journal_flush_err if @seq will never be flushed - * - * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if - * necessary -@@ -687,7 +736,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, - - /* Recheck under lock: */ - if (j->err_seq && seq >= j->err_seq) { -- ret = -EIO; -+ ret = -BCH_ERR_journal_flush_err; - goto out; - } - -@@ -714,7 +763,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, - * livelock: - */ - sched_annotate_sleep(); -- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); -+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); - if (ret) - return ret; - -@@ -747,6 +796,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, - } - - buf->must_flush = true; -+ j->flushing_seq = max(j->flushing_seq, seq); - - if (parent && !closure_wait(&buf->wait, parent)) - BUG(); -@@ -794,10 +844,11 @@ int bch2_journal_flush(struct journal *j) - } - - /* -- * bch2_journal_noflush_seq - tell the journal not to issue any flushes before -+ * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the -+ * range [start, end) - * @seq - */ --bool bch2_journal_noflush_seq(struct journal *j, u64 seq) -+bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) - { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - u64 unwritten_seq; -@@ -806,15 +857,15 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) - if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) - return false; - -- if (seq <= c->journal.flushed_seq_ondisk) -+ if (c->journal.flushed_seq_ondisk >= start) - return false; - - spin_lock(&j->lock); -- if (seq <= c->journal.flushed_seq_ondisk) -+ if (c->journal.flushed_seq_ondisk >= start) - goto out; - - for (unwritten_seq = journal_last_unwritten_seq(j); -- unwritten_seq < seq; -+ unwritten_seq < end; - unwritten_seq++) { - struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); - -@@ -831,19 +882,14 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) - return ret; - } - --int bch2_journal_meta(struct journal *j) -+static int __bch2_journal_meta(struct journal *j) - { -- struct journal_buf *buf; -- struct journal_res res; -- int ret; -- -- memset(&res, 0, sizeof(res)); -- -- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); -+ struct journal_res res = {}; -+ int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); - if (ret) - return ret; - -- buf = j->buf + (res.seq & JOURNAL_BUF_MASK); -+ struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK); - buf->must_flush = true; - - if (!buf->flush_time) { -@@ -856,27 +902,70 @@ int bch2_journal_meta(struct journal *j) - return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); - } - -+int bch2_journal_meta(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ -+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) -+ return -EROFS; -+ -+ int ret = __bch2_journal_meta(j); -+ bch2_write_ref_put(c, BCH_WRITE_REF_journal); -+ return ret; -+} -+ - /* block/unlock the journal: */ - - void bch2_journal_unblock(struct journal *j) - { - spin_lock(&j->lock); -- j->blocked--; -+ if (!--j->blocked && -+ j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && -+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { -+ union journal_res_state old, new; -+ -+ old.v = atomic64_read(&j->reservations.counter); -+ do { -+ new.v = old.v; -+ new.cur_entry_offset = j->cur_entry_offset_if_blocked; -+ } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); -+ } - spin_unlock(&j->lock); - - journal_wake(j); - } - -+static void __bch2_journal_block(struct journal *j) -+{ -+ if (!j->blocked++) { -+ union journal_res_state old, new; -+ -+ old.v = atomic64_read(&j->reservations.counter); -+ do { -+ j->cur_entry_offset_if_blocked = old.cur_entry_offset; -+ -+ if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL) -+ break; -+ -+ new.v = old.v; -+ new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; -+ } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); -+ -+ journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); -+ } -+} -+ - void bch2_journal_block(struct journal *j) - { - spin_lock(&j->lock); -- j->blocked++; -+ __bch2_journal_block(j); - spin_unlock(&j->lock); - - journal_quiesce(j); - } - --static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) -+static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, -+ u64 max_seq, bool *blocked) - { - struct journal_buf *ret = NULL; - -@@ -893,13 +982,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou - struct journal_buf *buf = j->buf + idx; - - if (buf->need_flush_to_write_buffer) { -- if (seq == journal_cur_seq(j)) -- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); -- - union journal_res_state s; - s.v = atomic64_read_acquire(&j->reservations.counter); - -- ret = journal_state_count(s, idx) -+ unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s); -+ -+ if (open && !*blocked) { -+ __bch2_journal_block(j); -+ *blocked = true; -+ } -+ -+ ret = journal_state_count(s, idx) > open - ? ERR_PTR(-EAGAIN) - : buf; - break; -@@ -912,18 +1005,24 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou - return ret; - } - --struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) -+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, -+ u64 max_seq, bool *blocked) - { - struct journal_buf *ret; -+ *blocked = false; -+ -+ wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, -+ max_seq, blocked)) != ERR_PTR(-EAGAIN)); -+ if (IS_ERR_OR_NULL(ret) && *blocked) -+ bch2_journal_unblock(j); - -- wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); - return ret; - } - - /* allocate journal on a device: */ - --static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, -- bool new_fs, struct closure *cl) -+static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, -+ bool new_fs, struct closure *cl) - { - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; -@@ -945,19 +1044,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, - } - - for (nr_got = 0; nr_got < nr_want; nr_got++) { -- if (new_fs) { -- bu[nr_got] = bch2_bucket_alloc_new_fs(ca); -- if (bu[nr_got] < 0) { -- ret = -BCH_ERR_ENOSPC_bucket_alloc; -- break; -- } -- } else { -- ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, -- BCH_DATA_journal, cl); -- ret = PTR_ERR_OR_ZERO(ob[nr_got]); -- if (ret) -- break; -+ enum bch_watermark watermark = new_fs -+ ? BCH_WATERMARK_btree -+ : BCH_WATERMARK_normal; - -+ ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, -+ BCH_DATA_journal, cl); -+ ret = PTR_ERR_OR_ZERO(ob[nr_got]); -+ if (ret) -+ break; -+ -+ if (!new_fs) { - ret = bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(trans, ca, - ob[nr_got]->bucket, BCH_DATA_journal, -@@ -967,9 +1064,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, - bch_err_msg(c, ret, "marking new journal buckets"); - break; - } -- -- bu[nr_got] = ob[nr_got]->bucket; - } -+ -+ bu[nr_got] = ob[nr_got]->bucket; - } - - if (!nr_got) -@@ -1009,8 +1106,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, - if (ret) - goto err_unblock; - -- if (!new_fs) -- bch2_write_super(c); -+ bch2_write_super(c); - - /* Commit: */ - if (c) -@@ -1044,9 +1140,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, - bu[i], BCH_DATA_free, 0, - BTREE_TRIGGER_transactional)); - err_free: -- if (!new_fs) -- for (i = 0; i < nr_got; i++) -- bch2_open_bucket_put(c, ob[i]); -+ for (i = 0; i < nr_got; i++) -+ bch2_open_bucket_put(c, ob[i]); - - kfree(new_bucket_seq); - kfree(new_buckets); -@@ -1055,26 +1150,20 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, - return ret; - } - --/* -- * Allocate more journal space at runtime - not currently making use if it, but -- * the code works: -- */ --int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, -- unsigned nr) -+static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca, -+ unsigned nr, bool new_fs) - { - struct journal_device *ja = &ca->journal; -- struct closure cl; - int ret = 0; - -+ struct closure cl; - closure_init_stack(&cl); - -- down_write(&c->state_lock); -- - /* don't handle reducing nr of buckets yet: */ - if (nr < ja->nr) -- goto unlock; -+ return 0; - -- while (ja->nr < nr) { -+ while (!ret && ja->nr < nr) { - struct disk_reservation disk_res = { 0, 0, 0 }; - - /* -@@ -1087,25 +1176,38 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - * filesystem-wide allocation will succeed, this is a device - * specific allocation - we can hang here: - */ -+ if (!new_fs) { -+ ret = bch2_disk_reservation_get(c, &disk_res, -+ bucket_to_sector(ca, nr - ja->nr), 1, 0); -+ if (ret) -+ break; -+ } - -- ret = bch2_disk_reservation_get(c, &disk_res, -- bucket_to_sector(ca, nr - ja->nr), 1, 0); -- if (ret) -- break; -+ ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl); - -- ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); -+ if (ret == -BCH_ERR_bucket_alloc_blocked || -+ ret == -BCH_ERR_open_buckets_empty) -+ ret = 0; /* wait and retry */ - - bch2_disk_reservation_put(c, &disk_res); -- - closure_sync(&cl); -- -- if (ret && ret != -BCH_ERR_bucket_alloc_blocked) -- break; - } - -- bch_err_fn(c, ret); --unlock: -+ return ret; -+} -+ -+/* -+ * Allocate more journal space at runtime - not currently making use if it, but -+ * the code works: -+ */ -+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, -+ unsigned nr) -+{ -+ down_write(&c->state_lock); -+ int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false); - up_write(&c->state_lock); -+ -+ bch_err_fn(c, ret); - return ret; - } - -@@ -1131,7 +1233,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) - min(1 << 13, - (1 << 24) / ca->mi.bucket_size)); - -- ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL); -+ ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs); - err: - bch_err_fn(ca, ret); - return ret; -@@ -1193,7 +1295,7 @@ void bch2_fs_journal_stop(struct journal *j) - * Always write a new journal entry, to make sure the clock hands are up - * to date (and match the superblock) - */ -- bch2_journal_meta(j); -+ __bch2_journal_meta(j); - - journal_quiesce(j); - cancel_delayed_work_sync(&j->write_work); -@@ -1217,6 +1319,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) - bool had_entries = false; - u64 last_seq = cur_seq, nr, seq; - -+ if (cur_seq >= JOURNAL_SEQ_MAX) { -+ bch_err(c, "cannot start: journal seq overflow"); -+ return -EINVAL; -+ } -+ - genradix_for_each_reverse(&c->journal_entries, iter, _i) { - i = *_i; - -@@ -1474,6 +1581,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) - case JOURNAL_ENTRY_CLOSED_VAL: - prt_printf(out, "closed\n"); - break; -+ case JOURNAL_ENTRY_BLOCKED_VAL: -+ prt_printf(out, "blocked\n"); -+ break; - default: - prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); - break; -@@ -1499,6 +1609,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) - printbuf_indent_sub(out, 2); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { -+ if (!ca->mi.durability) -+ continue; -+ - struct journal_device *ja = &ca->journal; - - if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) -@@ -1508,6 +1621,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) - continue; - - prt_printf(out, "dev %u:\n", ca->dev_idx); -+ prt_printf(out, "durability %u:\n", ca->mi.durability); - printbuf_indent_add(out, 2); - prt_printf(out, "nr\t%u\n", ja->nr); - prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); -@@ -1519,6 +1633,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) - printbuf_indent_sub(out, 2); - } - -+ prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); -+ - rcu_read_unlock(); - - --out->atomic; -@@ -1530,54 +1646,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) - __bch2_journal_debug_to_text(out, j); - spin_unlock(&j->lock); - } -- --bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) --{ -- struct journal_entry_pin_list *pin_list; -- struct journal_entry_pin *pin; -- -- spin_lock(&j->lock); -- if (!test_bit(JOURNAL_running, &j->flags)) { -- spin_unlock(&j->lock); -- return true; -- } -- -- *seq = max(*seq, j->pin.front); -- -- if (*seq >= j->pin.back) { -- spin_unlock(&j->lock); -- return true; -- } -- -- out->atomic++; -- -- pin_list = journal_seq_pin(j, *seq); -- -- prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); -- printbuf_indent_add(out, 2); -- -- for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) -- list_for_each_entry(pin, &pin_list->list[i], list) -- prt_printf(out, "\t%px %ps\n", pin, pin->flush); -- -- if (!list_empty(&pin_list->flushed)) -- prt_printf(out, "flushed:\n"); -- -- list_for_each_entry(pin, &pin_list->flushed, list) -- prt_printf(out, "\t%px %ps\n", pin, pin->flush); -- -- printbuf_indent_sub(out, 2); -- -- --out->atomic; -- spin_unlock(&j->lock); -- -- return false; --} -- --void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) --{ -- u64 seq = 0; -- -- while (!bch2_journal_seq_pins_to_text(out, j, &seq)) -- seq++; --} -diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h -index 2762be6f9814..107f7f901cd9 100644 ---- a/fs/bcachefs/journal.h -+++ b/fs/bcachefs/journal.h -@@ -285,7 +285,8 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq - spin_lock(&j->lock); - bch2_journal_buf_put_final(j, seq); - spin_unlock(&j->lock); -- } -+ } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) -+ wake_up(&j->wait); - } - - /* -@@ -311,7 +312,7 @@ static inline void bch2_journal_res_put(struct journal *j, - } - - int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, -- unsigned); -+ unsigned, struct btree_trans *); - - /* First bits for BCH_WATERMARK: */ - enum journal_res_flags { -@@ -367,7 +368,8 @@ static inline int journal_res_get_fast(struct journal *j, - } - - static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, -- unsigned u64s, unsigned flags) -+ unsigned u64s, unsigned flags, -+ struct btree_trans *trans) - { - int ret; - -@@ -379,7 +381,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re - if (journal_res_get_fast(j, res, flags)) - goto out; - -- ret = bch2_journal_res_get_slowpath(j, res, flags); -+ ret = bch2_journal_res_get_slowpath(j, res, flags, trans); - if (ret) - return ret; - out: -@@ -403,15 +405,16 @@ void bch2_journal_flush_async(struct journal *, struct closure *); - - int bch2_journal_flush_seq(struct journal *, u64, unsigned); - int bch2_journal_flush(struct journal *); --bool bch2_journal_noflush_seq(struct journal *, u64); -+bool bch2_journal_noflush_seq(struct journal *, u64, u64); - int bch2_journal_meta(struct journal *); - - void bch2_journal_halt(struct journal *); -+void bch2_journal_halt_locked(struct journal *); - - static inline int bch2_journal_error(struct journal *j) - { - return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL -- ? -EIO : 0; -+ ? -BCH_ERR_journal_shutdown : 0; - } - - struct bch_dev; -@@ -424,12 +427,10 @@ static inline void bch2_journal_set_replay_done(struct journal *j) - - void bch2_journal_unblock(struct journal *); - void bch2_journal_block(struct journal *); --struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); -+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); - - void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); - void bch2_journal_debug_to_text(struct printbuf *, struct journal *); --void bch2_journal_pins_to_text(struct printbuf *, struct journal *); --bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); - - int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, - unsigned nr); -diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c -index fb35dd336331..11c39e0c34f4 100644 ---- a/fs/bcachefs/journal_io.c -+++ b/fs/bcachefs/journal_io.c -@@ -17,6 +17,9 @@ - #include "sb-clean.h" - #include "trace.h" - -+#include -+#include -+ - void bch2_journal_pos_from_member_info_set(struct bch_fs *c) - { - lockdep_assert_held(&c->sb_lock); -@@ -299,7 +302,7 @@ static void journal_entry_err_msg(struct printbuf *out, - journal_entry_err_msg(&_buf, version, jset, entry); \ - prt_printf(&_buf, msg, ##__VA_ARGS__); \ - \ -- switch (flags & BCH_VALIDATE_write) { \ -+ switch (from.flags & BCH_VALIDATE_write) { \ - case READ: \ - mustfix_fsck_err(c, _err, "%s", _buf.buf); \ - break; \ -@@ -325,11 +328,11 @@ static void journal_entry_err_msg(struct printbuf *out, - static int journal_validate_key(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, -- unsigned level, enum btree_id btree_id, - struct bkey_i *k, -- unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from, -+ unsigned version, int big_endian) - { -+ enum bch_validate_flags flags = from.flags; - int write = flags & BCH_VALIDATE_write; - void *next = vstruct_next(entry); - int ret = 0; -@@ -364,11 +367,10 @@ static int journal_validate_key(struct bch_fs *c, - } - - if (!write) -- bch2_bkey_compat(level, btree_id, version, big_endian, -+ bch2_bkey_compat(from.level, from.btree, version, big_endian, - write, NULL, bkey_to_packed(k)); - -- ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), -- __btree_node_type(level, btree_id), write); -+ ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); - if (ret == -BCH_ERR_fsck_delete_bkey) { - le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); - memmove(k, bkey_next(k), next - (void *) bkey_next(k)); -@@ -379,7 +381,7 @@ static int journal_validate_key(struct bch_fs *c, - goto fsck_err; - - if (write) -- bch2_bkey_compat(level, btree_id, version, big_endian, -+ bch2_bkey_compat(from.level, from.btree, version, big_endian, - write, NULL, bkey_to_packed(k)); - fsck_err: - return ret; -@@ -389,16 +391,15 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_i *k = entry->start; - -+ from.level = entry->level; -+ from.btree = entry->btree_id; -+ - while (k != vstruct_last(entry)) { -- int ret = journal_validate_key(c, jset, entry, -- entry->level, -- entry->btree_id, -- k, version, big_endian, -- flags|BCH_VALIDATE_journal); -+ int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); - if (ret == FSCK_DELETED_KEY) - continue; - else if (ret) -@@ -421,7 +422,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs - bch2_prt_jset_entry_type(out, entry->type); - prt_str(out, ": "); - } -- prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); -+ bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); -+ prt_char(out, ' '); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); - first = false; - } -@@ -431,11 +433,15 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_i *k = entry->start; - int ret = 0; - -+ from.root = true; -+ from.level = entry->level + 1; -+ from.btree = entry->btree_id; -+ - if (journal_entry_err_on(!entry->u64s || - le16_to_cpu(entry->u64s) != k->k.u64s, - c, version, jset, entry, -@@ -452,8 +458,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, - return 0; - } - -- ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, -- version, big_endian, flags); -+ ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); - if (ret == FSCK_DELETED_KEY) - ret = 0; - fsck_err: -@@ -470,7 +475,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - /* obsolete, don't care: */ - return 0; -@@ -485,7 +490,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - int ret = 0; - -@@ -512,7 +517,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct jset_entry_blacklist_v2 *bl_entry; - int ret = 0; -@@ -554,7 +559,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); -@@ -588,7 +593,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); -@@ -632,7 +637,7 @@ static int journal_entry_clock_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct jset_entry_clock *clock = - container_of(entry, struct jset_entry_clock, entry); -@@ -665,14 +670,14 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry_clock *clock = - container_of(entry, struct jset_entry_clock, entry); - -- prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); -+ prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); - } - - static int journal_entry_dev_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct jset_entry_dev_usage *u = - container_of(entry, struct jset_entry_dev_usage, entry); -@@ -729,7 +734,7 @@ static int journal_entry_log_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - return 0; - } -@@ -738,19 +743,19 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) - { - struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); -- unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); - -- prt_printf(out, "%.*s", bytes, l->d); -+ prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); - } - - static int journal_entry_overwrite_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { -+ from.flags = 0; - return journal_entry_btree_keys_validate(c, jset, entry, -- version, big_endian, READ); -+ version, big_endian, from); - } - - static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, -@@ -763,10 +768,10 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - return journal_entry_btree_keys_validate(c, jset, entry, -- version, big_endian, READ); -+ version, big_endian, from); - } - - static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, -@@ -779,7 +784,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - unsigned bytes = vstruct_bytes(entry); - unsigned expected = 16; -@@ -809,7 +814,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs * - struct jset_entry_ops { - int (*validate)(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, -- enum bch_validate_flags); -+ struct bkey_validate_context); - void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); - }; - -@@ -827,11 +832,11 @@ int bch2_journal_entry_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - return entry->type < BCH_JSET_ENTRY_NR - ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, -- version, big_endian, flags) -+ version, big_endian, from) - : 0; - } - -@@ -849,10 +854,18 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, - static int jset_validate_entries(struct bch_fs *c, struct jset *jset, - enum bch_validate_flags flags) - { -+ struct bkey_validate_context from = { -+ .flags = flags, -+ .from = BKEY_VALIDATE_journal, -+ .journal_seq = le64_to_cpu(jset->seq), -+ }; -+ - unsigned version = le32_to_cpu(jset->version); - int ret = 0; - - vstruct_for_each(jset, entry) { -+ from.journal_offset = (u64 *) entry - jset->_data; -+ - if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), - c, version, jset, entry, - journal_entry_past_jset_end, -@@ -861,8 +874,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset, - break; - } - -- ret = bch2_journal_entry_validate(c, jset, entry, -- version, JSET_BIG_ENDIAN(jset), flags); -+ ret = bch2_journal_entry_validate(c, jset, entry, version, -+ JSET_BIG_ENDIAN(jset), from); - if (ret) - break; - } -@@ -875,13 +888,17 @@ static int jset_validate(struct bch_fs *c, - struct jset *jset, u64 sector, - enum bch_validate_flags flags) - { -- unsigned version; -+ struct bkey_validate_context from = { -+ .flags = flags, -+ .from = BKEY_VALIDATE_journal, -+ .journal_seq = le64_to_cpu(jset->seq), -+ }; - int ret = 0; - - if (le64_to_cpu(jset->magic) != jset_magic(c)) - return JOURNAL_ENTRY_NONE; - -- version = le32_to_cpu(jset->version); -+ unsigned version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), - c, version, jset, NULL, - jset_unsupported_version, -@@ -926,15 +943,16 @@ static int jset_validate_early(struct bch_fs *c, - unsigned bucket_sectors_left, - unsigned sectors_read) - { -- size_t bytes = vstruct_bytes(jset); -- unsigned version; -- enum bch_validate_flags flags = BCH_VALIDATE_journal; -+ struct bkey_validate_context from = { -+ .from = BKEY_VALIDATE_journal, -+ .journal_seq = le64_to_cpu(jset->seq), -+ }; - int ret = 0; - - if (le64_to_cpu(jset->magic) != jset_magic(c)) - return JOURNAL_ENTRY_NONE; - -- version = le32_to_cpu(jset->version); -+ unsigned version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), - c, version, jset, NULL, - jset_unsupported_version, -@@ -947,6 +965,7 @@ static int jset_validate_early(struct bch_fs *c, - return -EINVAL; - } - -+ size_t bytes = vstruct_bytes(jset); - if (bytes > (sectors_read << 9) && - sectors_read < bucket_sectors_left) - return JOURNAL_ENTRY_REREAD; -@@ -1231,8 +1250,6 @@ int bch2_journal_read(struct bch_fs *c, - * those entries will be blacklisted: - */ - genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { -- enum bch_validate_flags flags = BCH_VALIDATE_journal; -- - i = *_i; - - if (journal_replay_ignore(i)) -@@ -1252,6 +1269,10 @@ int bch2_journal_read(struct bch_fs *c, - continue; - } - -+ struct bkey_validate_context from = { -+ .from = BKEY_VALIDATE_journal, -+ .journal_seq = le64_to_cpu(i->j.seq), -+ }; - if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), - c, le32_to_cpu(i->j.version), &i->j, NULL, - jset_last_seq_newer_than_seq, -@@ -1411,27 +1432,50 @@ int bch2_journal_read(struct bch_fs *c, - - /* journal write: */ - -+static void journal_advance_devs_to_next_bucket(struct journal *j, -+ struct dev_alloc_list *devs, -+ unsigned sectors, u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ -+ darray_for_each(*devs, i) { -+ struct bch_dev *ca = rcu_dereference(c->devs[*i]); -+ if (!ca) -+ continue; -+ -+ struct journal_device *ja = &ca->journal; -+ -+ if (sectors > ja->sectors_free && -+ sectors <= ca->mi.bucket_size && -+ bch2_journal_dev_buckets_available(j, ja, -+ journal_space_discarded)) { -+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -+ ja->sectors_free = ca->mi.bucket_size; -+ -+ /* -+ * ja->bucket_seq[ja->cur_idx] must always have -+ * something sensible: -+ */ -+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); -+ } -+ } -+} -+ - static void __journal_write_alloc(struct journal *j, - struct journal_buf *w, -- struct dev_alloc_list *devs_sorted, -+ struct dev_alloc_list *devs, - unsigned sectors, - unsigned *replicas, - unsigned replicas_want) - { - struct bch_fs *c = container_of(j, struct bch_fs, journal); -- struct journal_device *ja; -- struct bch_dev *ca; -- unsigned i; - -- if (*replicas >= replicas_want) -- return; -- -- for (i = 0; i < devs_sorted->nr; i++) { -- ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); -+ darray_for_each(*devs, i) { -+ struct bch_dev *ca = rcu_dereference(c->devs[*i]); - if (!ca) - continue; - -- ja = &ca->journal; -+ struct journal_device *ja = &ca->journal; - - /* - * Check that we can use this device, and aren't already using -@@ -1477,65 +1521,53 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) - { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_devs_mask devs; -- struct journal_device *ja; -- struct bch_dev *ca; - struct dev_alloc_list devs_sorted; - unsigned sectors = vstruct_sectors(w->data, c->block_bits); - unsigned target = c->opts.metadata_target ?: - c->opts.foreground_target; -- unsigned i, replicas = 0, replicas_want = -+ unsigned replicas = 0, replicas_want = - READ_ONCE(c->opts.metadata_replicas); - unsigned replicas_need = min_t(unsigned, replicas_want, - READ_ONCE(c->opts.metadata_replicas_required)); -+ bool advance_done = false; - - rcu_read_lock(); --retry: -- devs = target_rw_devs(c, BCH_DATA_journal, target); - -- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); -+ /* We might run more than once if we have to stop and do discards: */ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); -+ bkey_for_each_ptr(ptrs, p) { -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); -+ if (ca) -+ replicas += ca->mi.durability; -+ } - -- __journal_write_alloc(j, w, &devs_sorted, -- sectors, &replicas, replicas_want); -+retry_target: -+ devs = target_rw_devs(c, BCH_DATA_journal, target); -+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); -+retry_alloc: -+ __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); - -- if (replicas >= replicas_want) -+ if (likely(replicas >= replicas_want)) - goto done; - -- for (i = 0; i < devs_sorted.nr; i++) { -- ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); -- if (!ca) -- continue; -- -- ja = &ca->journal; -- -- if (sectors > ja->sectors_free && -- sectors <= ca->mi.bucket_size && -- bch2_journal_dev_buckets_available(j, ja, -- journal_space_discarded)) { -- ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -- ja->sectors_free = ca->mi.bucket_size; -- -- /* -- * ja->bucket_seq[ja->cur_idx] must always have -- * something sensible: -- */ -- ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); -- } -+ if (!advance_done) { -+ journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); -+ advance_done = true; -+ goto retry_alloc; - } - -- __journal_write_alloc(j, w, &devs_sorted, -- sectors, &replicas, replicas_want); -- - if (replicas < replicas_want && target) { - /* Retry from all devices: */ - target = 0; -- goto retry; -+ advance_done = false; -+ goto retry_target; - } - done: - rcu_read_unlock(); - - BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - -- return replicas >= replicas_need ? 0 : -EROFS; -+ return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; - } - - static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -@@ -1732,6 +1764,7 @@ static CLOSURE_CALLBACK(journal_write_submit) - bio->bi_iter.bi_sector = ptr->offset; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; -+ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); - - BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); - ca->prev_journal_sector = bio->bi_iter.bi_sector; -@@ -2023,19 +2056,21 @@ CLOSURE_CALLBACK(bch2_journal_write) - bch2_journal_do_discards(j); - } - -- if (ret) { -+ if (ret && !bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - -- prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), -+ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), - le64_to_cpu(w->data->seq), -+ vstruct_sectors(w->data, c->block_bits), - bch2_err_str(ret)); - __bch2_journal_debug_to_text(&buf, j); - spin_unlock(&j->lock); - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); -- goto err; - } -+ if (ret) -+ goto err; - - /* - * write is allocated, no longer need to account for it in -diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h -index 2ca9cde30ea8..12b39fcb4424 100644 ---- a/fs/bcachefs/journal_io.h -+++ b/fs/bcachefs/journal_io.h -@@ -63,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, - - int bch2_journal_entry_validate(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, -- enum bch_validate_flags); -+ struct bkey_validate_context); - void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, - struct jset_entry *); - -diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c -index ace291f175dd..d373cd181a7f 100644 ---- a/fs/bcachefs/journal_reclaim.c -+++ b/fs/bcachefs/journal_reclaim.c -@@ -38,6 +38,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, - struct journal_device *ja, - enum journal_space_from from) - { -+ if (!ja->nr) -+ return 0; -+ - unsigned available = (journal_space_from(ja, from) - - ja->cur_idx - 1 + ja->nr) % ja->nr; - -@@ -137,14 +140,18 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned pos, nr_devs = 0; - struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; -+ unsigned min_bucket_size = U32_MAX; - - BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); - - rcu_read_lock(); - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { -- if (!ca->journal.nr) -+ if (!ca->journal.nr || -+ !ca->mi.durability) - continue; - -+ min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); -+ - space = journal_dev_space_available(j, ca, from); - if (!space.next_entry) - continue; -@@ -164,7 +171,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne - * We sorted largest to smallest, and we want the smallest out of the - * @nr_devs_want largest devices: - */ -- return dev_space[nr_devs_want - 1]; -+ space = dev_space[nr_devs_want - 1]; -+ space.next_entry = min(space.next_entry, min_bucket_size); -+ return space; - } - - void bch2_journal_space_available(struct journal *j) -@@ -318,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j) - popped = true; - } - -- if (popped) -+ if (popped) { - bch2_journal_space_available(j); -+ __closure_wake_up(&j->reclaim_flush_wait); -+ } - } - - bool __bch2_journal_pin_put(struct journal *j, u64 seq) -@@ -353,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j, - pin->seq = 0; - list_del_init(&pin->list); - -+ if (j->reclaim_flush_wait.list.first) -+ __closure_wake_up(&j->reclaim_flush_wait); -+ - /* - * Unpinning a journal entry may make journal_next_bucket() succeed, if - * writing a new last_seq will now make another bucket available: -@@ -370,15 +384,19 @@ void bch2_journal_pin_drop(struct journal *j, - spin_unlock(&j->lock); - } - --static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) -+static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, -+ journal_pin_flush_fn fn) - { - if (fn == bch2_btree_node_flush0 || -- fn == bch2_btree_node_flush1) -- return JOURNAL_PIN_btree; -- else if (fn == bch2_btree_key_cache_journal_flush) -- return JOURNAL_PIN_key_cache; -+ fn == bch2_btree_node_flush1) { -+ unsigned idx = fn == bch2_btree_node_flush1; -+ struct btree *b = container_of(pin, struct btree, writes[idx].journal); -+ -+ return JOURNAL_PIN_TYPE_btree0 - b->c.level; -+ } else if (fn == bch2_btree_key_cache_journal_flush) -+ return JOURNAL_PIN_TYPE_key_cache; - else -- return JOURNAL_PIN_other; -+ return JOURNAL_PIN_TYPE_other; - } - - static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, -@@ -397,7 +415,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, - atomic_inc(&pin_list->count); - pin->seq = seq; - pin->flush = flush_fn; -- list_add(&pin->list, &pin_list->list[type]); -+ -+ if (list_empty(&pin_list->unflushed[type]) && -+ j->reclaim_flush_wait.list.first) -+ __closure_wake_up(&j->reclaim_flush_wait); -+ -+ list_add(&pin->list, &pin_list->unflushed[type]); - } - - void bch2_journal_pin_copy(struct journal *j, -@@ -422,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j, - - bool reclaim = __journal_pin_drop(j, dst); - -- bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); -+ bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); - - if (reclaim) - bch2_journal_reclaim_fast(j); -@@ -446,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, - - bool reclaim = __journal_pin_drop(j, pin); - -- bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); -+ bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); - - if (reclaim) - bch2_journal_reclaim_fast(j); -@@ -490,16 +513,15 @@ journal_get_next_pin(struct journal *j, - { - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *ret = NULL; -- unsigned i; - - fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { - if (*seq > seq_to_flush && !allowed_above_seq) - break; - -- for (i = 0; i < JOURNAL_PIN_NR; i++) -- if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || -- ((1U << i) & allowed_above_seq)) { -- ret = list_first_entry_or_null(&pin_list->list[i], -+ for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) -+ if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) || -+ (BIT(i) & allowed_above_seq)) { -+ ret = list_first_entry_or_null(&pin_list->unflushed[i], - struct journal_entry_pin, list); - if (ret) - return ret; -@@ -535,8 +557,8 @@ static size_t journal_flush_pins(struct journal *j, - } - - if (min_key_cache) { -- allowed_above |= 1U << JOURNAL_PIN_key_cache; -- allowed_below |= 1U << JOURNAL_PIN_key_cache; -+ allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache); -+ allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache); - } - - cond_resched(); -@@ -544,7 +566,9 @@ static size_t journal_flush_pins(struct journal *j, - j->last_flushed = jiffies; - - spin_lock(&j->lock); -- pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); -+ pin = journal_get_next_pin(j, seq_to_flush, -+ allowed_below, -+ allowed_above, &seq); - if (pin) { - BUG_ON(j->flush_in_progress); - j->flush_in_progress = pin; -@@ -567,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j, - spin_lock(&j->lock); - /* Pin might have been dropped or rearmed: */ - if (likely(!err && !j->flush_in_progress_dropped)) -- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); -+ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); - j->flush_in_progress = NULL; - j->flush_in_progress_dropped = false; - spin_unlock(&j->lock); -@@ -758,10 +782,12 @@ static int bch2_journal_reclaim_thread(void *arg) - journal_empty = fifo_empty(&j->pin); - spin_unlock(&j->lock); - -+ long timeout = j->next_reclaim - jiffies; -+ - if (journal_empty) - schedule(); -- else if (time_after(j->next_reclaim, jiffies)) -- schedule_timeout(j->next_reclaim - jiffies); -+ else if (timeout > 0) -+ schedule_timeout(timeout); - else - break; - } -@@ -805,10 +831,41 @@ int bch2_journal_reclaim_start(struct journal *j) - return 0; - } - -+static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, -+ unsigned types) -+{ -+ struct journal_entry_pin_list *pin_list; -+ u64 seq; -+ -+ spin_lock(&j->lock); -+ fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { -+ if (seq > seq_to_flush) -+ break; -+ -+ for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) -+ if ((BIT(i) & types) && -+ (!list_empty(&pin_list->unflushed[i]) || -+ !list_empty(&pin_list->flushed[i]))) { -+ spin_unlock(&j->lock); -+ return true; -+ } -+ } -+ spin_unlock(&j->lock); -+ -+ return false; -+} -+ -+static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush, -+ unsigned types) -+{ -+ return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) || -+ journal_pins_still_flushing(j, seq_to_flush, types); -+} -+ - static int journal_flush_done(struct journal *j, u64 seq_to_flush, - bool *did_work) - { -- int ret; -+ int ret = 0; - - ret = bch2_journal_error(j); - if (ret) -@@ -816,12 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, - - mutex_lock(&j->reclaim_lock); - -- if (journal_flush_pins(j, seq_to_flush, -- (1U << JOURNAL_PIN_key_cache)| -- (1U << JOURNAL_PIN_other), 0, 0, 0) || -- journal_flush_pins(j, seq_to_flush, -- (1U << JOURNAL_PIN_btree), 0, 0, 0)) -- *did_work = true; -+ for (int type = JOURNAL_PIN_TYPE_NR - 1; -+ type >= 0; -+ --type) -+ if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { -+ *did_work = true; -+ goto unlock; -+ } - - if (seq_to_flush > journal_cur_seq(j)) - bch2_journal_entry_close(j); -@@ -836,6 +894,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, - !fifo_used(&j->pin); - - spin_unlock(&j->lock); -+unlock: - mutex_unlock(&j->reclaim_lock); - - return ret; -@@ -849,7 +908,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) - if (!test_bit(JOURNAL_running, &j->flags)) - return false; - -- closure_wait_event(&j->async_wait, -+ closure_wait_event(&j->reclaim_flush_wait, - journal_flush_done(j, seq_to_flush, &did_work)); - - return did_work; -@@ -915,3 +974,54 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) - - return ret; - } -+ -+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) -+{ -+ struct journal_entry_pin_list *pin_list; -+ struct journal_entry_pin *pin; -+ -+ spin_lock(&j->lock); -+ if (!test_bit(JOURNAL_running, &j->flags)) { -+ spin_unlock(&j->lock); -+ return true; -+ } -+ -+ *seq = max(*seq, j->pin.front); -+ -+ if (*seq >= j->pin.back) { -+ spin_unlock(&j->lock); -+ return true; -+ } -+ -+ out->atomic++; -+ -+ pin_list = journal_seq_pin(j, *seq); -+ -+ prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); -+ printbuf_indent_add(out, 2); -+ -+ prt_printf(out, "unflushed:\n"); -+ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) -+ list_for_each_entry(pin, &pin_list->unflushed[i], list) -+ prt_printf(out, "\t%px %ps\n", pin, pin->flush); -+ -+ prt_printf(out, "flushed:\n"); -+ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++) -+ list_for_each_entry(pin, &pin_list->flushed[i], list) -+ prt_printf(out, "\t%px %ps\n", pin, pin->flush); -+ -+ printbuf_indent_sub(out, 2); -+ -+ --out->atomic; -+ spin_unlock(&j->lock); -+ -+ return false; -+} -+ -+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) -+{ -+ u64 seq = 0; -+ -+ while (!bch2_journal_seq_pins_to_text(out, j, &seq)) -+ seq++; -+} -diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h -index ec84c3345281..0a73d7134e1c 100644 ---- a/fs/bcachefs/journal_reclaim.h -+++ b/fs/bcachefs/journal_reclaim.h -@@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j) - - int bch2_journal_flush_device_pins(struct journal *, int); - -+void bch2_journal_pins_to_text(struct printbuf *, struct journal *); -+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); -+ - #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ -diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h -index 19183fcf7ad7..1ef3a28ed6ab 100644 ---- a/fs/bcachefs/journal_types.h -+++ b/fs/bcachefs/journal_types.h -@@ -9,6 +9,9 @@ - #include "super_types.h" - #include "fifo.h" - -+/* btree write buffer steals 8 bits for its own purposes: */ -+#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) -+ - #define JOURNAL_BUF_BITS 2 - #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) - #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) -@@ -50,15 +53,18 @@ struct journal_buf { - */ - - enum journal_pin_type { -- JOURNAL_PIN_btree, -- JOURNAL_PIN_key_cache, -- JOURNAL_PIN_other, -- JOURNAL_PIN_NR, -+ JOURNAL_PIN_TYPE_btree3, -+ JOURNAL_PIN_TYPE_btree2, -+ JOURNAL_PIN_TYPE_btree1, -+ JOURNAL_PIN_TYPE_btree0, -+ JOURNAL_PIN_TYPE_key_cache, -+ JOURNAL_PIN_TYPE_other, -+ JOURNAL_PIN_TYPE_NR, - }; - - struct journal_entry_pin_list { -- struct list_head list[JOURNAL_PIN_NR]; -- struct list_head flushed; -+ struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; -+ struct list_head flushed[JOURNAL_PIN_TYPE_NR]; - atomic_t count; - struct bch_devs_list devs; - }; -@@ -112,6 +118,7 @@ union journal_res_state { - */ - #define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) - -+#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) - #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) - #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) - -@@ -193,6 +200,7 @@ struct journal { - * insufficient devices: - */ - enum journal_errors cur_entry_error; -+ unsigned cur_entry_offset_if_blocked; - - unsigned buf_size_want; - /* -@@ -221,6 +229,7 @@ struct journal { - /* Used when waiting because the journal was full */ - wait_queue_head_t wait; - struct closure_waitlist async_wait; -+ struct closure_waitlist reclaim_flush_wait; - - struct delayed_work write_work; - struct workqueue_struct *wq; -@@ -231,6 +240,7 @@ struct journal { - /* seq, last_seq from the most recent journal entry successfully written */ - u64 seq_ondisk; - u64 flushed_seq_ondisk; -+ u64 flushing_seq; - u64 last_seq_ondisk; - u64 err_seq; - u64 last_empty_seq; -diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c -index 60e00702d1a4..75f27ec26f85 100644 ---- a/fs/bcachefs/logged_ops.c -+++ b/fs/bcachefs/logged_ops.c -@@ -63,8 +63,10 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, - int bch2_resume_logged_ops(struct bch_fs *c) - { - int ret = bch2_trans_run(c, -- for_each_btree_key(trans, iter, -- BTREE_ID_logged_ops, POS_MIN, -+ for_each_btree_key_max(trans, iter, -+ BTREE_ID_logged_ops, -+ POS(LOGGED_OPS_INUM_logged_ops, 0), -+ POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), - BTREE_ITER_prefetch, k, - resume_logged_op(trans, &iter, k))); - bch_err_fn(c, ret); -@@ -74,9 +76,8 @@ int bch2_resume_logged_ops(struct bch_fs *c) - static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) - { - struct btree_iter iter; -- int ret; -- -- ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX); -+ int ret = bch2_bkey_get_empty_slot(trans, &iter, -+ BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX)); - if (ret) - return ret; - -diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h -index 6a4bf7129dba..cfb67c95d4c8 100644 ---- a/fs/bcachefs/logged_ops_format.h -+++ b/fs/bcachefs/logged_ops_format.h -@@ -2,6 +2,11 @@ - #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H - #define _BCACHEFS_LOGGED_OPS_FORMAT_H - -+enum logged_ops_inums { -+ LOGGED_OPS_INUM_logged_ops, -+ LOGGED_OPS_INUM_inode_cursors, -+}; -+ - struct bch_logged_op_truncate { - struct bch_val v; - __le32 subvol; -diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c -index 10857eccdeaf..ce794d55818f 100644 ---- a/fs/bcachefs/lru.c -+++ b/fs/bcachefs/lru.c -@@ -12,7 +12,7 @@ - - /* KEY_TYPE_lru is obsolete: */ - int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - int ret = 0; - -@@ -192,7 +192,7 @@ int bch2_check_lrus(struct bch_fs *c) - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, -- NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, -+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_lru_key(trans, &iter, k, &last_flushed))); - - bch2_bkey_buf_exit(&last_flushed, c); -diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h -index e6a7d8241bb8..f31a6cf1514c 100644 ---- a/fs/bcachefs/lru.h -+++ b/fs/bcachefs/lru.h -@@ -33,7 +33,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) - return BCH_LRU_read; - } - --int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -+int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); - void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - - void bch2_lru_pos_to_text(struct printbuf *, struct bpos); -diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c -index 0ef4a86850bb..953d1f2c5c39 100644 ---- a/fs/bcachefs/move.c -+++ b/fs/bcachefs/move.c -@@ -21,6 +21,8 @@ - #include "journal_reclaim.h" - #include "keylist.h" - #include "move.h" -+#include "rebalance.h" -+#include "reflink.h" - #include "replicas.h" - #include "snapshot.h" - #include "super-io.h" -@@ -196,6 +198,13 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) - list_del(&ctxt->list); - mutex_unlock(&c->moving_context_lock); - -+ /* -+ * Generally, releasing a transaction within a transaction restart means -+ * an unhandled transaction restart: but this can happen legitimately -+ * within the move code, e.g. when bch2_move_ratelimit() tells us to -+ * exit before we've retried -+ */ -+ bch2_trans_begin(ctxt->trans); - bch2_trans_put(ctxt->trans); - memset(ctxt, 0, sizeof(*ctxt)); - } -@@ -379,34 +388,42 @@ int bch2_move_extent(struct moving_context *ctxt, - return ret; - } - --struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, -+static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, - struct per_snapshot_io_opts *io_opts, -+ struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ -+ struct btree_iter *extent_iter, - struct bkey_s_c extent_k) - { - struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; -+ struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; - int ret = 0; - -- if (io_opts->cur_inum != extent_k.k->p.inode) { -+ if (extent_k.k->type == KEY_TYPE_reflink_v) -+ goto out; -+ -+ if (io_opts->cur_inum != extent_pos.inode) { - io_opts->d.nr = 0; - -- ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), -+ ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), - BTREE_ITER_all_snapshots, k, ({ -- if (k.k->p.offset != extent_k.k->p.inode) -+ if (k.k->p.offset != extent_pos.inode) - break; - - if (!bkey_is_inode(k.k)) - continue; - - struct bch_inode_unpacked inode; -- BUG_ON(bch2_inode_unpack(k, &inode)); -+ _ret3 = bch2_inode_unpack(k, &inode); -+ if (_ret3) -+ break; - - struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; - bch2_inode_opts_get(&e.io_opts, trans->c, &inode); - - darray_push(&io_opts->d, e); - })); -- io_opts->cur_inum = extent_k.k->p.inode; -+ io_opts->cur_inum = extent_pos.inode; - } - - ret = ret ?: trans_was_restarted(trans, restart_count); -@@ -415,43 +432,46 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, - - if (extent_k.k->p.snapshot) - darray_for_each(io_opts->d, i) -- if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) -- return &i->io_opts; -- -- return &io_opts->fs_io_opts; -+ if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { -+ opts_ret = &i->io_opts; -+ break; -+ } -+out: -+ ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); -+ if (ret) -+ return ERR_PTR(ret); -+ return opts_ret; - } - - int bch2_move_get_io_opts_one(struct btree_trans *trans, - struct bch_io_opts *io_opts, -+ struct btree_iter *extent_iter, - struct bkey_s_c extent_k) - { -- struct btree_iter iter; -- struct bkey_s_c k; -- int ret; -+ struct bch_fs *c = trans->c; -+ -+ *io_opts = bch2_opts_to_inode_opts(c->opts); - - /* reflink btree? */ -- if (!extent_k.k->p.inode) { -- *io_opts = bch2_opts_to_inode_opts(trans->c->opts); -- return 0; -- } -+ if (!extent_k.k->p.inode) -+ goto out; - -- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, -+ struct btree_iter inode_iter; -+ struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_cached); -- ret = bkey_err(k); -+ int ret = bkey_err(inode_k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - -- if (!ret && bkey_is_inode(k.k)) { -+ if (!ret && bkey_is_inode(inode_k.k)) { - struct bch_inode_unpacked inode; -- bch2_inode_unpack(k, &inode); -- bch2_inode_opts_get(io_opts, trans->c, &inode); -- } else { -- *io_opts = bch2_opts_to_inode_opts(trans->c->opts); -+ bch2_inode_unpack(inode_k, &inode); -+ bch2_inode_opts_get(io_opts, c, &inode); - } -- -- bch2_trans_iter_exit(trans, &iter); -- return 0; -+ bch2_trans_iter_exit(trans, &inode_iter); -+out: -+ return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); - } - - int bch2_move_ratelimit(struct moving_context *ctxt) -@@ -509,9 +529,15 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - struct per_snapshot_io_opts snapshot_io_opts; - struct bch_io_opts *io_opts; - struct bkey_buf sk; -- struct btree_iter iter; -+ struct btree_iter iter, reflink_iter = {}; - struct bkey_s_c k; - struct data_update_opts data_opts; -+ /* -+ * If we're moving a single file, also process reflinked data it points -+ * to (this includes propagating changed io_opts from the inode to the -+ * extent): -+ */ -+ bool walk_indirect = start.inode == end.inode; - int ret = 0, ret2; - - per_snapshot_io_opts_init(&snapshot_io_opts, c); -@@ -531,6 +557,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - bch2_ratelimit_reset(ctxt->rate); - - while (!bch2_move_ratelimit(ctxt)) { -+ struct btree_iter *extent_iter = &iter; -+ - bch2_trans_begin(trans); - - k = bch2_btree_iter_peek(&iter); -@@ -549,10 +577,37 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - if (ctxt->stats) - ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - -+ if (walk_indirect && -+ k.k->type == KEY_TYPE_reflink_p && -+ REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ s64 offset_into_extent = 0; -+ -+ bch2_trans_iter_exit(trans, &reflink_iter); -+ k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); -+ ret = bkey_err(k); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ break; -+ -+ if (bkey_deleted(k.k)) -+ goto next_nondata; -+ -+ /* -+ * XXX: reflink pointers may point to multiple indirect -+ * extents, so don't advance past the entire reflink -+ * pointer - need to fixup iter->k -+ */ -+ extent_iter = &reflink_iter; -+ offset_into_extent = 0; -+ } -+ - if (!bkey_extent_is_direct_data(k.k)) - goto next_nondata; - -- io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); -+ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, -+ iter.pos, extent_iter, k); - ret = PTR_ERR_OR_ZERO(io_opts); - if (ret) - continue; -@@ -568,7 +623,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - -- ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); -+ ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); - if (ret2) { - if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) - continue; -@@ -589,6 +644,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - bch2_btree_iter_advance(&iter); - } - -+ bch2_trans_iter_exit(trans, &reflink_iter); - bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&sk, c); - per_snapshot_io_opts_exit(&snapshot_io_opts); -@@ -654,16 +710,12 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - struct bch_fs *c = trans->c; - bool is_kthread = current->flags & PF_KTHREAD; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); -- struct btree_iter iter; -+ struct btree_iter iter = {}, bp_iter = {}; - struct bkey_buf sk; -- struct bch_backpointer bp; -- struct bch_alloc_v4 a_convert; -- const struct bch_alloc_v4 *a; - struct bkey_s_c k; - struct data_update_opts data_opts; -- unsigned dirty_sectors, bucket_size; -- u64 fragmentation; -- struct bpos bp_pos = POS_MIN; -+ unsigned sectors_moved = 0; -+ struct bkey_buf last_flushed; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); -@@ -672,6 +724,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - - trace_bucket_evacuate(c, &bucket); - -+ bch2_bkey_buf_init(&last_flushed); -+ bkey_init(&last_flushed.k->k); - bch2_bkey_buf_init(&sk); - - /* -@@ -679,21 +733,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - */ - bch2_trans_begin(trans); - -- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, -- bucket, BTREE_ITER_cached); -- ret = lockrestart_do(trans, -- bkey_err(k = bch2_btree_iter_peek_slot(&iter))); -- bch2_trans_iter_exit(trans, &iter); -+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, -+ bucket_pos_to_bp_start(ca, bucket), 0); - - bch_err_msg(c, ret, "looking up alloc key"); - if (ret) - goto err; - -- a = bch2_alloc_to_v4(k, &a_convert); -- dirty_sectors = bch2_bucket_sectors_dirty(*a); -- bucket_size = ca->mi.bucket_size; -- fragmentation = alloc_lru_idx_fragmentation(*a, ca); -- - ret = bch2_btree_write_buffer_tryflush(trans); - bch_err_msg(c, ret, "flushing btree write buffer"); - if (ret) -@@ -705,18 +751,23 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - - bch2_trans_begin(trans); - -- ret = bch2_get_next_backpointer(trans, ca, bucket, gen, -- &bp_pos, &bp, -- BTREE_ITER_cached); -+ k = bch2_btree_iter_peek(&bp_iter); -+ ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; -- if (bkey_eq(bp_pos, POS_MAX)) -+ -+ if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) - break; - -- if (!bp.level) { -- k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); -+ if (k.k->type != KEY_TYPE_backpointer) -+ goto next; -+ -+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); -+ -+ if (!bp.v->level) { -+ k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; -@@ -728,7 +779,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - -- ret = bch2_move_get_io_opts_one(trans, &io_opts, k); -+ ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); - if (ret) { - bch2_trans_iter_exit(trans, &iter); - continue; -@@ -738,14 +789,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - data_opts.target = io_opts.background_target; - data_opts.rewrite_ptrs = 0; - -+ unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ - unsigned i = 0; -- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { -- if (ptr->dev == bucket.inode) { -- data_opts.rewrite_ptrs |= 1U << i; -- if (ptr->cached) { -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { -+ if (p.ptr.dev == bucket.inode) { -+ if (p.ptr.cached) { - bch2_trans_iter_exit(trans, &iter); - goto next; - } -+ data_opts.rewrite_ptrs |= 1U << i; -+ break; - } - i++; - } -@@ -765,14 +820,15 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - goto err; - - if (ctxt->stats) -- atomic64_add(k.k->size, &ctxt->stats->sectors_seen); -+ atomic64_add(sectors, &ctxt->stats->sectors_seen); -+ sectors_moved += sectors; - } else { - struct btree *b; - -- b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); -+ b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); - ret = PTR_ERR_OR_ZERO(b); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) -- continue; -+ goto next; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) -@@ -796,15 +852,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - atomic64_add(sectors, &ctxt->stats->sectors_seen); - atomic64_add(sectors, &ctxt->stats->sectors_moved); - } -+ sectors_moved += btree_sectors(c); - } - next: -- bp_pos = bpos_nosnap_successor(bp_pos); -+ bch2_btree_iter_advance(&bp_iter); - } - -- trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); -+ trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); - err: -+ bch2_trans_iter_exit(trans, &bp_iter); - bch2_dev_put(ca); - bch2_bkey_buf_exit(&sk, c); -+ bch2_bkey_buf_exit(&last_flushed, c); - return ret; - } - -diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h -index 9baf3093a678..51e0505a8156 100644 ---- a/fs/bcachefs/move.h -+++ b/fs/bcachefs/move.h -@@ -110,9 +110,8 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt - darray_exit(&io_opts->d); - } - --struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, -- struct per_snapshot_io_opts *, struct bkey_s_c); --int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c); -+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, -+ struct btree_iter *, struct bkey_s_c); - - int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); - -diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c -index d658be90f737..6718dc37c5a3 100644 ---- a/fs/bcachefs/movinggc.c -+++ b/fs/bcachefs/movinggc.c -@@ -74,20 +74,14 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, - struct move_bucket *b, u64 time) - { - struct bch_fs *c = trans->c; -- struct btree_iter iter; -- struct bkey_s_c k; -- struct bch_alloc_v4 _a; -- const struct bch_alloc_v4 *a; -- int ret; - -- if (bch2_bucket_is_open(trans->c, -- b->k.bucket.inode, -- b->k.bucket.offset)) -+ if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) - return 0; - -- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, -- b->k.bucket, BTREE_ITER_cached); -- ret = bkey_err(k); -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, -+ b->k.bucket, BTREE_ITER_cached); -+ int ret = bkey_err(k); - if (ret) - return ret; - -@@ -95,13 +89,18 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, - if (!ca) - goto out; - -- a = bch2_alloc_to_v4(k, &_a); -+ if (ca->mi.state != BCH_MEMBER_STATE_rw || -+ !bch2_dev_is_online(ca)) -+ goto out_put; -+ -+ struct bch_alloc_v4 _a; -+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - b->k.gen = a->gen; - b->sectors = bch2_bucket_sectors_dirty(*a); - u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - - ret = lru_idx && lru_idx <= time; -- -+out_put: - bch2_dev_put(ca); - out: - bch2_trans_iter_exit(trans, &iter); -@@ -167,7 +166,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, - - bch2_trans_begin(trans); - -- ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, -+ ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), - lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), - 0, k, ({ -@@ -215,7 +214,8 @@ static int bch2_copygc(struct moving_context *ctxt, - }; - move_buckets buckets = { 0 }; - struct move_bucket_in_flight *f; -- u64 moved = atomic64_read(&ctxt->stats->sectors_moved); -+ u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); -+ u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); - int ret = 0; - - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); -@@ -245,7 +245,6 @@ static int bch2_copygc(struct moving_context *ctxt, - *did_work = true; - } - err: -- darray_exit(&buckets); - - /* no entries in LRU btree found, or got to end: */ - if (bch2_err_matches(ret, ENOENT)) -@@ -254,8 +253,11 @@ static int bch2_copygc(struct moving_context *ctxt, - if (ret < 0 && !bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "from bch2_move_data()"); - -- moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; -- trace_and_count(c, copygc, c, moved, 0, 0, 0); -+ sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; -+ sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; -+ trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); -+ -+ darray_exit(&buckets); - return ret; - } - -@@ -350,9 +352,9 @@ static int bch2_copygc_thread(void *arg) - bch2_trans_unlock_long(ctxt.trans); - cond_resched(); - -- if (!c->copy_gc_enabled) { -+ if (!c->opts.copygc_enabled) { - move_buckets_wait(&ctxt, buckets, true); -- kthread_wait_freezable(c->copy_gc_enabled || -+ kthread_wait_freezable(c->opts.copygc_enabled || - kthread_should_stop()); - } - -diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c -index 0e2ee262fbd4..6772faf385a5 100644 ---- a/fs/bcachefs/opts.c -+++ b/fs/bcachefs/opts.c -@@ -1,6 +1,7 @@ - // SPDX-License-Identifier: GPL-2.0 - - #include -+#include - - #include "bcachefs.h" - #include "compress.h" -@@ -48,12 +49,12 @@ static const char * const __bch2_csum_types[] = { - NULL - }; - --const char * const bch2_csum_opts[] = { -+const char * const __bch2_csum_opts[] = { - BCH_CSUM_OPTS() - NULL - }; - --static const char * const __bch2_compression_types[] = { -+const char * const __bch2_compression_types[] = { - BCH_COMPRESSION_TYPES() - NULL - }; -@@ -113,6 +114,7 @@ void bch2_prt_##name(struct printbuf *out, type t) \ - PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); - PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); - PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); -+PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); - PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); - PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); - PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); -@@ -333,17 +335,18 @@ int bch2_opt_parse(struct bch_fs *c, - switch (opt->type) { - case BCH_OPT_BOOL: - if (val) { -- ret = kstrtou64(val, 10, res); -+ ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); -+ if (ret != -BCH_ERR_option_not_bool) { -+ *res = ret; -+ } else { -+ if (err) -+ prt_printf(err, "%s: must be bool", opt->attr.name); -+ return ret; -+ } - } else { -- ret = 0; - *res = 1; - } - -- if (ret < 0 || (*res != 0 && *res != 1)) { -- if (err) -- prt_printf(err, "%s: must be bool", opt->attr.name); -- return ret < 0 ? ret : -BCH_ERR_option_not_bool; -- } - break; - case BCH_OPT_UINT: - if (!val) { -@@ -710,11 +713,14 @@ void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, - - struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) - { -- return (struct bch_io_opts) { -+ struct bch_io_opts opts = { - #define x(_name, _bits) ._name = src._name, - BCH_INODE_OPTS() - #undef x - }; -+ -+ bch2_io_opts_fixups(&opts); -+ return opts; - } - - bool bch2_opt_is_inode_opt(enum bch_opt_id id) -diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h -index 23dda014e331..9d397fc2a1f0 100644 ---- a/fs/bcachefs/opts.h -+++ b/fs/bcachefs/opts.h -@@ -16,7 +16,8 @@ extern const char * const bch2_version_upgrade_opts[]; - extern const char * const bch2_sb_features[]; - extern const char * const bch2_sb_compat[]; - extern const char * const __bch2_btree_ids[]; --extern const char * const bch2_csum_opts[]; -+extern const char * const __bch2_csum_opts[]; -+extern const char * const __bch2_compression_types[]; - extern const char * const bch2_compression_opts[]; - extern const char * const __bch2_str_hash_types[]; - extern const char * const bch2_str_hash_opts[]; -@@ -27,6 +28,7 @@ extern const char * const bch2_d_types[]; - void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); - void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); - void bch2_prt_data_type(struct printbuf *, enum bch_data_type); -+void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); - void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); - void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); - void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); -@@ -171,12 +173,12 @@ enum fsck_err_opts { - "size", "Maximum size of checksummed/compressed extents")\ - x(metadata_checksum, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -- OPT_STR(bch2_csum_opts), \ -+ OPT_STR(__bch2_csum_opts), \ - BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ - NULL, NULL) \ - x(data_checksum, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -- OPT_STR(bch2_csum_opts), \ -+ OPT_STR(__bch2_csum_opts), \ - BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ - NULL, NULL) \ - x(compression, u8, \ -@@ -220,14 +222,14 @@ enum fsck_err_opts { - BCH_SB_ERASURE_CODE, false, \ - NULL, "Enable erasure coding (DO NOT USE YET)") \ - x(inodes_32bit, u8, \ -- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_INODE_32BIT, true, \ - NULL, "Constrain inode numbers to 32 bits") \ -- x(shard_inode_numbers, u8, \ -- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -- OPT_BOOL(), \ -- BCH_SB_SHARD_INUMS, true, \ -+ x(shard_inode_numbers_bits, u8, \ -+ OPT_FS|OPT_FORMAT, \ -+ OPT_UINT(0, 8), \ -+ BCH_SB_SHARD_INUMS_NBITS, 0, \ - NULL, "Shard new inode numbers by CPU id") \ - x(inodes_use_key_cache, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ -@@ -473,6 +475,18 @@ enum fsck_err_opts { - BCH2_NO_SB_OPT, true, \ - NULL, "Enable nocow mode: enables runtime locking in\n"\ - "data move path needed if nocow will ever be in use\n")\ -+ x(copygc_enabled, u8, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, true, \ -+ NULL, "Enable copygc: disable for debugging, or to\n"\ -+ "quiet the system when doing performance testing\n")\ -+ x(rebalance_enabled, u8, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, true, \ -+ NULL, "Enable rebalance: disable for debugging, or to\n"\ -+ "quiet the system when doing performance testing\n")\ - x(no_data_io, u8, \ - OPT_MOUNT, \ - OPT_BOOL(), \ -@@ -488,7 +502,7 @@ enum fsck_err_opts { - OPT_DEVICE, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, 0, \ -- "size", "Size of filesystem on device") \ -+ "size", "Specifies the bucket size; must be greater than the btree node size")\ - x(durability, u8, \ - OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \ - OPT_UINT(0, BCH_REPLICAS_MAX), \ -@@ -624,11 +638,22 @@ struct bch_io_opts { - #define x(_name, _bits) u##_bits _name; - BCH_INODE_OPTS() - #undef x -+#define x(_name, _bits) u64 _name##_from_inode:1; -+ BCH_INODE_OPTS() -+#undef x - }; - --static inline unsigned background_compression(struct bch_io_opts opts) -+static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) - { -- return opts.background_compression ?: opts.compression; -+ if (!opts->background_target) -+ opts->background_target = opts->foreground_target; -+ if (!opts->background_compression) -+ opts->background_compression = opts->compression; -+ if (opts->nocow) { -+ opts->compression = opts->background_compression = 0; -+ opts->data_checksum = 0; -+ opts->erasure_code = 0; -+ } - } - - struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); -diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h -index 1d570387b77f..d0dd398baa2b 100644 ---- a/fs/bcachefs/printbuf.h -+++ b/fs/bcachefs/printbuf.h -@@ -251,16 +251,23 @@ static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) - printbuf_nul_terminate_reserved(out); - } - -+static inline void printbuf_reset_keep_tabstops(struct printbuf *buf) -+{ -+ buf->pos = 0; -+ buf->allocation_failure = 0; -+ buf->last_newline = 0; -+ buf->last_field = 0; -+ buf->indent = 0; -+ buf->cur_tabstop = 0; -+} -+ - /** - * printbuf_reset - re-use a printbuf without freeing and re-initializing it: - */ - static inline void printbuf_reset(struct printbuf *buf) - { -- buf->pos = 0; -- buf->allocation_failure = 0; -- buf->indent = 0; -+ printbuf_reset_keep_tabstops(buf); - buf->nr_tabstops = 0; -- buf->cur_tabstop = 0; - } - - /** -diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c -index 74f45a8162ad..8b857fc33244 100644 ---- a/fs/bcachefs/quota.c -+++ b/fs/bcachefs/quota.c -@@ -60,7 +60,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { - }; - - int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - int ret = 0; - -diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h -index a62abcc5332a..1551800ff44c 100644 ---- a/fs/bcachefs/quota.h -+++ b/fs/bcachefs/quota.h -@@ -5,10 +5,10 @@ - #include "inode.h" - #include "quota_types.h" - --enum bch_validate_flags; - extern const struct bch_sb_field_ops bch_sb_field_ops_quota; - --int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -+int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); - void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - - #define bch2_bkey_ops_quota ((struct bkey_ops) { \ -diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c -index 40a20192eee8..bef2aa1b8bcd 100644 ---- a/fs/bcachefs/rcu_pending.c -+++ b/fs/bcachefs/rcu_pending.c -@@ -25,21 +25,37 @@ enum rcu_pending_special { - #define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) - #define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) - --static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp) -+#ifdef __KERNEL__ -+typedef unsigned long rcu_gp_poll_state_t; -+ -+static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) -+{ -+ return l == r; -+} -+#else -+typedef struct urcu_gp_poll_state rcu_gp_poll_state_t; -+ -+static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) -+{ -+ return l.grace_period_id == r.grace_period_id; -+} -+#endif -+ -+static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp) - { - return ssp - ? get_state_synchronize_srcu(ssp) - : get_state_synchronize_rcu(); - } - --static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp) -+static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp) - { - return ssp - ? start_poll_synchronize_srcu(ssp) - : start_poll_synchronize_rcu(); - } - --static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie) -+static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie) - { - return ssp - ? poll_state_synchronize_srcu(ssp, cookie) -@@ -71,13 +87,13 @@ struct rcu_pending_seq { - GENRADIX(struct rcu_head *) objs; - size_t nr; - struct rcu_head **cursor; -- unsigned long seq; -+ rcu_gp_poll_state_t seq; - }; - - struct rcu_pending_list { - struct rcu_head *head; - struct rcu_head *tail; -- unsigned long seq; -+ rcu_gp_poll_state_t seq; - }; - - struct rcu_pending_pcpu { -@@ -316,10 +332,10 @@ static void rcu_pending_rcu_cb(struct rcu_head *rcu) - } - - static __always_inline struct rcu_pending_seq * --get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) -+get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq) - { - darray_for_each_reverse(p->objs, objs) -- if (objs->seq == seq) -+ if (rcu_gp_poll_cookie_eq(objs->seq, seq)) - return objs; - - if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) -@@ -329,7 +345,7 @@ get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) - } - - static noinline bool --rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, -+rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq, - struct rcu_head *head, void *ptr, - unsigned long *flags) - { -@@ -364,7 +380,7 @@ rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, - again: - for (struct rcu_pending_list *i = p->lists; - i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { -- if (i->seq == seq) { -+ if (rcu_gp_poll_cookie_eq(i->seq, seq)) { - rcu_pending_list_add(i, head); - return false; - } -@@ -408,7 +424,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, - struct rcu_pending_pcpu *p; - struct rcu_pending_seq *objs; - struct genradix_node *new_node = NULL; -- unsigned long seq, flags; -+ unsigned long flags; - bool start_gp = false; - - BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); -@@ -416,7 +432,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, - local_irq_save(flags); - p = this_cpu_ptr(pending->p); - spin_lock(&p->lock); -- seq = __get_state_synchronize_rcu(pending->srcu); -+ rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); - restart: - if (may_sleep && - unlikely(process_finished_items(pending, p, flags))) -diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c -index cd6647374353..d0a1f5cd5c2b 100644 ---- a/fs/bcachefs/rebalance.c -+++ b/fs/bcachefs/rebalance.c -@@ -24,6 +24,190 @@ - #include - #include - -+/* bch_extent_rebalance: */ -+ -+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ -+ bkey_extent_entry_for_each(ptrs, entry) -+ if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) -+ return &entry->rebalance; -+ -+ return NULL; -+} -+ -+static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, -+ struct bch_io_opts *opts, -+ struct bkey_s_c k, -+ struct bkey_ptrs_c ptrs) -+{ -+ if (!opts->background_compression) -+ return 0; -+ -+ unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned ptr_bit = 1; -+ unsigned rewrite_ptrs = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || -+ p.ptr.unwritten) -+ return 0; -+ -+ if (!p.ptr.cached && p.crc.compression_type != compression_type) -+ rewrite_ptrs |= ptr_bit; -+ ptr_bit <<= 1; -+ } -+ -+ return rewrite_ptrs; -+} -+ -+static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, -+ struct bch_io_opts *opts, -+ struct bkey_ptrs_c ptrs) -+{ -+ if (!opts->background_target || -+ !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) -+ return 0; -+ -+ unsigned ptr_bit = 1; -+ unsigned rewrite_ptrs = 0; -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) -+ rewrite_ptrs |= ptr_bit; -+ ptr_bit <<= 1; -+ } -+ -+ return rewrite_ptrs; -+} -+ -+static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, -+ struct bch_io_opts *opts, -+ struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ -+ return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | -+ bch2_bkey_ptrs_need_move(c, opts, ptrs); -+} -+ -+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) -+{ -+ const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); -+ if (!opts) -+ return 0; -+ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ u64 sectors = 0; -+ -+ if (opts->background_compression) { -+ unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || -+ p.ptr.unwritten) { -+ sectors = 0; -+ goto incompressible; -+ } -+ -+ if (!p.ptr.cached && p.crc.compression_type != compression_type) -+ sectors += p.crc.compressed_size; -+ } -+ } -+incompressible: -+ if (opts->background_target) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) -+ sectors += p.crc.compressed_size; -+ -+ return sectors; -+} -+ -+static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, -+ struct bkey_s_c k) -+{ -+ if (!bkey_extent_is_direct_data(k.k)) -+ return 0; -+ -+ const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); -+ -+ if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { -+ struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts); -+ return old == NULL || memcmp(old, &new, sizeof(new)); -+ } else { -+ return old != NULL; -+ } -+} -+ -+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, -+ struct bkey_i *_k) -+{ -+ if (!bkey_extent_is_direct_data(&_k->k)) -+ return 0; -+ -+ struct bkey_s k = bkey_i_to_s(_k); -+ struct bch_extent_rebalance *old = -+ (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); -+ -+ if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { -+ if (!old) { -+ old = bkey_val_end(k); -+ k.k->u64s += sizeof(*old) / sizeof(u64); -+ } -+ -+ *old = io_opts_to_rebalance_opts(c, opts); -+ } else { -+ if (old) -+ extent_entry_drop(k, (union bch_extent_entry *) old); -+ } -+ -+ return 0; -+} -+ -+int bch2_get_update_rebalance_opts(struct btree_trans *trans, -+ struct bch_io_opts *io_opts, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ BUG_ON(iter->flags & BTREE_ITER_is_extents); -+ BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); -+ -+ const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v -+ ? bch2_bkey_rebalance_opts(k) : NULL; -+ if (r) { -+#define x(_name) \ -+ if (r->_name##_from_inode) { \ -+ io_opts->_name = r->_name; \ -+ io_opts->_name##_from_inode = true; \ -+ } -+ BCH_REBALANCE_OPTS() -+#undef x -+ } -+ -+ if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) -+ return 0; -+ -+ struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); -+ int ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ return ret; -+ -+ bkey_reassemble(n, k); -+ -+ /* On successfull transaction commit, @k was invalidated: */ -+ -+ return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: -+ bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: -+ bch2_trans_commit(trans, NULL, NULL, 0) ?: -+ -BCH_ERR_transaction_restart_nested; -+} -+ - #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) - - static const char * const bch2_rebalance_state_strs[] = { -@@ -33,7 +217,7 @@ static const char * const bch2_rebalance_state_strs[] = { - #undef x - }; - --static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) -+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) - { - struct btree_iter iter; - struct bkey_s_c k; -@@ -71,9 +255,8 @@ static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) - int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) - { - int ret = bch2_trans_commit_do(c, NULL, NULL, -- BCH_TRANS_COMMIT_no_enospc| -- BCH_TRANS_COMMIT_lazy_rw, -- __bch2_set_rebalance_needs_scan(trans, inum)); -+ BCH_TRANS_COMMIT_no_enospc, -+ bch2_set_rebalance_needs_scan_trans(trans, inum)); - rebalance_wakeup(c); - return ret; - } -@@ -121,6 +304,9 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) - { -+ if (!bch2_bkey_rebalance_opts(k)) -+ return 0; -+ - struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) -@@ -134,31 +320,27 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, - static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, - struct bpos work_pos, - struct btree_iter *extent_iter, -+ struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) - { - struct bch_fs *c = trans->c; -- struct bkey_s_c k; - - bch2_trans_iter_exit(trans, extent_iter); - bch2_trans_iter_init(trans, extent_iter, - work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, - work_pos, - BTREE_ITER_all_snapshots); -- k = bch2_btree_iter_peek_slot(extent_iter); -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter); - if (bkey_err(k)) - return k; - -- const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; -- if (!r) { -- /* raced due to btree write buffer, nothing to do */ -- return bkey_s_c_null; -- } -+ int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); -+ if (ret) -+ return bkey_s_c_err(ret); - - memset(data_opts, 0, sizeof(*data_opts)); -- -- data_opts->rewrite_ptrs = -- bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); -- data_opts->target = r->target; -+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); -+ data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; - - if (!data_opts->rewrite_ptrs) { -@@ -178,12 +360,28 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, - if (trace_rebalance_extent_enabled()) { - struct printbuf buf = PRINTBUF; - -- prt_str(&buf, "target="); -- bch2_target_to_text(&buf, c, r->target); -- prt_str(&buf, " compression="); -- bch2_compression_opt_to_text(&buf, r->compression); -- prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, k); -+ prt_newline(&buf); -+ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ -+ unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); -+ if (p) { -+ prt_str(&buf, "compression="); -+ bch2_compression_opt_to_text(&buf, io_opts->background_compression); -+ prt_str(&buf, " "); -+ bch2_prt_u64_base2(&buf, p); -+ prt_newline(&buf); -+ } -+ -+ p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); -+ if (p) { -+ prt_str(&buf, "move="); -+ bch2_target_to_text(&buf, c, io_opts->background_target); -+ prt_str(&buf, " "); -+ bch2_prt_u64_base2(&buf, p); -+ prt_newline(&buf); -+ } - - trace_rebalance_extent(c, buf.buf); - printbuf_exit(&buf); -@@ -212,14 +410,10 @@ static int do_rebalance_extent(struct moving_context *ctxt, - bch2_bkey_buf_init(&sk); - - ret = bkey_err(k = next_rebalance_extent(trans, work_pos, -- extent_iter, &data_opts)); -+ extent_iter, &io_opts, &data_opts)); - if (ret || !k.k) - goto out; - -- ret = bch2_move_get_io_opts_one(trans, &io_opts, k); -- if (ret) -- goto out; -- - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); - - /* -@@ -253,20 +447,8 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) - { -- unsigned target, compression; -- -- if (k.k->p.inode) { -- target = io_opts->background_target; -- compression = background_compression(*io_opts); -- } else { -- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); -- -- target = r ? r->target : io_opts->background_target; -- compression = r ? r->compression : background_compression(*io_opts); -- } -- -- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); -- data_opts->target = target; -+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); -+ data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; - return data_opts->rewrite_ptrs != 0; - } -@@ -338,9 +520,9 @@ static int do_rebalance(struct moving_context *ctxt) - BTREE_ITER_all_snapshots); - - while (!bch2_move_ratelimit(ctxt)) { -- if (!r->enabled) { -+ if (!c->opts.rebalance_enabled) { - bch2_moving_ctxt_flush_all(ctxt); -- kthread_wait_freezable(r->enabled || -+ kthread_wait_freezable(c->opts.rebalance_enabled || - kthread_should_stop()); - } - -diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h -index 28a52638f16c..62a3859d3823 100644 ---- a/fs/bcachefs/rebalance.h -+++ b/fs/bcachefs/rebalance.h -@@ -2,8 +2,38 @@ - #ifndef _BCACHEFS_REBALANCE_H - #define _BCACHEFS_REBALANCE_H - -+#include "compress.h" -+#include "disk_groups.h" -+#include "opts.h" - #include "rebalance_types.h" - -+static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, -+ struct bch_io_opts *opts) -+{ -+ struct bch_extent_rebalance r = { -+ .type = BIT(BCH_EXTENT_ENTRY_rebalance), -+#define x(_name) \ -+ ._name = opts->_name, \ -+ ._name##_from_inode = opts->_name##_from_inode, -+ BCH_REBALANCE_OPTS() -+#undef x -+ }; -+ -+ if (r.background_target && -+ !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) -+ r.background_target = 0; -+ -+ return r; -+}; -+ -+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); -+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); -+int bch2_get_update_rebalance_opts(struct btree_trans *, -+ struct bch_io_opts *, -+ struct btree_iter *, -+ struct bkey_s_c); -+ -+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); - int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); - int bch2_set_fs_needs_rebalance(struct bch_fs *); - -diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h -new file mode 100644 -index 000000000000..ff9a1342a22b ---- /dev/null -+++ b/fs/bcachefs/rebalance_format.h -@@ -0,0 +1,53 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REBALANCE_FORMAT_H -+#define _BCACHEFS_REBALANCE_FORMAT_H -+ -+struct bch_extent_rebalance { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:6, -+ unused:3, -+ -+ promote_target_from_inode:1, -+ erasure_code_from_inode:1, -+ data_checksum_from_inode:1, -+ background_compression_from_inode:1, -+ data_replicas_from_inode:1, -+ background_target_from_inode:1, -+ -+ promote_target:16, -+ erasure_code:1, -+ data_checksum:4, -+ data_replicas:4, -+ background_compression:8, /* enum bch_compression_opt */ -+ background_target:16; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 background_target:16, -+ background_compression:8, -+ data_replicas:4, -+ data_checksum:4, -+ erasure_code:1, -+ promote_target:16, -+ -+ background_target_from_inode:1, -+ data_replicas_from_inode:1, -+ background_compression_from_inode:1, -+ data_checksum_from_inode:1, -+ erasure_code_from_inode:1, -+ promote_target_from_inode:1, -+ -+ unused:3, -+ type:6; -+#endif -+}; -+ -+/* subset of BCH_INODE_OPTS */ -+#define BCH_REBALANCE_OPTS() \ -+ x(data_checksum) \ -+ x(background_compression) \ -+ x(data_replicas) \ -+ x(promote_target) \ -+ x(background_target) \ -+ x(erasure_code) -+ -+#endif /* _BCACHEFS_REBALANCE_FORMAT_H */ -+ -diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h -index 0fffb536c1d0..fe5098c17dfc 100644 ---- a/fs/bcachefs/rebalance_types.h -+++ b/fs/bcachefs/rebalance_types.h -@@ -30,8 +30,6 @@ struct bch_fs_rebalance { - struct bbpos scan_start; - struct bbpos scan_end; - struct bch_move_stats scan_stats; -- -- unsigned enabled:1; - }; - - #endif /* _BCACHEFS_REBALANCE_TYPES_H */ -diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c -index 3c7f941dde39..98825437381c 100644 ---- a/fs/bcachefs/recovery.c -+++ b/fs/bcachefs/recovery.c -@@ -34,21 +34,83 @@ - - #define QSTR(n) { { { .len = strlen(n) } }, .name = n } - --void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) -+int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) - { -- if (btree >= BTREE_ID_NR_MAX) -- return; -- - u64 b = BIT_ULL(btree); -+ int ret = 0; -+ -+ mutex_lock(&c->sb_lock); -+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - if (!(c->sb.btrees_lost_data & b)) { -- bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree)); -+ struct printbuf buf = PRINTBUF; -+ bch2_btree_id_to_text(&buf, btree); -+ bch_err(c, "flagging btree %s lost data", buf.buf); -+ printbuf_exit(&buf); -+ ext->btrees_lost_data |= cpu_to_le64(b); -+ } - -- mutex_lock(&c->sb_lock); -- bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); -- bch2_write_super(c); -- mutex_unlock(&c->sb_lock); -+ /* Once we have runtime self healing for topology errors we won't need this: */ -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; -+ -+ /* Btree node accounting will be off: */ -+ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ /* -+ * These are much more minor, and don't need to be corrected right away, -+ * but in debug mode we want the next fsck run to be clean: -+ */ -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; -+#endif -+ -+ switch (btree) { -+ case BTREE_ID_alloc: -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; -+ -+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); -+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); -+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); -+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); -+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); -+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); -+ goto out; -+ case BTREE_ID_backpointers: -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; -+ goto out; -+ case BTREE_ID_need_discard: -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; -+ goto out; -+ case BTREE_ID_freespace: -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; -+ goto out; -+ case BTREE_ID_bucket_gens: -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; -+ goto out; -+ case BTREE_ID_lru: -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; -+ goto out; -+ case BTREE_ID_accounting: -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; -+ goto out; -+ default: -+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; -+ goto out; - } -+out: -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+static void kill_btree(struct bch_fs *c, enum btree_id btree) -+{ -+ bch2_btree_id_root(c, btree)->alive = false; -+ bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - } - - /* for -o reconstruct_alloc: */ -@@ -79,6 +141,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) - __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); - -+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent); -+ - __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); -@@ -99,16 +163,9 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - -- bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, -- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); -- bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, -- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); -- bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, -- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); -- bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, -- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); -- bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, -- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); -+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) -+ if (btree_id_is_alloc(i)) -+ kill_btree(c, i); - } - - /* -@@ -354,10 +411,13 @@ int bch2_journal_replay(struct bch_fs *c) - ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim - : 0), - bch2_journal_replay_key(trans, k)); -- bch_err_msg(c, ret, "while replaying key at btree %s level %u:", -- bch2_btree_id_str(k->btree_id), k->level); -- if (ret) -+ if (ret) { -+ struct printbuf buf = PRINTBUF; -+ bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); -+ bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); -+ printbuf_exit(&buf); - goto err; -+ } - - BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); - } -@@ -403,7 +463,9 @@ static int journal_replay_entry_early(struct bch_fs *c, - - switch (entry->type) { - case BCH_JSET_ENTRY_btree_root: { -- struct btree_root *r; -+ -+ if (unlikely(!entry->u64s)) -+ return 0; - - if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, - c, invalid_btree_id, -@@ -417,15 +479,11 @@ static int journal_replay_entry_early(struct bch_fs *c, - return ret; - } - -- r = bch2_btree_id_root(c, entry->btree_id); -+ struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); - -- if (entry->u64s) { -- r->level = entry->level; -- bkey_copy(&r->key, (struct bkey_i *) entry->start); -- r->error = 0; -- } else { -- r->error = -BCH_ERR_btree_node_read_error; -- } -+ r->level = entry->level; -+ bkey_copy(&r->key, (struct bkey_i *) entry->start); -+ r->error = 0; - r->alive = true; - break; - } -@@ -505,6 +563,7 @@ static int journal_replay_early(struct bch_fs *c, - - static int read_btree_roots(struct bch_fs *c) - { -+ struct printbuf buf = PRINTBUF; - int ret = 0; - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { -@@ -513,33 +572,22 @@ static int read_btree_roots(struct bch_fs *c) - if (!r->alive) - continue; - -- if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) -- continue; -+ printbuf_reset(&buf); -+ bch2_btree_id_level_to_text(&buf, i, r->level); - - if (mustfix_fsck_err_on((ret = r->error), - c, btree_root_bkey_invalid, - "invalid btree root %s", -- bch2_btree_id_str(i)) || -+ buf.buf) || - mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), - c, btree_root_read_error, -- "error reading btree root %s l=%u: %s", -- bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { -- if (btree_id_is_alloc(i)) { -- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); -- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); -- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); -- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); -- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); -- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); -+ "error reading btree root %s: %s", -+ buf.buf, bch2_err_str(ret))) { -+ if (btree_id_is_alloc(i)) - r->error = 0; -- } else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { -- bch_info(c, "will run btree node scan"); -- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); -- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); -- } - -- ret = 0; -- bch2_btree_lost_data(c, i); -+ ret = bch2_btree_lost_data(c, i); -+ BUG_ON(ret); - } - } - -@@ -553,6 +601,7 @@ static int read_btree_roots(struct bch_fs *c) - } - } - fsck_err: -+ printbuf_exit(&buf); - return ret; - } - -@@ -563,6 +612,7 @@ static bool check_version_upgrade(struct bch_fs *c) - bch2_latest_compatible_version(c->sb.version)); - unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; - unsigned new_version = 0; -+ bool ret = false; - - if (old_version < bcachefs_metadata_required_upgrade_below) { - if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || -@@ -618,14 +668,32 @@ static bool check_version_upgrade(struct bch_fs *c) - } - - bch_info(c, "%s", buf.buf); -+ printbuf_exit(&buf); - -- bch2_sb_upgrade(c, new_version); -+ ret = true; -+ } - -+ if (new_version > c->sb.version_incompat && -+ c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { -+ struct printbuf buf = PRINTBUF; -+ -+ prt_str(&buf, "Now allowing incompatible features up to "); -+ bch2_version_to_text(&buf, new_version); -+ prt_str(&buf, ", previously allowed up to "); -+ bch2_version_to_text(&buf, c->sb.version_incompat_allowed); -+ prt_newline(&buf); -+ -+ bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); -- return true; -+ -+ ret = true; - } - -- return false; -+ if (ret) -+ bch2_sb_upgrade(c, new_version, -+ c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); -+ -+ return ret; - } - - int bch2_fs_recovery(struct bch_fs *c) -@@ -660,8 +728,13 @@ int bch2_fs_recovery(struct bch_fs *c) - goto err; - } - -- if (c->opts.norecovery) -- c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; -+ if (c->opts.norecovery) { -+ c->opts.recovery_pass_last = c->opts.recovery_pass_last -+ ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) -+ : BCH_RECOVERY_PASS_snapshots_read; -+ c->opts.nochanges = true; -+ c->opts.read_only = true; -+ } - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); -@@ -708,17 +781,20 @@ int bch2_fs_recovery(struct bch_fs *c) - - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - -+ if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { -+ SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); -+ write_sb = true; -+ } -+ - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - -- if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) -- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); -- - if (c->opts.fsck) - set_bit(BCH_FS_fsck_running, &c->flags); - if (c->sb.clean) - set_bit(BCH_FS_clean_recovery, &c->flags); -+ set_bit(BCH_FS_recovery_running, &c->flags); - - ret = bch2_blacklist_table_initialize(c); - if (ret) { -@@ -807,15 +883,15 @@ int bch2_fs_recovery(struct bch_fs *c) - c->journal_replay_seq_start = last_seq; - c->journal_replay_seq_end = blacklist_seq - 1; - -- if (c->opts.reconstruct_alloc) -- bch2_reconstruct_alloc(c); -- - zero_out_btree_mem_ptr(&c->journal_keys); - - ret = journal_replay_early(c, clean); - if (ret) - goto err; - -+ if (c->opts.reconstruct_alloc) -+ bch2_reconstruct_alloc(c); -+ - /* - * After an unclean shutdown, skip then next few journal sequence - * numbers as they may have been referenced by btree writes that -@@ -870,16 +946,17 @@ int bch2_fs_recovery(struct bch_fs *c) - */ - set_bit(BCH_FS_may_go_rw, &c->flags); - clear_bit(BCH_FS_fsck_running, &c->flags); -+ clear_bit(BCH_FS_recovery_running, &c->flags); - - /* in case we don't run journal replay, i.e. norecovery mode */ - set_bit(BCH_FS_accounting_replay_done, &c->flags); - -+ bch2_async_btree_node_rewrites_flush(c); -+ - /* fsync if we fixed errors */ -- if (test_bit(BCH_FS_errors_fixed, &c->flags) && -- bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) { -+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_meta(&c->journal); -- bch2_write_ref_put(c, BCH_WRITE_REF_fsync); - } - - /* If we fixed errors, verify that fs is actually clean now: */ -@@ -1021,7 +1098,7 @@ int bch2_fs_initialize(struct bch_fs *c) - bch2_check_version_downgrade(c); - - if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { -- bch2_sb_upgrade(c, bcachefs_metadata_version_current); -+ bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - bch2_write_super(c); - } -@@ -1035,7 +1112,6 @@ int bch2_fs_initialize(struct bch_fs *c) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - -- c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; - set_bit(BCH_FS_btree_running, &c->flags); - set_bit(BCH_FS_may_go_rw, &c->flags); - -@@ -1076,9 +1152,6 @@ int bch2_fs_initialize(struct bch_fs *c) - if (ret) - goto err; - -- for_each_online_member(c, ca) -- ca->new_fs_bucket_idx = 0; -- - ret = bch2_fs_freespace_init(c); - if (ret) - goto err; -@@ -1137,6 +1210,7 @@ int bch2_fs_initialize(struct bch_fs *c) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - -+ c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; - return 0; - err: - bch_err_fn(c, ret); -diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h -index 4bf818de1f2f..b0d55754b21b 100644 ---- a/fs/bcachefs/recovery.h -+++ b/fs/bcachefs/recovery.h -@@ -2,7 +2,7 @@ - #ifndef _BCACHEFS_RECOVERY_H - #define _BCACHEFS_RECOVERY_H - --void bch2_btree_lost_data(struct bch_fs *, enum btree_id); -+int bch2_btree_lost_data(struct bch_fs *, enum btree_id); - - int bch2_journal_replay(struct bch_fs *); - -diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c -index dff589ddc984..0b3c951c32da 100644 ---- a/fs/bcachefs/recovery_passes.c -+++ b/fs/bcachefs/recovery_passes.c -@@ -46,7 +46,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) - - set_bit(BCH_FS_may_go_rw, &c->flags); - -- if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) -+ if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) - return bch2_fs_read_write_early(c); - return 0; - } -@@ -100,20 +100,34 @@ u64 bch2_recovery_passes_from_stable(u64 v) - /* - * For when we need to rewind recovery passes and run a pass we skipped: - */ --int bch2_run_explicit_recovery_pass(struct bch_fs *c, -- enum bch_recovery_pass pass) -+static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, -+ enum bch_recovery_pass pass) - { -- if (c->opts.recovery_passes & BIT_ULL(pass)) -+ if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) -+ return -BCH_ERR_not_in_recovery; -+ -+ if (c->recovery_passes_complete & BIT_ULL(pass)) - return 0; - -- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", -- bch2_recovery_passes[pass], pass, -- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); -+ bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); -+ -+ if (pass < BCH_RECOVERY_PASS_set_may_go_rw && -+ c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { -+ if (print) -+ bch_info(c, "need recovery pass %s (%u), but already rw", -+ bch2_recovery_passes[pass], pass); -+ return -BCH_ERR_cannot_rewind_recovery; -+ } -+ -+ if (print) -+ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", -+ bch2_recovery_passes[pass], pass, -+ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); - - c->opts.recovery_passes |= BIT_ULL(pass); - -- if (c->curr_recovery_pass >= pass) { -- c->curr_recovery_pass = pass; -+ if (c->curr_recovery_pass > pass) { -+ c->next_recovery_pass = pass; - c->recovery_passes_complete &= (1ULL << pass) >> 1; - return -BCH_ERR_restart_recovery; - } else { -@@ -121,6 +135,27 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, - } - } - -+int bch2_run_explicit_recovery_pass(struct bch_fs *c, -+ enum bch_recovery_pass pass) -+{ -+ unsigned long flags; -+ spin_lock_irqsave(&c->recovery_pass_lock, flags); -+ int ret = __bch2_run_explicit_recovery_pass(c, pass); -+ spin_unlock_irqrestore(&c->recovery_pass_lock, flags); -+ return ret; -+} -+ -+int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, -+ enum bch_recovery_pass pass) -+{ -+ lockdep_assert_held(&c->sb_lock); -+ -+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); -+ __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); -+ -+ return bch2_run_explicit_recovery_pass(c, pass); -+} -+ - int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, - enum bch_recovery_pass pass) - { -@@ -233,31 +268,48 @@ int bch2_run_recovery_passes(struct bch_fs *c) - */ - c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - -- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { -- if (c->opts.recovery_pass_last && -- c->curr_recovery_pass > c->opts.recovery_pass_last) -- break; -- -- if (should_run_recovery_pass(c, c->curr_recovery_pass)) { -- unsigned pass = c->curr_recovery_pass; -+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { -+ c->next_recovery_pass = c->curr_recovery_pass + 1; - -- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass) ?: -- bch2_journal_flush(&c->journal); -- if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || -- (ret && c->curr_recovery_pass < pass)) -- continue; -- if (ret) -- break; -+ spin_lock_irq(&c->recovery_pass_lock); -+ unsigned pass = c->curr_recovery_pass; - -- c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); -+ if (c->opts.recovery_pass_last && -+ c->curr_recovery_pass > c->opts.recovery_pass_last) { -+ spin_unlock_irq(&c->recovery_pass_lock); -+ break; - } - -- c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); -- -- if (!test_bit(BCH_FS_error, &c->flags)) -- bch2_clear_recovery_pass_required(c, c->curr_recovery_pass); -- -- c->curr_recovery_pass++; -+ if (!should_run_recovery_pass(c, pass)) { -+ c->curr_recovery_pass++; -+ c->recovery_pass_done = max(c->recovery_pass_done, pass); -+ spin_unlock_irq(&c->recovery_pass_lock); -+ continue; -+ } -+ spin_unlock_irq(&c->recovery_pass_lock); -+ -+ ret = bch2_run_recovery_pass(c, pass) ?: -+ bch2_journal_flush(&c->journal); -+ -+ if (!ret && !test_bit(BCH_FS_error, &c->flags)) -+ bch2_clear_recovery_pass_required(c, pass); -+ -+ spin_lock_irq(&c->recovery_pass_lock); -+ if (c->next_recovery_pass < c->curr_recovery_pass) { -+ /* -+ * bch2_run_explicit_recovery_pass() was called: we -+ * can't always catch -BCH_ERR_restart_recovery because -+ * it may have been called from another thread (btree -+ * node read completion) -+ */ -+ ret = 0; -+ c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); -+ } else { -+ c->recovery_passes_complete |= BIT_ULL(pass); -+ c->recovery_pass_done = max(c->recovery_pass_done, pass); -+ } -+ c->curr_recovery_pass = c->next_recovery_pass; -+ spin_unlock_irq(&c->recovery_pass_lock); - } - - return ret; -diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h -index 99b464e127b8..7d7339c8fa29 100644 ---- a/fs/bcachefs/recovery_passes.h -+++ b/fs/bcachefs/recovery_passes.h -@@ -9,6 +9,7 @@ u64 bch2_recovery_passes_from_stable(u64 v); - u64 bch2_fsck_recovery_passes(void); - - int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); -+int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); - int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); - - int bch2_run_online_recovery_passes(struct bch_fs *); -diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h -index 94dc20ca2065..418557960ed6 100644 ---- a/fs/bcachefs/recovery_passes_types.h -+++ b/fs/bcachefs/recovery_passes_types.h -@@ -8,53 +8,59 @@ - #define PASS_ALWAYS BIT(3) - #define PASS_ONLINE BIT(4) - -+#ifdef CONFIG_BCACHEFS_DEBUG -+#define PASS_FSCK_DEBUG BIT(1) -+#else -+#define PASS_FSCK_DEBUG 0 -+#endif -+ - /* - * Passes may be reordered, but the second field is a persistent identifier and - * must never change: - */ --#define BCH_RECOVERY_PASSES() \ -- x(recovery_pass_empty, 41, PASS_SILENT) \ -- x(scan_for_btree_nodes, 37, 0) \ -- x(check_topology, 4, 0) \ -- x(accounting_read, 39, PASS_ALWAYS) \ -- x(alloc_read, 0, PASS_ALWAYS) \ -- x(stripes_read, 1, PASS_ALWAYS) \ -- x(initialize_subvolumes, 2, 0) \ -- x(snapshots_read, 3, PASS_ALWAYS) \ -- x(check_allocations, 5, PASS_FSCK) \ -- x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ -- x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ -- x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ -- x(journal_replay, 9, PASS_ALWAYS) \ -- x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ -- x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ -- x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ -- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \ -- x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ -- x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ -- x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ -- x(bucket_gens_init, 17, 0) \ -- x(reconstruct_snapshots, 38, 0) \ -- x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ -- x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ -- x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ -- x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ -- x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ -- x(fs_upgrade_for_subvolumes, 22, 0) \ -- x(check_inodes, 24, PASS_FSCK) \ -- x(check_extents, 25, PASS_FSCK) \ -- x(check_indirect_extents, 26, PASS_FSCK) \ -- x(check_dirents, 27, PASS_FSCK) \ -- x(check_xattrs, 28, PASS_FSCK) \ -- x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ -- x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \ -- x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ -- x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ -- x(check_nlinks, 31, PASS_FSCK) \ -- x(resume_logged_ops, 23, PASS_ALWAYS) \ -- x(delete_dead_inodes, 32, PASS_ALWAYS) \ -- x(fix_reflink_p, 33, 0) \ -- x(set_fs_needs_rebalance, 34, 0) \ -+#define BCH_RECOVERY_PASSES() \ -+ x(recovery_pass_empty, 41, PASS_SILENT) \ -+ x(scan_for_btree_nodes, 37, 0) \ -+ x(check_topology, 4, 0) \ -+ x(accounting_read, 39, PASS_ALWAYS) \ -+ x(alloc_read, 0, PASS_ALWAYS) \ -+ x(stripes_read, 1, PASS_ALWAYS) \ -+ x(initialize_subvolumes, 2, 0) \ -+ x(snapshots_read, 3, PASS_ALWAYS) \ -+ x(check_allocations, 5, PASS_FSCK) \ -+ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ -+ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ -+ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ -+ x(journal_replay, 9, PASS_ALWAYS) \ -+ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ -+ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ -+ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ -+ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ -+ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ -+ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ -+ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ -+ x(bucket_gens_init, 17, 0) \ -+ x(reconstruct_snapshots, 38, 0) \ -+ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ -+ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ -+ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ -+ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ -+ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ -+ x(fs_upgrade_for_subvolumes, 22, 0) \ -+ x(check_inodes, 24, PASS_FSCK) \ -+ x(check_extents, 25, PASS_FSCK) \ -+ x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ -+ x(check_dirents, 27, PASS_FSCK) \ -+ x(check_xattrs, 28, PASS_FSCK) \ -+ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ -+ x(check_unreachable_inodes, 40, PASS_FSCK) \ -+ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ -+ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ -+ x(check_nlinks, 31, PASS_FSCK) \ -+ x(resume_logged_ops, 23, PASS_ALWAYS) \ -+ x(delete_dead_inodes, 32, PASS_ALWAYS) \ -+ x(fix_reflink_p, 33, 0) \ -+ x(set_fs_needs_rebalance, 34, 0) - - /* We normally enumerate recovery passes in the order we run them: */ - enum bch_recovery_pass { -diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c -index f457925fa362..441e648f28b5 100644 ---- a/fs/bcachefs/reflink.c -+++ b/fs/bcachefs/reflink.c -@@ -15,6 +15,17 @@ - - #include - -+static inline bool bkey_extent_is_reflink_data(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_reflink_v: -+ case KEY_TYPE_indirect_inline_data: -+ return true; -+ default: -+ return false; -+ } -+} -+ - static inline unsigned bkey_type_to_indirect(const struct bkey *k) - { - switch (k->type) { -@@ -30,15 +41,15 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) - /* reflink pointers */ - - int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - int ret = 0; - -- bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad), -+ bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad), - c, reflink_p_front_pad_bad, - "idx < front_pad (%llu < %u)", -- le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); -+ REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad)); - fsck_err: - return ret; - } -@@ -49,7 +60,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - - prt_printf(out, "idx %llu front_pad %u back_pad %u", -- le64_to_cpu(p.v->idx), -+ REFLINK_P_IDX(p.v), - le32_to_cpu(p.v->front_pad), - le32_to_cpu(p.v->back_pad)); - } -@@ -65,49 +76,250 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r - */ - return false; - -- if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) -+ if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v)) -+ return false; -+ -+ if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v)) - return false; - - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; - } - -+/* indirect extents */ -+ -+int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, -+ struct bkey_validate_context from) -+{ -+ int ret = 0; -+ -+ bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)), -+ c, reflink_v_pos_bad, -+ "indirect extent above maximum position 0:%llu", -+ REFLINK_P_IDX_MAX); -+ -+ ret = bch2_bkey_ptrs_validate(c, k, from); -+fsck_err: -+ return ret; -+} -+ -+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); -+ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+#if 0 -+Currently disabled, needs to be debugged: -+ -+bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); -+ -+ return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); -+} -+#endif -+ -+/* indirect inline data */ -+ -+int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, -+ struct bkey_validate_context from) -+{ -+ return 0; -+} -+ -+void bch2_indirect_inline_data_to_text(struct printbuf *out, -+ struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); -+ unsigned datalen = bkey_inline_data_bytes(k.k); -+ -+ prt_printf(out, "refcount %llu datalen %u: %*phN", -+ le64_to_cpu(d.v->refcount), datalen, -+ min(datalen, 32U), d.v->data); -+} -+ -+/* lookup */ -+ -+static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p, -+ bool should_commit) -+{ -+ struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); -+ int ret = PTR_ERR_OR_ZERO(new); -+ if (ret) -+ return ret; -+ -+ SET_REFLINK_P_ERROR(&new->v, false); -+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); -+ if (ret) -+ return ret; -+ -+ if (!should_commit) -+ return 0; -+ -+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: -+ -BCH_ERR_transaction_restart_nested; -+} -+ -+static int bch2_indirect_extent_missing_error(struct btree_trans *trans, -+ struct bkey_s_c_reflink_p p, -+ u64 missing_start, u64 missing_end, -+ bool should_commit) -+{ -+ if (REFLINK_P_ERROR(p.v)) -+ return 0; -+ -+ struct bch_fs *c = trans->c; -+ u64 live_start = REFLINK_P_IDX(p.v); -+ u64 live_end = REFLINK_P_IDX(p.v) + p.k->size; -+ u64 refd_start = live_start - le32_to_cpu(p.v->front_pad); -+ u64 refd_end = live_end + le32_to_cpu(p.v->back_pad); -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ BUG_ON(missing_start < refd_start); -+ BUG_ON(missing_end > refd_end); -+ -+ if (fsck_err(trans, reflink_p_to_missing_reflink_v, -+ "pointer to missing indirect extent\n" -+ " %s\n" -+ " missing range %llu-%llu", -+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), -+ missing_start, missing_end)) { -+ struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); -+ ret = PTR_ERR_OR_ZERO(new); -+ if (ret) -+ goto err; -+ -+ /* -+ * Is the missing range not actually needed? -+ * -+ * p.v->idx refers to the data that we actually want, but if the -+ * indirect extent we point to was bigger, front_pad and back_pad -+ * indicate the range we took a reference on. -+ */ -+ -+ if (missing_end <= live_start) { -+ new->v.front_pad = cpu_to_le32(live_start - missing_end); -+ } else if (missing_start >= live_end) { -+ new->v.back_pad = cpu_to_le32(missing_start - live_end); -+ } else { -+ struct bpos new_start = bkey_start_pos(&new->k); -+ struct bpos new_end = new->k.p; -+ -+ if (missing_start > live_start) -+ new_start.offset += missing_start - live_start; -+ if (missing_end < live_end) -+ new_end.offset -= live_end - missing_end; -+ -+ bch2_cut_front(new_start, &new->k_i); -+ bch2_cut_back(new_end, &new->k_i); -+ -+ SET_REFLINK_P_ERROR(&new->v, true); -+ } -+ -+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); -+ if (ret) -+ goto err; -+ -+ if (should_commit) -+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: -+ -BCH_ERR_transaction_restart_nested; -+ } -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+/* -+ * This is used from the read path, which doesn't expect to have to do a -+ * transaction commit, and from triggers, which should not be doing a commit: -+ */ -+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, -+ struct btree_iter *iter, -+ s64 *offset_into_extent, -+ struct bkey_s_c_reflink_p p, -+ bool should_commit, -+ unsigned iter_flags) -+{ -+ BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad))); -+ BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad)); -+ -+ u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent; -+ -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink, -+ POS(0, reflink_offset), iter_flags); -+ if (bkey_err(k)) -+ return k; -+ -+ if (unlikely(!bkey_extent_is_reflink_data(k.k))) { -+ unsigned size = min((u64) k.k->size, -+ REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - -+ reflink_offset); -+ bch2_key_resize(&iter->k, size); -+ -+ int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, -+ k.k->p.offset, should_commit); -+ if (ret) { -+ bch2_trans_iter_exit(trans, iter); -+ return bkey_s_c_err(ret); -+ } -+ } else if (unlikely(REFLINK_P_ERROR(p.v))) { -+ int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); -+ if (ret) { -+ bch2_trans_iter_exit(trans, iter); -+ return bkey_s_c_err(ret); -+ } -+ } -+ -+ *offset_into_extent = reflink_offset - bkey_start_offset(k.k); -+ return k; -+} -+ -+/* reflink pointer trigger */ -+ - static int trans_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, u64 *idx, - enum btree_iter_update_trigger_flags flags) - { - struct bch_fs *c = trans->c; -- struct btree_iter iter; -- struct bkey_i *k; -- __le64 *refcount; -- int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; - struct printbuf buf = PRINTBUF; -- int ret; - -- k = bch2_bkey_get_mut_noupdate(trans, &iter, -- BTREE_ID_reflink, POS(0, *idx), -- BTREE_ITER_with_updates); -- ret = PTR_ERR_OR_ZERO(k); -+ s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v); -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false, -+ BTREE_ITER_intent| -+ BTREE_ITER_with_updates); -+ int ret = bkey_err(k); - if (ret) -- goto err; -+ return ret; - -- refcount = bkey_refcount(bkey_i_to_s(k)); -- if (!refcount) { -- bch2_bkey_val_to_text(&buf, c, p.s_c); -- bch2_trans_inconsistent(trans, -- "nonexistent indirect extent at %llu while marking\n %s", -- *idx, buf.buf); -- ret = -EIO; -- goto err; -+ if (!bkey_refcount_c(k)) { -+ if (!(flags & BTREE_TRIGGER_overwrite)) -+ ret = -BCH_ERR_missing_indirect_extent; -+ goto next; - } - -+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); -+ ret = PTR_ERR_OR_ZERO(new); -+ if (ret) -+ goto err; -+ -+ __le64 *refcount = bkey_refcount(bkey_i_to_s(new)); - if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { - bch2_bkey_val_to_text(&buf, c, p.s_c); -- bch2_trans_inconsistent(trans, -- "indirect extent refcount underflow at %llu while marking\n %s", -- *idx, buf.buf); -- ret = -EIO; -- goto err; -+ prt_printf(&buf, "\n "); -+ bch2_bkey_val_to_text(&buf, c, k); -+ log_fsck_err(trans, reflink_refcount_underflow, -+ "indirect extent refcount underflow while marking\n %s", -+ buf.buf); -+ goto next; - } - - if (flags & BTREE_TRIGGER_insert) { -@@ -115,25 +327,26 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, - u64 pad; - - pad = max_t(s64, le32_to_cpu(v->front_pad), -- le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); -+ REFLINK_P_IDX(v) - bkey_start_offset(&new->k)); - BUG_ON(pad > U32_MAX); - v->front_pad = cpu_to_le32(pad); - - pad = max_t(s64, le32_to_cpu(v->back_pad), -- k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); -+ new->k.p.offset - p.k->size - REFLINK_P_IDX(v)); - BUG_ON(pad > U32_MAX); - v->back_pad = cpu_to_le32(pad); - } - -- le64_add_cpu(refcount, add); -+ le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1); - - bch2_btree_iter_set_pos_to_extent_start(&iter); -- ret = bch2_trans_update(trans, &iter, k, 0); -+ ret = bch2_trans_update(trans, &iter, new, 0); - if (ret) - goto err; -- -- *idx = k->k.p.offset; -+next: -+ *idx = k.k->p.offset; - err: -+fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -@@ -147,9 +360,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, - struct bch_fs *c = trans->c; - struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; -- u64 start = le64_to_cpu(p.v->idx); -- u64 end = le64_to_cpu(p.v->idx) + p.k->size; -- u64 next_idx = end + le32_to_cpu(p.v->back_pad); -+ u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); - s64 ret = 0; - struct printbuf buf = PRINTBUF; - -@@ -168,36 +379,14 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, - *idx = r->offset; - return 0; - not_found: -- BUG_ON(!(flags & BTREE_TRIGGER_check_repair)); -- -- if (fsck_err(trans, reflink_p_to_missing_reflink_v, -- "pointer to missing indirect extent\n" -- " %s\n" -- " missing range %llu-%llu", -- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), -- *idx, next_idx)) { -- struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c); -- ret = PTR_ERR_OR_ZERO(update); -+ if (flags & BTREE_TRIGGER_check_repair) { -+ ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); - if (ret) - goto err; -- -- if (next_idx <= start) { -- bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx); -- } else if (*idx >= end) { -- bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end); -- } else { -- bkey_error_init(update); -- update->k.p = p.k->p; -- update->k.size = p.k->size; -- set_bkey_val_u64s(&update->k, 0); -- } -- -- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun); - } - - *idx = next_idx; - err: --fsck_err: - printbuf_exit(&buf); - return ret; - } -@@ -210,8 +399,8 @@ static int __trigger_reflink_p(struct btree_trans *trans, - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - int ret = 0; - -- u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); -- u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); -+ u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); -+ u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); - - if (flags & BTREE_TRIGGER_transactional) { - while (idx < end && !ret) -@@ -253,35 +442,7 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, - return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags); - } - --/* indirect extents */ -- --int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) --{ -- return bch2_bkey_ptrs_validate(c, k, flags); --} -- --void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, -- struct bkey_s_c k) --{ -- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -- -- prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); -- -- bch2_bkey_ptrs_to_text(out, c, k); --} -- --#if 0 --Currently disabled, needs to be debugged: -- --bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) --{ -- struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); -- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); -- -- return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); --} --#endif -+/* indirect extent trigger */ - - static inline void - check_indirect_extent_deleting(struct bkey_s new, -@@ -307,25 +468,6 @@ int bch2_trigger_reflink_v(struct btree_trans *trans, - return bch2_trigger_extent(trans, btree_id, level, old, new, flags); - } - --/* indirect inline data */ -- --int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) --{ -- return 0; --} -- --void bch2_indirect_inline_data_to_text(struct printbuf *out, -- struct bch_fs *c, struct bkey_s_c k) --{ -- struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); -- unsigned datalen = bkey_inline_data_bytes(k.k); -- -- prt_printf(out, "refcount %llu datalen %u: %*phN", -- le64_to_cpu(d.v->refcount), datalen, -- min(datalen, 32U), d.v->data); --} -- - int bch2_trigger_indirect_inline_data(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, -@@ -336,9 +478,12 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *trans, - return 0; - } - -+/* create */ -+ - static int bch2_make_extent_indirect(struct btree_trans *trans, - struct btree_iter *extent_iter, -- struct bkey_i *orig) -+ struct bkey_i *orig, -+ bool reflink_p_may_update_opts_field) - { - struct bch_fs *c = trans->c; - struct btree_iter reflink_iter = { NULL }; -@@ -358,6 +503,14 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, - if (ret) - goto err; - -+ /* -+ * XXX: we're assuming that 56 bits will be enough for the life of the -+ * filesystem: we need to implement wraparound, with a cursor in the -+ * logged ops btree: -+ */ -+ if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size))) -+ return -ENOSPC; -+ - r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); - ret = PTR_ERR_OR_ZERO(r_v); - if (ret) -@@ -394,7 +547,10 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, - memset(&r_p->v, 0, sizeof(r_p->v)); - #endif - -- r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); -+ SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k)); -+ -+ if (reflink_p_may_update_opts_field) -+ SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true); - - ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, - BTREE_UPDATE_internal_snapshot_node); -@@ -409,7 +565,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) - struct bkey_s_c k; - int ret; - -- for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) { -+ for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) { - if (bkey_extent_is_unwritten(k)) - continue; - -@@ -426,7 +582,8 @@ s64 bch2_remap_range(struct bch_fs *c, - subvol_inum dst_inum, u64 dst_offset, - subvol_inum src_inum, u64 src_offset, - u64 remap_sectors, -- u64 new_i_size, s64 *i_sectors_delta) -+ u64 new_i_size, s64 *i_sectors_delta, -+ bool may_change_src_io_path_opts) - { - struct btree_trans *trans; - struct btree_iter dst_iter, src_iter; -@@ -439,6 +596,8 @@ s64 bch2_remap_range(struct bch_fs *c, - struct bpos src_want; - u64 dst_done = 0; - u32 dst_snapshot, src_snapshot; -+ bool reflink_p_may_update_opts_field = -+ bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); - int ret = 0, ret2 = 0; - - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) -@@ -520,7 +679,8 @@ s64 bch2_remap_range(struct bch_fs *c, - src_k = bkey_i_to_s_c(new_src.k); - - ret = bch2_make_extent_indirect(trans, &src_iter, -- new_src.k); -+ new_src.k, -+ reflink_p_may_update_opts_field); - if (ret) - continue; - -@@ -533,11 +693,15 @@ s64 bch2_remap_range(struct bch_fs *c, - struct bkey_i_reflink_p *dst_p = - bkey_reflink_p_init(new_dst.k); - -- u64 offset = le64_to_cpu(src_p.v->idx) + -+ u64 offset = REFLINK_P_IDX(src_p.v) + - (src_want.offset - - bkey_start_offset(src_k.k)); - -- dst_p->v.idx = cpu_to_le64(offset); -+ SET_REFLINK_P_IDX(&dst_p->v, offset); -+ -+ if (reflink_p_may_update_opts_field && -+ may_change_src_io_path_opts) -+ SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); - } else { - BUG(); - } -@@ -547,7 +711,7 @@ s64 bch2_remap_range(struct bch_fs *c, - min(src_k.k->p.offset - src_want.offset, - dst_end.offset - dst_iter.pos.offset)); - -- ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?: -+ ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?: - bch2_extent_update(trans, dst_inum, &dst_iter, - new_dst.k, &disk_res, - new_i_size, i_sectors_delta, -@@ -591,3 +755,97 @@ s64 bch2_remap_range(struct bch_fs *c, - - return dst_done ?: ret ?: ret2; - } -+ -+/* fsck */ -+ -+static int bch2_gc_write_reflink_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ size_t *idx) -+{ -+ struct bch_fs *c = trans->c; -+ const __le64 *refcount = bkey_refcount_c(k); -+ struct printbuf buf = PRINTBUF; -+ struct reflink_gc *r; -+ int ret = 0; -+ -+ if (!refcount) -+ return 0; -+ -+ while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && -+ r->offset < k.k->p.offset) -+ ++*idx; -+ -+ if (!r || -+ r->offset != k.k->p.offset || -+ r->size != k.k->size) { -+ bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); -+ return -EINVAL; -+ } -+ -+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), -+ trans, reflink_v_refcount_wrong, -+ "reflink key has wrong refcount:\n" -+ " %s\n" -+ " should be %u", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf), -+ r->refcount)) { -+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); -+ ret = PTR_ERR_OR_ZERO(new); -+ if (ret) -+ goto out; -+ -+ if (!r->refcount) -+ new->k.type = KEY_TYPE_deleted; -+ else -+ *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); -+ ret = bch2_trans_update(trans, iter, new, 0); -+ } -+out: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+int bch2_gc_reflink_done(struct bch_fs *c) -+{ -+ size_t idx = 0; -+ -+ int ret = bch2_trans_run(c, -+ for_each_btree_key_commit(trans, iter, -+ BTREE_ID_reflink, POS_MIN, -+ BTREE_ITER_prefetch, k, -+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -+ bch2_gc_write_reflink_key(trans, &iter, k, &idx))); -+ c->reflink_gc_nr = 0; -+ return ret; -+} -+ -+int bch2_gc_reflink_start(struct bch_fs *c) -+{ -+ c->reflink_gc_nr = 0; -+ -+ int ret = bch2_trans_run(c, -+ for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, -+ BTREE_ITER_prefetch, k, ({ -+ const __le64 *refcount = bkey_refcount_c(k); -+ -+ if (!refcount) -+ continue; -+ -+ struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, -+ c->reflink_gc_nr++, GFP_KERNEL); -+ if (!r) { -+ ret = -BCH_ERR_ENOMEM_gc_reflink_start; -+ break; -+ } -+ -+ r->offset = k.k->p.offset; -+ r->size = k.k->size; -+ r->refcount = 0; -+ 0; -+ }))); -+ -+ bch_err_fn(c, ret); -+ return ret; -+} -diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h -index 51afe11d8ed6..1632780bdf18 100644 ---- a/fs/bcachefs/reflink.h -+++ b/fs/bcachefs/reflink.h -@@ -2,9 +2,8 @@ - #ifndef _BCACHEFS_REFLINK_H - #define _BCACHEFS_REFLINK_H - --enum bch_validate_flags; -- --int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -+int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); - void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, -@@ -19,7 +18,8 @@ int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, - .min_val_size = 16, \ - }) - --int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -+int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); - void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, -@@ -34,7 +34,7 @@ int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, - }) - - int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context); - void bch2_indirect_inline_data_to_text(struct printbuf *, - struct bch_fs *, struct bkey_s_c); - int bch2_trigger_indirect_inline_data(struct btree_trans *, -@@ -73,7 +73,15 @@ static inline __le64 *bkey_refcount(struct bkey_s k) - } - } - -+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *, -+ s64 *, struct bkey_s_c_reflink_p, -+ bool, unsigned); -+ - s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, -- subvol_inum, u64, u64, u64, s64 *); -+ subvol_inum, u64, u64, u64, s64 *, -+ bool); -+ -+int bch2_gc_reflink_done(struct bch_fs *); -+int bch2_gc_reflink_start(struct bch_fs *); - - #endif /* _BCACHEFS_REFLINK_H */ -diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h -index 6772eebb1fc6..92995e4f898e 100644 ---- a/fs/bcachefs/reflink_format.h -+++ b/fs/bcachefs/reflink_format.h -@@ -4,7 +4,7 @@ - - struct bch_reflink_p { - struct bch_val v; -- __le64 idx; -+ __le64 idx_flags; - /* - * A reflink pointer might point to an indirect extent which is then - * later split (by copygc or rebalance). If we only pointed to part of -@@ -17,6 +17,11 @@ struct bch_reflink_p { - __le32 back_pad; - } __packed __aligned(8); - -+LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56); -+LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57); -+LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS, -+ struct bch_reflink_p, idx_flags, 57, 58); -+ - struct bch_reflink_v { - struct bch_val v; - __le64 refcount; -diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c -index 005275281804..59c8770e4a0e 100644 ---- a/fs/bcachefs/sb-clean.c -+++ b/fs/bcachefs/sb-clean.c -@@ -23,6 +23,10 @@ - int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, - int write) - { -+ struct bkey_validate_context from = { -+ .flags = write, -+ .from = BKEY_VALIDATE_superblock, -+ }; - struct jset_entry *entry; - int ret; - -@@ -40,7 +44,7 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle - ret = bch2_journal_entry_validate(c, NULL, entry, - le16_to_cpu(c->disk_sb.sb->version), - BCH_SB_BIG_ENDIAN(c->disk_sb.sb), -- write); -+ from); - if (ret) - return ret; - } -diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h -index 62ea478215d0..fdcf598f08b1 100644 ---- a/fs/bcachefs/sb-counters_format.h -+++ b/fs/bcachefs/sb-counters_format.h -@@ -2,86 +2,91 @@ - #ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H - #define _BCACHEFS_SB_COUNTERS_FORMAT_H - --#define BCH_PERSISTENT_COUNTERS() \ -- x(io_read, 0) \ -- x(io_write, 1) \ -- x(io_move, 2) \ -- x(bucket_invalidate, 3) \ -- x(bucket_discard, 4) \ -- x(bucket_alloc, 5) \ -- x(bucket_alloc_fail, 6) \ -- x(btree_cache_scan, 7) \ -- x(btree_cache_reap, 8) \ -- x(btree_cache_cannibalize, 9) \ -- x(btree_cache_cannibalize_lock, 10) \ -- x(btree_cache_cannibalize_lock_fail, 11) \ -- x(btree_cache_cannibalize_unlock, 12) \ -- x(btree_node_write, 13) \ -- x(btree_node_read, 14) \ -- x(btree_node_compact, 15) \ -- x(btree_node_merge, 16) \ -- x(btree_node_split, 17) \ -- x(btree_node_rewrite, 18) \ -- x(btree_node_alloc, 19) \ -- x(btree_node_free, 20) \ -- x(btree_node_set_root, 21) \ -- x(btree_path_relock_fail, 22) \ -- x(btree_path_upgrade_fail, 23) \ -- x(btree_reserve_get_fail, 24) \ -- x(journal_entry_full, 25) \ -- x(journal_full, 26) \ -- x(journal_reclaim_finish, 27) \ -- x(journal_reclaim_start, 28) \ -- x(journal_write, 29) \ -- x(read_promote, 30) \ -- x(read_bounce, 31) \ -- x(read_split, 33) \ -- x(read_retry, 32) \ -- x(read_reuse_race, 34) \ -- x(move_extent_read, 35) \ -- x(move_extent_write, 36) \ -- x(move_extent_finish, 37) \ -- x(move_extent_fail, 38) \ -- x(move_extent_start_fail, 39) \ -- x(copygc, 40) \ -- x(copygc_wait, 41) \ -- x(gc_gens_end, 42) \ -- x(gc_gens_start, 43) \ -- x(trans_blocked_journal_reclaim, 44) \ -- x(trans_restart_btree_node_reused, 45) \ -- x(trans_restart_btree_node_split, 46) \ -- x(trans_restart_fault_inject, 47) \ -- x(trans_restart_iter_upgrade, 48) \ -- x(trans_restart_journal_preres_get, 49) \ -- x(trans_restart_journal_reclaim, 50) \ -- x(trans_restart_journal_res_get, 51) \ -- x(trans_restart_key_cache_key_realloced, 52) \ -- x(trans_restart_key_cache_raced, 53) \ -- x(trans_restart_mark_replicas, 54) \ -- x(trans_restart_mem_realloced, 55) \ -- x(trans_restart_memory_allocation_failure, 56) \ -- x(trans_restart_relock, 57) \ -- x(trans_restart_relock_after_fill, 58) \ -- x(trans_restart_relock_key_cache_fill, 59) \ -- x(trans_restart_relock_next_node, 60) \ -- x(trans_restart_relock_parent_for_fill, 61) \ -- x(trans_restart_relock_path, 62) \ -- x(trans_restart_relock_path_intent, 63) \ -- x(trans_restart_too_many_iters, 64) \ -- x(trans_restart_traverse, 65) \ -- x(trans_restart_upgrade, 66) \ -- x(trans_restart_would_deadlock, 67) \ -- x(trans_restart_would_deadlock_write, 68) \ -- x(trans_restart_injected, 69) \ -- x(trans_restart_key_cache_upgrade, 70) \ -- x(trans_traverse_all, 71) \ -- x(transaction_commit, 72) \ -- x(write_super, 73) \ -- x(trans_restart_would_deadlock_recursion_limit, 74) \ -- x(trans_restart_write_buffer_flush, 75) \ -- x(trans_restart_split_race, 76) \ -- x(write_buffer_flush_slowpath, 77) \ -- x(write_buffer_flush_sync, 78) -+enum counters_flags { -+ TYPE_COUNTER = BIT(0), /* event counters */ -+ TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */ -+}; -+ -+#define BCH_PERSISTENT_COUNTERS() \ -+ x(io_read, 0, TYPE_SECTORS) \ -+ x(io_write, 1, TYPE_SECTORS) \ -+ x(io_move, 2, TYPE_SECTORS) \ -+ x(bucket_invalidate, 3, TYPE_COUNTER) \ -+ x(bucket_discard, 4, TYPE_COUNTER) \ -+ x(bucket_alloc, 5, TYPE_COUNTER) \ -+ x(bucket_alloc_fail, 6, TYPE_COUNTER) \ -+ x(btree_cache_scan, 7, TYPE_COUNTER) \ -+ x(btree_cache_reap, 8, TYPE_COUNTER) \ -+ x(btree_cache_cannibalize, 9, TYPE_COUNTER) \ -+ x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \ -+ x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \ -+ x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \ -+ x(btree_node_write, 13, TYPE_COUNTER) \ -+ x(btree_node_read, 14, TYPE_COUNTER) \ -+ x(btree_node_compact, 15, TYPE_COUNTER) \ -+ x(btree_node_merge, 16, TYPE_COUNTER) \ -+ x(btree_node_split, 17, TYPE_COUNTER) \ -+ x(btree_node_rewrite, 18, TYPE_COUNTER) \ -+ x(btree_node_alloc, 19, TYPE_COUNTER) \ -+ x(btree_node_free, 20, TYPE_COUNTER) \ -+ x(btree_node_set_root, 21, TYPE_COUNTER) \ -+ x(btree_path_relock_fail, 22, TYPE_COUNTER) \ -+ x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \ -+ x(btree_reserve_get_fail, 24, TYPE_COUNTER) \ -+ x(journal_entry_full, 25, TYPE_COUNTER) \ -+ x(journal_full, 26, TYPE_COUNTER) \ -+ x(journal_reclaim_finish, 27, TYPE_COUNTER) \ -+ x(journal_reclaim_start, 28, TYPE_COUNTER) \ -+ x(journal_write, 29, TYPE_COUNTER) \ -+ x(read_promote, 30, TYPE_COUNTER) \ -+ x(read_bounce, 31, TYPE_COUNTER) \ -+ x(read_split, 33, TYPE_COUNTER) \ -+ x(read_retry, 32, TYPE_COUNTER) \ -+ x(read_reuse_race, 34, TYPE_COUNTER) \ -+ x(move_extent_read, 35, TYPE_SECTORS) \ -+ x(move_extent_write, 36, TYPE_SECTORS) \ -+ x(move_extent_finish, 37, TYPE_SECTORS) \ -+ x(move_extent_fail, 38, TYPE_COUNTER) \ -+ x(move_extent_start_fail, 39, TYPE_COUNTER) \ -+ x(copygc, 40, TYPE_COUNTER) \ -+ x(copygc_wait, 41, TYPE_COUNTER) \ -+ x(gc_gens_end, 42, TYPE_COUNTER) \ -+ x(gc_gens_start, 43, TYPE_COUNTER) \ -+ x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \ -+ x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \ -+ x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \ -+ x(trans_restart_fault_inject, 47, TYPE_COUNTER) \ -+ x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \ -+ x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \ -+ x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \ -+ x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \ -+ x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \ -+ x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \ -+ x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \ -+ x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \ -+ x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \ -+ x(trans_restart_relock, 57, TYPE_COUNTER) \ -+ x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \ -+ x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \ -+ x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \ -+ x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \ -+ x(trans_restart_relock_path, 62, TYPE_COUNTER) \ -+ x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \ -+ x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \ -+ x(trans_restart_traverse, 65, TYPE_COUNTER) \ -+ x(trans_restart_upgrade, 66, TYPE_COUNTER) \ -+ x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \ -+ x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \ -+ x(trans_restart_injected, 69, TYPE_COUNTER) \ -+ x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \ -+ x(trans_traverse_all, 71, TYPE_COUNTER) \ -+ x(transaction_commit, 72, TYPE_COUNTER) \ -+ x(write_super, 73, TYPE_COUNTER) \ -+ x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \ -+ x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \ -+ x(trans_restart_split_race, 76, TYPE_COUNTER) \ -+ x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \ -+ x(write_buffer_flush_sync, 78, TYPE_COUNTER) - - enum bch_persistent_counters { - #define x(t, n, ...) BCH_COUNTER_##t, -diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c -index 8767c33c2b51..051214fdc735 100644 ---- a/fs/bcachefs/sb-downgrade.c -+++ b/fs/bcachefs/sb-downgrade.c -@@ -81,7 +81,16 @@ - BCH_FSCK_ERR_accounting_mismatch) \ - x(inode_has_child_snapshots, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ -- BCH_FSCK_ERR_inode_has_child_snapshots_wrong) -+ BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \ -+ x(backpointer_bucket_gen, \ -+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ -+ BCH_FSCK_ERR_backpointer_to_missing_ptr, \ -+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ -+ x(disk_accounting_big_endian, \ -+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ -+ BCH_FSCK_ERR_accounting_mismatch, \ -+ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ -+ BCH_FSCK_ERR_accounting_key_junk_at_end) - - #define DOWNGRADE_TABLE() \ - x(bucket_stripe_sectors, \ -@@ -117,7 +126,19 @@ - BCH_FSCK_ERR_bkey_version_in_future) \ - x(rebalance_work_acct_fix, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ -- BCH_FSCK_ERR_accounting_mismatch) -+ BCH_FSCK_ERR_accounting_mismatch, \ -+ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ -+ BCH_FSCK_ERR_accounting_key_junk_at_end) \ -+ x(backpointer_bucket_gen, \ -+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ -+ BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \ -+ BCH_FSCK_ERR_backpointer_to_missing_ptr, \ -+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ -+ x(disk_accounting_big_endian, \ -+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ -+ BCH_FSCK_ERR_accounting_mismatch, \ -+ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ -+ BCH_FSCK_ERR_accounting_key_junk_at_end) - - struct upgrade_downgrade_entry { - u64 recovery_passes; -diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h -index 9feb6739f77a..b86ec013d7d7 100644 ---- a/fs/bcachefs/sb-errors_format.h -+++ b/fs/bcachefs/sb-errors_format.h -@@ -5,9 +5,8 @@ - enum bch_fsck_flags { - FSCK_CAN_FIX = 1 << 0, - FSCK_CAN_IGNORE = 1 << 1, -- FSCK_NEED_FSCK = 1 << 2, -- FSCK_NO_RATELIMIT = 1 << 3, -- FSCK_AUTOFIX = 1 << 4, -+ FSCK_NO_RATELIMIT = 1 << 2, -+ FSCK_AUTOFIX = 1 << 3, - }; - - #define BCH_SB_ERRS() \ -@@ -58,8 +57,8 @@ enum bch_fsck_flags { - x(bset_wrong_sector_offset, 44, 0) \ - x(bset_empty, 45, 0) \ - x(bset_bad_seq, 46, 0) \ -- x(bset_blacklisted_journal_seq, 47, 0) \ -- x(first_bset_blacklisted_journal_seq, 48, 0) \ -+ x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \ -+ x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ - x(btree_node_bad_btree, 49, 0) \ - x(btree_node_bad_level, 50, 0) \ - x(btree_node_bad_min_key, 51, 0) \ -@@ -68,17 +67,17 @@ enum bch_fsck_flags { - x(btree_node_bkey_past_bset_end, 54, 0) \ - x(btree_node_bkey_bad_format, 55, 0) \ - x(btree_node_bad_bkey, 56, 0) \ -- x(btree_node_bkey_out_of_order, 57, 0) \ -- x(btree_root_bkey_invalid, 58, 0) \ -- x(btree_root_read_error, 59, 0) \ -+ x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \ -+ x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \ -+ x(btree_root_read_error, 59, FSCK_AUTOFIX) \ - x(btree_root_bad_min_key, 60, 0) \ - x(btree_root_bad_max_key, 61, 0) \ -- x(btree_node_read_error, 62, 0) \ -- x(btree_node_topology_bad_min_key, 63, 0) \ -- x(btree_node_topology_bad_max_key, 64, 0) \ -- x(btree_node_topology_overwritten_by_prev_node, 65, 0) \ -- x(btree_node_topology_overwritten_by_next_node, 66, 0) \ -- x(btree_node_topology_interior_node_empty, 67, 0) \ -+ x(btree_node_read_error, 62, FSCK_AUTOFIX) \ -+ x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ -+ x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ -+ x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \ -+ x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \ -+ x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \ - x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ - x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ - x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ -@@ -123,11 +122,12 @@ enum bch_fsck_flags { - x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ - x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ - x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ -+ x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \ - x(bucket_sector_count_overflow, 112, 0) \ - x(bucket_metadata_type_mismatch, 113, 0) \ -- x(need_discard_key_wrong, 114, 0) \ -- x(freespace_key_wrong, 115, 0) \ -- x(freespace_hole_missing, 116, 0) \ -+ x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \ -+ x(freespace_key_wrong, 115, FSCK_AUTOFIX) \ -+ x(freespace_hole_missing, 116, FSCK_AUTOFIX) \ - x(bucket_gens_val_size_bad, 117, 0) \ - x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \ - x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \ -@@ -139,9 +139,10 @@ enum bch_fsck_flags { - x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ - x(backpointer_bucket_offset_wrong, 125, 0) \ - x(backpointer_level_bad, 294, 0) \ -- x(backpointer_to_missing_device, 126, 0) \ -- x(backpointer_to_missing_alloc, 127, 0) \ -- x(backpointer_to_missing_ptr, 128, 0) \ -+ x(backpointer_dev_bad, 297, 0) \ -+ x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \ -+ x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \ -+ x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \ - x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \ - x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \ - x(lru_entry_bad, 131, FSCK_AUTOFIX) \ -@@ -167,20 +168,21 @@ enum bch_fsck_flags { - x(ptr_to_incorrect_stripe, 151, 0) \ - x(ptr_gen_newer_than_bucket_gen, 152, 0) \ - x(ptr_too_stale, 153, 0) \ -- x(stale_dirty_ptr, 154, 0) \ -+ x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ - x(ptr_bucket_data_type_mismatch, 155, 0) \ - x(ptr_cached_and_erasure_coded, 156, 0) \ - x(ptr_crc_uncompressed_size_too_small, 157, 0) \ -+ x(ptr_crc_uncompressed_size_too_big, 161, 0) \ -+ x(ptr_crc_uncompressed_size_mismatch, 300, 0) \ - x(ptr_crc_csum_type_unknown, 158, 0) \ - x(ptr_crc_compression_type_unknown, 159, 0) \ - x(ptr_crc_redundant, 160, 0) \ -- x(ptr_crc_uncompressed_size_too_big, 161, 0) \ - x(ptr_crc_nonce_mismatch, 162, 0) \ - x(ptr_stripe_redundant, 163, 0) \ - x(reservation_key_nr_replicas_invalid, 164, 0) \ -- x(reflink_v_refcount_wrong, 165, 0) \ -+ x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ - x(reflink_v_pos_bad, 292, 0) \ -- x(reflink_p_to_missing_reflink_v, 166, 0) \ -+ x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \ - x(reflink_refcount_underflow, 293, 0) \ - x(stripe_pos_bad, 167, 0) \ - x(stripe_val_size_bad, 168, 0) \ -@@ -209,6 +211,7 @@ enum bch_fsck_flags { - x(bkey_in_missing_snapshot, 190, 0) \ - x(inode_pos_inode_nonzero, 191, 0) \ - x(inode_pos_blockdev_range, 192, 0) \ -+ x(inode_alloc_cursor_inode_bad, 301, 0) \ - x(inode_unpack_error, 193, 0) \ - x(inode_str_hash_invalid, 194, 0) \ - x(inode_v3_fields_start_bad, 195, 0) \ -@@ -232,6 +235,7 @@ enum bch_fsck_flags { - x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ - x(inode_has_child_snapshots_wrong, 287, 0) \ - x(inode_unreachable, 210, FSCK_AUTOFIX) \ -+ x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ - x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ - x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ - x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ -@@ -252,6 +256,7 @@ enum bch_fsck_flags { - x(dirent_in_missing_dir_inode, 227, 0) \ - x(dirent_in_non_dir_inode, 228, 0) \ - x(dirent_to_missing_inode, 229, 0) \ -+ x(dirent_to_overwritten_inode, 302, 0) \ - x(dirent_to_missing_subvol, 230, 0) \ - x(dirent_to_itself, 231, 0) \ - x(quota_type_invalid, 232, 0) \ -@@ -288,7 +293,7 @@ enum bch_fsck_flags { - x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ - x(snapshot_node_missing, 264, 0) \ - x(dup_backpointer_to_bad_csum_extent, 265, 0) \ -- x(btree_bitmap_not_marked, 266, 0) \ -+ x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ - x(sb_clean_entry_overrun, 267, 0) \ - x(btree_ptr_v2_written_0, 268, 0) \ - x(subvol_snapshot_bad, 269, 0) \ -@@ -306,7 +311,10 @@ enum bch_fsck_flags { - x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ - x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ - x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ -- x(MAX, 295, 0) -+ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ -+ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ -+ x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ -+ x(MAX, 304, 0) - - enum bch_sb_error_id { - #define x(t, n, ...) BCH_FSCK_ERR_##t = n, -diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c -index 617d07e53b20..7c403427fbdb 100644 ---- a/fs/bcachefs/six.c -+++ b/fs/bcachefs/six.c -@@ -491,8 +491,12 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, - list_del(&wait->list); - raw_spin_unlock(&lock->wait_lock); - -- if (unlikely(acquired)) -+ if (unlikely(acquired)) { - do_six_unlock_type(lock, type); -+ } else if (type == SIX_LOCK_write) { -+ six_clear_bitmask(lock, SIX_LOCK_HELD_write); -+ six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); -+ } - break; - } - -@@ -501,10 +505,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, - - __set_current_state(TASK_RUNNING); - out: -- if (ret && type == SIX_LOCK_write) { -- six_clear_bitmask(lock, SIX_LOCK_HELD_write); -- six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); -- } - trace_contention_end(lock, 0); - - return ret; -@@ -616,8 +616,6 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long - - if (type != SIX_LOCK_write) - six_release(&lock->dep_map, ip); -- else -- lock->seq++; - - if (type == SIX_LOCK_intent && - lock->intent_lock_recurse) { -@@ -625,6 +623,15 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long - return; - } - -+ if (type == SIX_LOCK_write && -+ lock->write_lock_recurse) { -+ --lock->write_lock_recurse; -+ return; -+ } -+ -+ if (type == SIX_LOCK_write) -+ lock->seq++; -+ - do_six_unlock_type(lock, type); - } - EXPORT_SYMBOL_GPL(six_unlock_ip); -@@ -735,13 +742,13 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) - atomic_add(l[type].lock_val, &lock->state); - } - break; -+ case SIX_LOCK_write: -+ lock->write_lock_recurse++; -+ fallthrough; - case SIX_LOCK_intent: - EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); - lock->intent_lock_recurse++; - break; -- case SIX_LOCK_write: -- BUG(); -- break; - } - } - EXPORT_SYMBOL_GPL(six_lock_increment); -@@ -843,7 +850,8 @@ void six_lock_exit(struct six_lock *lock) - EXPORT_SYMBOL_GPL(six_lock_exit); - - void __six_lock_init(struct six_lock *lock, const char *name, -- struct lock_class_key *key, enum six_lock_init_flags flags) -+ struct lock_class_key *key, enum six_lock_init_flags flags, -+ gfp_t gfp) - { - atomic_set(&lock->state, 0); - raw_spin_lock_init(&lock->wait_lock); -@@ -866,7 +874,7 @@ void __six_lock_init(struct six_lock *lock, const char *name, - * failure if they wish by checking lock->readers, but generally - * will not want to treat it as an error. - */ -- lock->readers = alloc_percpu(unsigned); -+ lock->readers = alloc_percpu_gfp(unsigned, gfp); - } - #endif - } -diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h -index 68d46fd7f391..59b851cf8bac 100644 ---- a/fs/bcachefs/six.h -+++ b/fs/bcachefs/six.h -@@ -137,6 +137,7 @@ struct six_lock { - atomic_t state; - u32 seq; - unsigned intent_lock_recurse; -+ unsigned write_lock_recurse; - struct task_struct *owner; - unsigned __percpu *readers; - raw_spinlock_t wait_lock; -@@ -163,18 +164,19 @@ enum six_lock_init_flags { - }; - - void __six_lock_init(struct six_lock *lock, const char *name, -- struct lock_class_key *key, enum six_lock_init_flags flags); -+ struct lock_class_key *key, enum six_lock_init_flags flags, -+ gfp_t gfp); - - /** - * six_lock_init - initialize a six lock - * @lock: lock to initialize - * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU - */ --#define six_lock_init(lock, flags) \ -+#define six_lock_init(lock, flags, gfp) \ - do { \ - static struct lock_class_key __key; \ - \ -- __six_lock_init((lock), #lock, &__key, flags); \ -+ __six_lock_init((lock), #lock, &__key, flags, gfp); \ - } while (0) - - /** -diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c -index ae57638506c3..c54091a28909 100644 ---- a/fs/bcachefs/snapshot.c -+++ b/fs/bcachefs/snapshot.c -@@ -2,6 +2,7 @@ - - #include "bcachefs.h" - #include "bkey_buf.h" -+#include "btree_cache.h" - #include "btree_key_cache.h" - #include "btree_update.h" - #include "buckets.h" -@@ -32,7 +33,7 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, - } - - int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - int ret = 0; - -@@ -225,7 +226,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, - } - - int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_snapshot s; - u32 i, id; -@@ -279,23 +280,6 @@ int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, - return ret; - } - --static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) --{ -- struct snapshot_t *t = snapshot_t_mut(c, id); -- u32 parent = id; -- -- while ((parent = bch2_snapshot_parent_early(c, parent)) && -- parent - id - 1 < IS_ANCESTOR_BITMAP) -- __set_bit(parent - id - 1, t->is_ancestor); --} -- --static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) --{ -- mutex_lock(&c->snapshot_table_lock); -- __set_is_ancestor_bitmap(c, id); -- mutex_unlock(&c->snapshot_table_lock); --} -- - static int __bch2_mark_snapshot(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, -@@ -317,6 +301,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, - if (new.k->type == KEY_TYPE_snapshot) { - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - -+ t->live = true; - t->parent = le32_to_cpu(s.v->parent); - t->children[0] = le32_to_cpu(s.v->children[0]); - t->children[1] = le32_to_cpu(s.v->children[1]); -@@ -335,7 +320,11 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, - t->skip[2] = 0; - } - -- __set_is_ancestor_bitmap(c, id); -+ u32 parent = id; -+ -+ while ((parent = bch2_snapshot_parent_early(c, parent)) && -+ parent - id - 1 < IS_ANCESTOR_BITMAP) -+ __set_bit(parent - id - 1, t->is_ancestor); - - if (BCH_SNAPSHOT_DELETED(s.v)) { - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); -@@ -365,70 +354,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, - BTREE_ITER_with_updates, snapshot, s); - } - --static int bch2_snapshot_live(struct btree_trans *trans, u32 id) --{ -- struct bch_snapshot v; -- int ret; -- -- if (!id) -- return 0; -- -- ret = bch2_snapshot_lookup(trans, id, &v); -- if (bch2_err_matches(ret, ENOENT)) -- bch_err(trans->c, "snapshot node %u not found", id); -- if (ret) -- return ret; -- -- return !BCH_SNAPSHOT_DELETED(&v); --} -- --/* -- * If @k is a snapshot with just one live child, it's part of a linear chain, -- * which we consider to be an equivalence class: and then after snapshot -- * deletion cleanup, there should only be a single key at a given position in -- * this equivalence class. -- * -- * This sets the equivalence class of @k to be the child's equivalence class, if -- * it's part of such a linear chain: this correctly sets equivalence classes on -- * startup if we run leaf to root (i.e. in natural key order). -- */ --static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) --{ -- struct bch_fs *c = trans->c; -- unsigned i, nr_live = 0, live_idx = 0; -- struct bkey_s_c_snapshot snap; -- u32 id = k.k->p.offset, child[2]; -- -- if (k.k->type != KEY_TYPE_snapshot) -- return 0; -- -- snap = bkey_s_c_to_snapshot(k); -- -- child[0] = le32_to_cpu(snap.v->children[0]); -- child[1] = le32_to_cpu(snap.v->children[1]); -- -- for (i = 0; i < 2; i++) { -- int ret = bch2_snapshot_live(trans, child[i]); -- -- if (ret < 0) -- return ret; -- -- if (ret) -- live_idx = i; -- nr_live += ret; -- } -- -- mutex_lock(&c->snapshot_table_lock); -- -- snapshot_t_mut(c, id)->equiv = nr_live == 1 -- ? snapshot_t_mut(c, child[live_idx])->equiv -- : id; -- -- mutex_unlock(&c->snapshot_table_lock); -- -- return 0; --} -- - /* fsck: */ - - static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) -@@ -506,7 +431,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, - break; - } - } -- - bch2_trans_iter_exit(trans, &iter); - - if (!ret && !found) { -@@ -536,6 +460,7 @@ static int check_snapshot_tree(struct btree_trans *trans, - struct bch_snapshot s; - struct bch_subvolume subvol; - struct printbuf buf = PRINTBUF; -+ struct btree_iter snapshot_iter = {}; - u32 root_id; - int ret; - -@@ -545,22 +470,35 @@ static int check_snapshot_tree(struct btree_trans *trans, - st = bkey_s_c_to_snapshot_tree(k); - root_id = le32_to_cpu(st.v->root_snapshot); - -- ret = bch2_snapshot_lookup(trans, root_id, &s); -+ struct bkey_s_c_snapshot snapshot_k = -+ bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, -+ POS(0, root_id), 0, snapshot); -+ ret = bkey_err(snapshot_k); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - -+ if (!ret) -+ bkey_val_copy(&s, snapshot_k); -+ - if (fsck_err_on(ret || - root_id != bch2_snapshot_root(c, root_id) || - st.k->p.offset != le32_to_cpu(s.tree), - trans, snapshot_tree_to_missing_snapshot, - "snapshot tree points to missing/incorrect snapshot:\n %s", -- (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { -+ (bch2_bkey_val_to_text(&buf, c, st.s_c), -+ prt_newline(&buf), -+ ret -+ ? prt_printf(&buf, "(%s)", bch2_err_str(ret)) -+ : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c), -+ buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto err; - } - -- ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), -- false, 0, &subvol); -+ if (!st.v->master_subvol) -+ goto out; -+ -+ ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - -@@ -603,8 +541,10 @@ static int check_snapshot_tree(struct btree_trans *trans, - u->v.master_subvol = cpu_to_le32(subvol_id); - st = snapshot_tree_i_to_s_c(u); - } -+out: - err: - fsck_err: -+ bch2_trans_iter_exit(trans, &snapshot_iter); - printbuf_exit(&buf); - return ret; - } -@@ -799,7 +739,7 @@ static int check_snapshot(struct btree_trans *trans, - - if (should_have_subvol) { - id = le32_to_cpu(s.subvol); -- ret = bch2_subvolume_get(trans, id, 0, false, &subvol); -+ ret = bch2_subvolume_get(trans, id, false, &subvol); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot points to nonexistent subvolume:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -@@ -902,7 +842,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) - { - struct bch_fs *c = trans->c; - -- if (bch2_snapshot_equiv(c, id)) -+ if (bch2_snapshot_exists(c, id)) - return 0; - - /* Do we need to reconstruct the snapshot_tree entry as well? */ -@@ -951,8 +891,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) - - return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: - bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, -- bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: -- bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i)); -+ bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); - } - - /* Figure out which snapshot nodes belong in the same tree: */ -@@ -1050,7 +989,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) - snapshot_id_list_to_text(&buf, t); - - darray_for_each(*t, id) { -- if (fsck_err_on(!bch2_snapshot_equiv(c, *id), -+ if (fsck_err_on(!bch2_snapshot_exists(c, *id), - trans, snapshot_node_missing, - "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { - if (t->nr > 1) { -@@ -1083,10 +1022,12 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans, - struct printbuf buf = PRINTBUF; - int ret = 0; - -- if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), -+ if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), - trans, bkey_in_missing_snapshot, - "key in missing snapshot %s, delete?", -- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) -+ (bch2_btree_id_to_text(&buf, iter->btree_id), -+ prt_char(&buf, ' '), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; - fsck_err: -@@ -1100,13 +1041,11 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans, - int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) - { - struct btree_iter iter; -- struct bkey_i_snapshot *s; -- int ret = 0; -- -- s = bch2_bkey_get_mut_typed(trans, &iter, -+ struct bkey_i_snapshot *s = -+ bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, id), - 0, snapshot); -- ret = PTR_ERR_OR_ZERO(s); -+ int ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - trans->c, "missing snapshot %u", id); -@@ -1294,10 +1233,6 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, - goto err; - - new_snapids[i] = iter.pos.offset; -- -- mutex_lock(&c->snapshot_table_lock); -- snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i]; -- mutex_unlock(&c->snapshot_table_lock); - } - err: - bch2_trans_iter_exit(trans, &iter); -@@ -1403,129 +1338,153 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, - * that key to snapshot leaf nodes, where we can mutate it - */ - --static int delete_dead_snapshots_process_key(struct btree_trans *trans, -- struct btree_iter *iter, -- struct bkey_s_c k, -- snapshot_id_list *deleted, -- snapshot_id_list *equiv_seen, -- struct bpos *last_pos) -+struct snapshot_interior_delete { -+ u32 id; -+ u32 live_child; -+}; -+typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; -+ -+static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) - { -- int ret = bch2_check_key_has_snapshot(trans, iter, k); -- if (ret) -- return ret < 0 ? ret : 0; -+ darray_for_each(*l, i) -+ if (i->id == id) -+ return i->live_child; -+ return 0; -+} - -- struct bch_fs *c = trans->c; -- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); -- if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ -+static unsigned __live_child(struct snapshot_table *t, u32 id, -+ snapshot_id_list *delete_leaves, -+ interior_delete_list *delete_interior) -+{ -+ struct snapshot_t *s = __snapshot_t(t, id); -+ if (!s) - return 0; - -- if (!bkey_eq(k.k->p, *last_pos)) -- equiv_seen->nr = 0; -+ for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) -+ if (s->children[i] && -+ !snapshot_list_has_id(delete_leaves, s->children[i]) && -+ !interior_delete_has_id(delete_interior, s->children[i])) -+ return s->children[i]; - -- if (snapshot_list_has_id(deleted, k.k->p.snapshot)) -- return bch2_btree_delete_at(trans, iter, -- BTREE_UPDATE_internal_snapshot_node); -+ for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { -+ u32 live_child = s->children[i] -+ ? __live_child(t, s->children[i], delete_leaves, delete_interior) -+ : 0; -+ if (live_child) -+ return live_child; -+ } - -- if (!bpos_eq(*last_pos, k.k->p) && -- snapshot_list_has_id(equiv_seen, equiv)) -- return bch2_btree_delete_at(trans, iter, -- BTREE_UPDATE_internal_snapshot_node); -+ return 0; -+} - -- *last_pos = k.k->p; -+static unsigned live_child(struct bch_fs *c, u32 id, -+ snapshot_id_list *delete_leaves, -+ interior_delete_list *delete_interior) -+{ -+ rcu_read_lock(); -+ u32 ret = __live_child(rcu_dereference(c->snapshots), id, -+ delete_leaves, delete_interior); -+ rcu_read_unlock(); -+ return ret; -+} - -- ret = snapshot_list_add_nodup(c, equiv_seen, equiv); -- if (ret) -- return ret; -+static int delete_dead_snapshots_process_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ snapshot_id_list *delete_leaves, -+ interior_delete_list *delete_interior) -+{ -+ if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) -+ return bch2_btree_delete_at(trans, iter, -+ BTREE_UPDATE_internal_snapshot_node); - -- /* -- * When we have a linear chain of snapshot nodes, we consider -- * those to form an equivalence class: we're going to collapse -- * them all down to a single node, and keep the leaf-most node - -- * which has the same id as the equivalence class id. -- * -- * If there are multiple keys in different snapshots at the same -- * position, we're only going to keep the one in the newest -- * snapshot (we delete the others above) - the rest have been -- * overwritten and are redundant, and for the key we're going to keep we -- * need to move it to the equivalance class ID if it's not there -- * already. -- */ -- if (equiv != k.k->p.snapshot) { -+ u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); -+ if (live_child) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - -- new->k.p.snapshot = equiv; -- -- struct btree_iter new_iter; -- bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, -- BTREE_ITER_all_snapshots| -- BTREE_ITER_cached| -- BTREE_ITER_intent); -+ new->k.p.snapshot = live_child; - -- ret = bch2_btree_iter_traverse(&new_iter) ?: -- bch2_trans_update(trans, &new_iter, new, -- BTREE_UPDATE_internal_snapshot_node) ?: -- bch2_btree_delete_at(trans, iter, -- BTREE_UPDATE_internal_snapshot_node); -- bch2_trans_iter_exit(trans, &new_iter); -+ struct btree_iter dst_iter; -+ struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, -+ iter->btree_id, new->k.p, -+ BTREE_ITER_all_snapshots| -+ BTREE_ITER_intent); -+ ret = bkey_err(dst_k); - if (ret) - return ret; -+ -+ ret = (bkey_deleted(dst_k.k) -+ ? bch2_trans_update(trans, &dst_iter, new, -+ BTREE_UPDATE_internal_snapshot_node) -+ : 0) ?: -+ bch2_btree_delete_at(trans, iter, -+ BTREE_UPDATE_internal_snapshot_node); -+ bch2_trans_iter_exit(trans, &dst_iter); -+ return ret; - } - - return 0; - } - --static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k) -+/* -+ * For a given snapshot, if it doesn't have a subvolume that points to it, and -+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it -+ * as deleted. -+ */ -+static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, -+ snapshot_id_list *delete_leaves, -+ interior_delete_list *delete_interior) - { -- struct bkey_s_c_snapshot snap; -- u32 children[2]; -- int ret; -- - if (k.k->type != KEY_TYPE_snapshot) - return 0; - -- snap = bkey_s_c_to_snapshot(k); -- if (BCH_SNAPSHOT_DELETED(snap.v) || -- BCH_SNAPSHOT_SUBVOL(snap.v)) -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); -+ unsigned live_children = 0; -+ -+ if (BCH_SNAPSHOT_SUBVOL(s.v)) - return 0; - -- children[0] = le32_to_cpu(snap.v->children[0]); -- children[1] = le32_to_cpu(snap.v->children[1]); -+ for (unsigned i = 0; i < 2; i++) { -+ u32 child = le32_to_cpu(s.v->children[i]); - -- ret = bch2_snapshot_live(trans, children[0]) ?: -- bch2_snapshot_live(trans, children[1]); -- if (ret < 0) -- return ret; -- return !ret; --} -+ live_children += child && -+ !snapshot_list_has_id(delete_leaves, child); -+ } - --/* -- * For a given snapshot, if it doesn't have a subvolume that points to it, and -- * it doesn't have child snapshot nodes - it's now redundant and we can mark it -- * as deleted. -- */ --static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k) --{ -- int ret = bch2_snapshot_needs_delete(trans, k); -+ if (live_children == 0) { -+ return snapshot_list_add(c, delete_leaves, s.k->p.offset); -+ } else if (live_children == 1) { -+ struct snapshot_interior_delete d = { -+ .id = s.k->p.offset, -+ .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), -+ }; -+ -+ if (!d.live_child) { -+ bch_err(c, "error finding live child of snapshot %u", d.id); -+ return -EINVAL; -+ } - -- return ret <= 0 -- ? ret -- : bch2_snapshot_node_set_deleted(trans, k.k->p.offset); -+ return darray_push(delete_interior, d); -+ } else { -+ return 0; -+ } - } - - static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, -- snapshot_id_list *skip) -+ interior_delete_list *skip) - { - rcu_read_lock(); -- while (snapshot_list_has_id(skip, id)) -+ while (interior_delete_has_id(skip, id)) - id = __bch2_snapshot_parent(c, id); - - while (n--) { - do { - id = __bch2_snapshot_parent(c, id); -- } while (snapshot_list_has_id(skip, id)); -+ } while (interior_delete_has_id(skip, id)); - } - rcu_read_unlock(); - -@@ -1534,7 +1493,7 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, - - static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c k, -- snapshot_id_list *deleted) -+ interior_delete_list *deleted) - { - struct bch_fs *c = trans->c; - u32 nr_deleted_ancestors = 0; -@@ -1544,7 +1503,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, - if (k.k->type != KEY_TYPE_snapshot) - return 0; - -- if (snapshot_list_has_id(deleted, k.k->p.offset)) -+ if (interior_delete_has_id(deleted, k.k->p.offset)) - return 0; - - s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); -@@ -1553,7 +1512,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, - return ret; - - darray_for_each(*deleted, i) -- nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); -+ nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); - - if (!nr_deleted_ancestors) - return 0; -@@ -1571,7 +1530,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, - for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { - u32 id = le32_to_cpu(s->v.skip[j]); - -- if (snapshot_list_has_id(deleted, id)) { -+ if (interior_delete_has_id(deleted, id)) { - id = bch2_snapshot_nth_parent_skip(c, - parent, - depth > 1 -@@ -1590,51 +1549,45 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, - - int bch2_delete_dead_snapshots(struct bch_fs *c) - { -- struct btree_trans *trans; -- snapshot_id_list deleted = { 0 }; -- snapshot_id_list deleted_interior = { 0 }; -- int ret = 0; -- - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) - return 0; - -- trans = bch2_trans_get(c); -+ struct btree_trans *trans = bch2_trans_get(c); -+ snapshot_id_list delete_leaves = {}; -+ interior_delete_list delete_interior = {}; -+ int ret = 0; - - /* - * For every snapshot node: If we have no live children and it's not - * pointed to by a subvolume, delete it: - */ -- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, -- POS_MIN, 0, k, -- NULL, NULL, 0, -- bch2_delete_redundant_snapshot(trans, k)); -- bch_err_msg(c, ret, "deleting redundant snapshots"); -+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, -+ check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); -+ if (!bch2_err_matches(ret, EROFS)) -+ bch_err_msg(c, ret, "walking snapshots"); - if (ret) - goto err; - -- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, -- POS_MIN, 0, k, -- bch2_snapshot_set_equiv(trans, k)); -- bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); -- if (ret) -+ if (!delete_leaves.nr && !delete_interior.nr) - goto err; - -- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, -- POS_MIN, 0, k, ({ -- if (k.k->type != KEY_TYPE_snapshot) -- continue; -+ { -+ struct printbuf buf = PRINTBUF; -+ prt_printf(&buf, "deleting leaves"); -+ darray_for_each(delete_leaves, i) -+ prt_printf(&buf, " %u", *i); - -- BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v) -- ? snapshot_list_add(c, &deleted, k.k->p.offset) -- : 0; -- })); -- bch_err_msg(c, ret, "walking snapshots"); -- if (ret) -- goto err; -+ prt_printf(&buf, " interior"); -+ darray_for_each(delete_interior, i) -+ prt_printf(&buf, " %u->%u", i->id, i->live_child); -+ -+ ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); -+ printbuf_exit(&buf); -+ if (ret) -+ goto err; -+ } - - for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { -- struct bpos last_pos = POS_MIN; -- snapshot_id_list equiv_seen = { 0 }; - struct disk_reservation res = { 0 }; - - if (!btree_type_has_snapshots(btree)) -@@ -1644,33 +1597,26 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) - btree, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, -- delete_dead_snapshots_process_key(trans, &iter, k, &deleted, -- &equiv_seen, &last_pos)); -+ delete_dead_snapshots_process_key(trans, &iter, k, -+ &delete_leaves, -+ &delete_interior)); - - bch2_disk_reservation_put(c, &res); -- darray_exit(&equiv_seen); - -- bch_err_msg(c, ret, "deleting keys from dying snapshots"); -+ if (!bch2_err_matches(ret, EROFS)) -+ bch_err_msg(c, ret, "deleting keys from dying snapshots"); - if (ret) - goto err; - } - -- bch2_trans_unlock(trans); -- down_write(&c->snapshot_create_lock); -- -- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, -- POS_MIN, 0, k, ({ -- u32 snapshot = k.k->p.offset; -- u32 equiv = bch2_snapshot_equiv(c, snapshot); -- -- equiv != snapshot -- ? snapshot_list_add(c, &deleted_interior, snapshot) -- : 0; -- })); -- -- bch_err_msg(c, ret, "walking snapshots"); -- if (ret) -- goto err_create_lock; -+ darray_for_each(delete_leaves, i) { -+ ret = commit_do(trans, NULL, NULL, 0, -+ bch2_snapshot_node_delete(trans, *i)); -+ if (!bch2_err_matches(ret, EROFS)) -+ bch_err_msg(c, ret, "deleting snapshot %u", *i); -+ if (ret) -+ goto err; -+ } - - /* - * Fixing children of deleted snapshots can't be done completely -@@ -1680,32 +1626,24 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, - BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); -+ bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); - if (ret) -- goto err_create_lock; -- -- darray_for_each(deleted, i) { -- ret = commit_do(trans, NULL, NULL, 0, -- bch2_snapshot_node_delete(trans, *i)); -- bch_err_msg(c, ret, "deleting snapshot %u", *i); -- if (ret) -- goto err_create_lock; -- } -+ goto err; - -- darray_for_each(deleted_interior, i) { -+ darray_for_each(delete_interior, i) { - ret = commit_do(trans, NULL, NULL, 0, -- bch2_snapshot_node_delete(trans, *i)); -- bch_err_msg(c, ret, "deleting snapshot %u", *i); -+ bch2_snapshot_node_delete(trans, i->id)); -+ if (!bch2_err_matches(ret, EROFS)) -+ bch_err_msg(c, ret, "deleting snapshot %u", i->id); - if (ret) -- goto err_create_lock; -+ goto err; - } --err_create_lock: -- up_write(&c->snapshot_create_lock); - err: -- darray_exit(&deleted_interior); -- darray_exit(&deleted); -+ darray_exit(&delete_interior); -+ darray_exit(&delete_leaves); - bch2_trans_put(trans); -- bch_err_fn(c, ret); -+ if (!bch2_err_matches(ret, EROFS)) -+ bch_err_fn(c, ret); - return ret; - } - -@@ -1721,8 +1659,12 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) - - void bch2_delete_dead_snapshots_async(struct bch_fs *c) - { -- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && -- !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) -+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) -+ return; -+ -+ BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); -+ -+ if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); - } - -@@ -1735,18 +1677,10 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, - struct bkey_s_c k; - int ret; - -- bch2_trans_iter_init(trans, &iter, id, pos, -- BTREE_ITER_not_extents| -- BTREE_ITER_all_snapshots); -- while (1) { -- k = bch2_btree_iter_prev(&iter); -- ret = bkey_err(k); -- if (ret) -- break; -- -- if (!k.k) -- break; -- -+ for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos), -+ BTREE_ITER_not_extents| -+ BTREE_ITER_all_snapshots, -+ k, ret) { - if (!bkey_eq(pos, k.k->p)) - break; - -@@ -1760,37 +1694,36 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, - return ret; - } - --static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) -+static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) - { -- struct bch_fs *c = trans->c; -- struct bkey_s_c_snapshot snap; -- int ret = 0; -+ /* If there's one child, it's redundant and keys will be moved to the child */ -+ return !!snap.v->children[0] + !!snap.v->children[1] == 1; -+} - -+static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) -+{ - if (k.k->type != KEY_TYPE_snapshot) - return 0; - -- snap = bkey_s_c_to_snapshot(k); -+ struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || -- bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || -- (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { -- set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); -- return 0; -- } -+ interior_snapshot_needs_delete(snap)) -+ set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); - -- return ret; -+ return 0; - } - - int bch2_snapshots_read(struct bch_fs *c) - { -+ /* -+ * Initializing the is_ancestor bitmaps requires ancestors to already be -+ * initialized - so mark in reverse: -+ */ - int ret = bch2_trans_run(c, -- for_each_btree_key(trans, iter, BTREE_ID_snapshots, -- POS_MIN, 0, k, -+ for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, -+ POS_MAX, 0, k, - __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: -- bch2_snapshot_set_equiv(trans, k) ?: -- bch2_check_snapshot_needs_deletion(trans, k)) ?: -- for_each_btree_key(trans, iter, BTREE_ID_snapshots, -- POS_MIN, 0, k, -- (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); -+ bch2_check_snapshot_needs_deletion(trans, k))); - bch_err_fn(c, ret); - - /* -diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h -index 29c94716293e..00373cf32e7b 100644 ---- a/fs/bcachefs/snapshot.h -+++ b/fs/bcachefs/snapshot.h -@@ -2,11 +2,9 @@ - #ifndef _BCACHEFS_SNAPSHOT_H - #define _BCACHEFS_SNAPSHOT_H - --enum bch_validate_flags; -- - void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c, -- enum bch_validate_flags); -+ struct bkey_validate_context); - - #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ - .key_validate = bch2_snapshot_tree_validate, \ -@@ -19,7 +17,8 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); - int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); - - void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); --int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -+int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); - int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); -@@ -120,19 +119,19 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) - return id; - } - --static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) -+static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) - { - const struct snapshot_t *s = snapshot_t(c, id); -- return s ? s->equiv : 0; -+ return s ? s->live : 0; - } - --static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) -+static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) - { - rcu_read_lock(); -- id = __bch2_snapshot_equiv(c, id); -+ bool ret = __bch2_snapshot_exists(c, id); - rcu_read_unlock(); - -- return id; -+ return ret; - } - - static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) -diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c -new file mode 100644 -index 000000000000..d78451c2a0c6 ---- /dev/null -+++ b/fs/bcachefs/str_hash.c -@@ -0,0 +1,295 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "btree_update.h" -+#include "dirent.h" -+#include "fsck.h" -+#include "str_hash.h" -+#include "subvolume.h" -+ -+static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) -+{ -+ if (d.v->d_type == DT_SUBVOL) { -+ struct bch_subvolume subvol; -+ int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol), -+ false, &subvol); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ return ret; -+ return !ret; -+ } else { -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, -+ SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); -+ int ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ ret = bkey_is_inode(k.k); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+ } -+} -+ -+static noinline int fsck_rename_dirent(struct btree_trans *trans, -+ struct snapshots_seen *s, -+ const struct bch_hash_desc desc, -+ struct bch_hash_info *hash_info, -+ struct bkey_s_c_dirent old) -+{ -+ struct qstr old_name = bch2_dirent_get_name(old); -+ struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); -+ int ret = PTR_ERR_OR_ZERO(new); -+ if (ret) -+ return ret; -+ -+ bkey_dirent_init(&new->k_i); -+ dirent_copy_target(new, old); -+ new->k.p = old.k->p; -+ -+ for (unsigned i = 0; i < 1000; i++) { -+ unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", -+ old_name.len, old_name.name, i); -+ unsigned u64s = BKEY_U64s + dirent_val_u64s(len); -+ -+ if (u64s > U8_MAX) -+ return -EINVAL; -+ -+ new->k.u64s = u64s; -+ -+ ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, -+ (subvol_inum) { 0, old.k->p.inode }, -+ old.k->p.snapshot, &new->k_i, -+ BTREE_UPDATE_internal_snapshot_node); -+ if (!bch2_err_matches(ret, EEXIST)) -+ break; -+ } -+ -+ if (ret) -+ return ret; -+ -+ return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); -+} -+ -+static noinline int hash_pick_winner(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct bch_hash_info *hash_info, -+ struct bkey_s_c k1, -+ struct bkey_s_c k2) -+{ -+ if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && -+ !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) -+ return 0; -+ -+ switch (desc.btree_id) { -+ case BTREE_ID_dirents: { -+ int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1)); -+ if (ret < 0) -+ return ret; -+ if (!ret) -+ return 0; -+ -+ ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2)); -+ if (ret < 0) -+ return ret; -+ if (!ret) -+ return 1; -+ return 2; -+ } -+ default: -+ return 0; -+ } -+} -+ -+static int repair_inode_hash_info(struct btree_trans *trans, -+ struct bch_inode_unpacked *snapshot_root) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, -+ SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), -+ BTREE_ITER_all_snapshots, k, ret) { -+ if (k.k->p.offset != snapshot_root->bi_inum) -+ break; -+ if (!bkey_is_inode(k.k)) -+ continue; -+ -+ struct bch_inode_unpacked inode; -+ ret = bch2_inode_unpack(k, &inode); -+ if (ret) -+ break; -+ -+ if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || -+ INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), -+ trans, inode_snapshot_mismatch, -+ "inode hash info in different snapshots don't match")) { -+ inode.bi_hash_seed = snapshot_root->bi_hash_seed; -+ SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); -+ ret = __bch2_fsck_write_inode(trans, &inode) ?: -+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: -+ -BCH_ERR_transaction_restart_nested; -+ break; -+ } -+ } -+fsck_err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+/* -+ * All versions of the same inode in different snapshots must have the same hash -+ * seed/type: verify that the hash info we're using matches the root -+ */ -+static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, -+ struct bch_hash_info *hash_info) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), -+ BTREE_ITER_all_snapshots, k, ret) { -+ if (k.k->p.offset != inum) -+ break; -+ if (bkey_is_inode(k.k)) -+ goto found; -+ } -+ bch_err(c, "%s(): inum %llu not found", __func__, inum); -+ ret = -BCH_ERR_fsck_repair_unimplemented; -+ goto err; -+found:; -+ struct bch_inode_unpacked inode; -+ ret = bch2_inode_unpack(k, &inode); -+ if (ret) -+ goto err; -+ -+ struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); -+ if (hash_info->type != hash2.type || -+ memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) { -+ ret = repair_inode_hash_info(trans, &inode); -+ if (!ret) { -+ bch_err(c, "inode hash info mismatch with root, but mismatch not found\n" -+ "%u %llx %llx\n" -+ "%u %llx %llx", -+ hash_info->type, -+ hash_info->siphash_key.k0, -+ hash_info->siphash_key.k1, -+ hash2.type, -+ hash2.siphash_key.k0, -+ hash2.siphash_key.k1); -+ ret = -BCH_ERR_fsck_repair_unimplemented; -+ } -+ } -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int __bch2_str_hash_check_key(struct btree_trans *trans, -+ struct snapshots_seen *s, -+ const struct bch_hash_desc *desc, -+ struct bch_hash_info *hash_info, -+ struct btree_iter *k_iter, struct bkey_s_c hash_k) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter = { NULL }; -+ struct printbuf buf = PRINTBUF; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ u64 hash = desc->hash_bkey(hash_info, hash_k); -+ if (hash_k.k->p.offset < hash) -+ goto bad_hash; -+ -+ for_each_btree_key_norestart(trans, iter, desc->btree_id, -+ SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), -+ BTREE_ITER_slots, k, ret) { -+ if (bkey_eq(k.k->p, hash_k.k->p)) -+ break; -+ -+ if (k.k->type == desc->key_type && -+ !desc->cmp_bkey(k, hash_k)) -+ goto duplicate_entries; -+ -+ if (bkey_deleted(k.k)) { -+ bch2_trans_iter_exit(trans, &iter); -+ goto bad_hash; -+ } -+ } -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ printbuf_exit(&buf); -+ return ret; -+bad_hash: -+ /* -+ * Before doing any repair, check hash_info itself: -+ */ -+ ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info); -+ if (ret) -+ goto out; -+ -+ if (fsck_err(trans, hash_table_key_wrong_offset, -+ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", -+ bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { -+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); -+ if (IS_ERR(new)) -+ return PTR_ERR(new); -+ -+ k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info, -+ (subvol_inum) { 0, hash_k.k->p.inode }, -+ hash_k.k->p.snapshot, new, -+ STR_HASH_must_create| -+ BTREE_ITER_with_updates| -+ BTREE_UPDATE_internal_snapshot_node); -+ ret = bkey_err(k); -+ if (ret) -+ goto out; -+ if (k.k) -+ goto duplicate_entries; -+ -+ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, -+ BTREE_UPDATE_internal_snapshot_node) ?: -+ bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: -+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: -+ -BCH_ERR_transaction_restart_nested; -+ goto out; -+ } -+fsck_err: -+ goto out; -+duplicate_entries: -+ ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k); -+ if (ret < 0) -+ goto out; -+ -+ if (!fsck_err(trans, hash_table_key_duplicate, -+ "duplicate hash table keys%s:\n%s", -+ ret != 2 ? "" : ", both point to valid inodes", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, hash_k), -+ prt_newline(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), -+ buf.buf))) -+ goto out; -+ -+ switch (ret) { -+ case 0: -+ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); -+ break; -+ case 1: -+ ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0); -+ break; -+ case 2: -+ ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: -+ bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); -+ goto out; -+ } -+ -+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: -+ -BCH_ERR_transaction_restart_nested; -+ goto out; -+} -diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h -index ec2b1feea520..55a4ac7bf220 100644 ---- a/fs/bcachefs/str_hash.h -+++ b/fs/bcachefs/str_hash.h -@@ -160,7 +160,7 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, - struct bkey_s_c k; - int ret; - -- for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, -+ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(inum.inum, desc.hash_key(info, key), snapshot), - POS(inum.inum, U64_MAX), - BTREE_ITER_slots|flags, k, ret) { -@@ -210,7 +210,7 @@ bch2_hash_hole(struct btree_trans *trans, - if (ret) - return ret; - -- for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, -+ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(inum.inum, desc.hash_key(info, key), snapshot), - POS(inum.inum, U64_MAX), - BTREE_ITER_slots|BTREE_ITER_intent, k, ret) -@@ -265,7 +265,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, - bool found = false; - int ret; - -- for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, -+ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(insert->k.p.inode, - desc.hash_bkey(info, bkey_i_to_s_c(insert)), - snapshot), -@@ -393,4 +393,26 @@ int bch2_hash_delete(struct btree_trans *trans, - return ret; - } - -+struct snapshots_seen; -+int __bch2_str_hash_check_key(struct btree_trans *, -+ struct snapshots_seen *, -+ const struct bch_hash_desc *, -+ struct bch_hash_info *, -+ struct btree_iter *, struct bkey_s_c); -+ -+static inline int bch2_str_hash_check_key(struct btree_trans *trans, -+ struct snapshots_seen *s, -+ const struct bch_hash_desc *desc, -+ struct bch_hash_info *hash_info, -+ struct btree_iter *k_iter, struct bkey_s_c hash_k) -+{ -+ if (hash_k.k->type != desc->key_type) -+ return 0; -+ -+ if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) -+ return 0; -+ -+ return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k); -+} -+ - #endif /* _BCACHEFS_STR_HASH_H */ -diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c -index 80e5efaff524..b7b96283c316 100644 ---- a/fs/bcachefs/subvolume.c -+++ b/fs/bcachefs/subvolume.c -@@ -207,7 +207,7 @@ int bch2_check_subvol_children(struct bch_fs *c) - /* Subvolumes: */ - - int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); - int ret = 0; -@@ -286,11 +286,11 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) - static __always_inline int - bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, - bool inconsistent_if_not_found, -- int iter_flags, - struct bch_subvolume *s) - { - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), -- iter_flags, subvolume, s); -+ BTREE_ITER_cached| -+ BTREE_ITER_with_updates, subvolume, s); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && - inconsistent_if_not_found, - trans->c, "missing subvolume %u", subvol); -@@ -299,16 +299,15 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, - - int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, - bool inconsistent_if_not_found, -- int iter_flags, - struct bch_subvolume *s) - { -- return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s); -+ return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s); - } - - int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) - { - struct bch_subvolume s; -- int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s); -+ int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s); - if (ret) - return ret; - -@@ -328,7 +327,7 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, - struct bch_snapshot snap; - - return bch2_snapshot_lookup(trans, snapshot, &snap) ?: -- bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); -+ bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol); - } - - int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, -@@ -396,8 +395,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d - struct bch_subvolume s; - - return lockrestart_do(trans, -- bch2_subvolume_get(trans, subvolid_to_delete, true, -- BTREE_ITER_cached, &s)) ?: -+ bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?: - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -@@ -411,26 +409,61 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d - */ - static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) - { -- struct btree_iter iter; -- struct bkey_s_c_subvolume subvol; -- u32 snapid; -- int ret = 0; -+ struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {}; - -- subvol = bch2_bkey_get_iter_typed(trans, &iter, -+ struct bkey_s_c_subvolume subvol = -+ bch2_bkey_get_iter_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached|BTREE_ITER_intent, - subvolume); -- ret = bkey_err(subvol); -+ int ret = bkey_err(subvol); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); - if (ret) -- return ret; -+ goto err; - -- snapid = le32_to_cpu(subvol.v->snapshot); -+ u32 snapid = le32_to_cpu(subvol.v->snapshot); -+ -+ struct bkey_s_c_snapshot snapshot = -+ bch2_bkey_get_iter_typed(trans, &snapshot_iter, -+ BTREE_ID_snapshots, POS(0, snapid), -+ 0, snapshot); -+ ret = bkey_err(snapshot); -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, -+ "missing snapshot %u", snapid); -+ if (ret) -+ goto err; - -- ret = bch2_btree_delete_at(trans, &iter, 0) ?: -+ u32 treeid = le32_to_cpu(snapshot.v->tree); -+ -+ struct bkey_s_c_snapshot_tree snapshot_tree = -+ bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, -+ BTREE_ID_snapshot_trees, POS(0, treeid), -+ 0, snapshot_tree); -+ ret = bkey_err(snapshot_tree); -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, -+ "missing snapshot tree %u", treeid); -+ if (ret) -+ goto err; -+ -+ if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { -+ struct bkey_i_snapshot_tree *snapshot_tree_mut = -+ bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter, -+ &snapshot_tree.s_c, -+ 0, snapshot_tree); -+ ret = PTR_ERR_OR_ZERO(snapshot_tree_mut); -+ if (ret) -+ goto err; -+ -+ snapshot_tree_mut->v.master_subvol = 0; -+ } -+ -+ ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?: - bch2_snapshot_node_set_deleted(trans, snapid); -- bch2_trans_iter_exit(trans, &iter); -+err: -+ bch2_trans_iter_exit(trans, &snapshot_tree_iter); -+ bch2_trans_iter_exit(trans, &snapshot_iter); -+ bch2_trans_iter_exit(trans, &subvol_iter); - return ret; - } - -@@ -675,7 +708,7 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) - /* set bi_subvol on root inode */ - int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) - { -- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, -+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_fs_upgrade_for_subvolumes(trans)); - bch_err_fn(c, ret); - return ret; -diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h -index f897d106e142..910f6196700e 100644 ---- a/fs/bcachefs/subvolume.h -+++ b/fs/bcachefs/subvolume.h -@@ -5,12 +5,11 @@ - #include "darray.h" - #include "subvolume_types.h" - --enum bch_validate_flags; -- - int bch2_check_subvols(struct bch_fs *); - int bch2_check_subvol_children(struct bch_fs *); - --int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -+int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); - void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, -@@ -25,7 +24,7 @@ int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, - - int bch2_subvol_has_children(struct btree_trans *, u32); - int bch2_subvolume_get(struct btree_trans *, unsigned, -- bool, int, struct bch_subvolume *); -+ bool, struct bch_subvolume *); - int __bch2_subvolume_get_snapshot(struct btree_trans *, u32, - u32 *, bool); - int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); -@@ -34,7 +33,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32); - int bch2_subvol_is_ro(struct bch_fs *, u32); - - static inline struct bkey_s_c --bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end, -+bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end, - u32 subvolid, unsigned flags) - { - u32 snapshot; -@@ -43,10 +42,10 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos - return bkey_s_c_err(ret); - - bch2_btree_iter_set_snapshot(iter, snapshot); -- return bch2_btree_iter_peek_upto_type(iter, end, flags); -+ return bch2_btree_iter_peek_max_type(iter, end, flags); - } - --#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ -+#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ - _end, _subvolid, _flags, _k, _do) \ - ({ \ - struct bkey_s_c _k; \ -@@ -54,7 +53,7 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ -- (_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \ -+ (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \ - _end, _subvolid, (_flags)); \ - if (!(_k).k) \ - break; \ -@@ -67,14 +66,14 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos - _ret3; \ - }) - --#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \ -+#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \ - _start, _end, _subvolid, _flags, _k, _do) \ - ({ \ - struct btree_iter _iter; \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ -- for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ -+ for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ - _end, _subvolid, _flags, _k, _do); \ - }) - -diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h -index f2ec4277c2a5..1549d6daf7af 100644 ---- a/fs/bcachefs/subvolume_types.h -+++ b/fs/bcachefs/subvolume_types.h -@@ -9,13 +9,13 @@ typedef DARRAY(u32) snapshot_id_list; - #define IS_ANCESTOR_BITMAP 128 - - struct snapshot_t { -+ bool live; - u32 parent; - u32 skip[3]; - u32 depth; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 tree; -- u32 equiv; - unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; - }; - -diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c -index 7c71594f6a8b..0d588b3337ec 100644 ---- a/fs/bcachefs/super-io.c -+++ b/fs/bcachefs/super-io.c -@@ -23,6 +23,7 @@ - - #include - #include -+#include - - static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { - }; -@@ -41,7 +42,7 @@ static const struct bch2_metadata_version bch2_metadata_versions[] = { - #undef x - }; - --void bch2_version_to_text(struct printbuf *out, unsigned v) -+void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v) - { - const char *str = "(unknown version)"; - -@@ -54,7 +55,7 @@ void bch2_version_to_text(struct printbuf *out, unsigned v) - prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); - } - --unsigned bch2_latest_compatible_version(unsigned v) -+enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v) - { - if (!BCH_VERSION_MAJOR(v)) - return v; -@@ -68,6 +69,22 @@ unsigned bch2_latest_compatible_version(unsigned v) - return v; - } - -+bool bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) -+{ -+ bool ret = (c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && -+ version <= c->sb.version_incompat_allowed; -+ -+ if (ret) { -+ mutex_lock(&c->sb_lock); -+ SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, -+ max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ return ret; -+} -+ - const char * const bch2_sb_fields[] = { - #define x(name, nr) #name, - BCH_SB_FIELDS() -@@ -355,25 +372,34 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - struct bch_sb *sb = disk_sb->sb; - struct bch_sb_field_members_v1 *mi; - enum bch_opt_id opt_id; -- u16 block_size; - int ret; - - ret = bch2_sb_compatible(sb, out); - if (ret) - return ret; - -- if (sb->features[1] || -- (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { -- prt_printf(out, "Filesystem has incompatible features"); -+ u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); -+ unsigned incompat_bit = 0; -+ if (incompat) -+ incompat_bit = __ffs64(incompat); -+ else if (sb->features[1]) -+ incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); -+ -+ if (incompat_bit) { -+ prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", -+ incompat_bit, -+ bch2_sb_features[BCH_FEATURE_NR - 1], -+ BCH_FEATURE_NR - 1); - return -BCH_ERR_invalid_sb_features; - } - -- block_size = le16_to_cpu(sb->block_size); -- -- if (block_size > PAGE_SECTORS) { -- prt_printf(out, "Block size too big (got %u, max %u)", -- block_size, PAGE_SECTORS); -- return -BCH_ERR_invalid_sb_block_size; -+ if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || -+ BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { -+ prt_str(out, "Filesystem has incompatible version "); -+ bch2_version_to_text(out, le16_to_cpu(sb->version)); -+ prt_str(out, ", current version "); -+ bch2_version_to_text(out, bcachefs_metadata_version_current); -+ return -BCH_ERR_invalid_sb_features; - } - - if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { -@@ -406,6 +432,21 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - return -BCH_ERR_invalid_sb_time_precision; - } - -+ /* old versions didn't know to downgrade this field */ -+ if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version)) -+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version)); -+ -+ if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) { -+ prt_printf(out, "Invalid version_incompat "); -+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); -+ prt_str(out, " > incompat_allowed "); -+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); -+ if (flags & BCH_VALIDATE_write) -+ return -BCH_ERR_invalid_sb_version; -+ else -+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); -+ } -+ - if (!flags) { - /* - * Been seeing a bug where these are getting inexplicably -@@ -428,6 +469,11 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); - } - -+#ifdef __KERNEL__ -+ if (!BCH_SB_SHARD_INUMS_NBITS(sb)) -+ SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus()))); -+#endif -+ - for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { - const struct bch_option *opt = bch2_opt_table + opt_id; - -@@ -519,6 +565,9 @@ static void bch2_sb_update(struct bch_fs *c) - c->sb.uuid = src->uuid; - c->sb.user_uuid = src->user_uuid; - c->sb.version = le16_to_cpu(src->version); -+ c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src); -+ c->sb.version_incompat_allowed -+ = BCH_SB_VERSION_INCOMPAT_ALLOWED(src); - c->sb.version_min = le16_to_cpu(src->version_min); - c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); - c->sb.nr_devices = src->nr_devices; -@@ -676,7 +725,8 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf - } - - enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb); -- if (csum_type >= BCH_CSUM_NR) { -+ if (csum_type >= BCH_CSUM_NR || -+ bch2_csum_type_is_encryption(csum_type)) { - prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); - return -BCH_ERR_invalid_sb_csum_type; - } -@@ -878,7 +928,7 @@ static void write_super_endio(struct bio *bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "superblock %s error: %s", -- bio_data_dir(bio) ? "write" : "read", -+ str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) - ca->sb_write_error = 1; - -@@ -891,14 +941,15 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) - struct bch_sb *sb = ca->disk_sb.sb; - struct bio *bio = ca->disk_sb.bio; - -+ memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE); -+ - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; -- bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); -+ bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE); - -- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], -- bio_sectors(bio)); -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); - - percpu_ref_get(&ca->io_ref); - closure_bio_submit(bio, &c->sb_write); -@@ -1042,9 +1093,16 @@ int bch2_write_super(struct bch_fs *c) - ": Superblock write was silently dropped! (seq %llu expected %llu)", - le64_to_cpu(ca->sb_read_scratch->seq), - ca->disk_sb.seq); -- bch2_fs_fatal_error(c, "%s", buf.buf); -+ -+ if (c->opts.errors != BCH_ON_ERROR_continue && -+ c->opts.errors != BCH_ON_ERROR_fix_safe) { -+ ret = -BCH_ERR_erofs_sb_err; -+ bch2_fs_fatal_error(c, "%s", buf.buf); -+ } else { -+ bch_err(c, "%s", buf.buf); -+ } -+ - printbuf_exit(&buf); -- ret = -BCH_ERR_erofs_sb_err; - } - - if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { -@@ -1149,6 +1207,8 @@ bool bch2_check_version_downgrade(struct bch_fs *c) - */ - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); -+ if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current) -+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current); - if (c->sb.version > bcachefs_metadata_version_current) - c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - if (c->sb.version_min > bcachefs_metadata_version_current) -@@ -1157,7 +1217,7 @@ bool bch2_check_version_downgrade(struct bch_fs *c) - return ret; - } - --void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) -+void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) - { - lockdep_assert_held(&c->sb_lock); - -@@ -1167,6 +1227,12 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) - - c->disk_sb.sb->version = cpu_to_le16(new_version); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); -+ -+ if (incompat) { -+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, -+ max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); -+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); -+ } - } - - static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, -@@ -1331,6 +1397,14 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, - bch2_version_to_text(out, le16_to_cpu(sb->version)); - prt_newline(out); - -+ prt_printf(out, "Incompatible features allowed:\t"); -+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); -+ prt_newline(out); -+ -+ prt_printf(out, "Incompatible features in use:\t"); -+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); -+ prt_newline(out); -+ - prt_printf(out, "Version upgrade complete:\t"); - bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); - prt_newline(out); -diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h -index fadd364e2802..b4cff9ebdebb 100644 ---- a/fs/bcachefs/super-io.h -+++ b/fs/bcachefs/super-io.h -@@ -10,14 +10,26 @@ - - #include - -+#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096 -+ - static inline bool bch2_version_compatible(u16 version) - { - return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && - version >= bcachefs_metadata_version_min; - } - --void bch2_version_to_text(struct printbuf *, unsigned); --unsigned bch2_latest_compatible_version(unsigned); -+void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); -+enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); -+ -+bool bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); -+ -+static inline bool bch2_request_incompat_feature(struct bch_fs *c, -+ enum bcachefs_metadata_version version) -+{ -+ return likely(version <= c->sb.version_incompat) -+ ? true -+ : bch2_set_version_incompat(c, version); -+} - - static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) - { -@@ -92,7 +104,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) - } - - bool bch2_check_version_downgrade(struct bch_fs *); --void bch2_sb_upgrade(struct bch_fs *, unsigned); -+void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); - - void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, - struct bch_sb_field *); -diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c -index a6ed9a0bf1c7..0459c875e189 100644 ---- a/fs/bcachefs/super.c -+++ b/fs/bcachefs/super.c -@@ -290,7 +290,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) - - bch2_fs_journal_stop(&c->journal); - -- bch_info(c, "%sshutdown complete, journal seq %llu", -+ bch_info(c, "%sclean shutdown complete, journal seq %llu", - test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", - c->journal.seq_ondisk); - -@@ -411,6 +411,17 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) - return ret; - } - -+bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) -+{ -+ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); -+ -+ bch2_journal_halt_locked(&c->journal); -+ bch2_fs_read_only_async(c); -+ -+ wake_up(&bch2_read_only_wait); -+ return ret; -+} -+ - static int bch2_fs_read_write_late(struct bch_fs *c) - { - int ret; -@@ -441,6 +452,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) - { - int ret; - -+ BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); -+ - if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { - bch_err(c, "cannot go rw, unfixed btree errors"); - return -BCH_ERR_erofs_unfixed_errors; -@@ -561,6 +574,7 @@ static void __bch2_fs_free(struct bch_fs *c) - bch2_io_clock_exit(&c->io_clock[WRITE]); - bch2_io_clock_exit(&c->io_clock[READ]); - bch2_fs_compress_exit(c); -+ bch2_fs_btree_gc_exit(c); - bch2_journal_keys_put_initial(c); - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - BUG_ON(atomic_read(&c->journal_keys.ref)); -@@ -584,7 +598,6 @@ static void __bch2_fs_free(struct bch_fs *c) - #endif - kfree(rcu_dereference_protected(c->disk_groups, 1)); - kfree(c->journal_seq_blacklist_table); -- kfree(c->unused_inode_hints); - - if (c->write_ref_wq) - destroy_workqueue(c->write_ref_wq); -@@ -766,21 +779,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - - refcount_set(&c->ro_ref, 1); - init_waitqueue_head(&c->ro_ref_wait); -+ spin_lock_init(&c->recovery_pass_lock); - sema_init(&c->online_fsck_mutex, 1); - -- init_rwsem(&c->gc_lock); -- mutex_init(&c->gc_gens_lock); -- atomic_set(&c->journal_keys.ref, 1); -- c->journal_keys.initial_ref_held = true; -- - for (i = 0; i < BCH_TIME_STAT_NR; i++) - bch2_time_stats_init(&c->times[i]); - -- bch2_fs_gc_init(c); - bch2_fs_copygc_init(c); - bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); - bch2_fs_btree_iter_init_early(c); - bch2_fs_btree_interior_update_init_early(c); -+ bch2_fs_journal_keys_init(c); - bch2_fs_allocator_background_init(c); - bch2_fs_allocator_foreground_init(c); - bch2_fs_rebalance_init(c); -@@ -809,9 +818,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - INIT_LIST_HEAD(&c->vfs_inodes_list); - mutex_init(&c->vfs_inodes_lock); - -- c->copy_gc_enabled = 1; -- c->rebalance.enabled = 1; -- - c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; - c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; - c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; -@@ -873,8 +879,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - (btree_blocks(c) + 1) * 2 * - sizeof(struct sort_iter_set); - -- c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); -- - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || - !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", -@@ -901,9 +905,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - !(c->online_reserved = alloc_percpu(u64)) || - mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, - c->opts.btree_node_size) || -- mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || -- !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, -- sizeof(u64), GFP_KERNEL))) { -+ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { - ret = -BCH_ERR_ENOMEM_fs_other_alloc; - goto err; - } -@@ -917,6 +919,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - bch2_fs_btree_cache_init(c) ?: - bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_interior_update_init(c) ?: -+ bch2_fs_btree_gc_init(c) ?: - bch2_fs_buckets_waiting_for_journal_init(c) ?: - bch2_fs_btree_write_buffer_init(c) ?: - bch2_fs_subvolumes_init(c) ?: -@@ -1033,9 +1036,12 @@ int bch2_fs_start(struct bch_fs *c) - bch2_dev_allocator_add(c, ca); - bch2_recalc_capacity(c); - -+ c->recovery_task = current; - ret = BCH_SB_INITIALIZED(c->disk_sb.sb) - ? bch2_fs_recovery(c) - : bch2_fs_initialize(c); -+ c->recovery_task = NULL; -+ - if (ret) - goto err; - -@@ -1120,12 +1126,12 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, - - prt_bdevname(&buf, fs->bdev); - prt_char(&buf, ' '); -- bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; -+ bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); - prt_newline(&buf); - - prt_bdevname(&buf, sb->bdev); - prt_char(&buf, ' '); -- bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; -+ bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); - prt_newline(&buf); - - if (!opts->no_splitbrain_check) -@@ -1198,7 +1204,7 @@ static void bch2_dev_free(struct bch_dev *ca) - - free_percpu(ca->io_done); - bch2_dev_buckets_free(ca); -- free_page((unsigned long) ca->sb_read_scratch); -+ kfree(ca->sb_read_scratch); - - bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); - bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); -@@ -1309,8 +1315,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, - init_completion(&ca->ref_completion); - init_completion(&ca->io_ref_completion); - -- init_rwsem(&ca->bucket_lock); -- - INIT_WORK(&ca->io_error_work, bch2_io_error_work); - - bch2_time_stats_quantiles_init(&ca->io_latency[READ]); -@@ -1337,7 +1341,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, - - if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -- !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || -+ !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || - bch2_dev_buckets_alloc(c, ca) || - !(ca->io_done = alloc_percpu(*ca->io_done))) - goto err; -@@ -1366,7 +1370,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) - { - struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - struct bch_dev *ca = NULL; -- int ret = 0; - - if (bch2_fs_init_fault("dev_alloc")) - goto err; -@@ -1378,10 +1381,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) - ca->fs = c; - - bch2_dev_attach(c, ca, dev_idx); -- return ret; -+ return 0; - err: -- if (ca) -- bch2_dev_free(ca); - return -BCH_ERR_ENOMEM_dev_alloc; - } - -@@ -1751,11 +1752,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) - if (ret) - goto err; - -- ret = bch2_dev_journal_alloc(ca, true); -- bch_err_msg(c, ret, "allocating journal"); -- if (ret) -- goto err; -- - down_write(&c->state_lock); - mutex_lock(&c->sb_lock); - -@@ -1806,13 +1802,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path) - if (ret) - goto err_late; - -- ca->new_fs_bucket_idx = 0; -- - if (ca->mi.state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); - -+ ret = bch2_dev_journal_alloc(ca, false); -+ bch_err_msg(c, ret, "allocating journal"); -+ if (ret) -+ goto err_late; -+ - up_write(&c->state_lock); -- return 0; -+out: -+ printbuf_exit(&label); -+ printbuf_exit(&errbuf); -+ bch_err_fn(c, ret); -+ return ret; - - err_unlock: - mutex_unlock(&c->sb_lock); -@@ -1821,10 +1824,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) - if (ca) - bch2_dev_free(ca); - bch2_free_super(&sb); -- printbuf_exit(&label); -- printbuf_exit(&errbuf); -- bch_err_fn(c, ret); -- return ret; -+ goto out; - err_late: - up_write(&c->state_lock); - ca = NULL; -diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h -index dada09331d2e..04f8287eff5c 100644 ---- a/fs/bcachefs/super.h -+++ b/fs/bcachefs/super.h -@@ -29,21 +29,12 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); - struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); - - bool bch2_fs_emergency_read_only(struct bch_fs *); -+bool bch2_fs_emergency_read_only_locked(struct bch_fs *); - void bch2_fs_read_only(struct bch_fs *); - - int bch2_fs_read_write(struct bch_fs *); - int bch2_fs_read_write_early(struct bch_fs *); - --/* -- * Only for use in the recovery/fsck path: -- */ --static inline void bch2_fs_lazy_rw(struct bch_fs *c) --{ -- if (!test_bit(BCH_FS_rw, &c->flags) && -- !test_bit(BCH_FS_was_rw, &c->flags)) -- bch2_fs_read_write_early(c); --} -- - void __bch2_fs_stop(struct bch_fs *); - void bch2_fs_free(struct bch_fs *); - void bch2_fs_stop(struct bch_fs *); -diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c -index 03e59f86f360..a7eb1f511484 100644 ---- a/fs/bcachefs/sysfs.c -+++ b/fs/bcachefs/sysfs.c -@@ -146,7 +146,7 @@ write_attribute(trigger_journal_writes); - write_attribute(trigger_btree_cache_shrink); - write_attribute(trigger_btree_key_cache_shrink); - write_attribute(trigger_freelist_wakeup); --rw_attribute(gc_gens_pos); -+read_attribute(gc_gens_pos); - - read_attribute(uuid); - read_attribute(minor); -@@ -203,7 +203,6 @@ read_attribute(disk_groups); - - read_attribute(has_data); - read_attribute(alloc_debug); --read_attribute(accounting); - read_attribute(usage_base); - - #define x(t, n, ...) read_attribute(t); -@@ -211,12 +210,11 @@ BCH_PERSISTENT_COUNTERS() - #undef x - - rw_attribute(discard); -+read_attribute(state); - rw_attribute(label); - --rw_attribute(copy_gc_enabled); - read_attribute(copy_gc_wait); - --rw_attribute(rebalance_enabled); - sysfs_pd_controller_attribute(rebalance); - read_attribute(rebalance_status); - -@@ -237,11 +235,6 @@ write_attribute(perf_test); - BCH_TIME_STATS() - #undef x - --static struct attribute sysfs_state_rw = { -- .name = "state", -- .mode = 0444, --}; -- - static size_t bch2_btree_cache_size(struct bch_fs *c) - { - struct btree_cache *bc = &c->btree_cache; -@@ -302,7 +295,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c - - static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) - { -- prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree)); -+ bch2_btree_id_to_text(out, c->gc_gens_btree); -+ prt_printf(out, ": "); - bch2_bpos_to_text(out, c->gc_gens_pos); - prt_printf(out, "\n"); - } -@@ -339,9 +333,6 @@ SHOW(bch2_fs) - if (attr == &sysfs_gc_gens_pos) - bch2_gc_gens_pos_to_text(out, c); - -- sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -- -- sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); - sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ - - if (attr == &sysfs_copy_gc_wait) -@@ -405,9 +396,6 @@ SHOW(bch2_fs) - if (attr == &sysfs_alloc_debug) - bch2_fs_alloc_debug_to_text(out, c); - -- if (attr == &sysfs_accounting) -- bch2_fs_accounting_to_text(out, c); -- - if (attr == &sysfs_usage_base) - bch2_fs_usage_base_to_text(out, c); - -@@ -418,23 +406,6 @@ STORE(bch2_fs) - { - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - -- if (attr == &sysfs_copy_gc_enabled) { -- ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) -- ?: (ssize_t) size; -- -- if (c->copygc_thread) -- wake_up_process(c->copygc_thread); -- return ret; -- } -- -- if (attr == &sysfs_rebalance_enabled) { -- ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) -- ?: (ssize_t) size; -- -- rebalance_wakeup(c); -- return ret; -- } -- - sysfs_pd_controller_store(rebalance, &c->rebalance.pd); - - /* Debugging: */ -@@ -534,15 +505,22 @@ SHOW(bch2_fs_counters) - - printbuf_tabstop_push(out, 32); - -- #define x(t, ...) \ -+ #define x(t, n, f, ...) \ - if (attr == &sysfs_##t) { \ - counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ - counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ -+ if (f & TYPE_SECTORS) { \ -+ counter <<= 9; \ -+ counter_since_mount <<= 9; \ -+ } \ -+ \ - prt_printf(out, "since mount:\t"); \ -+ (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\ - prt_human_readable_u64(out, counter_since_mount); \ - prt_newline(out); \ - \ - prt_printf(out, "since filesystem creation:\t"); \ -+ (f & TYPE_COUNTER) ? prt_u64(out, counter) : \ - prt_human_readable_u64(out, counter); \ - prt_newline(out); \ - } -@@ -610,10 +588,8 @@ struct attribute *bch2_fs_internal_files[] = { - - &sysfs_gc_gens_pos, - -- &sysfs_copy_gc_enabled, - &sysfs_copy_gc_wait, - -- &sysfs_rebalance_enabled, - sysfs_pd_controller_files(rebalance), - - &sysfs_moving_ctxts, -@@ -622,7 +598,6 @@ struct attribute *bch2_fs_internal_files[] = { - - &sysfs_disk_groups, - &sysfs_alloc_debug, -- &sysfs_accounting, - &sysfs_usage_base, - NULL - }; -@@ -682,6 +657,13 @@ STORE(bch2_fs_opts_dir) - (id == Opt_compression && !c->opts.background_compression))) - bch2_set_rebalance_needs_scan(c, 0); - -+ if (v && id == Opt_rebalance_enabled) -+ rebalance_wakeup(c); -+ -+ if (v && id == Opt_copygc_enabled && -+ c->copygc_thread) -+ wake_up_process(c->copygc_thread); -+ - ret = size; - err: - bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); -@@ -790,7 +772,7 @@ SHOW(bch2_dev) - prt_char(out, '\n'); - } - -- if (attr == &sysfs_state_rw) { -+ if (attr == &sysfs_state) { - prt_string_option(out, bch2_member_states, ca->mi.state); - prt_char(out, '\n'); - } -@@ -870,7 +852,7 @@ struct attribute *bch2_dev_files[] = { - - /* settings: */ - &sysfs_discard, -- &sysfs_state_rw, -+ &sysfs_state, - &sysfs_label, - - &sysfs_has_data, -diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c -index fb5c1543e52f..6c6469814637 100644 ---- a/fs/bcachefs/tests.c -+++ b/fs/bcachefs/tests.c -@@ -131,7 +131,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) - i = 0; - - ret = bch2_trans_run(c, -- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, -+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i++); -@@ -186,7 +186,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) - i = 0; - - ret = bch2_trans_run(c, -- for_each_btree_key_upto(trans, iter, BTREE_ID_extents, -+ for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i); -@@ -242,7 +242,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) - i = 0; - - ret = bch2_trans_run(c, -- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, -+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i); -@@ -259,7 +259,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) - i = 0; - - ret = bch2_trans_run(c, -- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, -+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_slots, k, ({ - if (i >= nr * 2) -@@ -302,7 +302,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) - i = 0; - - ret = bch2_trans_run(c, -- for_each_btree_key_upto(trans, iter, BTREE_ID_extents, -+ for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i + 8); -@@ -320,7 +320,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) - i = 0; - - ret = bch2_trans_run(c, -- for_each_btree_key_upto(trans, iter, BTREE_ID_extents, -+ for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_slots, k, ({ - if (i == nr) -@@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr) - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - -- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); -+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - -- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); -+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - bch2_trans_iter_exit(trans, &iter); -@@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), 0); - -- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); -+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - -- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); -+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - bch2_trans_iter_exit(trans, &iter); -@@ -488,7 +488,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) - trans = bch2_trans_get(c); - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, snapid_lo), 0); -- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); -+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); - - BUG_ON(k.k->p.snapshot != U32_MAX); - -@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, - BTREE_ITER_intent); -- k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)); -+ k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)); - ret = bkey_err(k); - if (ret) - goto err; -@@ -726,7 +726,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) - static int seq_lookup(struct bch_fs *c, u64 nr) - { - return bch2_trans_run(c, -- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, -+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, - 0)); -diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h -index 5597b9d6297f..c1b51009edf6 100644 ---- a/fs/bcachefs/trace.h -+++ b/fs/bcachefs/trace.h -@@ -199,6 +199,30 @@ DECLARE_EVENT_CLASS(bio, - (unsigned long long)__entry->sector, __entry->nr_sector) - ); - -+/* disk_accounting.c */ -+ -+TRACE_EVENT(accounting_mem_insert, -+ TP_PROTO(struct bch_fs *c, const char *acc), -+ TP_ARGS(c, acc), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(unsigned, new_nr ) -+ __string(acc, acc ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->new_nr = c->accounting.k.nr; -+ __assign_str(acc); -+ ), -+ -+ TP_printk("%d,%d entries %u added %s", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->new_nr, -+ __get_str(acc)) -+); -+ - /* fs.c: */ - TRACE_EVENT(bch2_sync_fs, - TP_PROTO(struct super_block *sb, int wait), -@@ -703,7 +727,7 @@ DEFINE_EVENT(fs_str, bucket_alloc_fail, - TP_ARGS(c, str) - ); - --TRACE_EVENT(discard_buckets, -+DECLARE_EVENT_CLASS(discard_buckets_class, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err), -@@ -735,6 +759,18 @@ TRACE_EVENT(discard_buckets, - __entry->err) - ); - -+DEFINE_EVENT(discard_buckets_class, discard_buckets, -+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open, -+ u64 need_journal_commit, u64 discarded, const char *err), -+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err) -+); -+ -+DEFINE_EVENT(discard_buckets_class, discard_buckets_fast, -+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open, -+ u64 need_journal_commit, u64 discarded, const char *err), -+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err) -+); -+ - TRACE_EVENT(bucket_invalidate, - TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), - TP_ARGS(c, dev, bucket, sectors), -@@ -848,8 +884,8 @@ TRACE_EVENT(move_data, - TRACE_EVENT(evacuate_bucket, - TP_PROTO(struct bch_fs *c, struct bpos *bucket, - unsigned sectors, unsigned bucket_size, -- u64 fragmentation, int ret), -- TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret), -+ int ret), -+ TP_ARGS(c, bucket, sectors, bucket_size, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) -@@ -857,7 +893,6 @@ TRACE_EVENT(evacuate_bucket, - __field(u64, bucket ) - __field(u32, sectors ) - __field(u32, bucket_size ) -- __field(u64, fragmentation ) - __field(int, ret ) - ), - -@@ -867,45 +902,42 @@ TRACE_EVENT(evacuate_bucket, - __entry->bucket = bucket->offset; - __entry->sectors = sectors; - __entry->bucket_size = bucket_size; -- __entry->fragmentation = fragmentation; - __entry->ret = ret; - ), - -- TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i", -+ TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->member, __entry->bucket, - __entry->sectors, __entry->bucket_size, -- __entry->fragmentation, __entry->ret) -+ __entry->ret) - ); - - TRACE_EVENT(copygc, - TP_PROTO(struct bch_fs *c, -- u64 sectors_moved, u64 sectors_not_moved, -- u64 buckets_moved, u64 buckets_not_moved), -- TP_ARGS(c, -- sectors_moved, sectors_not_moved, -- buckets_moved, buckets_not_moved), -+ u64 buckets, -+ u64 sectors_seen, -+ u64 sectors_moved), -+ TP_ARGS(c, buckets, sectors_seen, sectors_moved), - - TP_STRUCT__entry( - __field(dev_t, dev ) -+ __field(u64, buckets ) -+ __field(u64, sectors_seen ) - __field(u64, sectors_moved ) -- __field(u64, sectors_not_moved ) -- __field(u64, buckets_moved ) -- __field(u64, buckets_not_moved ) - ), - - TP_fast_assign( - __entry->dev = c->dev; -+ __entry->buckets = buckets; -+ __entry->sectors_seen = sectors_seen; - __entry->sectors_moved = sectors_moved; -- __entry->sectors_not_moved = sectors_not_moved; -- __entry->buckets_moved = buckets_moved; -- __entry->buckets_not_moved = buckets_moved; - ), - -- TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", -+ TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), -- __entry->sectors_moved, __entry->sectors_not_moved, -- __entry->buckets_moved, __entry->buckets_not_moved) -+ __entry->buckets, -+ __entry->sectors_seen, -+ __entry->sectors_moved) - ); - - TRACE_EVENT(copygc_wait, -@@ -1316,6 +1348,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, - __entry->new_u64s) - ); - -+DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ - TRACE_EVENT(path_downgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, -@@ -1352,10 +1390,21 @@ TRACE_EVENT(path_downgrade, - __entry->pos_snapshot) - ); - --DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, -- TP_PROTO(struct btree_trans *trans, -- unsigned long caller_ip), -- TP_ARGS(trans, caller_ip) -+TRACE_EVENT(key_cache_fill, -+ TP_PROTO(struct btree_trans *trans, const char *key), -+ TP_ARGS(trans, key), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __string(key, key ) -+ ), -+ -+ TP_fast_assign( -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ __assign_str(key); -+ ), -+ -+ TP_printk("%s %s", __entry->trans_fn, __get_str(key)) - ); - - TRACE_EVENT(write_buffer_flush, -@@ -1414,6 +1463,24 @@ TRACE_EVENT(write_buffer_flush_slowpath, - TP_printk("%zu/%zu", __entry->slowpath, __entry->total) - ); - -+TRACE_EVENT(write_buffer_maybe_flush, -+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key), -+ TP_ARGS(trans, caller_ip, key), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __field(unsigned long, caller_ip ) -+ __string(key, key ) -+ ), -+ -+ TP_fast_assign( -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ __assign_str(key); -+ ), -+ -+ TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key)) -+); -+ - DEFINE_EVENT(fs_str, rebalance_extent, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c -index e0a876cbaa6b..da2cd11b3025 100644 ---- a/fs/bcachefs/util.c -+++ b/fs/bcachefs/util.c -@@ -653,19 +653,25 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) - return 0; - } - --size_t bch2_rand_range(size_t max) -+u64 bch2_get_random_u64_below(u64 ceil) - { -- size_t rand; -+ if (ceil <= U32_MAX) -+ return __get_random_u32_below(ceil); - -- if (!max) -- return 0; -+ /* this is the same (clever) algorithm as in __get_random_u32_below() */ -+ u64 rand = get_random_u64(); -+ u64 mult = ceil * rand; - -- do { -- rand = get_random_long(); -- rand &= roundup_pow_of_two(max) - 1; -- } while (rand >= max); -+ if (unlikely(mult < ceil)) { -+ u64 bound; -+ div64_u64_rem(-ceil, ceil, &bound); -+ while (unlikely(mult < bound)) { -+ rand = get_random_u64(); -+ mult = ceil * rand; -+ } -+ } - -- return rand; -+ return mul_u64_u64_shr(ceil, rand, 64); - } - - void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) -diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h -index fb02c1c36004..cb20cd7a108a 100644 ---- a/fs/bcachefs/util.h -+++ b/fs/bcachefs/util.h -@@ -55,6 +55,16 @@ static inline size_t buf_pages(void *p, size_t len) - PAGE_SIZE); - } - -+static inline void *bch2_kvmalloc(size_t n, gfp_t flags) -+{ -+ void *p = unlikely(n >= INT_MAX) -+ ? vmalloc(n) -+ : kvmalloc(n, flags & ~__GFP_ZERO); -+ if (p && (flags & __GFP_ZERO)) -+ memset(p, 0, n); -+ return p; -+} -+ - #define init_heap(heap, _size, gfp) \ - ({ \ - (heap)->nr = 0; \ -@@ -317,6 +327,19 @@ do { \ - _ptr ? container_of(_ptr, type, member) : NULL; \ - }) - -+static inline struct list_head *list_pop(struct list_head *head) -+{ -+ if (list_empty(head)) -+ return NULL; -+ -+ struct list_head *ret = head->next; -+ list_del_init(ret); -+ return ret; -+} -+ -+#define list_pop_entry(head, type, member) \ -+ container_of_or_null(list_pop(head), type, member) -+ - /* Does linear interpolation between powers of two */ - static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) - { -@@ -378,7 +401,7 @@ do { \ - _ret; \ - }) - --size_t bch2_rand_range(size_t); -+u64 bch2_get_random_u64_below(u64); - - void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); - void memcpy_from_bio(void *, struct bio *, struct bvec_iter); -@@ -696,4 +719,13 @@ static inline bool test_bit_le64(size_t bit, __le64 *addr) - return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; - } - -+static inline void memcpy_swab(void *_dst, void *_src, size_t len) -+{ -+ u8 *dst = _dst + len; -+ u8 *src = _src; -+ -+ while (len--) -+ *--dst = *src++; -+} -+ - #endif /* _BCACHEFS_UTIL_H */ -diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c -index 6a78553d9b0c..6620ecae26af 100644 ---- a/fs/bcachefs/varint.c -+++ b/fs/bcachefs/varint.c -@@ -9,6 +9,7 @@ - #include - #endif - -+#include "errcode.h" - #include "varint.h" - - /** -@@ -53,7 +54,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) - u64 v; - - if (unlikely(in + bytes > end)) -- return -1; -+ return -BCH_ERR_varint_decode_error; - - if (likely(bytes < 9)) { - __le64 v_le = 0; -@@ -115,7 +116,7 @@ int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) - unsigned bytes = ffz(*in) + 1; - - if (unlikely(in + bytes > end)) -- return -1; -+ return -BCH_ERR_varint_decode_error; - - if (likely(bytes < 9)) { - v >>= bytes; -diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c -index 952aca400faf..aed7c6984173 100644 ---- a/fs/bcachefs/xattr.c -+++ b/fs/bcachefs/xattr.c -@@ -71,7 +71,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { - }; - - int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, -- enum bch_validate_flags flags) -+ struct bkey_validate_context from) - { - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, -@@ -309,7 +309,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) - u64 offset = 0, inum = inode->ei_inode.bi_inum; - - int ret = bch2_trans_run(c, -- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs, -+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, - POS(inum, offset), - POS(inum, U64_MAX), - inode->ei_inum.subvol, 0, k, ({ -@@ -565,13 +565,6 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, - ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); - err: - mutex_unlock(&inode->ei_update_lock); -- -- if (value && -- (opt_id == Opt_background_target || -- opt_id == Opt_background_compression || -- (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) -- bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); -- - err_class_exit: - return bch2_err_class(ret); - } -@@ -609,7 +602,7 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { - - #endif /* NO_BCACHEFS_FS */ - --const struct xattr_handler *bch2_xattr_handlers[] = { -+const struct xattr_handler * const bch2_xattr_handlers[] = { - &bch_xattr_user_handler, - &bch_xattr_trusted_handler, - &bch_xattr_security_handler, -diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h -index c188a5ad64ce..132fbbd15a66 100644 ---- a/fs/bcachefs/xattr.h -+++ b/fs/bcachefs/xattr.h -@@ -6,7 +6,8 @@ - - extern const struct bch_hash_desc bch2_xattr_hash_desc; - --int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -+int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, -+ struct bkey_validate_context); - void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - - #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ -@@ -44,6 +45,6 @@ int bch2_xattr_set(struct btree_trans *, subvol_inum, - - ssize_t bch2_xattr_list(struct dentry *, char *, size_t); - --extern const struct xattr_handler *bch2_xattr_handlers[]; -+extern const struct xattr_handler * const bch2_xattr_handlers[]; - - #endif /* _BCACHEFS_XATTR_H */ -diff --git a/fs/fs_parser.c b/fs/fs_parser.c -index 24727ec34e5a..6521e9a9d6ef 100644 ---- a/fs/fs_parser.c -+++ b/fs/fs_parser.c -@@ -13,7 +13,7 @@ - #include - #include "internal.h" - --static const struct constant_table bool_names[] = { -+const struct constant_table bool_names[] = { - { "0", false }, - { "1", true }, - { "false", false }, -@@ -22,6 +22,7 @@ static const struct constant_table bool_names[] = { - { "yes", true }, - { }, - }; -+EXPORT_SYMBOL(bool_names); - - static const struct constant_table * - __lookup_constant(const struct constant_table *tbl, const char *name) -diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h -index 6cf713a7e6c6..0974cd33bcba 100644 ---- a/include/linux/fs_parser.h -+++ b/include/linux/fs_parser.h -@@ -83,6 +83,8 @@ extern int fs_lookup_param(struct fs_context *fc, - - extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); - -+extern const struct constant_table bool_names[]; -+ - #ifdef CONFIG_VALIDATE_FS_PARSER - extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, - int low, int high, int special); -diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h -index 43a7b9dcf15e..fe17b4828171 100644 ---- a/include/linux/min_heap.h -+++ b/include/linux/min_heap.h -@@ -15,8 +15,8 @@ - */ - #define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \ - struct _name { \ -- int nr; \ -- int size; \ -+ size_t nr; \ -+ size_t size; \ - _type *data; \ - _type preallocated[_nr]; \ - } --- -2.45.3 - diff --git a/sys-kernel/hardened-kernel/files/linux-6.14/1190_bcachefs-revert-6.14-backport-fixes.patch b/sys-kernel/hardened-kernel/files/linux-6.14/1190_bcachefs-revert-6.14-backport-fixes.patch deleted file mode 100644 index f82566d..0000000 --- a/sys-kernel/hardened-kernel/files/linux-6.14/1190_bcachefs-revert-6.14-backport-fixes.patch +++ /dev/null @@ -1,128 +0,0 @@ -From ee3912c8c293b09acc90ba6ad7443ceacc33ef79 Mon Sep 17 00:00:00 2001 -From: Alexander Miroshnichenko -Date: Wed, 14 May 2025 16:48:38 +0300 -Subject: [PATCH] bcachefs: revert 6.14 backport fixes -Content-Type: text/plain; charset="utf-8" -Content-Transfer-Encoding: 8bit - -Signed-off-by: Alexander Miroshnichenko ---- - fs/bcachefs/btree_update_interior.c | 17 +---------------- - fs/bcachefs/error.c | 8 -------- - fs/bcachefs/error.h | 2 -- - fs/bcachefs/fs-ioctl.c | 6 ++---- - fs/bcachefs/xattr_format.h | 8 +------- - 5 files changed, 4 insertions(+), 37 deletions(-) - -diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c -index e9be8b5571a4..e4e7c804625e 100644 ---- a/fs/bcachefs/btree_update_interior.c -+++ b/fs/bcachefs/btree_update_interior.c -@@ -35,8 +35,6 @@ static const char * const bch2_btree_update_modes[] = { - NULL - }; - --static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *); -- - static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - btree_path_idx_t, struct btree *, struct keylist *); - static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -@@ -1784,24 +1782,11 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t - int ret; - - lockdep_assert_held(&c->gc_lock); -+ BUG_ON(!btree_node_intent_locked(path, b->c.level)); - BUG_ON(!b->c.level); - BUG_ON(!as || as->b); - bch2_verify_keylist_sorted(keys); - -- if (!btree_node_intent_locked(path, b->c.level)) { -- struct printbuf buf = PRINTBUF; -- bch2_log_msg_start(c, &buf); -- prt_printf(&buf, "%s(): node not locked at level %u\n", -- __func__, b->c.level); -- bch2_btree_update_to_text(&buf, as); -- bch2_btree_path_to_text(&buf, trans, path_idx); -- -- bch2_print_string_as_lines(KERN_ERR, buf.buf); -- printbuf_exit(&buf); -- bch2_fs_emergency_read_only(c); -- return -EIO; -- } -- - ret = bch2_btree_node_lock_write(trans, path, &b->c); - if (ret) - return ret; -diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c -index 6cbf4819e923..038da6a61f6b 100644 ---- a/fs/bcachefs/error.c -+++ b/fs/bcachefs/error.c -@@ -11,14 +11,6 @@ - - #define FSCK_ERR_RATELIMIT_NR 10 - --void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) --{ --#ifdef BCACHEFS_LOG_PREFIX -- prt_printf(out, bch2_log_msg(c, "")); --#endif -- printbuf_indent_add(out, 2); --} -- - bool bch2_inconsistent_error(struct bch_fs *c) - { - set_bit(BCH_FS_error, &c->flags); -diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h -index 5730eb6b2f38..7acf2a27ca28 100644 ---- a/fs/bcachefs/error.h -+++ b/fs/bcachefs/error.h -@@ -18,8 +18,6 @@ struct work_struct; - - /* Error messages: */ - --void bch2_log_msg_start(struct bch_fs *, struct printbuf *); -- - /* - * Inconsistency errors: The on disk data is inconsistent. If these occur during - * initial recovery, they don't indicate a bug in the running code - we walk all -diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c -index 4d6193820483..15725b4ce393 100644 ---- a/fs/bcachefs/fs-ioctl.c -+++ b/fs/bcachefs/fs-ioctl.c -@@ -515,12 +515,10 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, - ret = -ENOENT; - goto err; - } -- -- ret = inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?: -- __bch2_unlink(dir, victim, true); -+ ret = __bch2_unlink(dir, victim, true); - if (!ret) { - fsnotify_rmdir(dir, victim); -- d_invalidate(victim); -+ d_delete(victim); - } - err: - inode_unlock(dir); -diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h -index 67426e33d04e..c7916011ef34 100644 ---- a/fs/bcachefs/xattr_format.h -+++ b/fs/bcachefs/xattr_format.h -@@ -13,13 +13,7 @@ struct bch_xattr { - __u8 x_type; - __u8 x_name_len; - __le16 x_val_len; -- /* -- * x_name contains the name and value counted by -- * x_name_len + x_val_len. The introduction of -- * __counted_by(x_name_len) caused a false positive -- * detection of an out of bounds write. -- */ -- __u8 x_name[]; -+ __u8 x_name[] __counted_by(x_name_len); - } __packed __aligned(8); - - #endif /* _BCACHEFS_XATTR_FORMAT_H */ --- -2.49.0 - diff --git a/sys-kernel/hardened-kernel/files/linux-6.14/1191_bcachefs-cherry-pick-updates-from-master-17227e8.patch b/sys-kernel/hardened-kernel/files/linux-6.14/1191_bcachefs-cherry-pick-updates-from-master-17227e8.patch deleted file mode 100644 index a14f9fd..0000000 --- a/sys-kernel/hardened-kernel/files/linux-6.14/1191_bcachefs-cherry-pick-updates-from-master-17227e8.patch +++ /dev/null @@ -1,41408 +0,0 @@ -From daf3d0f8fb4768e7c05f1fac44b217f4437ce04b Mon Sep 17 00:00:00 2001 -From: Alexander Miroshnichenko -Date: Thu, 22 May 2025 13:15:09 +0300 -Subject: [PATCH] bcachefs: cherry-pick updates from master 17227e8 -Content-Type: text/plain; charset="utf-8" -Content-Transfer-Encoding: 8bit - -Signed-off-by: Alexander Miroshnichenko ---- - .../bcachefs/SubmittingPatches.rst | 43 +- - .../filesystems/bcachefs/casefolding.rst | 108 ++ - .../filesystems/bcachefs/future/idle_work.rst | 78 ++ - Documentation/filesystems/bcachefs/index.rst | 27 +- - MAINTAINERS | 7 + - block/bdev.c | 2 + - block/blk-core.c | 19 +- - fs/bcachefs/Kconfig | 16 +- - fs/bcachefs/Makefile | 8 +- - fs/bcachefs/acl.c | 4 +- - fs/bcachefs/alloc_background.c | 428 ++++--- - fs/bcachefs/alloc_background.h | 9 +- - fs/bcachefs/alloc_foreground.c | 650 +++++------ - fs/bcachefs/alloc_foreground.h | 92 +- - fs/bcachefs/alloc_types.h | 18 +- - fs/bcachefs/async_objs.c | 132 +++ - fs/bcachefs/async_objs.h | 44 + - fs/bcachefs/async_objs_types.h | 25 + - fs/bcachefs/backpointers.c | 553 +++++---- - fs/bcachefs/backpointers.h | 40 +- - fs/bcachefs/bcachefs.h | 252 ++-- - fs/bcachefs/bcachefs_format.h | 124 +- - fs/bcachefs/bcachefs_ioctl.h | 29 +- - fs/bcachefs/bkey.c | 47 +- - fs/bcachefs/bkey.h | 4 +- - fs/bcachefs/bkey_methods.c | 26 +- - fs/bcachefs/bset.c | 64 +- - fs/bcachefs/bset.h | 22 +- - fs/bcachefs/btree_cache.c | 212 ++-- - fs/bcachefs/btree_gc.c | 102 +- - fs/bcachefs/btree_gc.h | 3 +- - fs/bcachefs/btree_io.c | 616 +++++++--- - fs/bcachefs/btree_io.h | 16 +- - fs/bcachefs/btree_iter.c | 420 ++++--- - fs/bcachefs/btree_iter.h | 213 ++-- - fs/bcachefs/btree_journal_iter.c | 5 +- - fs/bcachefs/btree_key_cache.c | 96 +- - fs/bcachefs/btree_key_cache.h | 3 +- - fs/bcachefs/btree_locking.c | 73 +- - fs/bcachefs/btree_locking.h | 41 +- - fs/bcachefs/btree_node_scan.c | 63 +- - fs/bcachefs/btree_node_scan_types.h | 2 +- - fs/bcachefs/btree_trans_commit.c | 199 ++-- - fs/bcachefs/btree_types.h | 47 +- - fs/bcachefs/btree_update.c | 95 +- - fs/bcachefs/btree_update.h | 71 +- - fs/bcachefs/btree_update_interior.c | 338 +++--- - fs/bcachefs/btree_update_interior.h | 11 +- - fs/bcachefs/btree_write_buffer.c | 38 +- - fs/bcachefs/btree_write_buffer.h | 1 + - fs/bcachefs/btree_write_buffer_types.h | 2 +- - fs/bcachefs/buckets.c | 303 ++--- - fs/bcachefs/buckets.h | 58 +- - fs/bcachefs/buckets_types.h | 32 + - fs/bcachefs/chardev.c | 58 +- - fs/bcachefs/checksum.c | 270 ++--- - fs/bcachefs/checksum.h | 7 +- - fs/bcachefs/compress.c | 74 +- - fs/bcachefs/data_update.c | 330 ++++-- - fs/bcachefs/data_update.h | 44 +- - fs/bcachefs/debug.c | 119 +- - fs/bcachefs/debug.h | 20 +- - fs/bcachefs/dirent.c | 294 ++++- - fs/bcachefs/dirent.h | 30 +- - fs/bcachefs/dirent_format.h | 20 +- - fs/bcachefs/disk_accounting.c | 170 ++- - fs/bcachefs/disk_accounting.h | 54 +- - fs/bcachefs/disk_accounting_format.h | 90 +- - fs/bcachefs/disk_accounting_types.h | 2 +- - fs/bcachefs/disk_groups.c | 148 ++- - fs/bcachefs/ec.c | 754 ++++++------ - fs/bcachefs/ec.h | 56 +- - fs/bcachefs/ec_types.h | 19 +- - fs/bcachefs/enumerated_ref.c | 144 +++ - fs/bcachefs/enumerated_ref.h | 66 ++ - fs/bcachefs/enumerated_ref_types.h | 19 + - fs/bcachefs/errcode.h | 77 +- - fs/bcachefs/error.c | 387 +++++-- - fs/bcachefs/error.h | 113 +- - fs/bcachefs/extent_update.c | 67 +- - fs/bcachefs/extent_update.h | 2 +- - fs/bcachefs/extents.c | 379 ++++-- - fs/bcachefs/extents.h | 34 +- - fs/bcachefs/extents_format.h | 24 +- - fs/bcachefs/extents_types.h | 12 +- - fs/bcachefs/eytzinger.c | 76 +- - fs/bcachefs/eytzinger.h | 95 +- - fs/bcachefs/fast_list.c | 156 +++ - fs/bcachefs/fast_list.h | 41 + - fs/bcachefs/fs-io-buffered.c | 59 +- - fs/bcachefs/fs-io-direct.c | 27 +- - fs/bcachefs/fs-io-pagecache.c | 18 +- - fs/bcachefs/fs-io.c | 97 +- - fs/bcachefs/fs-ioctl.c | 215 +--- - fs/bcachefs/fs-ioctl.h | 73 -- - fs/bcachefs/fs.c | 672 ++++++++--- - fs/bcachefs/fsck.c | 734 +++++------- - fs/bcachefs/inode.c | 204 ++-- - fs/bcachefs/inode.h | 46 +- - fs/bcachefs/inode_format.h | 13 +- - fs/bcachefs/io_misc.c | 21 +- - fs/bcachefs/io_read.c | 1024 ++++++++++------- - fs/bcachefs/io_read.h | 111 +- - fs/bcachefs/io_write.c | 539 +++++---- - fs/bcachefs/io_write.h | 38 +- - fs/bcachefs/io_write_types.h | 34 +- - fs/bcachefs/journal.c | 317 +++-- - fs/bcachefs/journal.h | 52 +- - fs/bcachefs/journal_io.c | 283 +++-- - fs/bcachefs/journal_io.h | 2 +- - fs/bcachefs/journal_reclaim.c | 72 +- - fs/bcachefs/journal_sb.c | 2 +- - fs/bcachefs/journal_seq_blacklist.c | 7 +- - fs/bcachefs/journal_types.h | 39 +- - fs/bcachefs/lru.c | 107 +- - fs/bcachefs/lru.h | 22 +- - fs/bcachefs/lru_format.h | 6 +- - fs/bcachefs/migrate.c | 143 ++- - fs/bcachefs/migrate.h | 3 +- - fs/bcachefs/move.c | 641 +++++++---- - fs/bcachefs/move.h | 17 +- - fs/bcachefs/move_types.h | 28 +- - fs/bcachefs/movinggc.c | 233 ++-- - fs/bcachefs/movinggc.h | 11 +- - fs/bcachefs/{fs-common.c => namei.c} | 501 ++++++-- - fs/bcachefs/{fs-common.h => namei.h} | 38 +- - fs/bcachefs/nocow_locking.c | 4 +- - fs/bcachefs/nocow_locking.h | 2 +- - fs/bcachefs/opts.c | 314 +++-- - fs/bcachefs/opts.h | 111 +- - fs/bcachefs/printbuf.c | 19 + - fs/bcachefs/printbuf.h | 1 + - fs/bcachefs/progress.c | 61 + - fs/bcachefs/progress.h | 29 + - fs/bcachefs/quota.c | 2 +- - fs/bcachefs/rcu_pending.c | 3 +- - fs/bcachefs/rebalance.c | 292 ++++- - fs/bcachefs/rebalance.h | 8 +- - fs/bcachefs/rebalance_types.h | 5 + - fs/bcachefs/recovery.c | 163 ++- - fs/bcachefs/recovery.h | 3 +- - fs/bcachefs/recovery_passes.c | 590 +++++++--- - fs/bcachefs/recovery_passes.h | 26 +- - fs/bcachefs/recovery_passes_format.h | 104 ++ - fs/bcachefs/recovery_passes_types.h | 93 +- - fs/bcachefs/reflink.c | 59 +- - fs/bcachefs/sb-counters.c | 90 +- - fs/bcachefs/sb-counters.h | 4 + - fs/bcachefs/sb-counters_format.h | 33 +- - fs/bcachefs/sb-downgrade.c | 22 +- - fs/bcachefs/sb-errors_format.h | 30 +- - fs/bcachefs/sb-errors_types.h | 2 +- - fs/bcachefs/sb-members.c | 83 +- - fs/bcachefs/sb-members.h | 95 +- - fs/bcachefs/sb-members_format.h | 7 + - fs/bcachefs/sb-members_types.h | 1 + - fs/bcachefs/snapshot.c | 539 ++++++--- - fs/bcachefs/snapshot.h | 34 +- - fs/bcachefs/snapshot_format.h | 4 +- - fs/bcachefs/snapshot_types.h | 58 + - fs/bcachefs/str_hash.c | 143 ++- - fs/bcachefs/str_hash.h | 29 +- - fs/bcachefs/subvolume.c | 71 +- - fs/bcachefs/subvolume.h | 20 +- - fs/bcachefs/subvolume_types.h | 27 - - fs/bcachefs/super-io.c | 183 ++- - fs/bcachefs/super-io.h | 11 +- - fs/bcachefs/super.c | 971 +++++++++++----- - fs/bcachefs/super.h | 11 +- - fs/bcachefs/super_types.h | 8 +- - fs/bcachefs/sysfs.c | 344 ++++-- - fs/bcachefs/sysfs.h | 5 +- - fs/bcachefs/tests.c | 34 +- - fs/bcachefs/thread_with_file.c | 4 +- - fs/bcachefs/thread_with_file_types.h | 2 +- - fs/bcachefs/time_stats.c | 20 +- - fs/bcachefs/time_stats.h | 1 + - fs/bcachefs/trace.h | 159 +-- - fs/bcachefs/util.c | 274 ++++- - fs/bcachefs/util.h | 100 +- - fs/bcachefs/xattr.c | 31 +- - fs/bcachefs/xattr.h | 4 +- - fs/bcachefs/xattr_format.h | 8 +- - fs/dcache.c | 267 +++++ - fs/libfs.c | 1 + - fs/overlayfs/params.c | 20 +- - fs/overlayfs/util.c | 19 +- - fs/super.c | 1 + - fs/xfs/xfs_super.c | 3 +- - {fs/bcachefs => include/linux}/darray.h | 59 +- - include/linux/darray_types.h | 33 + - include/linux/dcache.h | 12 + - include/linux/fs.h | 6 + - include/linux/seq_buf.h | 4 + - include/linux/shrinker.h | 13 +- - include/linux/sort.h | 11 + - lib/Makefile | 2 +- - {fs/bcachefs => lib}/darray.c | 9 +- - lib/seq_buf.c | 10 + - lib/sort.c | 110 +- - mm/oom_kill.c | 23 - - mm/show_mem.c | 50 + - mm/shrinker.c | 95 +- - mm/shrinker_debug.c | 18 + - mm/slab.h | 6 +- - mm/slab_common.c | 52 +- - 206 files changed, 14908 insertions(+), 7898 deletions(-) - create mode 100644 Documentation/filesystems/bcachefs/casefolding.rst - create mode 100644 Documentation/filesystems/bcachefs/future/idle_work.rst - create mode 100644 fs/bcachefs/async_objs.c - create mode 100644 fs/bcachefs/async_objs.h - create mode 100644 fs/bcachefs/async_objs_types.h - create mode 100644 fs/bcachefs/enumerated_ref.c - create mode 100644 fs/bcachefs/enumerated_ref.h - create mode 100644 fs/bcachefs/enumerated_ref_types.h - create mode 100644 fs/bcachefs/fast_list.c - create mode 100644 fs/bcachefs/fast_list.h - rename fs/bcachefs/{fs-common.c => namei.c} (52%) - rename fs/bcachefs/{fs-common.h => namei.h} (52%) - create mode 100644 fs/bcachefs/progress.c - create mode 100644 fs/bcachefs/progress.h - create mode 100644 fs/bcachefs/recovery_passes_format.h - create mode 100644 fs/bcachefs/snapshot_types.h - rename {fs/bcachefs => include/linux}/darray.h (64%) - create mode 100644 include/linux/darray_types.h - rename {fs/bcachefs => lib}/darray.c (75%) - -diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst -index 026b12ae0d6a..a455f9cfd15c 100644 ---- a/Documentation/filesystems/bcachefs/SubmittingPatches.rst -+++ b/Documentation/filesystems/bcachefs/SubmittingPatches.rst -@@ -1,8 +1,13 @@ --Submitting patches to bcachefs: --=============================== -+Submitting patches to bcachefs -+============================== -+ -+Here are suggestions for submitting patches to bcachefs subsystem. -+ -+Submission checklist -+-------------------- - - Patches must be tested before being submitted, either with the xfstests suite --[0], or the full bcachefs test suite in ktest [1], depending on what's being -+[0]_, or the full bcachefs test suite in ktest [1]_, depending on what's being - touched. Note that ktest wraps xfstests and will be an easier method to running - it for most users; it includes single-command wrappers for all the mainstream - in-kernel local filesystems. -@@ -26,21 +31,21 @@ considered out of date), but try not to deviate too much without reason. - Focus on writing code that reads well and is organized well; code should be - aesthetically pleasing. - --CI: --=== -+CI -+-- - - Instead of running your tests locally, when running the full test suite it's - prefereable to let a server farm do it in parallel, and then have the results - in a nice test dashboard (which can tell you which failures are new, and - presents results in a git log view, avoiding the need for most bisecting). - --That exists [2], and community members may request an account. If you work for -+That exists [2]_, and community members may request an account. If you work for - a big tech company, you'll need to help out with server costs to get access - - but the CI is not restricted to running bcachefs tests: it runs any ktest test - (which generally makes it easy to wrap other tests that can run in qemu). - --Other things to think about: --============================ -+Other things to think about -+--------------------------- - - - How will we debug this code? Is there sufficient introspection to diagnose - when something starts acting wonky on a user machine? -@@ -79,20 +84,22 @@ Other things to think about: - tested? (Automated tests exists but aren't in the CI, due to the hassle of - disk image management; coordinate to have them run.) - --Mailing list, IRC: --================== -+Mailing list, IRC -+----------------- - --Patches should hit the list [3], but much discussion and code review happens on --IRC as well [4]; many people appreciate the more conversational approach and --quicker feedback. -+Patches should hit the list [3]_, but much discussion and code review happens -+on IRC as well [4]_; many people appreciate the more conversational approach -+and quicker feedback. - - Additionally, we have a lively user community doing excellent QA work, which - exists primarily on IRC. Please make use of that resource; user feedback is - important for any nontrivial feature, and documenting it in commit messages - would be a good idea. - --[0]: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git --[1]: https://evilpiepirate.org/git/ktest.git/ --[2]: https://evilpiepirate.org/~testdashboard/ci/ --[3]: linux-bcachefs@vger.kernel.org --[4]: irc.oftc.net#bcache, #bcachefs-dev -+.. rubric:: References -+ -+.. [0] git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git -+.. [1] https://evilpiepirate.org/git/ktest.git/ -+.. [2] https://evilpiepirate.org/~testdashboard/ci/ -+.. [3] linux-bcachefs@vger.kernel.org -+.. [4] irc.oftc.net#bcache, #bcachefs-dev -diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst -new file mode 100644 -index 000000000000..871a38f557e8 ---- /dev/null -+++ b/Documentation/filesystems/bcachefs/casefolding.rst -@@ -0,0 +1,108 @@ -+.. SPDX-License-Identifier: GPL-2.0 -+ -+Casefolding -+=========== -+ -+bcachefs has support for case-insensitive file and directory -+lookups using the regular `chattr +F` (`S_CASEFOLD`, `FS_CASEFOLD_FL`) -+casefolding attributes. -+ -+The main usecase for casefolding is compatibility with software written -+against other filesystems that rely on casefolded lookups -+(eg. NTFS and Wine/Proton). -+Taking advantage of file-system level casefolding can lead to great -+loading time gains in many applications and games. -+ -+Casefolding support requires a kernel with the `CONFIG_UNICODE` enabled. -+Once a directory has been flagged for casefolding, a feature bit -+is enabled on the superblock which marks the filesystem as using -+casefolding. -+When the feature bit for casefolding is enabled, it is no longer possible -+to mount that filesystem on kernels without `CONFIG_UNICODE` enabled. -+ -+On the lookup/query side: casefolding is implemented by allocating a new -+string of `BCH_NAME_MAX` length using the `utf8_casefold` function to -+casefold the query string. -+ -+On the dirent side: casefolding is implemented by ensuring the `bkey`'s -+hash is made from the casefolded string and storing the cached casefolded -+name with the regular name in the dirent. -+ -+The structure looks like this: -+ -+* Regular: [dirent data][regular name][nul][nul]... -+* Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]... -+ -+(Do note, the number of NULs here is merely for illustration; their count can -+vary per-key, and they may not even be present if the key is aligned to -+`sizeof(u64)`.) -+ -+This is efficient as it means that for all file lookups that require casefolding, -+it has identical performance to a regular lookup: -+a hash comparison and a `memcmp` of the name. -+ -+Rationale -+--------- -+ -+Several designs were considered for this system: -+One was to introduce a dirent_v2, however that would be painful especially as -+the hash system only has support for a single key type. This would also need -+`BCH_NAME_MAX` to change between versions, and a new feature bit. -+ -+Another option was to store without the two lengths, and just take the length of -+the regular name and casefolded name contiguously / 2 as the length. This would -+assume that the regular length == casefolded length, but that could potentially -+not be true, if the uppercase unicode glyph had a different UTF-8 encoding than -+the lowercase unicode glyph. -+It would be possible to disregard the casefold cache for those cases, but it was -+decided to simply encode the two string lengths in the key to avoid random -+performance issues if this edgecase was ever hit. -+ -+The option settled on was to use a free-bit in d_type to mark a dirent as having -+a casefold cache, and then treat the first 4 bytes the name block as lengths. -+You can see this in the `d_cf_name_block` member of union in `bch_dirent`. -+ -+The feature bit was used to allow casefolding support to be enabled for the majority -+of users, but some allow users who have no need for the feature to still use bcachefs as -+`CONFIG_UNICODE` can increase the kernel side a significant amount due to the tables used, -+which may be decider between using bcachefs for eg. embedded platforms. -+ -+Other filesystems like ext4 and f2fs have a super-block level option for casefolding -+encoding, but bcachefs currently does not provide this. ext4 and f2fs do not expose -+any encodings than a single UTF-8 version. When future encodings are desirable, -+they will be added trivially using the opts mechanism. -+ -+dentry/dcache considerations -+---------------------------- -+ -+Currently, in casefolded directories, bcachefs (like other filesystems) will not cache -+negative dentry's. -+ -+This is because currently doing so presents a problem in the following scenario: -+ -+ - Lookup file "blAH" in a casefolded directory -+ - Creation of file "BLAH" in a casefolded directory -+ - Lookup file "blAH" in a casefolded directory -+ -+This would fail if negative dentry's were cached. -+ -+This is slightly suboptimal, but could be fixed in future with some vfs work. -+ -+ -+References -+---------- -+ -+(from Peter Anvin, on the list) -+ -+It is worth noting that Microsoft has basically declared their -+"recommended" case folding (upcase) table to be permanently frozen (for -+new filesystem instances in the case where they use an on-disk -+translation table created at format time.) As far as I know they have -+never supported anything other than 1:1 conversion of BMP code points, -+nor normalization. -+ -+The exFAT specification enumerates the full recommended upcase table, -+although in a somewhat annoying format (basically a hex dump of -+compressed data): -+ -+https://learn.microsoft.com/en-us/windows/win32/fileio/exfat-specification -diff --git a/Documentation/filesystems/bcachefs/future/idle_work.rst b/Documentation/filesystems/bcachefs/future/idle_work.rst -new file mode 100644 -index 000000000000..59a332509dcd ---- /dev/null -+++ b/Documentation/filesystems/bcachefs/future/idle_work.rst -@@ -0,0 +1,78 @@ -+Idle/background work classes design doc: -+ -+Right now, our behaviour at idle isn't ideal, it was designed for servers that -+would be under sustained load, to keep pending work at a "medium" level, to -+let work build up so we can process it in more efficient batches, while also -+giving headroom for bursts in load. -+ -+But for desktops or mobile - scenarios where work is less sustained and power -+usage is more important - we want to operate differently, with a "rush to -+idle" so the system can go to sleep. We don't want to be dribbling out -+background work while the system should be idle. -+ -+The complicating factor is that there are a number of background tasks, which -+form a heirarchy (or a digraph, depending on how you divide it up) - one -+background task may generate work for another. -+ -+Thus proper idle detection needs to model this heirarchy. -+ -+- Foreground writes -+- Page cache writeback -+- Copygc, rebalance -+- Journal reclaim -+ -+When we implement idle detection and rush to idle, we need to be careful not -+to disturb too much the existing behaviour that works reasonably well when the -+system is under sustained load (or perhaps improve it in the case of -+rebalance, which currently does not actively attempt to let work batch up). -+ -+SUSTAINED LOAD REGIME -+--------------------- -+ -+When the system is under continuous load, we want these jobs to run -+continuously - this is perhaps best modelled with a P/D controller, where -+they'll be trying to keep a target value (i.e. fragmented disk space, -+available journal space) roughly in the middle of some range. -+ -+The goal under sustained load is to balance our ability to handle load spikes -+without running out of x resource (free disk space, free space in the -+journal), while also letting some work accumululate to be batched (or become -+unnecessary). -+ -+For example, we don't want to run copygc too aggressively, because then it -+will be evacuating buckets that would have become empty (been overwritten or -+deleted) anyways, and we don't want to wait until we're almost out of free -+space because then the system will behave unpredicably - suddenly we're doing -+a lot more work to service each write and the system becomes much slower. -+ -+IDLE REGIME -+----------- -+ -+When the system becomes idle, we should start flushing our pending work -+quicker so the system can go to sleep. -+ -+Note that the definition of "idle" depends on where in the heirarchy a task -+is - a task should start flushing work more quickly when the task above it has -+stopped generating new work. -+ -+e.g. rebalance should start flushing more quickly when page cache writeback is -+idle, and journal reclaim should only start flushing more quickly when both -+copygc and rebalance are idle. -+ -+It's important to let work accumulate when more work is still incoming and we -+still have room, because flushing is always more efficient if we let it batch -+up. New writes may overwrite data before rebalance moves it, and tasks may be -+generating more updates for the btree nodes that journal reclaim needs to flush. -+ -+On idle, how much work we do at each interval should be proportional to the -+length of time we have been idle for. If we're idle only for a short duration, -+we shouldn't flush everything right away; the system might wake up and start -+generating new work soon, and flushing immediately might end up doing a lot of -+work that would have been unnecessary if we'd allowed things to batch more. -+ -+To summarize, we will need: -+ -+ - A list of classes for background tasks that generate work, which will -+ include one "foreground" class. -+ - Tracking for each class - "Am I doing work, or have I gone to sleep?" -+ - And each class should check the class above it when deciding how much work to issue. -diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst -index 7db4d7ceab58..e5c4c2120b93 100644 ---- a/Documentation/filesystems/bcachefs/index.rst -+++ b/Documentation/filesystems/bcachefs/index.rst -@@ -4,10 +4,35 @@ - bcachefs Documentation - ====================== - -+Subsystem-specific development process notes -+-------------------------------------------- -+ -+Development notes specific to bcachefs. These are intended to supplement -+:doc:`general kernel development handbook `. -+ - .. toctree:: -- :maxdepth: 2 -+ :maxdepth: 1 - :numbered: - - CodingStyle - SubmittingPatches -+ -+Filesystem implementation -+------------------------- -+ -+Documentation for filesystem features and their implementation details. -+At this moment, only a few of these are described here. -+ -+.. toctree:: -+ :maxdepth: 1 -+ :numbered: -+ -+ casefolding - errorcodes -+ -+Future design -+------------- -+.. toctree:: -+ :maxdepth: 1 -+ -+ future/idle_work -diff --git a/MAINTAINERS b/MAINTAINERS -index 00e94bec401e..82d236baff32 100644 ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -6404,6 +6404,13 @@ F: net/ax25/ax25_out.c - F: net/ax25/ax25_timer.c - F: net/ax25/sysctl_net_ax25.c - -+DARRAY -+M: Kent Overstreet -+L: linux-bcachefs@vger.kernel.org -+S: Maintained -+F: include/linux/darray.h -+F: include/linux/darray_types.h -+ - DATA ACCESS MONITOR - M: SeongJae Park - L: damon@lists.linux.dev -diff --git a/block/bdev.c b/block/bdev.c -index 5aebcf437f17..d909d87f857c 100644 ---- a/block/bdev.c -+++ b/block/bdev.c -@@ -178,6 +178,8 @@ EXPORT_SYMBOL(set_blocksize); - - int sb_set_blocksize(struct super_block *sb, int size) - { -+ if (!(sb->s_type->fs_flags & FS_LBS) && size > PAGE_SIZE) -+ return 0; - if (set_blocksize(sb->s_bdev_file, size)) - return 0; - /* If we get here, we know size is power of two -diff --git a/block/blk-core.c b/block/blk-core.c -index d6c4fa3943b5..7b1103eb877d 100644 ---- a/block/blk-core.c -+++ b/block/blk-core.c -@@ -793,20 +793,21 @@ void submit_bio_noacct(struct bio *bio) - goto end_io; - } - -+ if (WARN_ON_ONCE((bio->bi_opf & REQ_PREFLUSH) && -+ bio_op(bio) != REQ_OP_WRITE && -+ bio_op(bio) != REQ_OP_ZONE_APPEND)) -+ goto end_io; -+ - /* - * Filter flush bio's early so that bio based drivers without flush - * support don't have to worry about them. - */ -- if (op_is_flush(bio->bi_opf)) { -- if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE && -- bio_op(bio) != REQ_OP_ZONE_APPEND)) -+ if (op_is_flush(bio->bi_opf) && -+ !bdev_write_cache(bdev)) { -+ bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); -+ if (!bio_sectors(bio)) { -+ status = BLK_STS_OK; - goto end_io; -- if (!bdev_write_cache(bdev)) { -- bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); -- if (!bio_sectors(bio)) { -- status = BLK_STS_OK; -- goto end_io; -- } - } - } - -diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig -index fc7efd0a7525..ca165415b1d8 100644 ---- a/fs/bcachefs/Kconfig -+++ b/fs/bcachefs/Kconfig -@@ -15,10 +15,9 @@ config BCACHEFS_FS - select ZLIB_INFLATE - select ZSTD_COMPRESS - select ZSTD_DECOMPRESS -- select CRYPTO -- select CRYPTO_SHA256 -- select CRYPTO_CHACHA20 -- select CRYPTO_POLY1305 -+ select CRYPTO_LIB_SHA256 -+ select CRYPTO_LIB_CHACHA -+ select CRYPTO_LIB_POLY1305 - select KEYS - select RAID6_PQ - select XOR_BLOCKS -@@ -26,6 +25,7 @@ config BCACHEFS_FS - select SRCU - select SYMBOLIC_ERRNAME - select MIN_HEAP -+ select XARRAY_MULTI - help - The bcachefs filesystem - a modern, copy on write filesystem, with - support for multiple devices, compression, checksumming, etc. -@@ -103,6 +103,14 @@ config BCACHEFS_PATH_TRACEPOINTS - Enable extra tracepoints for debugging btree_path operations; we don't - normally want these enabled because they happen at very high rates. - -+config BCACHEFS_TRANS_KMALLOC_TRACE -+ bool "Trace bch2_trans_kmalloc() calls" -+ depends on BCACHEFS_FS -+ -+config BCACHEFS_ASYNC_OBJECT_LISTS -+ bool "Keep async objects on fast_lists for debugfs visibility" -+ depends on BCACHEFS_FS && DEBUG_FS -+ - config MEAN_AND_VARIANCE_UNIT_TEST - tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS - depends on KUNIT -diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile -index d2689388d5e8..d71621711cfa 100644 ---- a/fs/bcachefs/Makefile -+++ b/fs/bcachefs/Makefile -@@ -28,20 +28,20 @@ bcachefs-y := \ - checksum.o \ - clock.o \ - compress.o \ -- darray.o \ - data_update.o \ - debug.o \ - dirent.o \ - disk_accounting.o \ - disk_groups.o \ - ec.o \ -+ enumerated_ref.o \ - errcode.o \ - error.o \ - extents.o \ - extent_update.o \ - eytzinger.o \ -+ fast_list.o \ - fs.o \ -- fs-common.o \ - fs-ioctl.o \ - fs-io.o \ - fs-io-buffered.o \ -@@ -64,9 +64,11 @@ bcachefs-y := \ - migrate.o \ - move.o \ - movinggc.o \ -+ namei.o \ - nocow_locking.o \ - opts.o \ - printbuf.o \ -+ progress.o \ - quota.o \ - rebalance.o \ - rcu_pending.o \ -@@ -96,6 +98,8 @@ bcachefs-y := \ - varint.o \ - xattr.o - -+bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o -+ - obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o - - # Silence "note: xyz changed in GCC X.X" messages -diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c -index 99487727ae64..d03adc36100e 100644 ---- a/fs/bcachefs/acl.c -+++ b/fs/bcachefs/acl.c -@@ -273,7 +273,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); -- struct btree_iter iter = { NULL }; -+ struct btree_iter iter = {}; - struct posix_acl *acl = NULL; - - if (rcu) -@@ -344,7 +344,7 @@ int bch2_set_acl(struct mnt_idmap *idmap, - { - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; -- struct btree_iter inode_iter = { NULL }; -+ struct btree_iter inode_iter = {}; - struct bch_inode_unpacked inode_u; - struct posix_acl *acl; - umode_t mode; -diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c -index 3ea809990ef1..a38b9c6c891e 100644 ---- a/fs/bcachefs/alloc_background.c -+++ b/fs/bcachefs/alloc_background.c -@@ -17,6 +17,7 @@ - #include "debug.h" - #include "disk_accounting.h" - #include "ec.h" -+#include "enumerated_ref.h" - #include "error.h" - #include "lru.h" - #include "recovery.h" -@@ -232,7 +233,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, - int ret = 0; - - bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), -- c, alloc_v2_unpack_error, -+ c, alloc_v3_unpack_error, - "unpack error"); - fsck_err: - return ret; -@@ -308,7 +309,8 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, - "data type inconsistency"); - - bkey_fsck_err_on(!a.io_time[READ] && -- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, -+ !(c->recovery.passes_to_run & -+ BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)), - c, alloc_key_cached_but_read_time_zero, - "cached bucket with read_time == 0"); - break; -@@ -478,12 +480,27 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, - enum btree_iter_update_trigger_flags flags) - { - struct btree_iter iter; -- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); -- int ret = PTR_ERR_OR_ZERO(a); -- if (ret) -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos, -+ BTREE_ITER_with_updates| -+ BTREE_ITER_cached| -+ BTREE_ITER_intent); -+ int ret = bkey_err(k); -+ if (unlikely(ret)) - return ERR_PTR(ret); - -- ret = bch2_trans_update(trans, &iter, &a->k_i, flags); -+ if ((void *) k.v >= trans->mem && -+ (void *) k.v < trans->mem + trans->mem_top) { -+ bch2_trans_iter_exit(trans, &iter); -+ return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v); -+ } -+ -+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); -+ if (IS_ERR(a)) { -+ bch2_trans_iter_exit(trans, &iter); -+ return a; -+ } -+ -+ ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_); - bch2_trans_iter_exit(trans, &iter); - return unlikely(ret) ? ERR_PTR(ret) : a; - } -@@ -589,6 +606,8 @@ int bch2_bucket_gens_init(struct bch_fs *c) - - int bch2_alloc_read(struct bch_fs *c) - { -+ down_read(&c->state_lock); -+ - struct btree_trans *trans = bch2_trans_get(c); - struct bch_dev *ca = NULL; - int ret; -@@ -608,7 +627,7 @@ int bch2_alloc_read(struct bch_fs *c) - * bch2_check_alloc_key() which runs later: - */ - if (!ca) { -- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); -+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - -@@ -629,17 +648,17 @@ int bch2_alloc_read(struct bch_fs *c) - * bch2_check_alloc_key() which runs later: - */ - if (!ca) { -- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); -+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - - if (k.k->p.offset < ca->mi.first_bucket) { -- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); -+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket)); - continue; - } - - if (k.k->p.offset >= ca->mi.nbuckets) { -- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); -+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - -@@ -652,6 +671,7 @@ int bch2_alloc_read(struct bch_fs *c) - bch2_dev_put(ca); - bch2_trans_put(trans); - -+ up_read(&c->state_lock); - bch_err_fn(c, ret); - return ret; - } -@@ -673,8 +693,7 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans, - bch2_bkey_val_to_text(&buf, c, alloc_k); - - int ret = __bch2_fsck_err(NULL, trans, flags, err_id, -- "bucket incorrectly %sset in %s btree\n" -- " %s", -+ "bucket incorrectly %sset in %s btree\n%s", - set ? "" : "un", - bch2_btree_id_str(btree), - buf.buf); -@@ -777,14 +796,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s - s64 delta_sectors, - s64 delta_fragmented, unsigned flags) - { -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_dev_data_type, -- .dev_data_type.dev = ca->dev_idx, -- .dev_data_type.data_type = data_type, -- }; - s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; - -- return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); -+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, -+ d, dev_data_type, -+ .dev = ca->dev_idx, -+ .data_type = data_type); - } - - int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, -@@ -837,7 +854,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, - - struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); - if (!ca) -- return -EIO; -+ return -BCH_ERR_trigger_alloc; - - struct bch_alloc_v4 old_a_convert; - const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); -@@ -871,6 +888,9 @@ int bch2_trigger_alloc(struct btree_trans *trans, - if (data_type_is_empty(new_a->data_type) && - BCH_ALLOC_V4_NEED_INC_GEN(new_a) && - !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { -+ if (new_a->oldest_gen == new_a->gen && -+ !bch2_bucket_sectors_total(*new_a)) -+ new_a->oldest_gen++; - new_a->gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); - alloc_data_type_set(new_a, new_a->data_type); -@@ -889,26 +909,20 @@ int bch2_trigger_alloc(struct btree_trans *trans, - !new_a->io_time[READ]) - new_a->io_time[READ] = bch2_current_io_time(c, READ); - -- u64 old_lru = alloc_lru_idx_read(*old_a); -- u64 new_lru = alloc_lru_idx_read(*new_a); -- if (old_lru != new_lru) { -- ret = bch2_lru_change(trans, new.k->p.inode, -- bucket_to_u64(new.k->p), -- old_lru, new_lru); -- if (ret) -- goto err; -- } -+ ret = bch2_lru_change(trans, new.k->p.inode, -+ bucket_to_u64(new.k->p), -+ alloc_lru_idx_read(*old_a), -+ alloc_lru_idx_read(*new_a)); -+ if (ret) -+ goto err; - -- old_lru = alloc_lru_idx_fragmentation(*old_a, ca); -- new_lru = alloc_lru_idx_fragmentation(*new_a, ca); -- if (old_lru != new_lru) { -- ret = bch2_lru_change(trans, -- BCH_LRU_FRAGMENTATION_START, -- bucket_to_u64(new.k->p), -- old_lru, new_lru); -- if (ret) -- goto err; -- } -+ ret = bch2_lru_change(trans, -+ BCH_LRU_BUCKET_FRAGMENTATION, -+ bucket_to_u64(new.k->p), -+ alloc_lru_idx_fragmentation(*old_a, ca), -+ alloc_lru_idx_fragmentation(*new_a, ca)); -+ if (ret) -+ goto err; - - if (old_a->gen != new_a->gen) { - ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); -@@ -916,15 +930,6 @@ int bch2_trigger_alloc(struct btree_trans *trans, - goto err; - } - -- if ((flags & BTREE_TRIGGER_bucket_invalidate) && -- old_a->cached_sectors) { -- ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx, -- -((s64) old_a->cached_sectors), -- flags & BTREE_TRIGGER_gc); -- if (ret) -- goto err; -- } -- - ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); - if (ret) - goto err; -@@ -1032,9 +1037,9 @@ int bch2_trigger_alloc(struct btree_trans *trans, - bch2_dev_put(ca); - return ret; - invalid_bucket: -- bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", -+ bch2_fs_inconsistent(c, "reference to invalid bucket\n%s", - (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); -- ret = -EIO; -+ ret = -BCH_ERR_trigger_alloc; - goto err; - } - -@@ -1042,9 +1047,10 @@ int bch2_trigger_alloc(struct btree_trans *trans, - * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for - * extents style btrees, but works on non-extents btrees: - */ --static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) -+static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos end, struct bkey *hole) - { -- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); - - if (bkey_err(k)) - return k; -@@ -1055,9 +1061,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos - struct btree_iter iter2; - struct bpos next; - -- bch2_trans_copy_iter(&iter2, iter); -+ bch2_trans_copy_iter(trans, &iter2, iter); - -- struct btree_path *path = btree_iter_path(iter->trans, iter); -+ struct btree_path *path = btree_iter_path(trans, iter); - if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) - end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); - -@@ -1067,9 +1073,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos - * btree node min/max is a closed interval, upto takes a half - * open interval: - */ -- k = bch2_btree_iter_peek_max(&iter2, end); -+ k = bch2_btree_iter_peek_max(trans, &iter2, end); - next = iter2.pos; -- bch2_trans_iter_exit(iter->trans, &iter2); -+ bch2_trans_iter_exit(trans, &iter2); - - BUG_ON(next.offset >= iter->pos.offset + U32_MAX); - -@@ -1110,13 +1116,14 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck - return *ca != NULL; - } - --static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, -- struct bch_dev **ca, struct bkey *hole) -+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bch_dev **ca, struct bkey *hole) - { -- struct bch_fs *c = iter->trans->c; -+ struct bch_fs *c = trans->c; - struct bkey_s_c k; - again: -- k = bch2_get_key_or_hole(iter, POS_MAX, hole); -+ k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole); - if (bkey_err(k)) - return k; - -@@ -1129,7 +1136,7 @@ static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, - if (!next_bucket(c, ca, &hole_start)) - return bkey_s_c_null; - -- bch2_btree_iter_set_pos(iter, hole_start); -+ bch2_btree_iter_set_pos(trans, iter, hole_start); - goto again; - } - -@@ -1170,8 +1177,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, - - a = bch2_alloc_to_v4(alloc_k, &a_convert); - -- bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); -- k = bch2_btree_iter_peek_slot(discard_iter); -+ bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p); -+ k = bch2_btree_iter_peek_slot(trans, discard_iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -1184,8 +1191,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, - goto err; - } - -- bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); -- k = bch2_btree_iter_peek_slot(freespace_iter); -+ bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); -+ k = bch2_btree_iter_peek_slot(trans, freespace_iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -1198,16 +1205,15 @@ int bch2_check_alloc_key(struct btree_trans *trans, - goto err; - } - -- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); -- k = bch2_btree_iter_peek_slot(bucket_gens_iter); -+ bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); -+ k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), - trans, bucket_gens_key_wrong, -- "incorrect gen in bucket_gens btree (got %u should be %u)\n" -- " %s", -+ "incorrect gen in bucket_gens btree (got %u should be %u)\n%s", - alloc_gen(k, gens_offset), a->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { -@@ -1253,9 +1259,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, - if (!ca->mi.freespace_initialized) - return 0; - -- bch2_btree_iter_set_pos(freespace_iter, start); -+ bch2_btree_iter_set_pos(trans, freespace_iter, start); - -- k = bch2_btree_iter_peek_slot(freespace_iter); -+ k = bch2_btree_iter_peek_slot(trans, freespace_iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -1265,7 +1271,7 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, - if (fsck_err_on(k.k->type != KEY_TYPE_set, - trans, freespace_hole_missing, - "hole in alloc btree missing in freespace btree\n" -- " device %llu buckets %llu-%llu", -+ "device %llu buckets %llu-%llu", - freespace_iter->pos.inode, - freespace_iter->pos.offset, - end->offset)) { -@@ -1304,9 +1310,9 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, - unsigned i, gens_offset, gens_end_offset; - int ret; - -- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); -+ bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); - -- k = bch2_btree_iter_peek_slot(bucket_gens_iter); -+ k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -1383,7 +1389,7 @@ static void check_discard_freespace_key_work(struct work_struct *work) - container_of(work, struct check_discard_freespace_key_async, work); - - bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); -- bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key); -+ enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key); - kfree(w); - } - -@@ -1424,7 +1430,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite - (state == BCH_DATA_free && - genbits != alloc_freespace_genbits(*a))) { - if (fsck_err(trans, need_discard_freespace_key_bad, -- "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", -+ "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), - bch2_btree_id_str(iter->btree_id), - iter->pos.inode, -@@ -1439,7 +1445,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite - *gen = a->gen; - out: - fsck_err: -- bch2_set_btree_iter_dontneed(&alloc_iter); -+ bch2_set_btree_iter_dontneed(trans, &alloc_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); - return ret; -@@ -1460,7 +1466,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite - if (!w) - goto out; - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) { -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) { - kfree(w); - goto out; - } -@@ -1505,7 +1511,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, - struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); - if (!ca) { - if (fsck_err(trans, bucket_gens_to_invalid_dev, -- "bucket_gens key for invalid device:\n %s", -+ "bucket_gens key for invalid device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; -@@ -1514,7 +1520,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, - if (fsck_err_on(end <= ca->mi.first_bucket || - start >= ca->mi.nbuckets, - trans, bucket_gens_to_invalid_buckets, -- "bucket_gens key for invalid buckets:\n %s", -+ "bucket_gens key for invalid buckets:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; -@@ -1576,7 +1582,7 @@ int bch2_check_alloc_info(struct bch_fs *c) - - bch2_trans_begin(trans); - -- k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); -+ k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole); - ret = bkey_err(k); - if (ret) - goto bkey_err; -@@ -1614,7 +1620,7 @@ int bch2_check_alloc_info(struct bch_fs *c) - if (ret) - goto bkey_err; - -- bch2_btree_iter_set_pos(&iter, next); -+ bch2_btree_iter_set_pos(trans, &iter, next); - bkey_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; -@@ -1642,7 +1648,7 @@ int bch2_check_alloc_info(struct bch_fs *c) - BTREE_ITER_prefetch); - while (1) { - bch2_trans_begin(trans); -- k = bch2_btree_iter_peek(&iter); -+ k = bch2_btree_iter_peek(trans, &iter); - if (!k.k) - break; - -@@ -1661,7 +1667,7 @@ int bch2_check_alloc_info(struct bch_fs *c) - break; - } - -- bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); -+ bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos)); - } - bch2_trans_iter_exit(trans, &iter); - if (ret) -@@ -1689,7 +1695,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - struct printbuf buf = PRINTBUF; - int ret; - -- alloc_k = bch2_btree_iter_peek(alloc_iter); -+ alloc_k = bch2_btree_iter_peek(trans, alloc_iter); - if (!alloc_k.k) - return 0; - -@@ -1705,7 +1711,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - - u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - if (lru_idx) { -- ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, -+ ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION, -+ bucket_to_u64(alloc_k.k->p), - lru_idx, alloc_k, last_flushed); - if (ret) - goto err; -@@ -1716,8 +1723,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - - if (fsck_err_on(!a->io_time[READ], - trans, alloc_key_cached_but_read_time_zero, -- "cached bucket with read_time 0\n" -- " %s", -+ "cached bucket with read_time 0\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i_alloc_v4 *a_mut = -@@ -1735,7 +1741,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - a = &a_mut->v; - } - -- ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], -+ ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, -+ bucket_to_u64(alloc_k.k->p), -+ a->io_time[READ], - alloc_k, last_flushed); - if (ret) - goto err; -@@ -1757,7 +1765,8 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) - for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -- bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); -+ bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?: -+ bch2_check_stripe_to_lru_refs(c); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); -@@ -1814,7 +1823,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, - { - struct bch_fs *c = trans->c; - struct bpos pos = need_discard_iter->pos; -- struct btree_iter iter = { NULL }; -+ struct btree_iter iter = {}; - struct bkey_s_c k; - struct bkey_i_alloc_v4 *a; - struct printbuf buf = PRINTBUF; -@@ -1868,7 +1877,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, - s->discarded++; - *discard_pos_done = iter.pos; - -- if (ca->mi.discard && !c->opts.nochanges) { -+ if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) { - /* - * This works without any other locks because this is the only - * thread that removes items from the need_discard tree -@@ -1897,7 +1906,10 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, - if (ret) - goto out; - -- count_event(c, bucket_discard); -+ if (!fastpath) -+ count_event(c, bucket_discard); -+ else -+ count_event(c, bucket_discard_fast); - out: - fsck_err: - if (discard_locked) -@@ -1935,26 +1947,26 @@ static void bch2_do_discards_work(struct work_struct *work) - trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, - bch2_err_str(ret)); - -- percpu_ref_put(&ca->io_ref); -- bch2_write_ref_put(c, BCH_WRITE_REF_discard); -+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); - } - - void bch2_dev_do_discards(struct bch_dev *ca) - { - struct bch_fs *c = ca->fs; - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard)) - return; - -- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) -+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards)) - goto put_write_ref; - - if (queue_work(c->write_ref_wq, &ca->discard_work)) - return; - -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); - put_write_ref: -- bch2_write_ref_put(c, BCH_WRITE_REF_discard); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); - } - - void bch2_do_discards(struct bch_fs *c) -@@ -2030,8 +2042,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) - trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - - bch2_trans_put(trans); -- percpu_ref_put(&ca->io_ref); -- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); -+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); - } - - static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) -@@ -2041,30 +2053,88 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) - if (discard_in_flight_add(ca, bucket, false)) - return; - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast)) - return; - -- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) -+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast)) - goto put_ref; - - if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) - return; - -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); - put_ref: -- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); -+} -+ -+static int invalidate_one_bp(struct btree_trans *trans, -+ struct bch_dev *ca, -+ struct bkey_s_c_backpointer bp, -+ struct bkey_buf *last_flushed) -+{ -+ struct btree_iter extent_iter; -+ struct bkey_s_c extent_k = -+ bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed); -+ int ret = bkey_err(extent_k); -+ if (ret) -+ return ret; -+ -+ if (!extent_k.k) -+ return 0; -+ -+ struct bkey_i *n = -+ bch2_bkey_make_mut(trans, &extent_iter, &extent_k, -+ BTREE_UPDATE_internal_snapshot_node); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ goto err; -+ -+ bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx); -+err: -+ bch2_trans_iter_exit(trans, &extent_iter); -+ return ret; -+} -+ -+static int invalidate_one_bucket_by_bps(struct btree_trans *trans, -+ struct bch_dev *ca, -+ struct bpos bucket, -+ u8 gen, -+ struct bkey_buf *last_flushed) -+{ -+ struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket); -+ struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket); -+ -+ return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, -+ bp_start, bp_end, 0, k, -+ NULL, NULL, -+ BCH_WATERMARK_btree| -+ BCH_TRANS_COMMIT_no_enospc, ({ -+ if (k.k->type != KEY_TYPE_backpointer) -+ continue; -+ -+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); -+ -+ if (bp.v->bucket_gen != gen) -+ continue; -+ -+ /* filter out bps with gens that don't match */ -+ -+ invalidate_one_bp(trans, ca, bp, last_flushed); -+ })); - } - -+noinline_for_stack - static int invalidate_one_bucket(struct btree_trans *trans, -+ struct bch_dev *ca, - struct btree_iter *lru_iter, - struct bkey_s_c lru_k, -+ struct bkey_buf *last_flushed, - s64 *nr_to_invalidate) - { - struct bch_fs *c = trans->c; -- struct bkey_i_alloc_v4 *a = NULL; - struct printbuf buf = PRINTBUF; - struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); -- unsigned cached_sectors; -+ struct btree_iter alloc_iter = {}; - int ret = 0; - - if (*nr_to_invalidate <= 0) -@@ -2081,35 +2151,40 @@ static int invalidate_one_bucket(struct btree_trans *trans, - if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) - return 0; - -- a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate); -- ret = PTR_ERR_OR_ZERO(a); -+ struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, -+ BTREE_ID_alloc, bucket, -+ BTREE_ITER_cached); -+ ret = bkey_err(alloc_k); - if (ret) -- goto out; -+ return ret; -+ -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - - /* We expect harmless races here due to the btree write buffer: */ -- if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) -+ if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) - goto out; - -- BUG_ON(a->v.data_type != BCH_DATA_cached); -- BUG_ON(a->v.dirty_sectors); -- -- if (!a->v.cached_sectors) -- bch_err(c, "invalidating empty bucket, confused"); -+ /* -+ * Impossible since alloc_lru_idx_read() only returns nonzero if the -+ * bucket is supposed to be on the cached bucket LRU (i.e. -+ * BCH_DATA_cached) -+ * -+ * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 -+ */ -+ BUG_ON(a->data_type != BCH_DATA_cached); -+ BUG_ON(a->dirty_sectors); - -- cached_sectors = a->v.cached_sectors; -+ if (!a->cached_sectors) { -+ bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset, -+ true, last_flushed); -+ goto out; -+ } - -- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); -- a->v.gen++; -- a->v.data_type = 0; -- a->v.dirty_sectors = 0; -- a->v.stripe_sectors = 0; -- a->v.cached_sectors = 0; -- a->v.io_time[READ] = bch2_current_io_time(c, READ); -- a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); -+ unsigned cached_sectors = a->cached_sectors; -+ u8 gen = a->gen; - -- ret = bch2_trans_commit(trans, NULL, NULL, -- BCH_WATERMARK_btree| -- BCH_TRANS_COMMIT_no_enospc); -+ ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); - if (ret) - goto out; - -@@ -2117,6 +2192,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, - --*nr_to_invalidate; - out: - fsck_err: -+ bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); - return ret; - } -@@ -2126,9 +2202,9 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter - { - struct bkey_s_c k; - again: -- k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); -+ k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); - if (!k.k && !*wrapped) { -- bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); -+ bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0)); - *wrapped = true; - goto again; - } -@@ -2143,6 +2219,10 @@ static void bch2_do_invalidates_work(struct work_struct *work) - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - -+ struct bkey_buf last_flushed; -+ bch2_bkey_buf_init(&last_flushed); -+ bkey_init(&last_flushed.k->k); -+ - ret = bch2_btree_write_buffer_tryflush(trans); - if (ret) - goto err; -@@ -2167,38 +2247,39 @@ static void bch2_do_invalidates_work(struct work_struct *work) - if (!k.k) - break; - -- ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); -+ ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate); - restart_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - -- bch2_btree_iter_advance(&iter); -+ bch2_btree_iter_advance(trans, &iter); - } - bch2_trans_iter_exit(trans, &iter); - err: - bch2_trans_put(trans); -- percpu_ref_put(&ca->io_ref); -- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); -+ bch2_bkey_buf_exit(&last_flushed, c); -+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); - } - - void bch2_dev_do_invalidates(struct bch_dev *ca) - { - struct bch_fs *c = ca->fs; - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate)) - return; - -- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) -+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates)) - goto put_ref; - - if (queue_work(c->write_ref_wq, &ca->invalidate_work)) - return; - -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); - put_ref: -- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); - } - - void bch2_do_invalidates(struct bch_fs *c) -@@ -2243,7 +2324,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, - break; - } - -- k = bch2_get_key_or_hole(&iter, end, &hole); -+ k = bch2_get_key_or_hole(trans, &iter, end, &hole); - ret = bkey_err(k); - if (ret) - goto bkey_err; -@@ -2262,7 +2343,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, - if (ret) - goto bkey_err; - -- bch2_btree_iter_advance(&iter); -+ bch2_btree_iter_advance(trans, &iter); - } else { - struct bkey_i *freespace; - -@@ -2282,7 +2363,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, - if (ret) - goto bkey_err; - -- bch2_btree_iter_set_pos(&iter, k.k->p); -+ bch2_btree_iter_set_pos(trans, &iter, k.k->p); - } - bkey_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -@@ -2309,14 +2390,16 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, - - int bch2_fs_freespace_init(struct bch_fs *c) - { -- int ret = 0; -- bool doing_init = false; -+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) -+ return 0; -+ - - /* - * We can crash during the device add path, so we need to check this on - * every mount: - */ - -+ bool doing_init = false; - for_each_member_device(c, ca) { - if (ca->mi.freespace_initialized) - continue; -@@ -2326,7 +2409,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) - doing_init = true; - } - -- ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); -+ int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); - if (ret) { - bch2_dev_put(ca); - bch_err_fn(c, ret); -@@ -2356,8 +2439,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) - * We clear the LRU and need_discard btrees first so that we don't race - * with bch2_do_invalidates() and bch2_do_discards() - */ -- ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?: -- bch2_btree_delete_range(c, BTREE_ID_lru, start, end, -+ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, - BTREE_TRIGGER_norun, NULL) ?: -@@ -2420,15 +2502,15 @@ void bch2_recalc_capacity(struct bch_fs *c) - - lockdep_assert_held(&c->state_lock); - -- for_each_online_member(c, ca) { -- struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; -- -- ra_pages += bdi->ra_pages; -- } -+ rcu_read_lock(); -+ for_each_member_device_rcu(c, ca, NULL) { -+ struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); -+ if (bdev) -+ ra_pages += bdev->bd_disk->bdi->ra_pages; - -- bch2_set_ra_pages(c, ra_pages); -+ if (ca->mi.state != BCH_MEMBER_STATE_rw) -+ continue; - -- for_each_rw_member(c, ca) { - u64 dev_reserve = 0; - - /* -@@ -2465,6 +2547,9 @@ void bch2_recalc_capacity(struct bch_fs *c) - bucket_size_max = max_t(unsigned, bucket_size_max, - ca->mi.bucket_size); - } -+ rcu_read_unlock(); -+ -+ bch2_set_ra_pages(c, ra_pages); - - gc_reserve = c->opts.gc_reserve_bytes - ? c->opts.gc_reserve_bytes >> 9 -@@ -2487,27 +2572,41 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) - { - u64 ret = U64_MAX; - -- for_each_rw_member(c, ca) -+ rcu_read_lock(); -+ for_each_rw_member_rcu(c, ca) - ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); -+ rcu_read_unlock(); - return ret; - } - - static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) - { - struct open_bucket *ob; -- bool ret = false; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { -- spin_lock(&ob->lock); -- if (ob->valid && !ob->on_partial_list && -- ob->dev == ca->dev_idx) -- ret = true; -- spin_unlock(&ob->lock); -+ scoped_guard(spinlock, &ob->lock) { -+ if (ob->valid && !ob->on_partial_list && -+ ob->dev == ca->dev_idx) -+ return true; -+ } - } - -- return ret; -+ return false; -+} -+ -+void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) -+{ -+ /* BCH_DATA_free == all rw devs */ -+ -+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) -+ if (rw && -+ (i == BCH_DATA_free || -+ (ca->mi.data_allowed & BIT(i)))) -+ set_bit(ca->dev_idx, c->rw_devs[i].d); -+ else -+ clear_bit(ca->dev_idx, c->rw_devs[i].d); - } - - /* device goes ro: */ -@@ -2516,9 +2615,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) - lockdep_assert_held(&c->state_lock); - - /* First, remove device from allocation groups: */ -- -- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) -- clear_bit(ca->dev_idx, c->rw_devs[i].d); -+ bch2_dev_allocator_set_rw(c, ca, false); - - c->rw_devs_change_count++; - -@@ -2552,10 +2649,7 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) - { - lockdep_assert_held(&c->state_lock); - -- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) -- if (ca->mi.data_allowed & (1 << i)) -- set_bit(ca->dev_idx, c->rw_devs[i].d); -- -+ bch2_dev_allocator_set_rw(c, ca, true); - c->rw_devs_change_count++; - } - -diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h -index de25ba4ee94b..4f94c6a661bf 100644 ---- a/fs/bcachefs/alloc_background.h -+++ b/fs/bcachefs/alloc_background.h -@@ -131,7 +131,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, - if (a.stripe) - return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; - if (bch2_bucket_sectors_dirty(a)) -- return data_type; -+ return bucket_data_type(data_type); - if (a.cached_sectors) - return BCH_DATA_cached; - if (BCH_ALLOC_V4_NEED_DISCARD(&a)) -@@ -321,11 +321,11 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, - { - u64 want_free = ca->mi.nbuckets >> 7; - u64 free = max_t(s64, 0, -- u.d[BCH_DATA_free].buckets -- + u.d[BCH_DATA_need_discard].buckets -+ u.buckets[BCH_DATA_free] -+ + u.buckets[BCH_DATA_need_discard] - - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); - -- return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); -+ return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]); - } - - void bch2_dev_do_invalidates(struct bch_dev *); -@@ -350,6 +350,7 @@ int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); - void bch2_recalc_capacity(struct bch_fs *); - u64 bch2_min_rw_member_capacity(struct bch_fs *); - -+void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool); - void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); - void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); - -diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c -index 5a781fb4c794..1a52c12c51ae 100644 ---- a/fs/bcachefs/alloc_foreground.c -+++ b/fs/bcachefs/alloc_foreground.c -@@ -127,14 +127,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) - - void bch2_open_bucket_write_error(struct bch_fs *c, - struct open_buckets *obs, -- unsigned dev) -+ unsigned dev, int err) - { - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, obs, ob, i) - if (ob->dev == dev && ob->ec) -- bch2_ec_bucket_cancel(c, ob); -+ bch2_ec_bucket_cancel(c, ob, err); - } - - static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) -@@ -154,7 +154,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) - - static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) - { -- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) -+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs)) - return false; - - return bch2_is_superblock_bucket(ca, b); -@@ -179,29 +179,12 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) - closure_wake_up(&c->freelist_wait); - } - --static inline unsigned open_buckets_reserved(enum bch_watermark watermark) --{ -- switch (watermark) { -- case BCH_WATERMARK_interior_updates: -- return 0; -- case BCH_WATERMARK_reclaim: -- return OPEN_BUCKETS_COUNT / 6; -- case BCH_WATERMARK_btree: -- case BCH_WATERMARK_btree_copygc: -- return OPEN_BUCKETS_COUNT / 4; -- case BCH_WATERMARK_copygc: -- return OPEN_BUCKETS_COUNT / 3; -- default: -- return OPEN_BUCKETS_COUNT / 2; -- } --} -- - static inline bool may_alloc_bucket(struct bch_fs *c, -- struct bpos bucket, -- struct bucket_alloc_state *s) -+ struct alloc_request *req, -+ struct bpos bucket) - { - if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { -- s->skipped_open++; -+ req->counters.skipped_open++; - return false; - } - -@@ -210,36 +193,37 @@ static inline bool may_alloc_bucket(struct bch_fs *c, - bucket.inode, bucket.offset); - if (journal_seq_ready > c->journal.flushed_seq_ondisk) { - if (journal_seq_ready > c->journal.flushing_seq) -- s->need_journal_commit++; -- s->skipped_need_journal_commit++; -+ req->counters.need_journal_commit++; -+ req->counters.skipped_need_journal_commit++; - return false; - } - - if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { -- s->skipped_nocow++; -+ req->counters.skipped_nocow++; - return false; - } - - return true; - } - --static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, -+ struct alloc_request *req, - u64 bucket, u8 gen, -- enum bch_watermark watermark, -- struct bucket_alloc_state *s, - struct closure *cl) - { -+ struct bch_dev *ca = req->ca; -+ - if (unlikely(is_superblock_bucket(c, ca, bucket))) - return NULL; - - if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { -- s->skipped_nouse++; -+ req->counters.skipped_nouse++; - return NULL; - } - - spin_lock(&c->freelist_lock); - -- if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { -+ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { - if (cl) - closure_wait(&c->open_buckets_wait, cl); - -@@ -251,7 +235,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * - /* Recheck under lock: */ - if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { - spin_unlock(&c->freelist_lock); -- s->skipped_open++; -+ req->counters.skipped_open++; - return NULL; - } - -@@ -275,16 +259,15 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * - return ob; - } - --static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, -- enum bch_watermark watermark, -- struct bucket_alloc_state *s, -+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, -+ struct alloc_request *req, - struct btree_iter *freespace_iter, - struct closure *cl) - { - struct bch_fs *c = trans->c; - u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - -- if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) -+ if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b))) - return NULL; - - u8 gen; -@@ -294,7 +277,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc - if (ret) - return NULL; - -- return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); -+ return __try_alloc_bucket(c, req, b, gen, cl); - } - - /* -@@ -302,17 +285,16 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc - */ - static noinline struct open_bucket * - bch2_bucket_alloc_early(struct btree_trans *trans, -- struct bch_dev *ca, -- enum bch_watermark watermark, -- struct bucket_alloc_state *s, -+ struct alloc_request *req, - struct closure *cl) - { - struct bch_fs *c = trans->c; -+ struct bch_dev *ca = req->ca; - struct btree_iter iter, citer; - struct bkey_s_c k, ck; - struct open_bucket *ob = NULL; - u64 first_bucket = ca->mi.first_bucket; -- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; -+ u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; - u64 alloc_start = max(first_bucket, *dev_alloc_cursor); - u64 alloc_cursor = alloc_start; - int ret; -@@ -334,19 +316,19 @@ bch2_bucket_alloc_early(struct btree_trans *trans, - if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) - break; - -- if (s->btree_bitmap != BTREE_BITMAP_ANY && -- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, -+ if (req->btree_bitmap != BTREE_BITMAP_ANY && -+ req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, - bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { -- if (s->btree_bitmap == BTREE_BITMAP_YES && -+ if (req->btree_bitmap == BTREE_BITMAP_YES && - bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) - break; - - bucket = sector_to_bucket(ca, - round_up(bucket_to_sector(ca, bucket) + 1, - 1ULL << ca->mi.btree_bitmap_shift)); -- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); -- s->buckets_seen++; -- s->skipped_mi_btree_bitmap++; -+ bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); -+ req->counters.buckets_seen++; -+ req->counters.skipped_mi_btree_bitmap++; - continue; - } - -@@ -365,14 +347,13 @@ bch2_bucket_alloc_early(struct btree_trans *trans, - if (a->data_type != BCH_DATA_free) - goto next; - -- s->buckets_seen++; -+ req->counters.buckets_seen++; - -- ob = may_alloc_bucket(c, k.k->p, s) -- ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, -- watermark, s, cl) -+ ob = may_alloc_bucket(c, req, k.k->p) -+ ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) - : NULL; - next: -- bch2_set_btree_iter_dontneed(&citer); -+ bch2_set_btree_iter_dontneed(trans, &citer); - bch2_trans_iter_exit(trans, &citer); - if (ob) - break; -@@ -395,15 +376,14 @@ bch2_bucket_alloc_early(struct btree_trans *trans, - } - - static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, -- struct bch_dev *ca, -- enum bch_watermark watermark, -- struct bucket_alloc_state *s, -- struct closure *cl) -+ struct alloc_request *req, -+ struct closure *cl) - { -+ struct bch_dev *ca = req->ca; - struct btree_iter iter; - struct bkey_s_c k; - struct open_bucket *ob = NULL; -- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; -+ u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); - u64 alloc_cursor = alloc_start; - int ret; -@@ -419,13 +399,13 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - iter.k.size = iter.k.p.offset - iter.pos.offset; - - while (iter.k.size) { -- s->buckets_seen++; -+ req->counters.buckets_seen++; - - u64 bucket = iter.pos.offset & ~(~0ULL << 56); -- if (s->btree_bitmap != BTREE_BITMAP_ANY && -- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, -+ if (req->btree_bitmap != BTREE_BITMAP_ANY && -+ req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, - bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { -- if (s->btree_bitmap == BTREE_BITMAP_YES && -+ if (req->btree_bitmap == BTREE_BITMAP_YES && - bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) - goto fail; - -@@ -434,16 +414,16 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - 1ULL << ca->mi.btree_bitmap_shift)); - alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); - -- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); -- s->skipped_mi_btree_bitmap++; -+ bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); -+ req->counters.skipped_mi_btree_bitmap++; - goto next; - } - -- ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); -+ ob = try_alloc_bucket(trans, req, &iter, cl); - if (ob) { - if (!IS_ERR(ob)) - *dev_alloc_cursor = iter.pos.offset; -- bch2_set_btree_iter_dontneed(&iter); -+ bch2_set_btree_iter_dontneed(trans, &iter); - break; - } - -@@ -470,33 +450,30 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - return ob; - } - --static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, -- enum bch_watermark watermark, -- enum bch_data_type data_type, -+static noinline void trace_bucket_alloc2(struct bch_fs *c, -+ struct alloc_request *req, - struct closure *cl, -- struct bch_dev_usage *usage, -- struct bucket_alloc_state *s, - struct open_bucket *ob) - { - struct printbuf buf = PRINTBUF; - - printbuf_tabstop_push(&buf, 24); - -- prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); -- prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); -- prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); -+ prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); -+ prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); -+ prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); - prt_printf(&buf, "blocking\t%u\n", cl != NULL); -- prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets); -- prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); -- prt_printf(&buf, "copygc_wait\t%lu/%lli\n", -+ prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); -+ prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); -+ prt_printf(&buf, "copygc_wait\t%llu/%lli\n", - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); -- prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); -- prt_printf(&buf, "open\t%llu\n", s->skipped_open); -- prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); -- prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); -- prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); -- prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); -+ prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); -+ prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open); -+ prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit); -+ prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow); -+ prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse); -+ prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap); - - if (!IS_ERR(ob)) { - prt_printf(&buf, "allocated\t%llu\n", ob->bucket); -@@ -512,47 +489,42 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, - /** - * bch2_bucket_alloc_trans - allocate a single bucket from a specific device - * @trans: transaction object -- * @ca: device to allocate from -- * @watermark: how important is this allocation? -- * @data_type: BCH_DATA_journal, btree, user... -+ * @req: state for the entire allocation - * @cl: if not NULL, closure to be used to wait if buckets not available - * @nowait: if true, do not wait for buckets to become available -- * @usage: for secondarily also returning the current device usage - * - * Returns: an open_bucket on success, or an ERR_PTR() on failure. - */ - static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, -- struct bch_dev *ca, -- enum bch_watermark watermark, -- enum bch_data_type data_type, -- struct closure *cl, -- bool nowait, -- struct bch_dev_usage *usage) -+ struct alloc_request *req, -+ struct closure *cl, -+ bool nowait) - { - struct bch_fs *c = trans->c; -+ struct bch_dev *ca = req->ca; - struct open_bucket *ob = NULL; - bool freespace = READ_ONCE(ca->mi.freespace_initialized); - u64 avail; -- struct bucket_alloc_state s = { -- .btree_bitmap = data_type == BCH_DATA_btree, -- }; - bool waiting = nowait; -+ -+ req->btree_bitmap = req->data_type == BCH_DATA_btree; -+ memset(&req->counters, 0, sizeof(req->counters)); - again: -- bch2_dev_usage_read_fast(ca, usage); -- avail = dev_buckets_free(ca, *usage, watermark); -+ bch2_dev_usage_read_fast(ca, &req->usage); -+ avail = dev_buckets_free(ca, req->usage, req->watermark); - -- if (usage->d[BCH_DATA_need_discard].buckets > avail) -+ if (req->usage.buckets[BCH_DATA_need_discard] > avail) - bch2_dev_do_discards(ca); - -- if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) -+ if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) - bch2_gc_gens_async(c); - -- if (should_invalidate_buckets(ca, *usage)) -+ if (should_invalidate_buckets(ca, req->usage)) - bch2_dev_do_invalidates(ca); - - if (!avail) { -- if (watermark > BCH_WATERMARK_normal && -- c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) -+ if (req->watermark > BCH_WATERMARK_normal && -+ c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) - goto alloc; - - if (cl && !waiting) { -@@ -571,18 +543,18 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - closure_wake_up(&c->freelist_wait); - alloc: - ob = likely(freespace) -- ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) -- : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); -+ ? bch2_bucket_alloc_freelist(trans, req, cl) -+ : bch2_bucket_alloc_early(trans, req, cl); - -- if (s.need_journal_commit * 2 > avail) -+ if (req->counters.need_journal_commit * 2 > avail) - bch2_journal_flush_async(&c->journal, NULL); - -- if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { -- s.btree_bitmap = BTREE_BITMAP_ANY; -+ if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) { -+ req->btree_bitmap = BTREE_BITMAP_ANY; - goto alloc; - } - -- if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { -+ if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) { - freespace = false; - goto alloc; - } -@@ -591,7 +563,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - ob = ERR_PTR(-BCH_ERR_no_buckets_found); - - if (!IS_ERR(ob)) -- ob->data_type = data_type; -+ ob->data_type = req->data_type; - - if (!IS_ERR(ob)) - count_event(c, bucket_alloc); -@@ -601,7 +573,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - if (!IS_ERR(ob) - ? trace_bucket_alloc_enabled() - : trace_bucket_alloc_fail_enabled()) -- trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); -+ trace_bucket_alloc2(c, req, cl, ob); - - return ob; - } -@@ -611,20 +583,22 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, - enum bch_data_type data_type, - struct closure *cl) - { -- struct bch_dev_usage usage; - struct open_bucket *ob; -+ struct alloc_request req = { -+ .watermark = watermark, -+ .data_type = data_type, -+ .ca = ca, -+ }; - - bch2_trans_do(c, -- PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, -- data_type, cl, false, &usage))); -+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); - return ob; - } - - static int __dev_stripe_cmp(struct dev_stripe_state *stripe, - unsigned l, unsigned r) - { -- return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - -- (stripe->next_alloc[l] < stripe->next_alloc[r])); -+ return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]); - } - - #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -@@ -643,25 +617,62 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, - return ret; - } - -+static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ -+static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */ -+static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */ -+ -+static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe) -+{ -+ /* -+ * Avoid underflowing clock hands if at all possible, if clock hands go -+ * to 0 then we lose information - clock hands can be in a wide range if -+ * we have devices we rarely try to allocate from, if we generally -+ * allocate from a specified target but only sometimes have to fall back -+ * to the whole filesystem. -+ */ -+ u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */ -+ u64 scale_min = 0; /* minumum we must subtract to avoid overflow */ -+ -+ for (u64 *v = stripe->next_alloc; -+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) { -+ if (*v) -+ scale_max = min(scale_max, *v); -+ if (*v > stripe_clock_hand_max) -+ scale_min = max(scale_min, *v - stripe_clock_hand_max); -+ } -+ -+ u64 scale = max(scale_min, scale_max); -+ -+ for (u64 *v = stripe->next_alloc; -+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) -+ *v = *v < scale ? 0 : *v - scale; -+} -+ - static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, - struct dev_stripe_state *stripe, - struct bch_dev_usage *usage) - { -+ /* -+ * Stripe state has a per device clock hand: we allocate from the device -+ * with the smallest clock hand. -+ * -+ * When we allocate, we don't do a simple increment; we add the inverse -+ * of the device's free space. This results in round robin behavior that -+ * biases in favor of the device(s) with more free space. -+ */ -+ - u64 *v = stripe->next_alloc + ca->dev_idx; -- u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); -+ u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); - u64 free_space_inv = free_space -- ? div64_u64(1ULL << 48, free_space) -- : 1ULL << 48; -- u64 scale = *v / 4; -+ ? div64_u64(stripe_clock_hand_inv, free_space) -+ : stripe_clock_hand_inv; - -- if (*v + free_space_inv >= *v) -- *v += free_space_inv; -- else -- *v = U64_MAX; -+ /* Saturating add, avoid overflow: */ -+ u64 sum = *v + free_space_inv; -+ *v = sum >= *v ? sum : U64_MAX; - -- for (v = stripe->next_alloc; -- v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) -- *v = *v < scale ? 0 : *v - scale; -+ if (unlikely(*v > stripe_clock_hand_rescale)) -+ bch2_stripe_state_rescale(stripe); - } - - void bch2_dev_stripe_increment(struct bch_dev *ca, -@@ -674,24 +685,20 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, - } - - static int add_new_bucket(struct bch_fs *c, -- struct open_buckets *ptrs, -- struct bch_devs_mask *devs_may_alloc, -- unsigned nr_replicas, -- unsigned *nr_effective, -- bool *have_cache, -- struct open_bucket *ob) -+ struct alloc_request *req, -+ struct open_bucket *ob) - { - unsigned durability = ob_dev(c, ob)->mi.durability; - -- BUG_ON(*nr_effective >= nr_replicas); -+ BUG_ON(req->nr_effective >= req->nr_replicas); - -- __clear_bit(ob->dev, devs_may_alloc->d); -- *nr_effective += durability; -- *have_cache |= !durability; -+ __clear_bit(ob->dev, req->devs_may_alloc.d); -+ req->nr_effective += durability; -+ req->have_cache |= !durability; - -- ob_push(c, ptrs, ob); -+ ob_push(c, &req->ptrs, ob); - -- if (*nr_effective >= nr_replicas) -+ if (req->nr_effective >= req->nr_replicas) - return 1; - if (ob->ec) - return 1; -@@ -699,39 +706,31 @@ static int add_new_bucket(struct bch_fs *c, - } - - int bch2_bucket_alloc_set_trans(struct btree_trans *trans, -- struct open_buckets *ptrs, -- struct dev_stripe_state *stripe, -- struct bch_devs_mask *devs_may_alloc, -- unsigned nr_replicas, -- unsigned *nr_effective, -- bool *have_cache, -- enum bch_write_flags flags, -- enum bch_data_type data_type, -- enum bch_watermark watermark, -- struct closure *cl) -+ struct alloc_request *req, -+ struct dev_stripe_state *stripe, -+ struct closure *cl) - { - struct bch_fs *c = trans->c; - int ret = -BCH_ERR_insufficient_devices; - -- BUG_ON(*nr_effective >= nr_replicas); -+ BUG_ON(req->nr_effective >= req->nr_replicas); - -- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); -+ struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc); - darray_for_each(devs_sorted, i) { -- struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); -- if (!ca) -+ req->ca = bch2_dev_tryget_noerror(c, *i); -+ if (!req->ca) - continue; - -- if (!ca->mi.durability && *have_cache) { -- bch2_dev_put(ca); -+ if (!req->ca->mi.durability && req->have_cache) { -+ bch2_dev_put(req->ca); - continue; - } - -- struct bch_dev_usage usage; -- struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, -- cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); -+ struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, -+ req->flags & BCH_WRITE_alloc_nowait); - if (!IS_ERR(ob)) -- bch2_dev_stripe_increment_inlined(ca, stripe, &usage); -- bch2_dev_put(ca); -+ bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); -+ bch2_dev_put(req->ca); - - if (IS_ERR(ob)) { - ret = PTR_ERR(ob); -@@ -740,9 +739,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - continue; - } - -- if (add_new_bucket(c, ptrs, devs_may_alloc, -- nr_replicas, nr_effective, -- have_cache, ob)) { -+ if (add_new_bucket(c, req, ob)) { - ret = 0; - break; - } -@@ -760,34 +757,27 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - */ - - static int bucket_alloc_from_stripe(struct btree_trans *trans, -- struct open_buckets *ptrs, -- struct write_point *wp, -- struct bch_devs_mask *devs_may_alloc, -- u16 target, -- unsigned nr_replicas, -- unsigned *nr_effective, -- bool *have_cache, -- enum bch_watermark watermark, -- enum bch_write_flags flags, -- struct closure *cl) -+ struct alloc_request *req, -+ struct closure *cl) - { - struct bch_fs *c = trans->c; - int ret = 0; - -- if (nr_replicas < 2) -+ if (req->nr_replicas < 2) - return 0; - -- if (ec_open_bucket(c, ptrs)) -+ if (ec_open_bucket(c, &req->ptrs)) - return 0; - - struct ec_stripe_head *h = -- bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); -+ bch2_ec_stripe_head_get(trans, req, 0, cl); - if (IS_ERR(h)) - return PTR_ERR(h); - if (!h) - return 0; - -- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); -+ struct dev_alloc_list devs_sorted = -+ bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc); - darray_for_each(devs_sorted, i) - for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { - if (!h->s->blocks[ec_idx]) -@@ -799,9 +789,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, - ob->ec = h->s; - ec_stripe_new_get(h->s, STRIPE_REF_io); - -- ret = add_new_bucket(c, ptrs, devs_may_alloc, -- nr_replicas, nr_effective, -- have_cache, ob); -+ ret = add_new_bucket(c, req, ob); - goto out; - } - } -@@ -813,65 +801,49 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, - /* Sector allocator */ - - static bool want_bucket(struct bch_fs *c, -- struct write_point *wp, -- struct bch_devs_mask *devs_may_alloc, -- bool *have_cache, bool ec, -+ struct alloc_request *req, - struct open_bucket *ob) - { - struct bch_dev *ca = ob_dev(c, ob); - -- if (!test_bit(ob->dev, devs_may_alloc->d)) -+ if (!test_bit(ob->dev, req->devs_may_alloc.d)) - return false; - -- if (ob->data_type != wp->data_type) -+ if (ob->data_type != req->wp->data_type) - return false; - - if (!ca->mi.durability && -- (wp->data_type == BCH_DATA_btree || ec || *have_cache)) -+ (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) - return false; - -- if (ec != (ob->ec != NULL)) -+ if (req->ec != (ob->ec != NULL)) - return false; - - return true; - } - - static int bucket_alloc_set_writepoint(struct bch_fs *c, -- struct open_buckets *ptrs, -- struct write_point *wp, -- struct bch_devs_mask *devs_may_alloc, -- unsigned nr_replicas, -- unsigned *nr_effective, -- bool *have_cache, -- bool ec) -+ struct alloc_request *req) - { -- struct open_buckets ptrs_skip = { .nr = 0 }; - struct open_bucket *ob; - unsigned i; - int ret = 0; - -- open_bucket_for_each(c, &wp->ptrs, ob, i) { -- if (!ret && want_bucket(c, wp, devs_may_alloc, -- have_cache, ec, ob)) -- ret = add_new_bucket(c, ptrs, devs_may_alloc, -- nr_replicas, nr_effective, -- have_cache, ob); -+ req->scratch_ptrs.nr = 0; -+ -+ open_bucket_for_each(c, &req->wp->ptrs, ob, i) { -+ if (!ret && want_bucket(c, req, ob)) -+ ret = add_new_bucket(c, req, ob); - else -- ob_push(c, &ptrs_skip, ob); -+ ob_push(c, &req->scratch_ptrs, ob); - } -- wp->ptrs = ptrs_skip; -+ req->wp->ptrs = req->scratch_ptrs; - - return ret; - } - - static int bucket_alloc_set_partial(struct bch_fs *c, -- struct open_buckets *ptrs, -- struct write_point *wp, -- struct bch_devs_mask *devs_may_alloc, -- unsigned nr_replicas, -- unsigned *nr_effective, -- bool *have_cache, bool ec, -- enum bch_watermark watermark) -+ struct alloc_request *req) - { - int i, ret = 0; - -@@ -886,13 +858,12 @@ static int bucket_alloc_set_partial(struct bch_fs *c, - for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { - struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; - -- if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { -+ if (want_bucket(c, req, ob)) { - struct bch_dev *ca = ob_dev(c, ob); -- struct bch_dev_usage usage; - u64 avail; - -- bch2_dev_usage_read_fast(ca, &usage); -- avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; -+ bch2_dev_usage_read_fast(ca, &req->usage); -+ avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets; - if (!avail) - continue; - -@@ -905,9 +876,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - rcu_read_unlock(); - -- ret = add_new_bucket(c, ptrs, devs_may_alloc, -- nr_replicas, nr_effective, -- have_cache, ob); -+ ret = add_new_bucket(c, req, ob); - if (ret) - break; - } -@@ -918,61 +887,41 @@ static int bucket_alloc_set_partial(struct bch_fs *c, - } - - static int __open_bucket_add_buckets(struct btree_trans *trans, -- struct open_buckets *ptrs, -- struct write_point *wp, -- struct bch_devs_list *devs_have, -- u16 target, -- bool erasure_code, -- unsigned nr_replicas, -- unsigned *nr_effective, -- bool *have_cache, -- enum bch_watermark watermark, -- enum bch_write_flags flags, -- struct closure *_cl) -+ struct alloc_request *req, -+ struct closure *_cl) - { - struct bch_fs *c = trans->c; -- struct bch_devs_mask devs; - struct open_bucket *ob; - struct closure *cl = NULL; - unsigned i; - int ret; - -- devs = target_rw_devs(c, wp->data_type, target); -+ req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); - - /* Don't allocate from devices we already have pointers to: */ -- darray_for_each(*devs_have, i) -- __clear_bit(*i, devs.d); -+ darray_for_each(*req->devs_have, i) -+ __clear_bit(*i, req->devs_may_alloc.d); - -- open_bucket_for_each(c, ptrs, ob, i) -- __clear_bit(ob->dev, devs.d); -+ open_bucket_for_each(c, &req->ptrs, ob, i) -+ __clear_bit(ob->dev, req->devs_may_alloc.d); - -- ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, -- nr_replicas, nr_effective, -- have_cache, erasure_code); -+ ret = bucket_alloc_set_writepoint(c, req); - if (ret) - return ret; - -- ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, -- nr_replicas, nr_effective, -- have_cache, erasure_code, watermark); -+ ret = bucket_alloc_set_partial(c, req); - if (ret) - return ret; - -- if (erasure_code) { -- ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, -- target, -- nr_replicas, nr_effective, -- have_cache, -- watermark, flags, _cl); -+ if (req->ec) { -+ ret = bucket_alloc_from_stripe(trans, req, _cl); - } else { - retry_blocking: - /* - * Try nonblocking first, so that if one device is full we'll try from - * other devices: - */ -- ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, -- nr_replicas, nr_effective, have_cache, -- flags, wp->data_type, watermark, cl); -+ ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && -@@ -986,38 +935,27 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, - } - - static int open_bucket_add_buckets(struct btree_trans *trans, -- struct open_buckets *ptrs, -- struct write_point *wp, -- struct bch_devs_list *devs_have, -- u16 target, -- unsigned erasure_code, -- unsigned nr_replicas, -- unsigned *nr_effective, -- bool *have_cache, -- enum bch_watermark watermark, -- enum bch_write_flags flags, -- struct closure *cl) -+ struct alloc_request *req, -+ struct closure *cl) - { - int ret; - -- if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { -- ret = __open_bucket_add_buckets(trans, ptrs, wp, -- devs_have, target, erasure_code, -- nr_replicas, nr_effective, have_cache, -- watermark, flags, cl); -+ if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { -+ ret = __open_bucket_add_buckets(trans, req, cl); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_operation_blocked) || - bch2_err_matches(ret, BCH_ERR_freelist_empty) || - bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - return ret; -- if (*nr_effective >= nr_replicas) -+ if (req->nr_effective >= req->nr_replicas) - return 0; - } - -- ret = __open_bucket_add_buckets(trans, ptrs, wp, -- devs_have, target, false, -- nr_replicas, nr_effective, have_cache, -- watermark, flags, cl); -+ bool ec = false; -+ swap(ec, req->ec); -+ ret = __open_bucket_add_buckets(trans, req, cl); -+ swap(ec, req->ec); -+ - return ret < 0 ? ret : 0; - } - -@@ -1270,26 +1208,26 @@ static struct write_point *writepoint_find(struct btree_trans *trans, - - static noinline void - deallocate_extra_replicas(struct bch_fs *c, -- struct open_buckets *ptrs, -- struct open_buckets *ptrs_no_use, -- unsigned extra_replicas) -+ struct alloc_request *req) - { -- struct open_buckets ptrs2 = { 0 }; - struct open_bucket *ob; -+ unsigned extra_replicas = req->nr_effective - req->nr_replicas; - unsigned i; - -- open_bucket_for_each(c, ptrs, ob, i) { -+ req->scratch_ptrs.nr = 0; -+ -+ open_bucket_for_each(c, &req->ptrs, ob, i) { - unsigned d = ob_dev(c, ob)->mi.durability; - - if (d && d <= extra_replicas) { - extra_replicas -= d; -- ob_push(c, ptrs_no_use, ob); -+ ob_push(c, &req->wp->ptrs, ob); - } else { -- ob_push(c, &ptrs2, ob); -+ ob_push(c, &req->scratch_ptrs, ob); - } - } - -- *ptrs = ptrs2; -+ req->ptrs = req->scratch_ptrs; - } - - /* -@@ -1308,51 +1246,53 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, - struct write_point **wp_ret) - { - struct bch_fs *c = trans->c; -- struct write_point *wp; - struct open_bucket *ob; -- struct open_buckets ptrs; -- unsigned nr_effective, write_points_nr; -- bool have_cache; -- int ret; -+ unsigned write_points_nr; - int i; - -+ struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req)); -+ int ret = PTR_ERR_OR_ZERO(req); -+ if (unlikely(ret)) -+ return ret; -+ - if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) - erasure_code = false; - -+ req->nr_replicas = nr_replicas; -+ req->target = target; -+ req->ec = erasure_code; -+ req->watermark = watermark; -+ req->flags = flags; -+ req->devs_have = devs_have; -+ - BUG_ON(!nr_replicas || !nr_replicas_required); - retry: -- ptrs.nr = 0; -- nr_effective = 0; -- write_points_nr = c->write_points_nr; -- have_cache = false; -+ req->ptrs.nr = 0; -+ req->nr_effective = 0; -+ req->have_cache = false; -+ write_points_nr = c->write_points_nr; -+ -+ *wp_ret = req->wp = writepoint_find(trans, write_point.v); - -- *wp_ret = wp = writepoint_find(trans, write_point.v); -+ req->data_type = req->wp->data_type; - - ret = bch2_trans_relock(trans); - if (ret) - goto err; - - /* metadata may not allocate on cache devices: */ -- if (wp->data_type != BCH_DATA_user) -- have_cache = true; -- -- if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { -- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, -- target, erasure_code, -- nr_replicas, &nr_effective, -- &have_cache, watermark, -- flags, NULL); -+ if (req->data_type != BCH_DATA_user) -+ req->have_cache = true; -+ -+ if (target && !(flags & BCH_WRITE_only_specified_devs)) { -+ ret = open_bucket_add_buckets(trans, req, NULL); - if (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto alloc_done; - - /* Don't retry from all devices if we're out of open buckets: */ - if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { -- int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, -- target, erasure_code, -- nr_replicas, &nr_effective, -- &have_cache, watermark, -- flags, cl); -+ int ret2 = open_bucket_add_buckets(trans, req, cl); - if (!ret2 || - bch2_err_matches(ret2, BCH_ERR_transaction_restart) || - bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { -@@ -1365,59 +1305,74 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, - * Only try to allocate cache (durability = 0 devices) from the - * specified target: - */ -- have_cache = true; -+ req->have_cache = true; -+ req->target = 0; - -- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, -- 0, erasure_code, -- nr_replicas, &nr_effective, -- &have_cache, watermark, -- flags, cl); -+ ret = open_bucket_add_buckets(trans, req, cl); - } else { -- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, -- target, erasure_code, -- nr_replicas, &nr_effective, -- &have_cache, watermark, -- flags, cl); -+ ret = open_bucket_add_buckets(trans, req, cl); - } - alloc_done: -- BUG_ON(!ret && nr_effective < nr_replicas); -+ BUG_ON(!ret && req->nr_effective < req->nr_replicas); - -- if (erasure_code && !ec_open_bucket(c, &ptrs)) -+ if (erasure_code && !ec_open_bucket(c, &req->ptrs)) - pr_debug("failed to get ec bucket: ret %u", ret); - - if (ret == -BCH_ERR_insufficient_devices && -- nr_effective >= nr_replicas_required) -+ req->nr_effective >= nr_replicas_required) - ret = 0; - - if (ret) - goto err; - -- if (nr_effective > nr_replicas) -- deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); -+ if (req->nr_effective > req->nr_replicas) -+ deallocate_extra_replicas(c, req); - - /* Free buckets we didn't use: */ -- open_bucket_for_each(c, &wp->ptrs, ob, i) -+ open_bucket_for_each(c, &req->wp->ptrs, ob, i) - open_bucket_free_unused(c, ob); - -- wp->ptrs = ptrs; -+ req->wp->ptrs = req->ptrs; - -- wp->sectors_free = UINT_MAX; -+ req->wp->sectors_free = UINT_MAX; - -- open_bucket_for_each(c, &wp->ptrs, ob, i) -- wp->sectors_free = min(wp->sectors_free, ob->sectors_free); -+ open_bucket_for_each(c, &req->wp->ptrs, ob, i) { -+ /* -+ * Ensure proper write alignment - either due to misaligned -+ * bucket sizes (from buggy bcachefs-tools), or writes that mix -+ * logical/physical alignment: -+ */ -+ struct bch_dev *ca = ob_dev(c, ob); -+ u64 offset = bucket_to_sector(ca, ob->bucket) + -+ ca->mi.bucket_size - -+ ob->sectors_free; -+ unsigned align = round_up(offset, block_sectors(c)) - offset; - -- BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); -+ ob->sectors_free = max_t(int, 0, ob->sectors_free - align); -+ -+ req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free); -+ } -+ -+ req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c)); -+ -+ /* Did alignment use up space in an open_bucket? */ -+ if (unlikely(!req->wp->sectors_free)) { -+ bch2_alloc_sectors_done(c, req->wp); -+ goto retry; -+ } -+ -+ BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX); - - return 0; - err: -- open_bucket_for_each(c, &wp->ptrs, ob, i) -- if (ptrs.nr < ARRAY_SIZE(ptrs.v)) -- ob_push(c, &ptrs, ob); -+ open_bucket_for_each(c, &req->wp->ptrs, ob, i) -+ if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v)) -+ ob_push(c, &req->ptrs, ob); - else - open_bucket_free_unused(c, ob); -- wp->ptrs = ptrs; -+ req->wp->ptrs = req->ptrs; - -- mutex_unlock(&wp->lock); -+ mutex_unlock(&req->wp->lock); - - if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && - try_decrease_writepoints(trans, write_points_nr)) -@@ -1426,27 +1381,13 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, - if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; - -- if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && -+ if (cl && !(flags & BCH_WRITE_alloc_nowait) && - bch2_err_matches(ret, BCH_ERR_freelist_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; - - return ret; - } - --struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) --{ -- struct bch_dev *ca = ob_dev(c, ob); -- -- return (struct bch_extent_ptr) { -- .type = 1 << BCH_EXTENT_ENTRY_ptr, -- .gen = ob->gen, -- .dev = ob->dev, -- .offset = bucket_to_sector(ca, ob->bucket) + -- ca->mi.bucket_size - -- ob->sectors_free, -- }; --} -- - void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, - struct bkey_i *k, unsigned sectors, - bool cached) -@@ -1576,8 +1517,10 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, - struct open_bucket *ob; - unsigned i; - -+ mutex_lock(&wp->lock); -+ - prt_printf(out, "%lu: ", wp->write_point); -- prt_human_readable_u64(out, wp->sectors_allocated); -+ prt_human_readable_u64(out, wp->sectors_allocated << 9); - - prt_printf(out, " last wrote: "); - bch2_pr_time_units(out, sched_clock() - wp->last_used); -@@ -1593,6 +1536,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, - open_bucket_for_each(c, &wp->ptrs, ob, i) - bch2_open_bucket_to_text(out, c, ob); - printbuf_indent_sub(out, 2); -+ -+ mutex_unlock(&wp->lock); - } - - void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) -@@ -1650,7 +1595,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) - void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) - { - struct bch_fs *c = ca->fs; -- struct bch_dev_usage stats = bch2_dev_usage_read(ca); -+ struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca); - unsigned nr[BCH_DATA_NR]; - - memset(nr, 0, sizeof(nr)); -@@ -1673,7 +1618,8 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) - printbuf_tabstop_push(out, 16); - - prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); -- prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); -+ prt_printf(out, "buckets to invalidate\t%llu\r\n", -+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca))); - } - - static noinline void bch2_print_allocator_stuck(struct bch_fs *c) -@@ -1689,7 +1635,12 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - -- for_each_online_member(c, ca) { -+ bch2_printbuf_make_room(&buf, 4096); -+ -+ rcu_read_lock(); -+ buf.atomic++; -+ -+ for_each_online_member_rcu(c, ca) { - prt_printf(&buf, "Dev %u:\n", ca->dev_idx); - printbuf_indent_add(&buf, 2); - bch2_dev_alloc_debug_to_text(&buf, ca); -@@ -1697,6 +1648,9 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) - prt_newline(&buf); - } - -+ --buf.atomic; -+ rcu_read_unlock(); -+ - prt_printf(&buf, "Copygc debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_copygc_wait_to_text(&buf, c); -@@ -1708,7 +1662,7 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) - bch2_journal_debug_to_text(&buf, &c->journal); - printbuf_indent_sub(&buf, 2); - -- bch2_print_string_as_lines(KERN_ERR, buf.buf); -+ bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - -diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h -index f25481a0d1a0..2e01c7b61ed1 100644 ---- a/fs/bcachefs/alloc_foreground.h -+++ b/fs/bcachefs/alloc_foreground.h -@@ -3,8 +3,10 @@ - #define _BCACHEFS_ALLOC_FOREGROUND_H - - #include "bcachefs.h" -+#include "buckets.h" - #include "alloc_types.h" - #include "extents.h" -+#include "io_write_types.h" - #include "sb-members.h" - - #include -@@ -23,6 +25,52 @@ struct dev_alloc_list { - u8 data[BCH_SB_MEMBERS_MAX]; - }; - -+struct alloc_request { -+ unsigned nr_replicas; -+ unsigned target; -+ bool ec; -+ enum bch_watermark watermark; -+ enum bch_write_flags flags; -+ enum bch_data_type data_type; -+ struct bch_devs_list *devs_have; -+ struct write_point *wp; -+ -+ /* These fields are used primarily by open_bucket_add_buckets */ -+ struct open_buckets ptrs; -+ unsigned nr_effective; /* sum of @ptrs durability */ -+ bool have_cache; /* have we allocated from a 0 durability dev */ -+ struct bch_devs_mask devs_may_alloc; -+ -+ /* bch2_bucket_alloc_set_trans(): */ -+ struct bch_dev_usage usage; -+ -+ /* bch2_bucket_alloc_trans(): */ -+ struct bch_dev *ca; -+ -+ enum { -+ BTREE_BITMAP_NO, -+ BTREE_BITMAP_YES, -+ BTREE_BITMAP_ANY, -+ } btree_bitmap; -+ -+ struct { -+ u64 buckets_seen; -+ u64 skipped_open; -+ u64 skipped_need_journal_commit; -+ u64 need_journal_commit; -+ u64 skipped_nocow; -+ u64 skipped_nouse; -+ u64 skipped_mi_btree_bitmap; -+ } counters; -+ -+ unsigned scratch_nr_replicas; -+ unsigned scratch_nr_effective; -+ bool scratch_have_cache; -+ enum bch_data_type scratch_data_type; -+ struct open_buckets scratch_ptrs; -+ struct bch_devs_mask scratch_devs_may_alloc; -+}; -+ - struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, - struct dev_stripe_state *, - struct bch_devs_mask *); -@@ -33,6 +81,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) - return bch2_dev_have_ref(c, ob->dev); - } - -+static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) -+{ -+ switch (watermark) { -+ case BCH_WATERMARK_interior_updates: -+ return 0; -+ case BCH_WATERMARK_reclaim: -+ return OPEN_BUCKETS_COUNT / 6; -+ case BCH_WATERMARK_btree: -+ case BCH_WATERMARK_btree_copygc: -+ return OPEN_BUCKETS_COUNT / 4; -+ case BCH_WATERMARK_copygc: -+ return OPEN_BUCKETS_COUNT / 3; -+ default: -+ return OPEN_BUCKETS_COUNT / 2; -+ } -+} -+ - struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, - enum bch_watermark, enum bch_data_type, - struct closure *); -@@ -65,7 +130,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, - } - - void bch2_open_bucket_write_error(struct bch_fs *, -- struct open_buckets *, unsigned); -+ struct open_buckets *, unsigned, int); - - void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); - -@@ -93,7 +158,9 @@ static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct writ - unsigned i; - - open_bucket_for_each(c, &wp->ptrs, ob, i) -- ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); -+ ob_push(c, ob->sectors_free < block_sectors(c) -+ ? &ptrs -+ : &keep, ob); - wp->ptrs = keep; - - mutex_unlock(&wp->lock); -@@ -154,11 +221,8 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 - } - - enum bch_write_flags; --int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, -- struct dev_stripe_state *, struct bch_devs_mask *, -- unsigned, unsigned *, bool *, enum bch_write_flags, -- enum bch_data_type, enum bch_watermark, -- struct closure *); -+int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, -+ struct dev_stripe_state *, struct closure *); - - int bch2_alloc_sectors_start_trans(struct btree_trans *, - unsigned, unsigned, -@@ -170,7 +234,19 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *, - struct closure *, - struct write_point **); - --struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); -+static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct bch_dev *ca = ob_dev(c, ob); -+ -+ return (struct bch_extent_ptr) { -+ .type = 1 << BCH_EXTENT_ENTRY_ptr, -+ .gen = ob->gen, -+ .dev = ob->dev, -+ .offset = bucket_to_sector(ca, ob->bucket) + -+ ca->mi.bucket_size - -+ ob->sectors_free, -+ }; -+} - - /* - * Append pointers to the space we just allocated to @k, and mark @sectors space -diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h -index 4aa8ee026cb8..e7becdf22cba 100644 ---- a/fs/bcachefs/alloc_types.h -+++ b/fs/bcachefs/alloc_types.h -@@ -8,22 +8,6 @@ - #include "clock_types.h" - #include "fifo.h" - --struct bucket_alloc_state { -- enum { -- BTREE_BITMAP_NO, -- BTREE_BITMAP_YES, -- BTREE_BITMAP_ANY, -- } btree_bitmap; -- -- u64 buckets_seen; -- u64 skipped_open; -- u64 skipped_need_journal_commit; -- u64 need_journal_commit; -- u64 skipped_nocow; -- u64 skipped_nouse; -- u64 skipped_mi_btree_bitmap; --}; -- - #define BCH_WATERMARKS() \ - x(stripe) \ - x(normal) \ -@@ -90,6 +74,7 @@ struct dev_stripe_state { - x(stopped) \ - x(waiting_io) \ - x(waiting_work) \ -+ x(runnable) \ - x(running) - - enum write_point_state { -@@ -125,6 +110,7 @@ struct write_point { - enum write_point_state state; - u64 last_state_change; - u64 time[WRITE_POINT_STATE_NR]; -+ u64 last_runtime; - } __aligned(SMP_CACHE_BYTES); - }; - -diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c -new file mode 100644 -index 000000000000..a7cd1f0f0964 ---- /dev/null -+++ b/fs/bcachefs/async_objs.c -@@ -0,0 +1,132 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Async obj debugging: keep asynchronous objects on (very fast) lists, make -+ * them visibile in debugfs: -+ */ -+ -+#include "bcachefs.h" -+#include "async_objs.h" -+#include "btree_io.h" -+#include "debug.h" -+#include "io_read.h" -+#include "io_write.h" -+ -+#include -+ -+static void promote_obj_to_text(struct printbuf *out, void *obj) -+{ -+ bch2_promote_op_to_text(out, obj); -+} -+ -+static void rbio_obj_to_text(struct printbuf *out, void *obj) -+{ -+ bch2_read_bio_to_text(out, obj); -+} -+ -+static void write_op_obj_to_text(struct printbuf *out, void *obj) -+{ -+ bch2_write_op_to_text(out, obj); -+} -+ -+static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj) -+{ -+ struct btree_read_bio *rbio = obj; -+ bch2_btree_read_bio_to_text(out, rbio); -+} -+ -+static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj) -+{ -+ struct btree_write_bio *wbio = obj; -+ bch2_bio_to_text(out, &wbio->wbio.bio); -+} -+ -+static int bch2_async_obj_list_open(struct inode *inode, struct file *file) -+{ -+ struct async_obj_list *list = inode->i_private; -+ struct dump_iter *i; -+ -+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); -+ if (!i) -+ return -ENOMEM; -+ -+ file->private_data = i; -+ i->from = POS_MIN; -+ i->iter = 0; -+ i->c = container_of(list, struct bch_fs, async_objs[list->idx]); -+ i->list = list; -+ i->buf = PRINTBUF; -+ return 0; -+} -+ -+static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct async_obj_list *list = i->list; -+ ssize_t ret = 0; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ struct genradix_iter iter; -+ void *obj; -+ fast_list_for_each_from(&list->list, iter, obj, i->iter) { -+ ret = bch2_debugfs_flush_buf(i); -+ if (ret) -+ return ret; -+ -+ if (!i->size) -+ break; -+ -+ list->obj_to_text(&i->buf, obj); -+ } -+ -+ if (i->buf.allocation_failure) -+ ret = -ENOMEM; -+ else -+ i->iter = iter.pos; -+ -+ if (!ret) -+ ret = bch2_debugfs_flush_buf(i); -+ -+ return ret ?: i->ret; -+} -+ -+static const struct file_operations async_obj_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_async_obj_list_open, -+ .release = bch2_dump_release, -+ .read = bch2_async_obj_list_read, -+}; -+ -+void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) -+{ -+ c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir); -+ -+#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \ -+ &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops); -+ BCH_ASYNC_OBJ_LISTS() -+#undef x -+} -+ -+void bch2_fs_async_obj_exit(struct bch_fs *c) -+{ -+ for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) -+ fast_list_exit(&c->async_objs[i].list); -+} -+ -+int bch2_fs_async_obj_init(struct bch_fs *c) -+{ -+ for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) { -+ if (fast_list_init(&c->async_objs[i].list)) -+ return -BCH_ERR_ENOMEM_async_obj_init; -+ c->async_objs[i].idx = i; -+ } -+ -+#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text; -+ BCH_ASYNC_OBJ_LISTS() -+#undef x -+ -+ return 0; -+} -diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h -new file mode 100644 -index 000000000000..cd6489b8cf76 ---- /dev/null -+++ b/fs/bcachefs/async_objs.h -@@ -0,0 +1,44 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ASYNC_OBJS_H -+#define _BCACHEFS_ASYNC_OBJS_H -+ -+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -+static inline void __async_object_list_del(struct fast_list *head, unsigned idx) -+{ -+ fast_list_remove(head, idx); -+} -+ -+static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx) -+{ -+ int ret = fast_list_add(head, obj); -+ *idx = ret > 0 ? ret : 0; -+ return ret < 0 ? ret : 0; -+} -+ -+#define async_object_list_del(_c, _list, idx) \ -+ __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx) -+ -+#define async_object_list_add(_c, _list, obj, idx) \ -+ __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx) -+ -+void bch2_fs_async_obj_debugfs_init(struct bch_fs *); -+void bch2_fs_async_obj_exit(struct bch_fs *); -+int bch2_fs_async_obj_init(struct bch_fs *); -+ -+#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ -+ -+#define async_object_list_del(_c, _n, idx) do {} while (0) -+ -+static inline int __async_object_list_add(void) -+{ -+ return 0; -+} -+#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add() -+ -+static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {} -+static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {} -+static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; } -+ -+#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ -+ -+#endif /* _BCACHEFS_ASYNC_OBJS_H */ -diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h -new file mode 100644 -index 000000000000..8d713c0f5841 ---- /dev/null -+++ b/fs/bcachefs/async_objs_types.h -@@ -0,0 +1,25 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H -+#define _BCACHEFS_ASYNC_OBJS_TYPES_H -+ -+#define BCH_ASYNC_OBJ_LISTS() \ -+ x(promote) \ -+ x(rbio) \ -+ x(write_op) \ -+ x(btree_read_bio) \ -+ x(btree_write_bio) -+ -+enum bch_async_obj_lists { -+#define x(n) BCH_ASYNC_OBJ_LIST_##n, -+ BCH_ASYNC_OBJ_LISTS() -+#undef x -+ BCH_ASYNC_OBJ_NR -+}; -+ -+struct async_obj_list { -+ struct fast_list list; -+ void (*obj_to_text)(struct printbuf *, void *); -+ unsigned idx; -+}; -+ -+#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */ -diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c -index ebeb6a5ff9d2..cde7dd115267 100644 ---- a/fs/bcachefs/backpointers.c -+++ b/fs/bcachefs/backpointers.c -@@ -11,9 +11,21 @@ - #include "checksum.h" - #include "disk_accounting.h" - #include "error.h" -+#include "progress.h" -+#include "recovery_passes.h" - - #include - -+static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64); -+ -+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) -+{ -+ return (struct bbpos) { -+ .btree = bp.btree_id, -+ .pos = bp.pos, -+ }; -+} -+ - int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) - { -@@ -49,6 +61,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke - } - - bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); -+ prt_str(out, " data_type="); -+ bch2_prt_data_type(out, bp.v->data_type); - prt_printf(out, " suboffset=%u len=%u gen=%u pos=", - (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), - bp.v->bucket_len, -@@ -93,6 +107,9 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, - { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; -+ bool will_check = c->recovery.passes_to_run & -+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); -+ int ret = 0; - - if (insert) { - prt_printf(&buf, "existing backpointer found when inserting "); -@@ -106,9 +123,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, - - prt_printf(&buf, "for "); - bch2_bkey_val_to_text(&buf, c, orig_k); -- -- bch_err(c, "%s", buf.buf); -- } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { -+ } else if (!will_check) { - prt_printf(&buf, "backpointer not found when deleting\n"); - printbuf_indent_add(&buf, 2); - -@@ -122,17 +137,14 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, - - prt_printf(&buf, "for "); - bch2_bkey_val_to_text(&buf, c, orig_k); -- -- bch_err(c, "%s", buf.buf); - } - -- printbuf_exit(&buf); -+ if (!will_check && __bch2_inconsistent_error(c, &buf)) -+ ret = -BCH_ERR_erofs_unfixed_errors; - -- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { -- return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0; -- } else { -- return 0; -- } -+ bch_err(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ return ret; - } - - int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, -@@ -172,7 +184,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - - static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) - { -- return (likely(!bch2_backpointers_no_use_write_buffer) -+ return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) - ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) - : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -@@ -182,7 +194,7 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, - struct bkey_s_c visiting_k, - struct bkey_buf *last_flushed) - { -- return likely(!bch2_backpointers_no_use_write_buffer) -+ return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) - ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) - : 0; - } -@@ -190,7 +202,8 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, - static int backpointer_target_not_found(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct bkey_s_c target_k, -- struct bkey_buf *last_flushed) -+ struct bkey_buf *last_flushed, -+ bool commit) - { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; -@@ -207,11 +220,11 @@ static int backpointer_target_not_found(struct btree_trans *trans, - if (ret) - return ret; - -- prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", -+ prt_printf(&buf, "backpointer doesn't match %s it points to:\n", - bp.v->level ? "btree node" : "extent"); - bch2_bkey_val_to_text(&buf, c, bp.s_c); - -- prt_printf(&buf, "\n "); -+ prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, target_k); - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); -@@ -219,63 +232,45 @@ static int backpointer_target_not_found(struct btree_trans *trans, - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) - if (p.ptr.dev == bp.k->p.inode) { -- prt_printf(&buf, "\n "); -+ prt_newline(&buf); - struct bkey_i_backpointer bp2; - bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); - } - - if (fsck_err(trans, backpointer_to_missing_ptr, -- "%s", buf.buf)) -+ "%s", buf.buf)) { - ret = bch2_backpointer_del(trans, bp.k->p); -+ if (ret || !commit) -+ goto out; -+ -+ /* -+ * Normally, on transaction commit from inside a transaction, -+ * we'll return -BCH_ERR_transaction_restart_nested, since a -+ * transaction commit invalidates pointers given out by peek(). -+ * -+ * However, since we're updating a write buffer btree, if we -+ * return a transaction restart and loop we won't see that the -+ * backpointer has been deleted without an additional write -+ * buffer flush - and those are expensive. -+ * -+ * So we're relying on the caller immediately advancing to the -+ * next backpointer and starting a new transaction immediately -+ * after backpointer_get_key() returns NULL: -+ */ -+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -+ } -+out: - fsck_err: - printbuf_exit(&buf); - return ret; - } - --struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, -- struct bkey_s_c_backpointer bp, -- struct btree_iter *iter, -- unsigned iter_flags, -- struct bkey_buf *last_flushed) --{ -- struct bch_fs *c = trans->c; -- -- if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) -- return bkey_s_c_null; -- -- if (likely(!bp.v->level)) { -- bch2_trans_node_iter_init(trans, iter, -- bp.v->btree_id, -- bp.v->pos, -- 0, 0, -- iter_flags); -- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -- if (bkey_err(k)) { -- bch2_trans_iter_exit(trans, iter); -- return k; -- } -- -- if (k.k && -- extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) -- return k; -- -- bch2_trans_iter_exit(trans, iter); -- int ret = backpointer_target_not_found(trans, bp, k, last_flushed); -- return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -- } else { -- struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); -- if (IS_ERR_OR_NULL(b)) -- return ((struct bkey_s_c) { .k = ERR_CAST(b) }); -- -- return bkey_i_to_s_c(&b->key); -- } --} -- --struct btree *bch2_backpointer_get_node(struct btree_trans *trans, -- struct bkey_s_c_backpointer bp, -- struct btree_iter *iter, -- struct bkey_buf *last_flushed) -+static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, -+ struct bkey_s_c_backpointer bp, -+ struct btree_iter *iter, -+ struct bkey_buf *last_flushed, -+ bool commit) - { - struct bch_fs *c = trans->c; - -@@ -287,7 +282,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, - 0, - bp.v->level - 1, - 0); -- struct btree *b = bch2_btree_iter_peek_node(iter); -+ struct btree *b = bch2_btree_iter_peek_node(trans, iter); - if (IS_ERR_OR_NULL(b)) - goto err; - -@@ -300,7 +295,8 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, - if (btree_node_will_make_reachable(b)) { - b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); - } else { -- int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); -+ int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), -+ last_flushed, commit); - b = ret ? ERR_PTR(ret) : NULL; - } - err: -@@ -308,6 +304,79 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, - return b; - } - -+static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, -+ struct bkey_s_c_backpointer bp, -+ struct btree_iter *iter, -+ unsigned iter_flags, -+ struct bkey_buf *last_flushed, -+ bool commit) -+{ -+ struct bch_fs *c = trans->c; -+ -+ if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) -+ return bkey_s_c_null; -+ -+ bch2_trans_node_iter_init(trans, iter, -+ bp.v->btree_id, -+ bp.v->pos, -+ 0, -+ bp.v->level, -+ iter_flags); -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); -+ if (bkey_err(k)) { -+ bch2_trans_iter_exit(trans, iter); -+ return k; -+ } -+ -+ /* -+ * peek_slot() doesn't normally return NULL - except when we ask for a -+ * key at a btree level that doesn't exist. -+ * -+ * We may want to revisit this and change peek_slot(): -+ */ -+ if (!k.k) { -+ bkey_init(&iter->k); -+ iter->k.p = bp.v->pos; -+ k.k = &iter->k; -+ } -+ -+ if (k.k && -+ extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) -+ return k; -+ -+ bch2_trans_iter_exit(trans, iter); -+ -+ if (!bp.v->level) { -+ int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit); -+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -+ } else { -+ struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); -+ if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) -+ return bkey_s_c_null; -+ if (IS_ERR_OR_NULL(b)) -+ return ((struct bkey_s_c) { .k = ERR_CAST(b) }); -+ -+ return bkey_i_to_s_c(&b->key); -+ } -+} -+ -+struct btree *bch2_backpointer_get_node(struct btree_trans *trans, -+ struct bkey_s_c_backpointer bp, -+ struct btree_iter *iter, -+ struct bkey_buf *last_flushed) -+{ -+ return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true); -+} -+ -+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, -+ struct bkey_s_c_backpointer bp, -+ struct btree_iter *iter, -+ unsigned iter_flags, -+ struct bkey_buf *last_flushed) -+{ -+ return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true); -+} -+ - static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, - struct bkey_buf *last_flushed) - { -@@ -315,7 +384,7 @@ static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, st - return 0; - - struct bch_fs *c = trans->c; -- struct btree_iter alloc_iter = { NULL }; -+ struct btree_iter alloc_iter = {}; - struct bkey_s_c alloc_k; - struct printbuf buf = PRINTBUF; - int ret = 0; -@@ -419,7 +488,8 @@ static int check_extent_checksum(struct btree_trans *trans, - - bytes = p.crc.compressed_size << 9; - -- struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); -+ struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ, -+ BCH_DEV_READ_REF_check_extent_checksums); - if (!ca) - return false; - -@@ -436,12 +506,11 @@ static int check_extent_checksum(struct btree_trans *trans, - if (ret) - goto err; - -- prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); -- prt_printf(&buf, "\n "); -+ prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n"); - bch2_btree_id_to_text(&buf, btree); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, extent); -- prt_printf(&buf, "\n "); -+ prt_newline(&buf); - bch2_btree_id_to_text(&buf, o_btree); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, extent2); -@@ -457,7 +526,8 @@ static int check_extent_checksum(struct btree_trans *trans, - if (bio) - bio_put(bio); - kvfree(data_buf); -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], -+ BCH_DEV_READ_REF_check_extent_checksums); - printbuf_exit(&buf); - return ret; - } -@@ -504,7 +574,7 @@ static int check_bp_exists(struct btree_trans *trans, - struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); - - struct bkey_s_c other_extent = -- bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); -+ __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false); - ret = bkey_err(other_extent); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - ret = 0; -@@ -514,11 +584,27 @@ static int check_bp_exists(struct btree_trans *trans, - if (!other_extent.k) - goto missing; - -+ rcu_read_lock(); -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode); -+ if (ca) { -+ struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent); -+ bkey_for_each_ptr(other_extent_ptrs, ptr) -+ if (ptr->dev == bp->k.p.inode && -+ dev_ptr_stale_rcu(ca, ptr)) { -+ ret = drop_dev_and_update(trans, other_bp.v->btree_id, -+ other_extent, bp->k.p.inode); -+ if (ret) -+ goto err; -+ goto out; -+ } -+ } -+ rcu_read_unlock(); -+ - if (bch2_extents_match(orig_k, other_extent)) { - printbuf_reset(&buf); -- prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); -+ prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n"); - bch2_bkey_val_to_text(&buf, c, orig_k); -- prt_str(&buf, "\n "); -+ prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, other_extent); - bch_err(c, "%s", buf.buf); - -@@ -557,20 +643,20 @@ static int check_bp_exists(struct btree_trans *trans, - } - - printbuf_reset(&buf); -- prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode); -+ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode); - bch2_bkey_val_to_text(&buf, c, orig_k); -- prt_str(&buf, "\n "); -+ prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, other_extent); - bch_err(c, "%s", buf.buf); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto err; - missing: - printbuf_reset(&buf); -- prt_str(&buf, "missing backpointer\n for: "); -+ prt_str(&buf, "missing backpointer\nfor: "); - bch2_bkey_val_to_text(&buf, c, orig_k); -- prt_printf(&buf, "\n want: "); -+ prt_printf(&buf, "\nwant: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); -- prt_printf(&buf, "\n got: "); -+ prt_printf(&buf, "\ngot: "); - bch2_bkey_val_to_text(&buf, c, bp_k); - - if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) -@@ -590,28 +676,38 @@ static int check_extent_to_backpointers(struct btree_trans *trans, - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -- if (p.ptr.cached) -- continue; -- - if (p.ptr.dev == BCH_SB_MEMBER_INVALID) - continue; - - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); -- bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); -- bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); -- rcu_read_unlock(); -+ if (!ca) { -+ rcu_read_unlock(); -+ continue; -+ } - -- if (check || empty) { -- struct bkey_i_backpointer bp; -- bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); -+ if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) { -+ rcu_read_unlock(); -+ continue; -+ } - -- int ret = check -- ? check_bp_exists(trans, s, &bp, k) -- : bch2_bucket_backpointer_mod(trans, k, &bp, true); -- if (ret) -- return ret; -+ u64 b = PTR_BUCKET_NR(ca, &p.ptr); -+ if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) { -+ rcu_read_unlock(); -+ continue; - } -+ -+ bool empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); -+ rcu_read_unlock(); -+ -+ struct bkey_i_backpointer bp; -+ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); -+ -+ int ret = !empty -+ ? check_bp_exists(trans, s, &bp, k) -+ : bch2_bucket_backpointer_mod(trans, k, &bp, true); -+ if (ret) -+ return ret; - } - - return 0; -@@ -630,7 +726,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, - retry: - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, - 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); -- b = bch2_btree_iter_peek_node(&iter); -+ b = bch2_btree_iter_peek_node(trans, &iter); - ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err; -@@ -649,14 +745,6 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, - return ret; - } - --static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) --{ -- return (struct bbpos) { -- .btree = bp.btree_id, -- .pos = bp.pos, -- }; --} -- - static u64 mem_may_pin_bytes(struct bch_fs *c) - { - struct sysinfo i; -@@ -715,69 +803,11 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - return ret; - } - --struct progress_indicator_state { -- unsigned long next_print; -- u64 nodes_seen; -- u64 nodes_total; -- struct btree *last_node; --}; -- --static inline void progress_init(struct progress_indicator_state *s, -- struct bch_fs *c, -- u64 btree_id_mask) -+static inline int bch2_fs_going_ro(struct bch_fs *c) - { -- memset(s, 0, sizeof(*s)); -- -- s->next_print = jiffies + HZ * 10; -- -- for (unsigned i = 0; i < BTREE_ID_NR; i++) { -- if (!(btree_id_mask & BIT_ULL(i))) -- continue; -- -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_btree, -- .btree.id = i, -- }; -- -- u64 v; -- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); -- s->nodes_total += div64_ul(v, btree_sectors(c)); -- } --} -- --static inline bool progress_update_p(struct progress_indicator_state *s) --{ -- bool ret = time_after_eq(jiffies, s->next_print); -- -- if (ret) -- s->next_print = jiffies + HZ * 10; -- return ret; --} -- --static void progress_update_iter(struct btree_trans *trans, -- struct progress_indicator_state *s, -- struct btree_iter *iter, -- const char *msg) --{ -- struct bch_fs *c = trans->c; -- struct btree *b = path_l(btree_iter_path(trans, iter))->b; -- -- s->nodes_seen += b != s->last_node; -- s->last_node = b; -- -- if (progress_update_p(s)) { -- struct printbuf buf = PRINTBUF; -- unsigned percent = s->nodes_total -- ? div64_u64(s->nodes_seen * 100, s->nodes_total) -- : 0; -- -- prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", -- msg, percent, s->nodes_seen, s->nodes_total); -- bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); -- -- bch_info(c, "%s", buf.buf); -- printbuf_exit(&buf); -- } -+ return test_bit(BCH_FS_going_ro, &c->flags) -+ ? -EROFS -+ : 0; - } - - static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, -@@ -787,7 +817,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - struct progress_indicator_state progress; - int ret = 0; - -- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); -+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); - - for (enum btree_id btree_id = 0; - btree_id < btree_id_nr_alive(c); -@@ -806,7 +836,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - BTREE_ITER_prefetch); - - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ -- progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); -+ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); -+ bch2_fs_going_ro(c) ?: - check_extent_to_backpointers(trans, s, btree_id, level, k) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - })); -@@ -827,7 +858,7 @@ enum alloc_sector_counter { - ALLOC_SECTORS_NR - }; - --static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) -+static int data_type_to_alloc_counter(enum bch_data_type t) - { - switch (t) { - case BCH_DATA_btree: -@@ -836,15 +867,17 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t - case BCH_DATA_cached: - return ALLOC_cached; - case BCH_DATA_stripe: -+ case BCH_DATA_parity: - return ALLOC_stripe; - default: -- BUG(); -+ return -1; - } - } - - static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); - - static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, -+ bool *had_mismatch, - struct bkey_buf *last_flushed) - { - struct bch_fs *c = trans->c; -@@ -852,6 +885,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - bool need_commit = false; - -+ *had_mismatch = false; -+ - if (a->data_type == BCH_DATA_sb || - a->data_type == BCH_DATA_journal || - a->data_type == BCH_DATA_parity) -@@ -889,7 +924,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b - if (bp.v->bucket_gen != a->gen) - continue; - -- sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; -+ int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); -+ if (alloc_counter < 0) -+ continue; -+ -+ sectors[alloc_counter] += bp.v->bucket_len; - }; - bch2_trans_iter_exit(trans, &iter); - if (ret) -@@ -901,9 +940,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b - goto err; - } - -- /* Cached pointers don't have backpointers: */ -- - if (sectors[ALLOC_dirty] != a->dirty_sectors || -+ sectors[ALLOC_cached] != a->cached_sectors || - sectors[ALLOC_stripe] != a->stripe_sectors) { - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { - ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); -@@ -912,17 +950,25 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b - } - - if (sectors[ALLOC_dirty] > a->dirty_sectors || -+ sectors[ALLOC_cached] > a->cached_sectors || - sectors[ALLOC_stripe] > a->stripe_sectors) { - ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: - -BCH_ERR_transaction_restart_nested; - goto err; - } - -- if (!sectors[ALLOC_dirty] && -- !sectors[ALLOC_stripe]) -- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); -- else -- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); -+ bool empty = (sectors[ALLOC_dirty] + -+ sectors[ALLOC_stripe] + -+ sectors[ALLOC_cached]) == 0; -+ -+ ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch, -+ alloc_k.k->p.offset) ?: -+ (empty -+ ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty, -+ alloc_k.k->p.offset) -+ : 0); -+ -+ *had_mismatch = true; - } - err: - bch2_dev_put(ca); -@@ -946,8 +992,14 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) - goto next; - - struct bpos bucket = bp_pos_to_bucket(ca, pos); -- bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, -- ca->mi.nbuckets, bucket.offset); -+ u64 next = ca->mi.nbuckets; -+ -+ unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets); -+ if (bitmap) -+ next = min_t(u64, next, -+ find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset)); -+ -+ bucket.offset = next; - if (bucket.offset == ca->mi.nbuckets) - goto next; - -@@ -973,7 +1025,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, - { - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); -- struct btree *b = bch2_btree_iter_peek_node(&iter); -+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err; -@@ -1056,28 +1108,6 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) - { - int ret = 0; - -- /* -- * Can't allow devices to come/go/resize while we have bucket bitmaps -- * allocated -- */ -- lockdep_assert_held(&c->state_lock); -- -- for_each_member_device(c, ca) { -- BUG_ON(ca->bucket_backpointer_mismatches); -- ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), -- sizeof(unsigned long), -- GFP_KERNEL); -- ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), -- sizeof(unsigned long), -- GFP_KERNEL); -- if (!ca->bucket_backpointer_mismatches || -- !ca->bucket_backpointer_empty) { -- bch2_dev_put(ca); -- ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; -- goto err_free_bitmaps; -- } -- } -- - struct btree_trans *trans = bch2_trans_get(c); - struct extents_to_bp_state s = { .bp_start = POS_MIN }; - -@@ -1086,23 +1116,24 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) - - ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_prefetch, k, ({ -- check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); -+ bool had_mismatch; -+ bch2_fs_going_ro(c) ?: -+ check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed); - })); - if (ret) - goto err; - -- u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; -+ u64 nr_buckets = 0, nr_mismatches = 0; - for_each_member_device(c, ca) { - nr_buckets += ca->mi.nbuckets; -- nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); -- nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); -+ nr_mismatches += ca->bucket_backpointer_mismatch.nr; - } - -- if (!nr_mismatches && !nr_empty) -+ if (!nr_mismatches) - goto err; - - bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", -- nr_mismatches + nr_empty, nr_buckets); -+ nr_mismatches, nr_buckets); - - while (1) { - ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); -@@ -1133,22 +1164,71 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) - - s.bp_start = bpos_successor(s.bp_end); - } -+ -+ for_each_member_device(c, ca) { -+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); -+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); -+ } - err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&s.last_flushed, c); - bch2_btree_cache_unpin(c); --err_free_bitmaps: -- for_each_member_device(c, ca) { -- kvfree(ca->bucket_backpointer_empty); -- ca->bucket_backpointer_empty = NULL; -- kvfree(ca->bucket_backpointer_mismatches); -- ca->bucket_backpointer_mismatches = NULL; -- } - - bch_err_fn(c, ret); - return ret; - } - -+static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, -+ struct bpos bucket, -+ bool *had_mismatch, -+ struct bkey_buf *last_flushed) -+{ -+ struct btree_iter alloc_iter; -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter, -+ BTREE_ID_alloc, bucket, -+ BTREE_ITER_cached); -+ int ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed); -+ bch2_trans_iter_exit(trans, &alloc_iter); -+ return ret; -+} -+ -+int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans, -+ struct bch_dev *ca, u64 bucket, -+ bool copygc, -+ struct bkey_buf *last_flushed) -+{ -+ struct bch_fs *c = trans->c; -+ bool had_mismatch; -+ int ret = lockrestart_do(trans, -+ check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket), -+ &had_mismatch, last_flushed)); -+ if (ret || !had_mismatch) -+ return ret; -+ -+ u64 nr = ca->bucket_backpointer_mismatch.nr; -+ u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0; -+ -+ struct printbuf buf = PRINTBUF; -+ __bch2_log_msg_start(ca->name, &buf); -+ -+ prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n", -+ bucket, nr, ca->mi.nbuckets); -+ -+ bch2_run_explicit_recovery_pass(c, &buf, -+ BCH_RECOVERY_PASS_check_extents_to_backpointers, -+ nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0); -+ -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ return 0; -+} -+ -+/* backpointers -> extents */ -+ - static int check_one_backpointer(struct btree_trans *trans, - struct bbpos start, - struct bbpos end, -@@ -1206,11 +1286,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); -- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); -+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); - - int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_prefetch, k, ({ -- progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); -+ bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); - check_one_backpointer(trans, start, end, k, &last_flushed); - })); - -@@ -1264,3 +1344,48 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) - bch_err_fn(c, ret); - return ret; - } -+ -+static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit) -+{ -+ scoped_guard(mutex, &b->lock) { -+ if (!b->buckets) { -+ b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), -+ sizeof(unsigned long), GFP_KERNEL); -+ if (!b->buckets) -+ return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; -+ } -+ -+ b->nr += !__test_and_set_bit(bit, b->buckets); -+ } -+ -+ return 0; -+} -+ -+int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_size) -+{ -+ scoped_guard(mutex, &b->lock) { -+ if (!b->buckets) -+ return 0; -+ -+ unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size), -+ sizeof(unsigned long), GFP_KERNEL); -+ if (!n) -+ return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; -+ -+ memcpy(n, b->buckets, -+ BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long)); -+ kvfree(b->buckets); -+ b->buckets = n; -+ } -+ -+ return 0; -+} -+ -+void bch2_bucket_bitmap_free(struct bucket_bitmap *b) -+{ -+ mutex_lock(&b->lock); -+ kvfree(b->buckets); -+ b->buckets = NULL; -+ b->nr = 0; -+ mutex_unlock(&b->lock); -+} -diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h -index 060dad1521ee..6840561084ce 100644 ---- a/fs/bcachefs/backpointers.h -+++ b/fs/bcachefs/backpointers.h -@@ -1,6 +1,6 @@ - /* SPDX-License-Identifier: GPL-2.0 */ --#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H --#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H -+#ifndef _BCACHEFS_BACKPOINTERS_H -+#define _BCACHEFS_BACKPOINTERS_H - - #include "btree_cache.h" - #include "btree_iter.h" -@@ -102,7 +102,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, - struct bkey_i_backpointer *bp, - bool insert) - { -- if (unlikely(bch2_backpointers_no_use_write_buffer)) -+ if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); - - if (!insert) { -@@ -123,7 +123,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, - return BCH_DATA_btree; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: -- return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; -+ if (p.has_ec) -+ return BCH_DATA_stripe; -+ if (p.ptr.cached) -+ return BCH_DATA_cached; -+ else -+ return BCH_DATA_user; - case KEY_TYPE_stripe: { - const struct bch_extent_ptr *ptr = &entry->ptr; - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -@@ -147,7 +152,20 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, - struct bkey_i_backpointer *bp) - { - bkey_backpointer_init(&bp->k_i); -- bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset); -+ bp->k.p.inode = p.ptr.dev; -+ -+ if (k.k->type != KEY_TYPE_stripe) -+ bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset; -+ else { -+ /* -+ * Put stripe backpointers where they won't collide with the -+ * extent backpointers within the stripe: -+ */ -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) << -+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1; -+ } -+ - bp->v = (struct bch_backpointer) { - .btree_id = btree_id, - .level = level, -@@ -164,8 +182,20 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_b - struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, - struct btree_iter *, struct bkey_buf *); - -+int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64, -+ bool, struct bkey_buf *); -+ - int bch2_check_btree_backpointers(struct bch_fs *); - int bch2_check_extents_to_backpointers(struct bch_fs *); - int bch2_check_backpointers_to_extents(struct bch_fs *); - -+static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i) -+{ -+ unsigned long *bitmap = READ_ONCE(b->buckets); -+ return bitmap && test_bit(i, bitmap); -+} -+ -+int bch2_bucket_bitmap_resize(struct bucket_bitmap *, u64, u64); -+void bch2_bucket_bitmap_free(struct bucket_bitmap *); -+ - #endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ -diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h -index 161cf2f05d2a..7824da2af9d0 100644 ---- a/fs/bcachefs/bcachefs.h -+++ b/fs/bcachefs/bcachefs.h -@@ -203,22 +203,24 @@ - #include - #include - #include -+#include - - #include "bcachefs_format.h" - #include "btree_journal_iter_types.h" - #include "disk_accounting_types.h" - #include "errcode.h" -+#include "fast_list.h" - #include "fifo.h" - #include "nocow_locking_types.h" - #include "opts.h" --#include "recovery_passes_types.h" - #include "sb-errors_types.h" - #include "seqmutex.h" -+#include "snapshot_types.h" - #include "time_stats.h" - #include "util.h" - - #ifdef CONFIG_BCACHEFS_DEBUG --#define BCH_WRITE_REF_DEBUG -+#define ENUMERATED_REF_DEBUG - #endif - - #ifndef dynamic_fault -@@ -268,7 +270,8 @@ do { \ - - #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") - --void bch2_print_str(struct bch_fs *, const char *); -+void bch2_print_str(struct bch_fs *, const char *, const char *); -+void bch2_print_str_nonblocking(struct bch_fs *, const char *, const char *); - - __printf(2, 3) - void bch2_print_opts(struct bch_opts *, const char *, ...); -@@ -292,6 +295,16 @@ do { \ - bch2_print(_c, __VA_ARGS__); \ - } while (0) - -+#define bch2_print_str_ratelimited(_c, ...) \ -+do { \ -+ static DEFINE_RATELIMIT_STATE(_rs, \ -+ DEFAULT_RATELIMIT_INTERVAL, \ -+ DEFAULT_RATELIMIT_BURST); \ -+ \ -+ if (__ratelimit(&_rs)) \ -+ bch2_print_str(_c, __VA_ARGS__); \ -+} while (0) -+ - #define bch_info(c, fmt, ...) \ - bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) - #define bch_info_ratelimited(c, fmt, ...) \ -@@ -389,17 +402,20 @@ do { \ - "compare them") \ - BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ - "Don't use the write buffer for backpointers, enabling "\ -- "extra runtime checks") -- --/* Parameters that should only be compiled in debug mode: */ --#define BCH_DEBUG_PARAMS_DEBUG() \ -- BCH_DEBUG_PARAM(expensive_debug_checks, \ -- "Enables various runtime debugging checks that " \ -- "significantly affect performance") \ -+ "extra runtime checks") \ -+ BCH_DEBUG_PARAM(debug_check_btree_locking, \ -+ "Enable additional asserts for btree locking") \ - BCH_DEBUG_PARAM(debug_check_iterators, \ - "Enables extra verification for btree iterators") \ -+ BCH_DEBUG_PARAM(debug_check_bset_lookups, \ -+ "Enables extra verification for bset lookups") \ - BCH_DEBUG_PARAM(debug_check_btree_accounting, \ - "Verify btree accounting for keys within a node") \ -+ BCH_DEBUG_PARAM(debug_check_bkey_unpack, \ -+ "Enables extra verification for bkey unpack") -+ -+/* Parameters that should only be compiled in debug mode: */ -+#define BCH_DEBUG_PARAMS_DEBUG() \ - BCH_DEBUG_PARAM(journal_seq_verify, \ - "Store the journal sequence number in the version " \ - "number of every btree key, and verify that btree " \ -@@ -426,28 +442,28 @@ do { \ - #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() - #endif - --#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; --BCH_DEBUG_PARAMS() -+#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name; -+BCH_DEBUG_PARAMS_ALL() - #undef BCH_DEBUG_PARAM - --#ifndef CONFIG_BCACHEFS_DEBUG --#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name; --BCH_DEBUG_PARAMS_DEBUG() --#undef BCH_DEBUG_PARAM --#endif -- - #define BCH_TIME_STATS() \ - x(btree_node_mem_alloc) \ - x(btree_node_split) \ - x(btree_node_compact) \ - x(btree_node_merge) \ - x(btree_node_sort) \ -+ x(btree_node_get) \ - x(btree_node_read) \ - x(btree_node_read_done) \ -+ x(btree_node_write) \ - x(btree_interior_update_foreground) \ - x(btree_interior_update_total) \ - x(btree_gc) \ - x(data_write) \ -+ x(data_write_to_submit) \ -+ x(data_write_to_queue) \ -+ x(data_write_to_btree_update) \ -+ x(data_write_btree_update) \ - x(data_read) \ - x(data_promote) \ - x(journal_flush_write) \ -@@ -456,6 +472,7 @@ BCH_DEBUG_PARAMS_DEBUG() - x(blocked_journal_low_on_space) \ - x(blocked_journal_low_on_pin) \ - x(blocked_journal_max_in_flight) \ -+ x(blocked_journal_max_open) \ - x(blocked_key_cache_flush) \ - x(blocked_allocate) \ - x(blocked_allocate_open_bucket) \ -@@ -470,6 +487,7 @@ enum bch_time_stats { - }; - - #include "alloc_types.h" -+#include "async_objs_types.h" - #include "btree_gc_types.h" - #include "btree_types.h" - #include "btree_node_scan_types.h" -@@ -479,10 +497,12 @@ enum bch_time_stats { - #include "clock_types.h" - #include "disk_groups_types.h" - #include "ec_types.h" -+#include "enumerated_ref_types.h" - #include "journal_types.h" - #include "keylist_types.h" - #include "quota_types.h" - #include "rebalance_types.h" -+#include "recovery_passes_types.h" - #include "replicas_types.h" - #include "sb-members_types.h" - #include "subvolume_types.h" -@@ -511,6 +531,57 @@ struct discard_in_flight { - u64 bucket:63; - }; - -+#define BCH_DEV_READ_REFS() \ -+ x(bch2_online_devs) \ -+ x(trans_mark_dev_sbs) \ -+ x(read_fua_test) \ -+ x(sb_field_resize) \ -+ x(write_super) \ -+ x(journal_read) \ -+ x(fs_journal_alloc) \ -+ x(fs_resize_on_mount) \ -+ x(btree_node_read) \ -+ x(btree_node_read_all_replicas) \ -+ x(btree_node_scrub) \ -+ x(btree_node_write) \ -+ x(btree_node_scan) \ -+ x(btree_verify_replicas) \ -+ x(btree_node_ondisk_to_text) \ -+ x(io_read) \ -+ x(check_extent_checksums) \ -+ x(ec_block) -+ -+enum bch_dev_read_ref { -+#define x(n) BCH_DEV_READ_REF_##n, -+ BCH_DEV_READ_REFS() -+#undef x -+ BCH_DEV_READ_REF_NR, -+}; -+ -+#define BCH_DEV_WRITE_REFS() \ -+ x(journal_write) \ -+ x(journal_do_discards) \ -+ x(dev_do_discards) \ -+ x(discard_one_bucket_fast) \ -+ x(do_invalidates) \ -+ x(nocow_flush) \ -+ x(io_write) \ -+ x(ec_block) \ -+ x(ec_bucket_zero) -+ -+enum bch_dev_write_ref { -+#define x(n) BCH_DEV_WRITE_REF_##n, -+ BCH_DEV_WRITE_REFS() -+#undef x -+ BCH_DEV_WRITE_REF_NR, -+}; -+ -+struct bucket_bitmap { -+ unsigned long *buckets; -+ u64 nr; -+ struct mutex lock; -+}; -+ - struct bch_dev { - struct kobject kobj; - #ifdef CONFIG_BCACHEFS_DEBUG -@@ -521,8 +592,7 @@ struct bch_dev { - struct percpu_ref ref; - #endif - struct completion ref_completion; -- struct percpu_ref io_ref; -- struct completion io_ref_completion; -+ struct enumerated_ref io_ref[2]; - - struct bch_fs *fs; - -@@ -533,6 +603,7 @@ struct bch_dev { - */ - struct bch_member_cpu mi; - atomic64_t errors[BCH_MEMBER_ERROR_NR]; -+ unsigned long write_errors_start; - - __uuid_t uuid; - char name[BDEVNAME_SIZE]; -@@ -555,10 +626,11 @@ struct bch_dev { - u8 *oldest_gen; - unsigned long *buckets_nouse; - -- unsigned long *bucket_backpointer_mismatches; -- unsigned long *bucket_backpointer_empty; -+ struct bucket_bitmap bucket_backpointer_mismatch; -+ struct bucket_bitmap bucket_backpointer_empty; - -- struct bch_dev_usage __percpu *usage; -+ struct bch_dev_usage_full __percpu -+ *usage; - - /* Allocator: */ - u64 alloc_cursor[3]; -@@ -567,10 +639,6 @@ struct bch_dev { - unsigned nr_partial_buckets; - unsigned nr_btree_reserve; - -- size_t inc_gen_needs_gc; -- size_t inc_gen_really_needs_gc; -- size_t buckets_waiting_on_journal; -- - struct work_struct invalidate_work; - struct work_struct discard_work; - struct mutex discard_buckets_in_flight_lock; -@@ -609,21 +677,23 @@ struct bch_dev { - x(accounting_replay_done) \ - x(may_go_rw) \ - x(rw) \ -+ x(rw_init_done) \ - x(was_rw) \ - x(stopping) \ - x(emergency_ro) \ - x(going_ro) \ - x(write_disable_complete) \ - x(clean_shutdown) \ -- x(recovery_running) \ -- x(fsck_running) \ -+ x(in_recovery) \ -+ x(in_fsck) \ - x(initial_gc_unfixed) \ - x(need_delete_dead_snapshots) \ - x(error) \ - x(topology_error) \ - x(errors_fixed) \ - x(errors_not_fixed) \ -- x(no_invalid_checks) -+ x(no_invalid_checks) \ -+ x(discard_mount_opt_set) \ - - enum bch_fs_flags { - #define x(n) BCH_FS_##n, -@@ -642,8 +712,10 @@ struct btree_transaction_stats { - struct bch2_time_stats lock_hold_times; - struct mutex lock; - unsigned nr_max_paths; -- unsigned journal_entries_size; - unsigned max_mem; -+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -+ darray_trans_kmalloc_trace trans_kmalloc_trace; -+#endif - char *max_paths_text; - }; - -@@ -664,9 +736,6 @@ struct btree_trans_buf { - struct btree_trans *trans; - }; - --#define BCACHEFS_ROOT_SUBVOL_INUM \ -- ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) -- - #define BCH_WRITE_REFS() \ - x(journal) \ - x(trans) \ -@@ -687,7 +756,9 @@ struct btree_trans_buf { - x(gc_gens) \ - x(snapshot_delete_pagecache) \ - x(sysfs) \ -- x(btree_write_buffer) -+ x(btree_write_buffer) \ -+ x(btree_node_scrub) \ -+ x(async_recovery_passes) - - enum bch_write_ref { - #define x(n) BCH_WRITE_REF_##n, -@@ -696,6 +767,8 @@ enum bch_write_ref { - BCH_WRITE_REF_NR, - }; - -+#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0) -+ - struct bch_fs { - struct closure cl; - -@@ -719,11 +792,7 @@ struct bch_fs { - struct rw_semaphore state_lock; - - /* Counts outstanding writes, for clean transition to read-only */ --#ifdef BCH_WRITE_REF_DEBUG -- atomic_long_t writes[BCH_WRITE_REF_NR]; --#else -- struct percpu_ref writes; --#endif -+ struct enumerated_ref writes; - /* - * Certain operations are only allowed in single threaded mode, during - * recovery, and we want to assert that this is the case: -@@ -767,6 +836,7 @@ struct bch_fs { - - u8 nr_devices; - u8 clean; -+ bool multi_device; /* true if we've ever had more than one device */ - - u8 encryption_type; - -@@ -776,10 +846,16 @@ struct bch_fs { - unsigned nsec_per_time_unit; - u64 features; - u64 compat; -+ u64 recovery_passes_required; - unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; - u64 btrees_lost_data; - } sb; -+ DARRAY(enum bcachefs_metadata_version) -+ incompat_versions_requested; - -+#ifdef CONFIG_UNICODE -+ struct unicode_map *cf_encoding; -+#endif - - struct bch_sb_handle disk_sb; - -@@ -795,7 +871,7 @@ struct bch_fs { - struct mutex snapshot_table_lock; - struct rw_semaphore snapshot_create_lock; - -- struct work_struct snapshot_delete_work; -+ struct snapshot_delete snapshot_delete; - struct work_struct snapshot_wait_for_pagecache_and_delete_work; - snapshot_id_list snapshots_unlinked; - struct mutex snapshots_unlinked_lock; -@@ -860,7 +936,7 @@ struct bch_fs { - struct btree_write_buffer btree_write_buffer; - - struct workqueue_struct *btree_update_wq; -- struct workqueue_struct *btree_io_complete_wq; -+ struct workqueue_struct *btree_write_complete_wq; - /* copygc needs its own workqueue for index updates.. */ - struct workqueue_struct *copygc_wq; - /* -@@ -871,6 +947,7 @@ struct bch_fs { - struct workqueue_struct *write_ref_wq; - - /* ALLOCATION */ -+ struct bch_devs_mask online_devs; - struct bch_devs_mask rw_devs[BCH_DATA_NR]; - unsigned long rw_devs_change_count; - -@@ -965,13 +1042,16 @@ struct bch_fs { - nocow_locks; - struct rhashtable promote_table; - -+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -+ struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR]; -+#endif -+ - mempool_t compression_bounce[2]; - mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; - size_t zstd_workspace_size; - -- struct crypto_shash *sha256; -- struct crypto_sync_skcipher *chacha20; -- struct crypto_shash *poly1305; -+ struct bch_key chacha20_key; -+ bool chacha20_key_set; - - atomic64_t key_version; - -@@ -993,15 +1073,11 @@ struct bch_fs { - wait_queue_head_t copygc_running_wq; - - /* STRIPES: */ -- GENRADIX(struct stripe) stripes; - GENRADIX(struct gc_stripe) gc_stripes; - - struct hlist_head ec_stripes_new[32]; - spinlock_t ec_stripes_new_lock; - -- ec_stripes_heap ec_stripes_heap; -- struct mutex ec_stripes_heap_lock; -- - /* ERASURE CODING */ - struct list_head ec_stripe_head_list; - struct mutex ec_stripe_head_lock; -@@ -1039,25 +1115,12 @@ struct bch_fs { - /* RECOVERY */ - u64 journal_replay_seq_start; - u64 journal_replay_seq_end; -- /* -- * Two different uses: -- * "Has this fsck pass?" - i.e. should this type of error be an -- * emergency read-only -- * And, in certain situations fsck will rewind to an earlier pass: used -- * for signaling to the toplevel code which pass we want to run now. -- */ -- enum bch_recovery_pass curr_recovery_pass; -- enum bch_recovery_pass next_recovery_pass; -- /* bitmask of recovery passes that we actually ran */ -- u64 recovery_passes_complete; -- /* never rewinds version of curr_recovery_pass */ -- enum bch_recovery_pass recovery_pass_done; -- spinlock_t recovery_pass_lock; -- struct semaphore online_fsck_mutex; -+ struct bch_fs_recovery recovery; - - /* DEBUG JUNK */ - struct dentry *fs_debug_dir; - struct dentry *btree_debug_dir; -+ struct dentry *async_obj_dir; - struct btree_debug btree_debug[BTREE_ID_NR]; - struct btree *verify_data; - struct btree_node *verify_ondisk; -@@ -1099,54 +1162,6 @@ struct bch_fs { - - extern struct wait_queue_head bch2_read_only_wait; - --static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) --{ --#ifdef BCH_WRITE_REF_DEBUG -- atomic_long_inc(&c->writes[ref]); --#else -- percpu_ref_get(&c->writes); --#endif --} -- --static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) --{ --#ifdef BCH_WRITE_REF_DEBUG -- return !test_bit(BCH_FS_going_ro, &c->flags) && -- atomic_long_inc_not_zero(&c->writes[ref]); --#else -- return percpu_ref_tryget(&c->writes); --#endif --} -- --static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) --{ --#ifdef BCH_WRITE_REF_DEBUG -- return !test_bit(BCH_FS_going_ro, &c->flags) && -- atomic_long_inc_not_zero(&c->writes[ref]); --#else -- return percpu_ref_tryget_live(&c->writes); --#endif --} -- --static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) --{ --#ifdef BCH_WRITE_REF_DEBUG -- long v = atomic_long_dec_return(&c->writes[ref]); -- -- BUG_ON(v < 0); -- if (v) -- return; -- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) -- if (atomic_long_read(&c->writes[i])) -- return; -- -- set_bit(BCH_FS_write_disable_complete, &c->flags); -- wake_up(&bch2_read_only_wait); --#else -- percpu_ref_put(&c->writes); --#endif --} -- - static inline bool bch2_ro_ref_tryget(struct bch_fs *c) - { - if (test_bit(BCH_FS_stopping, &c->flags)) -@@ -1247,4 +1262,17 @@ static inline unsigned data_replicas_required(struct bch_fs *c) - #define BKEY_PADDED_ONSTACK(key, pad) \ - struct { struct bkey_i key; __u64 key ## _pad[pad]; } - -+/* -+ * This is needed because discard is both a filesystem option and a device -+ * option, and mount options are supposed to apply to that mount and not be -+ * persisted, i.e. if it's set as a mount option we can't propagate it to the -+ * device. -+ */ -+static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) -+{ -+ return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) -+ ? c->opts.discard -+ : ca->mi.discard; -+} -+ - #endif /* _BCACHEFS_H */ -diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h -index f70f0108401f..b4a04df5ea95 100644 ---- a/fs/bcachefs/bcachefs_format.h -+++ b/fs/bcachefs/bcachefs_format.h -@@ -366,6 +366,10 @@ static inline void bkey_init(struct bkey *k) - #define __BKEY_PADDED(key, pad) \ - struct bkey_i key; __u64 key ## _pad[pad] - -+enum bch_bkey_type_flags { -+ BKEY_TYPE_strict_btree_checks = BIT(0), -+}; -+ - /* - * - DELETED keys are used internally to mark keys that should be ignored but - * override keys in composition order. Their version number is ignored. -@@ -383,46 +387,46 @@ static inline void bkey_init(struct bkey *k) - * - * - WHITEOUT: for hash table btrees - */ --#define BCH_BKEY_TYPES() \ -- x(deleted, 0) \ -- x(whiteout, 1) \ -- x(error, 2) \ -- x(cookie, 3) \ -- x(hash_whiteout, 4) \ -- x(btree_ptr, 5) \ -- x(extent, 6) \ -- x(reservation, 7) \ -- x(inode, 8) \ -- x(inode_generation, 9) \ -- x(dirent, 10) \ -- x(xattr, 11) \ -- x(alloc, 12) \ -- x(quota, 13) \ -- x(stripe, 14) \ -- x(reflink_p, 15) \ -- x(reflink_v, 16) \ -- x(inline_data, 17) \ -- x(btree_ptr_v2, 18) \ -- x(indirect_inline_data, 19) \ -- x(alloc_v2, 20) \ -- x(subvolume, 21) \ -- x(snapshot, 22) \ -- x(inode_v2, 23) \ -- x(alloc_v3, 24) \ -- x(set, 25) \ -- x(lru, 26) \ -- x(alloc_v4, 27) \ -- x(backpointer, 28) \ -- x(inode_v3, 29) \ -- x(bucket_gens, 30) \ -- x(snapshot_tree, 31) \ -- x(logged_op_truncate, 32) \ -- x(logged_op_finsert, 33) \ -- x(accounting, 34) \ -- x(inode_alloc_cursor, 35) -+#define BCH_BKEY_TYPES() \ -+ x(deleted, 0, 0) \ -+ x(whiteout, 1, 0) \ -+ x(error, 2, 0) \ -+ x(cookie, 3, 0) \ -+ x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \ -+ x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \ -+ x(extent, 6, BKEY_TYPE_strict_btree_checks) \ -+ x(reservation, 7, BKEY_TYPE_strict_btree_checks) \ -+ x(inode, 8, BKEY_TYPE_strict_btree_checks) \ -+ x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \ -+ x(dirent, 10, BKEY_TYPE_strict_btree_checks) \ -+ x(xattr, 11, BKEY_TYPE_strict_btree_checks) \ -+ x(alloc, 12, BKEY_TYPE_strict_btree_checks) \ -+ x(quota, 13, BKEY_TYPE_strict_btree_checks) \ -+ x(stripe, 14, BKEY_TYPE_strict_btree_checks) \ -+ x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \ -+ x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \ -+ x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \ -+ x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \ -+ x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \ -+ x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \ -+ x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \ -+ x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \ -+ x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \ -+ x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \ -+ x(set, 25, 0) \ -+ x(lru, 26, BKEY_TYPE_strict_btree_checks) \ -+ x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \ -+ x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \ -+ x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \ -+ x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \ -+ x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \ -+ x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \ -+ x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \ -+ x(accounting, 34, BKEY_TYPE_strict_btree_checks) \ -+ x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) - - enum bch_bkey_type { --#define x(name, nr) KEY_TYPE_##name = nr, -+#define x(name, nr, ...) KEY_TYPE_##name = nr, - BCH_BKEY_TYPES() - #undef x - KEY_TYPE_MAX, -@@ -493,7 +497,8 @@ struct bch_sb_field { - x(members_v2, 11) \ - x(errors, 12) \ - x(ext, 13) \ -- x(downgrade, 14) -+ x(downgrade, 14) \ -+ x(recovery_passes, 15) - - #include "alloc_background_format.h" - #include "dirent_format.h" -@@ -506,6 +511,7 @@ struct bch_sb_field { - #include "logged_ops_format.h" - #include "lru_format.h" - #include "quota_format.h" -+#include "recovery_passes_format.h" - #include "reflink_format.h" - #include "replicas_format.h" - #include "snapshot_format.h" -@@ -686,7 +692,15 @@ struct bch_sb_field_ext { - x(inode_depth, BCH_VERSION(1, 17)) \ - x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ - x(autofix_errors, BCH_VERSION(1, 19)) \ -- x(directory_size, BCH_VERSION(1, 20)) -+ x(directory_size, BCH_VERSION(1, 20)) \ -+ x(cached_backpointers, BCH_VERSION(1, 21)) \ -+ x(stripe_backpointers, BCH_VERSION(1, 22)) \ -+ x(stripe_lru, BCH_VERSION(1, 23)) \ -+ x(casefolding, BCH_VERSION(1, 24)) \ -+ x(extent_flags, BCH_VERSION(1, 25)) \ -+ x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ -+ x(fast_device_removal, BCH_VERSION(1, 27)) \ -+ x(inode_has_case_insensitive, BCH_VERSION(1, 28)) - - enum bcachefs_metadata_version { - bcachefs_metadata_version_min = 9, -@@ -837,6 +851,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); - LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); - LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); - LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); -+LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64); - LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); - LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); - LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); -@@ -855,6 +870,11 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); - LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, - struct bch_sb, flags[5], 48, 64); - LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); -+LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); -+LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); -+LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); -+LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); -+LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24); - - static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) - { -@@ -908,7 +928,10 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u - x(journal_no_flush, 16) \ - x(alloc_v2, 17) \ - x(extents_across_btree_nodes, 18) \ -- x(incompat_version_field, 19) -+ x(incompat_version_field, 19) \ -+ x(casefolding, 20) \ -+ x(no_alloc_info, 21) \ -+ x(small_image, 22) - - #define BCH_SB_FEATURES_ALWAYS \ - (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ -@@ -922,7 +945,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u - BIT_ULL(BCH_FEATURE_new_siphash)| \ - BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ - BIT_ULL(BCH_FEATURE_new_varint)| \ -- BIT_ULL(BCH_FEATURE_journal_no_flush)) -+ BIT_ULL(BCH_FEATURE_journal_no_flush)| \ -+ BIT_ULL(BCH_FEATURE_incompat_version_field)) - - enum bch_sb_feature { - #define x(f, n) BCH_FEATURE_##f, -@@ -974,6 +998,19 @@ enum bch_error_actions { - BCH_ON_ERROR_NR - }; - -+#define BCH_DEGRADED_ACTIONS() \ -+ x(ask, 0) \ -+ x(yes, 1) \ -+ x(very, 2) \ -+ x(no, 3) -+ -+enum bch_degraded_actions { -+#define x(t, n) BCH_DEGRADED_##t = n, -+ BCH_DEGRADED_ACTIONS() -+#undef x -+ BCH_DEGRADED_ACTIONS_NR -+}; -+ - #define BCH_STR_HASH_TYPES() \ - x(crc32c, 0) \ - x(crc64, 1) \ -@@ -1133,7 +1170,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) - x(log, 9) \ - x(overwrite, 10) \ - x(write_buffer_keys, 11) \ -- x(datetime, 12) -+ x(datetime, 12) \ -+ x(log_bkey, 13) - - enum bch_jset_entry_type { - #define x(f, nr) BCH_JSET_ENTRY_##f = nr, -diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h -index 3c23bdf788ce..52594e925eb7 100644 ---- a/fs/bcachefs/bcachefs_ioctl.h -+++ b/fs/bcachefs/bcachefs_ioctl.h -@@ -87,6 +87,7 @@ struct bch_ioctl_incremental { - #define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) - #define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) - #define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) -+#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters) - - /* ioctl below act on a particular file, not the filesystem as a whole: */ - -@@ -213,6 +214,10 @@ struct bch_ioctl_data { - struct bpos end_pos; - - union { -+ struct { -+ __u32 dev; -+ __u32 data_types; -+ } scrub; - struct { - __u32 dev; - __u32 pad; -@@ -229,6 +234,11 @@ enum bch_data_event { - BCH_DATA_EVENT_NR = 1, - }; - -+enum data_progress_data_type_special { -+ DATA_PROGRESS_DATA_TYPE_phys = 254, -+ DATA_PROGRESS_DATA_TYPE_done = 255, -+}; -+ - struct bch_ioctl_data_progress { - __u8 data_type; - __u8 btree_id; -@@ -237,11 +247,19 @@ struct bch_ioctl_data_progress { - - __u64 sectors_done; - __u64 sectors_total; -+ __u64 sectors_error_corrected; -+ __u64 sectors_error_uncorrected; - } __packed __aligned(8); - -+enum bch_ioctl_data_event_ret { -+ BCH_IOCTL_DATA_EVENT_RET_done = 1, -+ BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, -+}; -+ - struct bch_ioctl_data_event { - __u8 type; -- __u8 pad[7]; -+ __u8 ret; -+ __u8 pad[6]; - union { - struct bch_ioctl_data_progress p; - __u64 pad2[15]; -@@ -443,4 +461,13 @@ struct bch_ioctl_query_accounting { - struct bkey_i_accounting accounting[]; - }; - -+#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0) -+ -+struct bch_ioctl_query_counters { -+ __u16 nr; -+ __u16 flags; -+ __u32 pad; -+ __u64 d[]; -+}; -+ - #endif /* _BCACHEFS_IOCTL_H */ -diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c -index 995ba32e9b6e..ee823c640642 100644 ---- a/fs/bcachefs/bkey.c -+++ b/fs/bcachefs/bkey.c -@@ -47,11 +47,9 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out, - } - } - --#ifdef CONFIG_BCACHEFS_DEBUG -- --static void bch2_bkey_pack_verify(const struct bkey_packed *packed, -- const struct bkey *unpacked, -- const struct bkey_format *format) -+static void __bch2_bkey_pack_verify(const struct bkey_packed *packed, -+ const struct bkey *unpacked, -+ const struct bkey_format *format) - { - struct bkey tmp; - -@@ -95,11 +93,13 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, - } - } - --#else - static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, -- const struct bkey *unpacked, -- const struct bkey_format *format) {} --#endif -+ const struct bkey *unpacked, -+ const struct bkey_format *format) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) -+ __bch2_bkey_pack_verify(packed, unpacked, format); -+} - - struct pack_state { - const struct bkey_format *format; -@@ -398,7 +398,6 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) - return ret; - } - --#ifdef CONFIG_BCACHEFS_DEBUG - static bool bkey_packed_successor(struct bkey_packed *out, - const struct btree *b, - struct bkey_packed k) -@@ -455,7 +454,6 @@ static bool bkey_format_has_too_big_fields(const struct bkey_format *f) - - return false; - } --#endif - - /* - * Returns a packed key that compares <= in -@@ -472,9 +470,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, - const struct bkey_format *f = &b->format; - struct pack_state state = pack_state_init(f, out); - u64 *w = out->_data; --#ifdef CONFIG_BCACHEFS_DEBUG - struct bpos orig = in; --#endif - bool exact = true; - unsigned i; - -@@ -527,18 +523,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, - out->format = KEY_FORMAT_LOCAL_BTREE; - out->type = KEY_TYPE_deleted; - --#ifdef CONFIG_BCACHEFS_DEBUG -- if (exact) { -- BUG_ON(bkey_cmp_left_packed(b, out, &orig)); -- } else { -- struct bkey_packed successor; -+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { -+ if (exact) { -+ BUG_ON(bkey_cmp_left_packed(b, out, &orig)); -+ } else { -+ struct bkey_packed successor; - -- BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); -- BUG_ON(bkey_packed_successor(&successor, b, *out) && -- bkey_cmp_left_packed(b, &successor, &orig) < 0 && -- !bkey_format_has_too_big_fields(f)); -+ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); -+ BUG_ON(bkey_packed_successor(&successor, b, *out) && -+ bkey_cmp_left_packed(b, &successor, &orig) < 0 && -+ !bkey_format_has_too_big_fields(f)); -+ } - } --#endif - - return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; - } -@@ -627,14 +623,13 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) - } - } - --#ifdef CONFIG_BCACHEFS_DEBUG -- { -+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { - struct printbuf buf = PRINTBUF; - - BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); - printbuf_exit(&buf); - } --#endif -+ - return ret; - } - -diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h -index 054e2d5e8448..3ccd521c190a 100644 ---- a/fs/bcachefs/bkey.h -+++ b/fs/bcachefs/bkey.h -@@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) - static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) - { - return bpos_eq(l.k->p, r.k->p) && -+ l.k->size == r.k->size && - bkey_bytes(l.k) == bkey_bytes(r.k) && - !memcmp(l.v, r.v, bkey_val_bytes(l.k)); - } -@@ -397,8 +398,7 @@ __bkey_unpack_key_format_checked(const struct btree *b, - compiled_unpack_fn unpack_fn = b->aux_data; - unpack_fn(dst, src); - -- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && -- bch2_expensive_debug_checks) { -+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { - struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); - - BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); -diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c -index 15c93576b5c2..fcd8c82cba4f 100644 ---- a/fs/bcachefs/bkey_methods.c -+++ b/fs/bcachefs/bkey_methods.c -@@ -21,7 +21,7 @@ - #include "xattr.h" - - const char * const bch2_bkey_types[] = { --#define x(name, nr) #name, -+#define x(name, nr, ...) #name, - BCH_BKEY_TYPES() - #undef x - NULL -@@ -115,7 +115,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_ - }) - - const struct bkey_ops bch2_bkey_ops[] = { --#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, -+#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name, - BCH_BKEY_TYPES() - #undef x - }; -@@ -155,6 +155,12 @@ static u64 bch2_key_types_allowed[] = { - #undef x - }; - -+static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = { -+#define x(name, nr, flags) [KEY_TYPE_##name] = flags, -+ BCH_BKEY_TYPES() -+#undef x -+}; -+ - const char *bch2_btree_node_type_str(enum btree_node_type type) - { - return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); -@@ -177,8 +183,18 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - if (type >= BKEY_TYPE_NR) - return 0; - -- bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && -- (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) && -+ enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX -+ ? bch2_bkey_type_flags[k.k->type] -+ : 0; -+ -+ bool strict_key_type_allowed = -+ (from.flags & BCH_VALIDATE_commit) || -+ type == BKEY_TYPE_btree || -+ (from.btree < BTREE_ID_NR && -+ (bkey_flags & BKEY_TYPE_strict_btree_checks)); -+ -+ bkey_fsck_err_on(strict_key_type_allowed && -+ k.k->type < KEY_TYPE_MAX && - !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), - c, bkey_invalid_type_for_btree, - "invalid key type for btree %s (%s)", -@@ -340,7 +356,7 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) - return ops->key_merge && - bch2_bkey_maybe_mergable(l.k, r.k) && - (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && -- !bch2_key_merging_disabled && -+ !static_branch_unlikely(&bch2_key_merging_disabled) && - ops->key_merge(c, l, r); - } - -diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c -index 9a4a83d6fd2d..32841f762eb2 100644 ---- a/fs/bcachefs/bset.c -+++ b/fs/bcachefs/bset.c -@@ -144,8 +144,6 @@ struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) - return nr; - } - --#ifdef CONFIG_BCACHEFS_DEBUG -- - void __bch2_verify_btree_nr_keys(struct btree *b) - { - struct btree_nr_keys nr = bch2_btree_node_count_keys(b); -@@ -153,7 +151,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b) - BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); - } - --static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, -+static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, - struct btree *b) - { - struct btree_node_iter iter = *_iter; -@@ -190,8 +188,8 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, - } - } - --void bch2_btree_node_iter_verify(struct btree_node_iter *iter, -- struct btree *b) -+void __bch2_btree_node_iter_verify(struct btree_node_iter *iter, -+ struct btree *b) - { - struct btree_node_iter_set *set, *s2; - struct bkey_packed *k, *p; -@@ -237,8 +235,8 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - } - } - --void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, -- struct bkey_packed *insert, unsigned clobber_u64s) -+static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, -+ struct bkey_packed *insert, unsigned clobber_u64s) - { - struct bset_tree *t = bch2_bkey_to_bset(b, where); - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); -@@ -285,12 +283,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, - #endif - } - --#else -- --static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, -- struct btree *b) {} -+static inline void bch2_verify_insert_pos(struct btree *b, -+ struct bkey_packed *where, -+ struct bkey_packed *insert, -+ unsigned clobber_u64s) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) -+ __bch2_verify_insert_pos(b, where, insert, clobber_u64s); -+} - --#endif - - /* Auxiliary search trees */ - -@@ -361,9 +362,8 @@ static struct bkey_float *bkey_float(const struct btree *b, - return ro_aux_tree_base(b, t)->f + idx; - } - --static void bset_aux_tree_verify(struct btree *b) -+static void __bset_aux_tree_verify(struct btree *b) - { --#ifdef CONFIG_BCACHEFS_DEBUG - for_each_bset(b, t) { - if (t->aux_data_offset == U16_MAX) - continue; -@@ -375,7 +375,12 @@ static void bset_aux_tree_verify(struct btree *b) - BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); - BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); - } --#endif -+} -+ -+static inline void bset_aux_tree_verify(struct btree *b) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) -+ __bset_aux_tree_verify(b); - } - - void bch2_btree_keys_init(struct btree *b) -@@ -495,15 +500,11 @@ static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, - }; - } - --static void bch2_bset_verify_rw_aux_tree(struct btree *b, -- struct bset_tree *t) -+static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t) - { - struct bkey_packed *k = btree_bkey_first(b, t); - unsigned j = 0; - -- if (!bch2_expensive_debug_checks) -- return; -- - BUG_ON(bset_has_ro_aux_tree(t)); - - if (!bset_has_rw_aux_tree(t)) -@@ -530,6 +531,13 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, - } - } - -+static inline void bch2_bset_verify_rw_aux_tree(struct btree *b, -+ struct bset_tree *t) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) -+ __bch2_bset_verify_rw_aux_tree(b, t); -+} -+ - /* returns idx of first entry >= offset: */ - static unsigned rw_aux_tree_bsearch(struct btree *b, - struct bset_tree *t, -@@ -869,7 +877,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, - k = p; - } - -- if (bch2_expensive_debug_checks) { -+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { - BUG_ON(ret >= orig_k); - - for (i = ret -@@ -1195,7 +1203,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, - bkey_iter_pos_cmp(b, m, search) < 0) - m = bkey_p_next(m); - -- if (bch2_expensive_debug_checks) { -+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); - - BUG_ON(prev && -@@ -1435,9 +1443,9 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, - void bch2_btree_node_iter_advance(struct btree_node_iter *iter, - struct btree *b) - { -- if (bch2_expensive_debug_checks) { -- bch2_btree_node_iter_verify(iter, b); -- bch2_btree_node_iter_next_check(iter, b); -+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { -+ __bch2_btree_node_iter_verify(iter, b); -+ __bch2_btree_node_iter_next_check(iter, b); - } - - __bch2_btree_node_iter_advance(iter, b); -@@ -1453,8 +1461,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, - struct btree_node_iter_set *set; - unsigned end = 0; - -- if (bch2_expensive_debug_checks) -- bch2_btree_node_iter_verify(iter, b); -+ bch2_btree_node_iter_verify(iter, b); - - for_each_bset(b, t) { - k = bch2_bkey_prev_all(b, t, -@@ -1489,8 +1496,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, - iter->data[0].k = __btree_node_key_to_offset(b, prev); - iter->data[0].end = end; - -- if (bch2_expensive_debug_checks) -- bch2_btree_node_iter_verify(iter, b); -+ bch2_btree_node_iter_verify(iter, b); - return prev; - } - -diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h -index 6953d55b72cc..a15ecf9d006e 100644 ---- a/fs/bcachefs/bset.h -+++ b/fs/bcachefs/bset.h -@@ -517,27 +517,19 @@ void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); - void bch2_dump_btree_node(struct bch_fs *, struct btree *); - void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); - --#ifdef CONFIG_BCACHEFS_DEBUG -- - void __bch2_verify_btree_nr_keys(struct btree *); --void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); --void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, -- struct bkey_packed *, unsigned); -- --#else -+void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); - --static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} - static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, -- struct btree *b) {} --static inline void bch2_verify_insert_pos(struct btree *b, -- struct bkey_packed *where, -- struct bkey_packed *insert, -- unsigned clobber_u64s) {} --#endif -+ struct btree *b) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) -+ __bch2_btree_node_iter_verify(iter, b); -+} - - static inline void bch2_verify_btree_nr_keys(struct btree *b) - { -- if (bch2_debug_check_btree_accounting) -+ if (static_branch_unlikely(&bch2_debug_check_btree_accounting)) - __bch2_verify_btree_nr_keys(b); - } - -diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c -index 1ec1f90e0eb3..a5d983309311 100644 ---- a/fs/bcachefs/btree_cache.c -+++ b/fs/bcachefs/btree_cache.c -@@ -15,14 +15,9 @@ - - #include - #include -+#include - #include - --#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ --do { \ -- if (shrinker_counter) \ -- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ --} while (0) -- - const char * const bch2_btree_node_flags[] = { - "typebit", - "typebit", -@@ -350,115 +345,118 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, - return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); - } - --/* -- * this version is for btree nodes that have already been freed (we're not -- * reaping a real btree node) -- */ --static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) -+static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, -+ bool flush, bool locked) - { - struct btree_cache *bc = &c->btree_cache; -- int ret = 0; - - lockdep_assert_held(&bc->lock); --wait_on_io: -- if (b->flags & ((1U << BTREE_NODE_dirty)| -- (1U << BTREE_NODE_read_in_flight)| -+ -+ if (btree_node_noevict(b)) { -+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; -+ return -BCH_ERR_ENOMEM_btree_node_reclaim; -+ } -+ if (btree_node_write_blocked(b)) { -+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; -+ return -BCH_ERR_ENOMEM_btree_node_reclaim; -+ } -+ if (btree_node_will_make_reachable(b)) { -+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; -+ return -BCH_ERR_ENOMEM_btree_node_reclaim; -+ } -+ -+ if (btree_node_dirty(b)) { -+ if (!flush) { -+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; -+ return -BCH_ERR_ENOMEM_btree_node_reclaim; -+ } -+ -+ if (locked) { -+ /* -+ * Using the underscore version because we don't want to compact -+ * bsets after the write, since this node is about to be evicted -+ * - unless btree verify mode is enabled, since it runs out of -+ * the post write cleanup: -+ */ -+ if (static_branch_unlikely(&bch2_verify_btree_ondisk)) -+ bch2_btree_node_write(c, b, SIX_LOCK_intent, -+ BTREE_WRITE_cache_reclaim); -+ else -+ __bch2_btree_node_write(c, b, -+ BTREE_WRITE_cache_reclaim); -+ } -+ } -+ -+ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| - (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { -- if (btree_node_dirty(b)) -- BTREE_CACHE_NOT_FREED_INCREMENT(dirty); -- else if (btree_node_read_in_flight(b)) -- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); -+ if (btree_node_read_in_flight(b)) -+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; - else if (btree_node_write_in_flight(b)) -- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); -+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; - } - -+ if (locked) -+ return -EINTR; -+ - /* XXX: waiting on IO with btree cache lock held */ - bch2_btree_node_wait_on_read(b); - bch2_btree_node_wait_on_write(b); - } - -+ return 0; -+} -+ -+/* -+ * this version is for btree nodes that have already been freed (we're not -+ * reaping a real btree node) -+ */ -+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ int ret = 0; -+ -+ lockdep_assert_held(&bc->lock); -+retry_unlocked: -+ ret = __btree_node_reclaim_checks(c, b, flush, false); -+ if (ret) -+ return ret; -+ - if (!six_trylock_intent(&b->c.lock)) { -- BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); -+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; - } - - if (!six_trylock_write(&b->c.lock)) { -- BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); -- goto out_unlock_intent; -+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; -+ six_unlock_intent(&b->c.lock); -+ return -BCH_ERR_ENOMEM_btree_node_reclaim; - } - - /* recheck under lock */ -- if (b->flags & ((1U << BTREE_NODE_read_in_flight)| -- (1U << BTREE_NODE_write_in_flight))) { -- if (!flush) { -- if (btree_node_read_in_flight(b)) -- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); -- else if (btree_node_write_in_flight(b)) -- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); -- goto out_unlock; -- } -+ ret = __btree_node_reclaim_checks(c, b, flush, true); -+ if (ret) { - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); -- goto wait_on_io; -- } -- -- if (btree_node_noevict(b)) { -- BTREE_CACHE_NOT_FREED_INCREMENT(noevict); -- goto out_unlock; -- } -- if (btree_node_write_blocked(b)) { -- BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); -- goto out_unlock; -- } -- if (btree_node_will_make_reachable(b)) { -- BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); -- goto out_unlock; -+ if (ret == -EINTR) -+ goto retry_unlocked; -+ return ret; - } - -- if (btree_node_dirty(b)) { -- if (!flush) { -- BTREE_CACHE_NOT_FREED_INCREMENT(dirty); -- goto out_unlock; -- } -- /* -- * Using the underscore version because we don't want to compact -- * bsets after the write, since this node is about to be evicted -- * - unless btree verify mode is enabled, since it runs out of -- * the post write cleanup: -- */ -- if (bch2_verify_btree_ondisk) -- bch2_btree_node_write(c, b, SIX_LOCK_intent, -- BTREE_WRITE_cache_reclaim); -- else -- __bch2_btree_node_write(c, b, -- BTREE_WRITE_cache_reclaim); -- -- six_unlock_write(&b->c.lock); -- six_unlock_intent(&b->c.lock); -- goto wait_on_io; -- } --out: - if (b->hash_val && !ret) - trace_and_count(c, btree_cache_reap, c, b); -- return ret; --out_unlock: -- six_unlock_write(&b->c.lock); --out_unlock_intent: -- six_unlock_intent(&b->c.lock); -- ret = -BCH_ERR_ENOMEM_btree_node_reclaim; -- goto out; -+ return 0; - } - --static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) -+static int btree_node_reclaim(struct bch_fs *c, struct btree *b) - { -- return __btree_node_reclaim(c, b, false, shrinker_counter); -+ return __btree_node_reclaim(c, b, false); - } - - static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) - { -- return __btree_node_reclaim(c, b, true, false); -+ return __btree_node_reclaim(c, b, true); - } - - static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, -@@ -476,7 +474,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, - unsigned long ret = SHRINK_STOP; - bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; - -- if (bch2_btree_shrinker_disabled) -+ if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) - return SHRINK_STOP; - - mutex_lock(&bc->lock); -@@ -490,7 +488,10 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, - * IO can always make forward progress: - */ - can_free = btree_cache_can_free(list); -- nr = min_t(unsigned long, nr, can_free); -+ if (nr > can_free) { -+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; -+ nr = can_free; -+ } - - i = 0; - list_for_each_entry_safe(b, t, &bc->freeable, list) { -@@ -506,7 +507,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, - if (touched >= nr) - goto out; - -- if (!btree_node_reclaim(c, b, true)) { -+ if (!btree_node_reclaim(c, b)) { - btree_node_data_free(bc, b); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); -@@ -522,7 +523,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, - clear_btree_node_accessed(b); - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; - --touched;; -- } else if (!btree_node_reclaim(c, b, true)) { -+ } else if (!btree_node_reclaim(c, b)) { - __bch2_btree_node_hash_remove(bc, b); - __btree_node_data_free(bc, b); - -@@ -569,12 +570,25 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, - { - struct btree_cache_list *list = shrink->private_data; - -- if (bch2_btree_shrinker_disabled) -+ if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) - return 0; - - return btree_cache_can_free(list); - } - -+static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) -+{ -+ struct btree_cache_list *list = shrink->private_data; -+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); -+ -+ char *cbuf; -+ size_t buflen = seq_buf_get_buf(s, &cbuf); -+ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); -+ -+ bch2_btree_cache_to_text(&out, bc); -+ seq_buf_commit(s, out.pos); -+} -+ - void bch2_fs_btree_cache_exit(struct bch_fs *c) - { - struct btree_cache *bc = &c->btree_cache; -@@ -610,6 +624,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) - btree_node_write_in_flight(b)); - - btree_node_data_free(bc, b); -+ cond_resched(); - } - - BUG_ON(!bch2_journal_error(&c->journal) && -@@ -665,6 +680,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) - bc->live[0].shrink = shrink; - shrink->count_objects = bch2_btree_cache_count; - shrink->scan_objects = bch2_btree_cache_scan; -+ shrink->to_text = bch2_btree_cache_shrinker_to_text; - shrink->seeks = 2; - shrink->private_data = &bc->live[0]; - shrinker_register(shrink); -@@ -675,6 +691,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) - bc->live[1].shrink = shrink; - shrink->count_objects = bch2_btree_cache_count; - shrink->scan_objects = bch2_btree_cache_scan; -+ shrink->to_text = bch2_btree_cache_shrinker_to_text; - shrink->seeks = 8; - shrink->private_data = &bc->live[1]; - shrinker_register(shrink); -@@ -754,7 +771,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) - - for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) - list_for_each_entry_reverse(b, &bc->live[i].list, list) -- if (!btree_node_reclaim(c, b, false)) -+ if (!btree_node_reclaim(c, b)) - return b; - - while (1) { -@@ -789,7 +806,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea - * disk node. Check the freed list before allocating a new one: - */ - list_for_each_entry(b, freed, list) -- if (!btree_node_reclaim(c, b, false)) { -+ if (!btree_node_reclaim(c, b)) { - list_del_init(&b->list); - goto got_node; - } -@@ -816,7 +833,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea - * the list. Check if there's any freed nodes there: - */ - list_for_each_entry(b2, &bc->freeable, list) -- if (!btree_node_reclaim(c, b2, false)) { -+ if (!btree_node_reclaim(c, b2)) { - swap(b->data, b2->data); - swap(b->aux_data, b2->aux_data); - -@@ -851,7 +868,6 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea - b->sib_u64s[1] = 0; - b->whiteout_u64s = 0; - bch2_btree_keys_init(b); -- set_btree_node_accessed(b); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], - start_time); -@@ -977,7 +993,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, - - /* Unlock before doing IO: */ - six_unlock_intent(&b->c.lock); -- bch2_trans_unlock_noassert(trans); -+ bch2_trans_unlock(trans); - - bch2_btree_node_read(trans, b, sync); - -@@ -1003,7 +1019,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) - { - struct printbuf buf = PRINTBUF; - -- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) -+ if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) - return; - - prt_printf(&buf, -@@ -1285,6 +1301,10 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, - six_unlock_read(&b->c.lock); - goto retry; - } -+ -+ /* avoid atomic set bit if it's not needed: */ -+ if (!btree_node_accessed(b)) -+ set_btree_node_accessed(b); - } - - /* XXX: waiting on IO with btree locks held: */ -@@ -1300,10 +1320,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, - prefetch(p + L1_CACHE_BYTES * 2); - } - -- /* avoid atomic set bit if it's not needed: */ -- if (!btree_node_accessed(b)) -- set_btree_node_accessed(b); -- - if (unlikely(btree_node_read_error(b))) { - six_unlock_read(&b->c.lock); - b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); -@@ -1416,7 +1432,7 @@ void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, - prt_printf(out, "%u", r->level); - else - prt_printf(out, "(unknown)"); -- prt_printf(out, "\n "); -+ prt_newline(out); - - bch2_bkey_val_to_text(out, c, k); - } -@@ -1492,9 +1508,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc - - prt_btree_cache_line(out, c, "live:", bc->live[0].nr); - prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); -- prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); -+ prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); -+ prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); - prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); -- prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); -+ prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); - prt_newline(out); - - for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { -@@ -1505,6 +1522,7 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc - } - - prt_newline(out); -+ prt_printf(out, "counters since mount:\n"); - prt_printf(out, "freed:\t%zu\n", bc->nr_freed); - prt_printf(out, "not freed:\n"); - -diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c -index dd1d9b74076e..91b6395421df 100644 ---- a/fs/bcachefs/btree_gc.c -+++ b/fs/bcachefs/btree_gc.c -@@ -22,11 +22,13 @@ - #include "debug.h" - #include "disk_accounting.h" - #include "ec.h" -+#include "enumerated_ref.h" - #include "error.h" - #include "extents.h" - #include "journal.h" - #include "keylist.h" - #include "move.h" -+#include "progress.h" - #include "recovery_passes.h" - #include "reflink.h" - #include "recovery.h" -@@ -46,6 +48,27 @@ - #define DROP_PREV_NODE 11 - #define DID_FILL_FROM_SCAN 12 - -+/* -+ * Returns true if it's a btree we can easily reconstruct, or otherwise won't -+ * cause data loss if it's missing: -+ */ -+static bool btree_id_important(enum btree_id btree) -+{ -+ if (btree_id_is_alloc(btree)) -+ return false; -+ -+ switch (btree) { -+ case BTREE_ID_quotas: -+ case BTREE_ID_snapshot_trees: -+ case BTREE_ID_logged_ops: -+ case BTREE_ID_rebalance_work: -+ case BTREE_ID_subvolume_children: -+ return false; -+ default: -+ return true; -+ } -+} -+ - static const char * const bch2_gc_phase_strs[] = { - #define x(n) #n, - GC_PHASES() -@@ -212,15 +235,15 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * - - prt_printf(&buf, " at "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); -- prt_printf(&buf, ":\n parent: "); -+ prt_printf(&buf, ":\nparent: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - if (prev) { -- prt_printf(&buf, "\n prev: "); -+ prt_printf(&buf, "\nprev: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); - } - -- prt_str(&buf, "\n next: "); -+ prt_str(&buf, "\nnext: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); - - if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ -@@ -279,12 +302,12 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, - if (bpos_eq(child->key.k.p, b->key.k.p)) - return 0; - -- prt_printf(&buf, " at "); -+ prt_printf(&buf, "\nat: "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); -- prt_printf(&buf, ":\n parent: "); -+ prt_printf(&buf, "\nparent: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - -- prt_str(&buf, "\n child: "); -+ prt_str(&buf, "\nchild: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); - - if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, -@@ -348,21 +371,13 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - -- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), -- trans, btree_node_read_error, -- "Topology repair: unreadable btree node at\n" -- " %s", -- buf.buf)) { -+ if (bch2_err_matches(ret, EIO)) { - bch2_btree_node_evict(trans, cur_k.k); - cur = NULL; - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - if (ret) - break; -- -- ret = bch2_btree_lost_data(c, b->c.btree_id); -- if (ret) -- break; - continue; - } - -@@ -524,9 +539,6 @@ int bch2_check_topology(struct bch_fs *c) - bch2_btree_id_to_text(&buf, i); - - if (r->error) { -- ret = bch2_btree_lost_data(c, i); -- if (ret) -- break; - reconstruct_root: - bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - -@@ -534,8 +546,10 @@ int bch2_check_topology(struct bch_fs *c) - r->error = 0; - - if (!bch2_btree_has_scanned_nodes(c, i)) { -- mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing, -- "no nodes found for btree %s, continue?", buf.buf); -+ __fsck_err(trans, -+ FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), -+ btree_root_unreadable_and_scan_found_nothing, -+ "no nodes found for btree %s, continue?", buf.buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); - } else { - bch2_btree_root_alloc_fake_trans(trans, i, 1); -@@ -605,13 +619,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, - deleted.p = k.k->p; - - if (initial) { -- BUG_ON(bch2_journal_seq_verify && -+ BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) && - k.k->bversion.lo > atomic64_read(&c->journal.seq)); - - if (fsck_err_on(btree_id != BTREE_ID_accounting && - k.k->bversion.lo > atomic64_read(&c->key_version), - trans, bkey_version_in_future, -- "key version number higher than recorded %llu\n %s", -+ "key version number higher than recorded %llu\n%s", - atomic64_read(&c->key_version), - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - atomic64_set(&c->key_version, k.k->bversion.lo); -@@ -619,7 +633,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, - - if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), - trans, btree_bitmap_not_marked, -- "btree ptr not marked in member info btree allocated bitmap\n %s", -+ "btree ptr not marked in member info btree allocated bitmap\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { -@@ -656,7 +670,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, - return ret; - } - --static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) -+static int bch2_gc_btree(struct btree_trans *trans, -+ struct progress_indicator_state *progress, -+ enum btree_id btree, bool initial) - { - struct bch_fs *c = trans->c; - unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; -@@ -673,6 +689,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in - BTREE_ITER_prefetch); - - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ -+ bch2_progress_update_iter(trans, progress, &iter, "check_allocations"); - gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); - bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); - })); -@@ -688,7 +705,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, - 0, bch2_btree_id_root(c, btree)->b->c.level, 0); -- struct btree *b = bch2_btree_iter_peek_node(&iter); -+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err_root; -@@ -717,22 +734,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) - static int bch2_gc_btrees(struct bch_fs *c) - { - struct btree_trans *trans = bch2_trans_get(c); -- enum btree_id ids[BTREE_ID_NR]; - struct printbuf buf = PRINTBUF; -- unsigned i; - int ret = 0; - -- for (i = 0; i < BTREE_ID_NR; i++) -+ struct progress_indicator_state progress; -+ bch2_progress_init(&progress, c, ~0ULL); -+ -+ enum btree_id ids[BTREE_ID_NR]; -+ for (unsigned i = 0; i < BTREE_ID_NR; i++) - ids[i] = i; - bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - -- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { -+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - unsigned btree = i < BTREE_ID_NR ? ids[i] : i; - - if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) - continue; - -- ret = bch2_gc_btree(trans, btree, true); -+ ret = bch2_gc_btree(trans, &progress, btree, true); - } - - printbuf_exit(&buf); -@@ -1015,8 +1034,7 @@ int bch2_check_allocations(struct bch_fs *c) - { - int ret; - -- lockdep_assert_held(&c->state_lock); -- -+ down_read(&c->state_lock); - down_write(&c->gc_lock); - - bch2_btree_interior_updates_flush(c); -@@ -1054,12 +1072,17 @@ int bch2_check_allocations(struct bch_fs *c) - percpu_up_write(&c->mark_lock); - - up_write(&c->gc_lock); -+ up_read(&c->state_lock); - - /* - * At startup, allocations can happen directly instead of via the - * allocator thread - issue wakeup in case they blocked on gc_lock: - */ - closure_wake_up(&c->freelist_wait); -+ -+ if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) -+ bch2_sb_members_clean_deleted(c); -+ - bch_err_fn(c, ret); - return ret; - } -@@ -1194,7 +1217,7 @@ int bch2_gc_gens(struct bch_fs *c) - BCH_TRANS_COMMIT_no_enospc, ({ - ca = bch2_dev_iterate(c, ca, k.k->p.inode); - if (!ca) { -- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); -+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - bch2_alloc_write_oldest_gen(trans, ca, &iter, k); -@@ -1228,26 +1251,21 @@ static void bch2_gc_gens_work(struct work_struct *work) - { - struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); - bch2_gc_gens(c); -- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); - } - - void bch2_gc_gens_async(struct bch_fs *c) - { -- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) && -+ if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) && - !queue_work(c->write_ref_wq, &c->gc_gens_work)) -- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); --} -- --void bch2_fs_btree_gc_exit(struct bch_fs *c) --{ -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); - } - --int bch2_fs_btree_gc_init(struct bch_fs *c) -+void bch2_fs_btree_gc_init_early(struct bch_fs *c) - { - seqcount_init(&c->gc_pos_lock); - INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); - - init_rwsem(&c->gc_lock); - mutex_init(&c->gc_gens_lock); -- return 0; - } -diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h -index 9693a90a48a2..ec77662369a2 100644 ---- a/fs/bcachefs/btree_gc.h -+++ b/fs/bcachefs/btree_gc.h -@@ -83,7 +83,6 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); - int bch2_gc_gens(struct bch_fs *); - void bch2_gc_gens_async(struct bch_fs *); - --void bch2_fs_btree_gc_exit(struct bch_fs *); --int bch2_fs_btree_gc_init(struct bch_fs *); -+void bch2_fs_btree_gc_init_early(struct bch_fs *); - - #endif /* _BCACHEFS_BTREE_GC_H */ -diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c -index 756736f9243d..34018296053a 100644 ---- a/fs/bcachefs/btree_io.c -+++ b/fs/bcachefs/btree_io.c -@@ -1,6 +1,8 @@ - // SPDX-License-Identifier: GPL-2.0 - - #include "bcachefs.h" -+#include "async_objs.h" -+#include "bkey_buf.h" - #include "bkey_methods.h" - #include "bkey_sort.h" - #include "btree_cache.h" -@@ -12,6 +14,7 @@ - #include "buckets.h" - #include "checksum.h" - #include "debug.h" -+#include "enumerated_ref.h" - #include "error.h" - #include "extents.h" - #include "io_write.h" -@@ -40,6 +43,7 @@ void bch2_btree_node_io_unlock(struct btree *b) - - clear_btree_node_write_in_flight_inner(b); - clear_btree_node_write_in_flight(b); -+ smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); - } - -@@ -512,21 +516,23 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) - - static void btree_err_msg(struct printbuf *out, struct bch_fs *c, - struct bch_dev *ca, -+ bool print_pos, - struct btree *b, struct bset *i, struct bkey_packed *k, -- unsigned offset, int write) -+ unsigned offset, int rw) - { -- prt_printf(out, bch2_log_msg(c, "%s"), -- write == READ -- ? "error validating btree node " -- : "corrupt btree node before write "); -- if (ca) -- prt_printf(out, "on %s ", ca->name); -- prt_printf(out, "at btree "); -- bch2_btree_pos_to_text(out, c, b); -+ if (print_pos) { -+ prt_str(out, rw == READ -+ ? "error validating btree node " -+ : "corrupt btree node before write "); -+ prt_printf(out, "at btree "); -+ bch2_btree_pos_to_text(out, c, b); -+ prt_newline(out); -+ } - -- printbuf_indent_add(out, 2); -+ if (ca) -+ prt_printf(out, "%s ", ca->name); - -- prt_printf(out, "\nnode offset %u/%u", -+ prt_printf(out, "node offset %u/%u", - b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); - if (i) - prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); -@@ -537,34 +543,32 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, - prt_str(out, ": "); - } - --__printf(10, 11) -+__printf(11, 12) - static int __btree_err(int ret, - struct bch_fs *c, - struct bch_dev *ca, - struct btree *b, - struct bset *i, - struct bkey_packed *k, -- int write, -- bool have_retry, -+ int rw, - enum bch_sb_error_id err_type, -+ struct bch_io_failures *failed, -+ struct printbuf *err_msg, - const char *fmt, ...) - { -- struct printbuf out = PRINTBUF; -- bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; -- va_list args; -+ if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) -+ return -BCH_ERR_fsck_fix; - -- btree_err_msg(&out, c, ca, b, i, k, b->written, write); -+ bool have_retry = false; -+ int ret2; - -- va_start(args, fmt); -- prt_vprintf(&out, fmt, args); -- va_end(args); -+ if (ca) { -+ bch2_mark_btree_validate_failure(failed, ca->dev_idx); - -- if (write == WRITE) { -- bch2_print_string_as_lines(KERN_ERR, out.buf); -- ret = c->opts.errors == BCH_ON_ERROR_continue -- ? 0 -- : -BCH_ERR_fsck_errors_not_fixed; -- goto out; -+ struct extent_ptr_decoded pick; -+ have_retry = !bch2_bkey_pick_read_device(c, -+ bkey_i_to_s_c(&b->key), -+ failed, &pick, -1); - } - - if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) -@@ -572,37 +576,77 @@ static int __btree_err(int ret, - if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) - ret = -BCH_ERR_btree_node_read_err_bad_node; - -- if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) -- bch2_sb_error_count(c, err_type); -+ bch2_sb_error_count(c, err_type); -+ -+ bool print_deferred = err_msg && -+ rw == READ && -+ !(test_bit(BCH_FS_in_fsck, &c->flags) && -+ c->opts.fix_errors == FSCK_FIX_ask); -+ -+ struct printbuf out = PRINTBUF; -+ bch2_log_msg_start(c, &out); -+ -+ if (!print_deferred) -+ err_msg = &out; -+ -+ btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw); -+ -+ va_list args; -+ va_start(args, fmt); -+ prt_vprintf(err_msg, fmt, args); -+ va_end(args); -+ -+ if (print_deferred) { -+ prt_newline(err_msg); -+ -+ switch (ret) { -+ case -BCH_ERR_btree_node_read_err_fixable: -+ ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type); -+ if (ret2 != -BCH_ERR_fsck_fix && -+ ret2 != -BCH_ERR_fsck_ignore) { -+ ret = ret2; -+ goto fsck_err; -+ } -+ -+ if (!have_retry) -+ ret = -BCH_ERR_fsck_fix; -+ goto out; -+ case -BCH_ERR_btree_node_read_err_bad_node: -+ prt_str(&out, ", "); -+ ret = __bch2_topology_error(c, &out); -+ break; -+ } -+ -+ goto out; -+ } -+ -+ if (rw == WRITE) { -+ prt_str(&out, ", "); -+ ret = __bch2_inconsistent_error(c, &out) -+ ? -BCH_ERR_fsck_errors_not_fixed -+ : 0; -+ goto print; -+ } - - switch (ret) { - case -BCH_ERR_btree_node_read_err_fixable: -- ret = !silent -- ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf) -- : -BCH_ERR_fsck_fix; -- if (ret != -BCH_ERR_fsck_fix && -- ret != -BCH_ERR_fsck_ignore) -+ ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf); -+ if (ret2 != -BCH_ERR_fsck_fix && -+ ret2 != -BCH_ERR_fsck_ignore) { -+ ret = ret2; - goto fsck_err; -- ret = -BCH_ERR_fsck_fix; -- break; -- case -BCH_ERR_btree_node_read_err_want_retry: -- case -BCH_ERR_btree_node_read_err_must_retry: -- if (!silent) -- bch2_print_string_as_lines(KERN_ERR, out.buf); -- break; -+ } -+ -+ if (!have_retry) -+ ret = -BCH_ERR_fsck_fix; -+ goto out; - case -BCH_ERR_btree_node_read_err_bad_node: -- if (!silent) -- bch2_print_string_as_lines(KERN_ERR, out.buf); -- ret = bch2_topology_error(c); -- break; -- case -BCH_ERR_btree_node_read_err_incompatible: -- if (!silent) -- bch2_print_string_as_lines(KERN_ERR, out.buf); -- ret = -BCH_ERR_fsck_errors_not_fixed; -+ prt_str(&out, ", "); -+ ret = __bch2_topology_error(c, &out); - break; -- default: -- BUG(); - } -+print: -+ bch2_print_str(c, KERN_ERR, out.buf); - out: - fsck_err: - printbuf_exit(&out); -@@ -611,8 +655,9 @@ static int __btree_err(int ret, - - #define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ - ({ \ -- int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ -+ int _ret = __btree_err(type, c, ca, b, i, k, write, \ - BCH_FSCK_ERR_##_err_type, \ -+ failed, err_msg, \ - msg, ##__VA_ARGS__); \ - \ - if (_ret != -BCH_ERR_fsck_fix) { \ -@@ -620,7 +665,7 @@ static int __btree_err(int ret, - goto fsck_err; \ - } \ - \ -- *saw_error = true; \ -+ true; \ - }) - - #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) -@@ -678,8 +723,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) - - static int validate_bset(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, struct bset *i, -- unsigned offset, unsigned sectors, -- int write, bool have_retry, bool *saw_error) -+ unsigned offset, unsigned sectors, int write, -+ struct bch_io_failures *failed, -+ struct printbuf *err_msg) - { - unsigned version = le16_to_cpu(i->version); - unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); -@@ -816,7 +862,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, - -BCH_ERR_btree_node_read_err_bad_node, - c, ca, b, i, NULL, - btree_node_bad_format, -- "invalid bkey format: %s\n %s", buf1.buf, -+ "invalid bkey format: %s\n%s", buf1.buf, - (printbuf_reset(&buf2), - bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); - printbuf_reset(&buf1); -@@ -892,7 +938,8 @@ static inline int btree_node_read_bkey_cmp(const struct btree *b, - - static int validate_bset_keys(struct bch_fs *c, struct btree *b, - struct bset *i, int write, -- bool have_retry, bool *saw_error) -+ struct bch_io_failures *failed, -+ struct printbuf *err_msg) - { - unsigned version = le16_to_cpu(i->version); - struct bkey_packed *k, *prev = NULL; -@@ -1005,7 +1052,9 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, - } - - int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, -- struct btree *b, bool have_retry, bool *saw_error) -+ struct btree *b, -+ struct bch_io_failures *failed, -+ struct printbuf *err_msg) - { - struct btree_node_entry *bne; - struct sort_iter *iter; -@@ -1015,11 +1064,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - bool used_mempool, blacklisted; - bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && - BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); -- unsigned u64s; - unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); - u64 max_journal_seq = 0; - struct printbuf buf = PRINTBUF; -- int ret = 0, retry_read = 0, write = READ; -+ int ret = 0, write = READ; - u64 start_time = local_clock(); - - b->version_ondisk = U16_MAX; -@@ -1153,15 +1201,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - b->version_ondisk = min(b->version_ondisk, - le16_to_cpu(i->version)); - -- ret = validate_bset(c, ca, b, i, b->written, sectors, -- READ, have_retry, saw_error); -+ ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg); - if (ret) - goto fsck_err; - - if (!b->written) - btree_node_set_format(b, b->data->format); - -- ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); -+ ret = validate_bset_keys(c, b, i, READ, failed, err_msg); - if (ret) - goto fsck_err; - -@@ -1222,23 +1269,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); - sorted->keys.u64s = 0; - -- set_btree_bset(b, b->set, &b->data->keys); -- - b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); - memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, - btree_buf_bytes(b) - - sizeof(struct btree_node) - - b->nr.live_u64s * sizeof(u64)); - -- u64s = le16_to_cpu(sorted->keys.u64s); -+ b->data->keys.u64s = sorted->keys.u64s; - *sorted = *b->data; -- sorted->keys.u64s = cpu_to_le16(u64s); - swap(sorted, b->data); - set_btree_bset(b, b->set, &b->data->keys); - b->nsets = 1; - b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); - -- BUG_ON(b->nr.live_u64s != u64s); -+ BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s)); - - btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); - -@@ -1252,7 +1296,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - - ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); - if (ret == -BCH_ERR_fsck_delete_bkey || -- (bch2_inject_invalid_keys && -+ (static_branch_unlikely(&bch2_inject_invalid_keys) && - !bversion_cmp(u.k->bversion, MAX_VERSION))) { - btree_keys_account_key_drop(&b->nr, 0, k); - -@@ -1292,20 +1336,11 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - - if (!ptr_written) - set_btree_node_need_rewrite(b); --out: -+fsck_err: - mempool_free(iter, &c->fill_iter); - printbuf_exit(&buf); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); -- return retry_read; --fsck_err: -- if (ret == -BCH_ERR_btree_node_read_err_want_retry || -- ret == -BCH_ERR_btree_node_read_err_must_retry) { -- retry_read = 1; -- } else { -- set_btree_node_read_error(b); -- bch2_btree_lost_data(c, b->c.btree_id); -- } -- goto out; -+ return ret; - } - - static void btree_node_read_work(struct work_struct *work) -@@ -1317,17 +1352,28 @@ static void btree_node_read_work(struct work_struct *work) - struct btree *b = rb->b; - struct bio *bio = &rb->bio; - struct bch_io_failures failed = { .nr = 0 }; -+ int ret = 0; -+ - struct printbuf buf = PRINTBUF; -- bool saw_error = false; -- bool retry = false; -- bool can_retry; -+ bch2_log_msg_start(c, &buf); -+ -+ prt_printf(&buf, "btree node read error at btree "); -+ bch2_btree_pos_to_text(&buf, c, b); -+ prt_newline(&buf); - - goto start; - while (1) { -- retry = true; -- bch_info(c, "retrying read"); -- ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); -+ ret = bch2_bkey_pick_read_device(c, -+ bkey_i_to_s_c(&b->key), -+ &failed, &rb->pick, -1); -+ if (ret) { -+ set_btree_node_read_error(b); -+ break; -+ } -+ -+ ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); - rb->have_ioref = ca != NULL; -+ rb->start_time = local_clock(); - bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = rb->pick.ptr.offset; - bio->bi_iter.bi_size = btree_buf_bytes(b); -@@ -1338,60 +1384,66 @@ static void btree_node_read_work(struct work_struct *work) - } else { - bio->bi_status = BLK_STS_REMOVED; - } -+ -+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, -+ rb->start_time, !bio->bi_status); - start: -- printbuf_reset(&buf); -- bch2_btree_pos_to_text(&buf, c, b); -- bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read, -- "btree read error %s for %s", -- bch2_blk_status_to_str(bio->bi_status), buf.buf); - if (rb->have_ioref) -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read); - rb->have_ioref = false; - -- bch2_mark_io_failure(&failed, &rb->pick); -- -- can_retry = bch2_bkey_pick_read_device(c, -- bkey_i_to_s_c(&b->key), -- &failed, &rb->pick) > 0; -- -- if (!bio->bi_status && -- !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { -- if (retry) -- bch_info(c, "retry success"); -- break; -+ if (bio->bi_status) { -+ bch2_mark_io_failure(&failed, &rb->pick, false); -+ continue; - } - -- saw_error = true; -+ ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); -+ if (ret == -BCH_ERR_btree_node_read_err_want_retry || -+ ret == -BCH_ERR_btree_node_read_err_must_retry) -+ continue; - -- if (!can_retry) { -+ if (ret) - set_btree_node_read_error(b); -- bch2_btree_lost_data(c, b->c.btree_id); -- break; -- } -+ -+ break; - } - -- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], -- rb->start_time); -- bio_put(&rb->bio); -+ bch2_io_failures_to_text(&buf, c, &failed); - -- if ((saw_error || -+ if (btree_node_read_error(b)) -+ bch2_btree_lost_data(c, &buf, b->c.btree_id); -+ -+ /* -+ * only print retry success if we read from a replica with no errors -+ */ -+ if (btree_node_read_error(b)) -+ prt_printf(&buf, "ret %s", bch2_err_str(ret)); -+ else if (failed.nr) { -+ if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev)) -+ prt_printf(&buf, "retry success"); -+ else -+ prt_printf(&buf, "repair success"); -+ } -+ -+ if ((failed.nr || - btree_node_need_rewrite(b)) && - !btree_node_read_error(b) && -- c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { -- if (saw_error) { -- printbuf_reset(&buf); -- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); -- prt_str(&buf, " "); -- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); -- bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", -- __func__, buf.buf); -- } -- -+ c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { -+ prt_printf(&buf, " (rewriting node)"); - bch2_btree_node_rewrite_async(c, b); - } -+ prt_newline(&buf); - -+ if (failed.nr) -+ bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); -+ -+ async_object_list_del(c, btree_read_bio, rb->list_idx); -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], -+ rb->start_time); -+ bio_put(&rb->bio); - printbuf_exit(&buf); - clear_btree_node_read_in_flight(b); -+ smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); - } - -@@ -1400,16 +1452,20 @@ static void btree_node_read_endio(struct bio *bio) - struct btree_read_bio *rb = - container_of(bio, struct btree_read_bio, bio); - struct bch_fs *c = rb->c; -+ struct bch_dev *ca = rb->have_ioref -+ ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - -- if (rb->have_ioref) { -- struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); -- -- bch2_latency_acct(ca, rb->start_time, READ); -- } -+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, -+ rb->start_time, !bio->bi_status); - - queue_work(c->btree_read_complete_wq, &rb->work); - } - -+void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio) -+{ -+ bch2_bio_to_text(out, &rbio->bio); -+} -+ - struct btree_node_read_all { - struct closure cl; - struct bch_fs *c; -@@ -1469,12 +1525,13 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) - struct btree *b = ra->b; - struct printbuf buf = PRINTBUF; - bool dump_bset_maps = false; -- bool have_retry = false; - int ret = 0, best = -1, write = READ; - unsigned i, written = 0, written2 = 0; - __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 - ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; - bool _saw_error = false, *saw_error = &_saw_error; -+ struct printbuf *err_msg = NULL; -+ struct bch_io_failures *failed = NULL; - - for (i = 0; i < ra->nr; i++) { - struct btree_node *bn = ra->buf[i]; -@@ -1567,14 +1624,19 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) - - if (best >= 0) { - memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); -- ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); -+ ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL); - } else { - ret = -1; - } - - if (ret) { - set_btree_node_read_error(b); -- bch2_btree_lost_data(c, b->c.btree_id); -+ -+ struct printbuf buf = PRINTBUF; -+ bch2_btree_lost_data(c, &buf, b->c.btree_id); -+ if (buf.pos) -+ bch_err(c, "%s", buf.buf); -+ printbuf_exit(&buf); - } else if (*saw_error) - bch2_btree_node_rewrite_async(c, b); - -@@ -1588,6 +1650,7 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) - printbuf_exit(&buf); - - clear_btree_node_read_in_flight(b); -+ smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); - } - -@@ -1602,6 +1665,8 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) - struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); - - bch2_latency_acct(ca, rb->start_time, READ); -+ enumerated_ref_put(&ca->io_ref[READ], -+ BCH_DEV_READ_REF_btree_node_read_all_replicas); - } - - ra->err[rb->idx] = bio->bi_status; -@@ -1641,7 +1706,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool - - i = 0; - bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { -- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); -+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, -+ BCH_DEV_READ_REF_btree_node_read_all_replicas); - struct btree_read_bio *rb = - container_of(ra->bio[i], struct btree_read_bio, bio); - rb->c = c; -@@ -1692,33 +1758,42 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, - - trace_and_count(c, btree_node_read, trans, b); - -- if (bch2_verify_all_btree_replicas && -+ if (static_branch_unlikely(&bch2_verify_all_btree_replicas) && - !btree_node_read_all_replicas(c, b, sync)) - return; - - ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), -- NULL, &pick); -+ NULL, &pick, -1); - - if (ret <= 0) { -+ bool ratelimit = true; - struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); - - prt_str(&buf, "btree node read error: no device to read from\n at "); - bch2_btree_pos_to_text(&buf, c, b); -- bch_err_ratelimited(c, "%s", buf.buf); -- -- if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && -- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) -- bch2_fatal_error(c); -+ prt_newline(&buf); -+ bch2_btree_lost_data(c, &buf, b->c.btree_id); -+ -+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && -+ bch2_fs_emergency_read_only2(c, &buf)) -+ ratelimit = false; -+ -+ static DEFINE_RATELIMIT_STATE(rs, -+ DEFAULT_RATELIMIT_INTERVAL, -+ DEFAULT_RATELIMIT_BURST); -+ if (!ratelimit || __ratelimit(&rs)) -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); - - set_btree_node_read_error(b); -- bch2_btree_lost_data(c, b->c.btree_id); - clear_btree_node_read_in_flight(b); -+ smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -- printbuf_exit(&buf); - return; - } - -- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); -+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); - - bio = bio_alloc_bioset(NULL, - buf_pages(b->data, btree_buf_bytes(b)), -@@ -1737,6 +1812,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, - bio->bi_end_io = btree_node_read_endio; - bch2_bio_map(bio, b->data, btree_buf_bytes(b)); - -+ async_object_list_add(c, btree_read_bio, rb, &rb->list_idx); -+ - if (rb->have_ioref) { - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], - bio_sectors(bio)); -@@ -1811,6 +1888,192 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, - return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); - } - -+struct btree_node_scrub { -+ struct bch_fs *c; -+ struct bch_dev *ca; -+ void *buf; -+ bool used_mempool; -+ unsigned written; -+ -+ enum btree_id btree; -+ unsigned level; -+ struct bkey_buf key; -+ __le64 seq; -+ -+ struct work_struct work; -+ struct bio bio; -+}; -+ -+static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, -+ struct printbuf *err) -+{ -+ unsigned written = 0; -+ -+ if (le64_to_cpu(data->magic) != bset_magic(c)) { -+ prt_printf(err, "bad magic: want %llx, got %llx", -+ bset_magic(c), le64_to_cpu(data->magic)); -+ return false; -+ } -+ -+ while (written < (ptr_written ?: btree_sectors(c))) { -+ struct btree_node_entry *bne; -+ struct bset *i; -+ bool first = !written; -+ -+ if (first) { -+ bne = NULL; -+ i = &data->keys; -+ } else { -+ bne = (void *) data + (written << 9); -+ i = &bne->keys; -+ -+ if (!ptr_written && i->seq != data->keys.seq) -+ break; -+ } -+ -+ struct nonce nonce = btree_nonce(i, written << 9); -+ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); -+ -+ if (first) { -+ if (good_csum_type) { -+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); -+ if (bch2_crc_cmp(data->csum, csum)) { -+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); -+ return false; -+ } -+ } -+ -+ written += vstruct_sectors(data, c->block_bits); -+ } else { -+ if (good_csum_type) { -+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ if (bch2_crc_cmp(bne->csum, csum)) { -+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); -+ return false; -+ } -+ } -+ -+ written += vstruct_sectors(bne, c->block_bits); -+ } -+ } -+ -+ return true; -+} -+ -+static void btree_node_scrub_work(struct work_struct *work) -+{ -+ struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); -+ struct bch_fs *c = scrub->c; -+ struct printbuf err = PRINTBUF; -+ -+ __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, -+ bkey_i_to_s_c(scrub->key.k)); -+ prt_newline(&err); -+ -+ if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { -+ struct btree_trans *trans = bch2_trans_get(c); -+ -+ struct btree_iter iter; -+ bch2_trans_node_iter_init(trans, &iter, scrub->btree, -+ scrub->key.k->k.p, 0, scrub->level - 1, 0); -+ -+ struct btree *b; -+ int ret = lockrestart_do(trans, -+ PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter))); -+ if (ret) -+ goto err; -+ -+ if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { -+ bch_err(c, "error validating btree node during scrub on %s at btree %s", -+ scrub->ca->name, err.buf); -+ -+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0); -+ } -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ bch2_trans_begin(trans); -+ bch2_trans_put(trans); -+ } -+ -+ printbuf_exit(&err); -+ bch2_bkey_buf_exit(&scrub->key, c);; -+ btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); -+ enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); -+ kfree(scrub); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); -+} -+ -+static void btree_node_scrub_endio(struct bio *bio) -+{ -+ struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); -+ -+ queue_work(scrub->c->btree_read_complete_wq, &scrub->work); -+} -+ -+int bch2_btree_node_scrub(struct btree_trans *trans, -+ enum btree_id btree, unsigned level, -+ struct bkey_s_c k, unsigned dev) -+{ -+ if (k.k->type != KEY_TYPE_btree_ptr_v2) -+ return 0; -+ -+ struct bch_fs *c = trans->c; -+ -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub)) -+ return -BCH_ERR_erofs_no_writes; -+ -+ struct extent_ptr_decoded pick; -+ int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); -+ if (ret <= 0) -+ goto err; -+ -+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, -+ BCH_DEV_READ_REF_btree_node_scrub); -+ if (!ca) { -+ ret = -BCH_ERR_device_offline; -+ goto err; -+ } -+ -+ bool used_mempool = false; -+ void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); -+ -+ unsigned vecs = buf_pages(buf, c->opts.btree_node_size); -+ -+ struct btree_node_scrub *scrub = -+ kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); -+ if (!scrub) { -+ ret = -ENOMEM; -+ goto err_free; -+ } -+ -+ scrub->c = c; -+ scrub->ca = ca; -+ scrub->buf = buf; -+ scrub->used_mempool = used_mempool; -+ scrub->written = btree_ptr_sectors_written(k); -+ -+ scrub->btree = btree; -+ scrub->level = level; -+ bch2_bkey_buf_init(&scrub->key); -+ bch2_bkey_buf_reassemble(&scrub->key, c, k); -+ scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; -+ -+ INIT_WORK(&scrub->work, btree_node_scrub_work); -+ -+ bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); -+ bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); -+ scrub->bio.bi_iter.bi_sector = pick.ptr.offset; -+ scrub->bio.bi_end_io = btree_node_scrub_endio; -+ submit_bio(&scrub->bio); -+ return 0; -+err_free: -+ btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); -+err: -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); -+ return ret; -+} -+ - static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, - struct btree_write *w) - { -@@ -1831,7 +2094,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, - bch2_journal_pin_drop(&c->journal, &w->journal); - } - --static void __btree_node_write_done(struct bch_fs *c, struct btree *b) -+static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) - { - struct btree_write *w = btree_prev_write(b); - unsigned long old, new; -@@ -1839,6 +2102,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) - - bch2_btree_complete_write(c, b, w); - -+ if (start_time) -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time); -+ - old = READ_ONCE(b->flags); - do { - new = old; -@@ -1865,11 +2131,13 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) - - if (new & (1U << BTREE_NODE_write_in_flight)) - __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); -- else -+ else { -+ smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -+ } - } - --static void btree_node_write_done(struct bch_fs *c, struct btree *b) -+static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) - { - struct btree_trans *trans = bch2_trans_get(c); - -@@ -1877,7 +2145,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) - - /* we don't need transaction context anymore after we got the lock. */ - bch2_trans_put(trans); -- __btree_node_write_done(c, b); -+ __btree_node_write_done(c, b, start_time); - six_unlock_read(&b->c.lock); - } - -@@ -1887,6 +2155,7 @@ static void btree_node_write_work(struct work_struct *work) - container_of(work, struct btree_write_bio, work); - struct bch_fs *c = wbio->wbio.c; - struct btree *b = wbio->wbio.bio.bi_private; -+ u64 start_time = wbio->start_time; - int ret = 0; - - btree_bounce_free(c, -@@ -1918,13 +2187,20 @@ static void btree_node_write_work(struct work_struct *work) - goto err; - } - out: -+ async_object_list_del(c, btree_write_bio, wbio->list_idx); - bio_put(&wbio->wbio.bio); -- btree_node_write_done(c, b); -+ btree_node_write_done(c, b, start_time); - return; - err: - set_btree_node_noevict(b); -- bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, -- "writing btree node: %s", bch2_err_str(ret)); -+ -+ if (!bch2_err_matches(ret, EROFS)) { -+ struct printbuf buf = PRINTBUF; -+ prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); -+ bch2_btree_pos_to_text(&buf, c, b); -+ bch2_fs_fatal_error(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } - goto out; - } - -@@ -1937,23 +2213,34 @@ static void btree_node_write_endio(struct bio *bio) - struct bch_fs *c = wbio->c; - struct btree *b = wbio->bio.bi_private; - struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; -- unsigned long flags; - -- if (wbio->have_ioref) -- bch2_latency_acct(ca, wbio->submit_time, WRITE); -+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, -+ wbio->submit_time, !bio->bi_status); -+ -+ if (ca && bio->bi_status) { -+ struct printbuf buf = PRINTBUF; -+ buf.atomic++; -+ prt_printf(&buf, "btree write error: %s\n ", -+ bch2_blk_status_to_str(bio->bi_status)); -+ bch2_btree_pos_to_text(&buf, c, b); -+ bch_err_dev_ratelimited(ca, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } - -- if (!ca || -- bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, -- "btree write error: %s", -- bch2_blk_status_to_str(bio->bi_status)) || -- bch2_meta_write_fault("btree")) { -+ if (bio->bi_status) { -+ unsigned long flags; - spin_lock_irqsave(&c->btree_write_error_lock, flags); - bch2_dev_list_add_dev(&orig->failed, wbio->dev); - spin_unlock_irqrestore(&c->btree_write_error_lock, flags); - } - -+ /* -+ * XXX: we should be using io_ref[WRITE], but we aren't retrying failed -+ * btree writes yet (due to device removal/ro): -+ */ - if (wbio->have_ioref) -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], -+ BCH_DEV_READ_REF_btree_node_write); - - if (parent) { - bio_put(bio); -@@ -1962,16 +2249,15 @@ static void btree_node_write_endio(struct bio *bio) - } - - clear_btree_node_write_in_flight_inner(b); -+ smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); - INIT_WORK(&wb->work, btree_node_write_work); -- queue_work(c->btree_io_complete_wq, &wb->work); -+ queue_work(c->btree_write_complete_wq, &wb->work); - } - - static int validate_bset_for_write(struct bch_fs *c, struct btree *b, - struct bset *i, unsigned sectors) - { -- bool saw_error; -- - int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, -@@ -1984,8 +2270,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, - return ret; - } - -- ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: -- validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); -+ ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: -+ validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL); - if (ret) { - bch2_inconsistent_error(c); - dump_stack(); -@@ -2023,6 +2309,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) - bool validate_before_checksum = false; - enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; - void *data; -+ u64 start_time = local_clock(); - int ret; - - if (flags & BTREE_WRITE_ALREADY_STARTED) -@@ -2231,6 +2518,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) - wbio->data = data; - wbio->data_bytes = bytes; - wbio->sector_offset = b->written; -+ wbio->start_time = start_time; - wbio->wbio.c = c; - wbio->wbio.used_mempool = used_mempool; - wbio->wbio.first_btree_write = !b->written; -@@ -2250,6 +2538,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) - atomic64_inc(&c->btree_write_stats[type].nr); - atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); - -+ async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx); -+ - INIT_WORK(&wbio->work, btree_write_submit); - queue_work(c->btree_write_submit_wq, &wbio->work); - return; -@@ -2258,7 +2548,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) - b->written += sectors_to_write; - nowrite: - btree_bounce_free(c, bytes, used_mempool, data); -- __btree_node_write_done(c, b); -+ __btree_node_write_done(c, b, 0); - } - - /* -diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h -index 6f9e4a6dacf7..30a5180532c8 100644 ---- a/fs/bcachefs/btree_io.h -+++ b/fs/bcachefs/btree_io.h -@@ -41,6 +41,9 @@ struct btree_read_bio { - u64 start_time; - unsigned have_ioref:1; - unsigned idx:7; -+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -+ unsigned list_idx; -+#endif - struct extent_ptr_decoded pick; - struct work_struct work; - struct bio bio; -@@ -52,6 +55,10 @@ struct btree_write_bio { - void *data; - unsigned data_bytes; - unsigned sector_offset; -+ u64 start_time; -+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -+ unsigned list_idx; -+#endif - struct bch_write_bio wbio; - }; - -@@ -127,11 +134,18 @@ void bch2_btree_build_aux_trees(struct btree *); - void bch2_btree_init_next(struct btree_trans *, struct btree *); - - int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, -- struct btree *, bool, bool *); -+ struct btree *, -+ struct bch_io_failures *, -+ struct printbuf *); - void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); - int bch2_btree_root_read(struct bch_fs *, enum btree_id, - const struct bkey_i *, unsigned); - -+void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *); -+ -+int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, unsigned); -+ - bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); - - enum btree_write_flags { -diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c -index e32fce4fd258..0f0b80c8c29a 100644 ---- a/fs/bcachefs/btree_iter.c -+++ b/fs/bcachefs/btree_iter.c -@@ -16,6 +16,7 @@ - #include "journal_io.h" - #include "replicas.h" - #include "snapshot.h" -+#include "super.h" - #include "trace.h" - - #include -@@ -114,11 +115,9 @@ static inline bool btree_path_pos_in_node(struct btree_path *path, - !btree_path_pos_after_node(path, b); - } - --/* Btree iterator: */ -+/* Debug: */ - --#ifdef CONFIG_BCACHEFS_DEBUG -- --static void bch2_btree_path_verify_cached(struct btree_trans *trans, -+static void __bch2_btree_path_verify_cached(struct btree_trans *trans, - struct btree_path *path) - { - struct bkey_cached *ck; -@@ -135,7 +134,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans, - btree_node_unlock(trans, path, 0); - } - --static void bch2_btree_path_verify_level(struct btree_trans *trans, -+static void __bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned level) - { - struct btree_path_level *l; -@@ -147,16 +146,13 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, - struct printbuf buf3 = PRINTBUF; - const char *msg; - -- if (!bch2_debug_check_iterators) -- return; -- - l = &path->l[level]; - tmp = l->iter; - locked = btree_node_locked(path, level); - - if (path->cached) { - if (!level) -- bch2_btree_path_verify_cached(trans, path); -+ __bch2_btree_path_verify_cached(trans, path); - return; - } - -@@ -217,7 +213,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, - msg, level, buf1.buf, buf2.buf, buf3.buf); - } - --static void bch2_btree_path_verify(struct btree_trans *trans, -+static void __bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) - { - struct bch_fs *c = trans->c; -@@ -229,25 +225,23 @@ static void bch2_btree_path_verify(struct btree_trans *trans, - break; - } - -- bch2_btree_path_verify_level(trans, path, i); -+ __bch2_btree_path_verify_level(trans, path, i); - } - - bch2_btree_path_verify_locks(path); - } - --void bch2_trans_verify_paths(struct btree_trans *trans) -+void __bch2_trans_verify_paths(struct btree_trans *trans) - { - struct btree_path *path; - unsigned iter; - - trans_for_each_path(trans, path, iter) -- bch2_btree_path_verify(trans, path); -+ __bch2_btree_path_verify(trans, path); - } - --static void bch2_btree_iter_verify(struct btree_iter *iter) -+static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) - { -- struct btree_trans *trans = iter->trans; -- - BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); - - BUG_ON((iter->flags & BTREE_ITER_is_extents) && -@@ -258,11 +252,11 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) - !btree_type_has_snapshot_field(iter->btree_id)); - - if (iter->update_path) -- bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); -- bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); -+ __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); -+ __bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); - } - --static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) -+static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) - { - BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && - !iter->pos.snapshot); -@@ -276,16 +270,13 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) - bkey_gt(iter->pos, iter->k.p))); - } - --static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) -+static int __bch2_btree_iter_verify_ret(struct btree_trans *trans, -+ struct btree_iter *iter, struct bkey_s_c k) - { -- struct btree_trans *trans = iter->trans; - struct btree_iter copy; - struct bkey_s_c prev; - int ret = 0; - -- if (!bch2_debug_check_iterators) -- return 0; -- - if (!(iter->flags & BTREE_ITER_filter_snapshots)) - return 0; - -@@ -299,7 +290,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k - bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, - BTREE_ITER_nopreserve| - BTREE_ITER_all_snapshots); -- prev = bch2_btree_iter_prev(©); -+ prev = bch2_btree_iter_prev(trans, ©); - if (!prev.k) - goto out; - -@@ -326,7 +317,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k - return ret; - } - --void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, -+void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos) - { - bch2_trans_verify_not_unlocked_or_in_restart(trans); -@@ -359,17 +350,40 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); - } - --#else -- - static inline void bch2_btree_path_verify_level(struct btree_trans *trans, -- struct btree_path *path, unsigned l) {} -+ struct btree_path *path, unsigned l) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_iterators)) -+ __bch2_btree_path_verify_level(trans, path, l); -+} -+ - static inline void bch2_btree_path_verify(struct btree_trans *trans, -- struct btree_path *path) {} --static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} --static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} --static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } -+ struct btree_path *path) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_iterators)) -+ __bch2_btree_path_verify(trans, path); -+} - --#endif -+static inline void bch2_btree_iter_verify(struct btree_trans *trans, -+ struct btree_iter *iter) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_iterators)) -+ __bch2_btree_iter_verify(trans, iter); -+} -+ -+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_iterators)) -+ __bch2_btree_iter_verify_entry_exit(iter); -+} -+ -+static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ return static_branch_unlikely(&bch2_debug_check_iterators) -+ ? __bch2_btree_iter_verify_ret(trans, iter, k) -+ : 0; -+} - - /* Btree path: fixups after btree updates */ - -@@ -523,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, - __bch2_btree_node_iter_fix(path, b, node_iter, t, - where, clobber_u64s, new_u64s); - -- if (bch2_debug_check_iterators) -+ if (static_branch_unlikely(&bch2_debug_check_iterators)) - bch2_btree_node_iter_verify(node_iter, b); - } - -@@ -562,20 +576,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, - bch2_btree_node_iter_peek_all(&l->iter, l->b)); - } - --static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, -- struct btree_path *path, -- struct btree_path_level *l, -- struct bkey *u) --{ -- struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, -- bch2_btree_node_iter_peek(&l->iter, l->b)); -- -- path->pos = k.k ? k.k->p : l->b->key.k.p; -- trans->paths_sorted = false; -- bch2_btree_path_verify_level(trans, path, l - path->l); -- return k; --} -- - static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, -@@ -1176,7 +1176,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, - } - - if (path->cached) { -- ret = bch2_btree_path_traverse_cached(trans, path, flags); -+ ret = bch2_btree_path_traverse_cached(trans, path_idx, flags); - goto out; - } - -@@ -1499,24 +1499,16 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) - prt_newline(buf); - } - -- for (struct jset_entry *e = trans->journal_entries; -+ for (struct jset_entry *e = btree_trans_journal_entries_start(trans); - e != btree_trans_journal_entries_top(trans); -- e = vstruct_next(e)) -+ e = vstruct_next(e)) { - bch2_journal_entry_to_text(buf, trans->c, e); -+ prt_newline(buf); -+ } - - printbuf_indent_sub(buf, 2); - } - --noinline __cold --void bch2_dump_trans_updates(struct btree_trans *trans) --{ -- struct printbuf buf = PRINTBUF; -- -- bch2_trans_updates_to_text(&buf, trans); -- bch2_print_str(trans->c, buf.buf); -- printbuf_exit(&buf); --} -- - static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) - { - struct btree_path *path = trans->paths + path_idx; -@@ -1613,7 +1605,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) - __bch2_trans_paths_to_text(&buf, trans, nosort); - bch2_trans_updates_to_text(&buf, trans); - -- bch2_print_str(trans->c, buf.buf); -+ bch2_print_str(trans->c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - -@@ -1877,10 +1869,8 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * - return (struct bkey_s_c) { u, NULL }; - } - --void bch2_set_btree_iter_dontneed(struct btree_iter *iter) -+void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) - { -- struct btree_trans *trans = iter->trans; -- - if (!iter->path || trans->restarted) - return; - -@@ -1892,17 +1882,14 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *iter) - /* Btree iterators: */ - - int __must_check --__bch2_btree_iter_traverse(struct btree_iter *iter) -+__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) - { -- return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); -+ return bch2_btree_path_traverse(trans, iter->path, iter->flags); - } - - int __must_check --bch2_btree_iter_traverse(struct btree_iter *iter) -+bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) - { -- struct btree_trans *trans = iter->trans; -- int ret; -- - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - iter->path = bch2_btree_path_set_pos(trans, iter->path, -@@ -1910,7 +1897,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - -- ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); -+ int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - return ret; - -@@ -1922,14 +1909,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter) - - /* Iterate across nodes (leaf and interior nodes) */ - --struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) -+struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, -+ struct btree_iter *iter) - { -- struct btree_trans *trans = iter->trans; - struct btree *b = NULL; - int ret; - - EBUG_ON(trans->paths[iter->path].cached); -- bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify(trans, iter); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) -@@ -1951,7 +1938,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); - out: - bch2_btree_iter_verify_entry_exit(iter); -- bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify(trans, iter); - - return b; - err: -@@ -1960,26 +1947,26 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) - } - - /* Only kept for -tools */ --struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) -+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans, -+ struct btree_iter *iter) - { - struct btree *b; - -- while (b = bch2_btree_iter_peek_node(iter), -+ while (b = bch2_btree_iter_peek_node(trans, iter), - bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) -- bch2_trans_begin(iter->trans); -+ bch2_trans_begin(trans); - - return b; - } - --struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) -+struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter) - { -- struct btree_trans *trans = iter->trans; - struct btree *b = NULL; - int ret; - - EBUG_ON(trans->paths[iter->path].cached); - bch2_trans_verify_not_unlocked_or_in_restart(trans); -- bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify(trans, iter); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) -@@ -1998,6 +1985,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) - return NULL; - } - -+ /* -+ * We don't correctly handle nodes with extra intent locks here: -+ * downgrade so we don't violate locking invariants -+ */ -+ bch2_btree_path_downgrade(trans, path); -+ - if (!bch2_btree_node_relock(trans, path, path->level + 1)) { - __bch2_btree_path_unlock(trans, path); - path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); -@@ -2046,7 +2039,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) - EBUG_ON(btree_iter_path(trans, iter)->uptodate); - out: - bch2_btree_iter_verify_entry_exit(iter); -- bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify(trans, iter); - - return b; - err: -@@ -2056,7 +2049,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) - - /* Iterate across keys (in leaf nodes only) */ - --inline bool bch2_btree_iter_advance(struct btree_iter *iter) -+inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter) - { - struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_all_snapshots -@@ -2065,11 +2058,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) - - if (ret && !(iter->flags & BTREE_ITER_is_extents)) - pos = bkey_successor(iter, pos); -- bch2_btree_iter_set_pos(iter, pos); -+ bch2_btree_iter_set_pos(trans, iter, pos); - return ret; - } - --inline bool bch2_btree_iter_rewind(struct btree_iter *iter) -+inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter) - { - struct bpos pos = bkey_start_pos(&iter->k); - bool ret = !(iter->flags & BTREE_ITER_all_snapshots -@@ -2078,7 +2071,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) - - if (ret && !(iter->flags & BTREE_ITER_is_extents)) - pos = bkey_predecessor(iter, pos); -- bch2_btree_iter_set_pos(iter, pos); -+ bch2_btree_iter_set_pos(trans, iter, pos); - return ret; - } - -@@ -2205,9 +2198,9 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans, - * bkey_s_c_null: - */ - static noinline --struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) -+struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos pos) - { -- struct btree_trans *trans = iter->trans; - struct bch_fs *c = trans->c; - struct bkey u; - struct bkey_s_c k; -@@ -2253,14 +2246,14 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos - return k; - } - --static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) -+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos search_key) - { -- struct btree_trans *trans = iter->trans; - struct bkey_s_c k, k2; - int ret; - - EBUG_ON(btree_iter_path(trans, iter)->cached); -- bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify(trans, iter); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, -@@ -2270,7 +2263,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ -- bch2_btree_iter_set_pos(iter, iter->pos); -+ bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - break; - } -@@ -2280,7 +2273,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp - - if (unlikely(!l->b)) { - /* No btree nodes at requested level: */ -- bch2_btree_iter_set_pos(iter, SPOS_MAX); -+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } -@@ -2291,10 +2284,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - k.k && -- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { -+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { - k = k2; - if (bkey_err(k)) { -- bch2_btree_iter_set_pos(iter, iter->pos); -+ bch2_btree_iter_set_pos(trans, iter, iter->pos); - break; - } - } -@@ -2327,27 +2320,28 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp - search_key = bpos_successor(l->b->key.k.p); - } else { - /* End of btree: */ -- bch2_btree_iter_set_pos(iter, SPOS_MAX); -+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } - } - -- bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify(trans, iter); - return k; - } - - /** - * bch2_btree_iter_peek_max() - returns first key greater than or equal to - * iterator's current position -+ * @trans: btree transaction object - * @iter: iterator to peek from - * @end: search limit: returns keys less than or equal to @end - * - * Returns: key if found, or an error extractable with bkey_err(). - */ --struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end) -+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos end) - { -- struct btree_trans *trans = iter->trans; - struct bpos search_key = btree_iter_search_key(iter); - struct bkey_s_c k; - struct bpos iter_pos = iter->pos; -@@ -2370,7 +2364,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en - } - - while (1) { -- k = __bch2_btree_iter_peek(iter, search_key); -+ k = __bch2_btree_iter_peek(trans, iter, search_key); - if (unlikely(!k.k)) - goto end; - if (unlikely(bkey_err(k))) -@@ -2484,9 +2478,9 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en - if (!(iter->flags & BTREE_ITER_all_snapshots)) - iter->pos.snapshot = iter->snapshot; - -- ret = bch2_btree_iter_verify_ret(iter, k); -+ ret = bch2_btree_iter_verify_ret(trans, iter, k); - if (unlikely(ret)) { -- bch2_btree_iter_set_pos(iter, iter->pos); -+ bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - } - -@@ -2494,7 +2488,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en - - return k; - end: -- bch2_btree_iter_set_pos(iter, end); -+ bch2_btree_iter_set_pos(trans, iter, end); - k = bkey_s_c_null; - goto out_no_locked; - } -@@ -2502,24 +2496,25 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en - /** - * bch2_btree_iter_next() - returns first key greater than iterator's current - * position -+ * @trans: btree transaction object - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ --struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) -+struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter) - { -- if (!bch2_btree_iter_advance(iter)) -+ if (!bch2_btree_iter_advance(trans, iter)) - return bkey_s_c_null; - -- return bch2_btree_iter_peek(iter); -+ return bch2_btree_iter_peek(trans, iter); - } - --static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key) -+static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos search_key) - { -- struct btree_trans *trans = iter->trans; - struct bkey_s_c k, k2; - -- bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify(trans, iter); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, -@@ -2529,7 +2524,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru - int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ -- bch2_btree_iter_set_pos(iter, iter->pos); -+ bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - break; - } -@@ -2539,7 +2534,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru - - if (unlikely(!l->b)) { - /* No btree nodes at requested level: */ -- bch2_btree_iter_set_pos(iter, SPOS_MAX); -+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } -@@ -2555,10 +2550,10 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - k.k && -- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { -+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { - k = k2; - if (bkey_err(k2)) { -- bch2_btree_iter_set_pos(iter, iter->pos); -+ bch2_btree_iter_set_pos(trans, iter, iter->pos); - break; - } - } -@@ -2579,28 +2574,33 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru - search_key = bpos_predecessor(path->l[0].b->data->min_key); - } else { - /* Start of btree: */ -- bch2_btree_iter_set_pos(iter, POS_MIN); -+ bch2_btree_iter_set_pos(trans, iter, POS_MIN); - k = bkey_s_c_null; - break; - } - } - -- bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify(trans, iter); - return k; - } - - /** - * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to - * iterator's current position -+ * @trans: btree transaction object - * @iter: iterator to peek from - * @end: search limit: returns keys greater than or equal to @end - * - * Returns: key if found, or an error extractable with bkey_err(). - */ --struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) -+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos end) - { - if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && -- !bkey_eq(iter->pos, POS_MAX)) { -+ !bkey_eq(iter->pos, POS_MAX) && -+ !((iter->flags & BTREE_ITER_is_extents) && -+ iter->pos.offset == U64_MAX)) { -+ - /* - * bkey_start_pos(), for extents, is not monotonically - * increasing until after filtering for snapshots: -@@ -2609,7 +2609,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp - * real visible extents - easiest to just use peek_slot() (which - * internally uses peek() for extents) - */ -- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); - if (bkey_err(k)) - return k; - -@@ -2619,14 +2619,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp - return k; - } - -- struct btree_trans *trans = iter->trans; - struct bpos search_key = iter->pos; - struct bkey_s_c k; - btree_path_idx_t saved_path = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify_entry_exit(iter); -- EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); -+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode); - - int ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) { -@@ -2635,7 +2634,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp - } - - while (1) { -- k = __bch2_btree_iter_peek_prev(iter, search_key); -+ k = __bch2_btree_iter_peek_prev(trans, iter, search_key); - if (unlikely(!k.k)) - goto end; - if (unlikely(bkey_err(k))) -@@ -2726,10 +2725,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp - bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); - - bch2_btree_iter_verify_entry_exit(iter); -- bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify(trans, iter); - return k; - end: -- bch2_btree_iter_set_pos(iter, end); -+ bch2_btree_iter_set_pos(trans, iter, end); - k = bkey_s_c_null; - goto out_no_locked; - } -@@ -2737,34 +2736,34 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp - /** - * bch2_btree_iter_prev() - returns first key less than iterator's current - * position -+ * @trans: btree transaction object - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ --struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) -+struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter) - { -- if (!bch2_btree_iter_rewind(iter)) -+ if (!bch2_btree_iter_rewind(trans, iter)) - return bkey_s_c_null; - -- return bch2_btree_iter_peek_prev(iter); -+ return bch2_btree_iter_peek_prev(trans, iter); - } - --struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) -+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter) - { -- struct btree_trans *trans = iter->trans; - struct bpos search_key; - struct bkey_s_c k; - int ret; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); -- bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify(trans, iter); - bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); -- goto out_no_locked; -+ goto out; - } - - /* extents can't span inode numbers: */ -@@ -2773,7 +2772,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) - if (iter->pos.inode == KEY_INODE_MAX) - return bkey_s_c_null; - -- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); -+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); - } - - search_key = btree_iter_search_key(iter); -@@ -2784,13 +2783,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); -- goto out_no_locked; -+ goto out; - } - - struct btree_path *path = btree_iter_path(trans, iter); - if (unlikely(!btree_path_node(path, path->level))) - return bkey_s_c_null; - -+ btree_path_set_should_be_locked(trans, path); -+ - if ((iter->flags & BTREE_ITER_cached) || - !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { - k = bkey_s_c_null; -@@ -2807,16 +2808,16 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) - goto out; - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && -- (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { -+ (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { - if (!bkey_err(k)) - iter->k = *k.k; - /* We're not returning a key from iter->path: */ -- goto out_no_locked; -+ goto out; - } - -- k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); -+ k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); - if (unlikely(!k.k)) -- goto out_no_locked; -+ goto out; - - if (unlikely(k.k->type == KEY_TYPE_whiteout && - (iter->flags & BTREE_ITER_filter_snapshots) && -@@ -2834,8 +2835,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) - if (iter->flags & BTREE_ITER_intent) { - struct btree_iter iter2; - -- bch2_trans_copy_iter(&iter2, iter); -- k = bch2_btree_iter_peek_max(&iter2, end); -+ bch2_trans_copy_iter(trans, &iter2, iter); -+ k = bch2_btree_iter_peek_max(trans, &iter2, end); - - if (k.k && !bkey_err(k)) { - swap(iter->key_cache_path, iter2.key_cache_path); -@@ -2846,15 +2847,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) - } else { - struct bpos pos = iter->pos; - -- k = bch2_btree_iter_peek_max(iter, end); -+ k = bch2_btree_iter_peek_max(trans, iter, end); - if (unlikely(bkey_err(k))) -- bch2_btree_iter_set_pos(iter, pos); -+ bch2_btree_iter_set_pos(trans, iter, pos); - else - iter->pos = pos; - } - - if (unlikely(bkey_err(k))) -- goto out_no_locked; -+ goto out; - - next = k.k ? bkey_start_pos(k.k) : POS_MAX; - -@@ -2876,42 +2877,40 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) - } - } - out: -- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); --out_no_locked: - bch2_btree_iter_verify_entry_exit(iter); -- bch2_btree_iter_verify(iter); -- ret = bch2_btree_iter_verify_ret(iter, k); -+ bch2_btree_iter_verify(trans, iter); -+ ret = bch2_btree_iter_verify_ret(trans, iter, k); - if (unlikely(ret)) - return bkey_s_c_err(ret); - - return k; - } - --struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) -+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter) - { -- if (!bch2_btree_iter_advance(iter)) -+ if (!bch2_btree_iter_advance(trans, iter)) - return bkey_s_c_null; - -- return bch2_btree_iter_peek_slot(iter); -+ return bch2_btree_iter_peek_slot(trans, iter); - } - --struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) -+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter) - { -- if (!bch2_btree_iter_rewind(iter)) -+ if (!bch2_btree_iter_rewind(trans, iter)) - return bkey_s_c_null; - -- return bch2_btree_iter_peek_slot(iter); -+ return bch2_btree_iter_peek_slot(trans, iter); - } - - /* Obsolete, but still used by rust wrapper in -tools */ --struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) -+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter) - { - struct bkey_s_c k; - -- while (btree_trans_too_many_iters(iter->trans) || -- (k = bch2_btree_iter_peek_type(iter, iter->flags), -+ while (btree_trans_too_many_iters(trans) || -+ (k = bch2_btree_iter_peek_type(trans, iter, iter->flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) -- bch2_trans_begin(iter->trans); -+ bch2_trans_begin(trans); - - return k; - } -@@ -2944,7 +2943,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) - struct btree_path *path, *prev = NULL; - struct trans_for_each_path_inorder_iter iter; - -- if (!bch2_debug_check_iterators) -+ if (!static_branch_unlikely(&bch2_debug_check_iterators)) - return; - - trans_for_each_path_inorder(trans, path, iter) { -@@ -3057,7 +3056,6 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) - iter->path = 0; - iter->update_path = 0; - iter->key_cache_path = 0; -- iter->trans = NULL; - } - - void bch2_trans_iter_init_outlined(struct btree_trans *trans, -@@ -3097,10 +3095,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, - BUG_ON(iter->min_depth != depth); - } - --void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) -+void bch2_trans_copy_iter(struct btree_trans *trans, -+ struct btree_iter *dst, struct btree_iter *src) - { -- struct btree_trans *trans = src->trans; -- - *dst = *src; - #ifdef TRACK_PATH_ALLOCATED - dst->ip_allocated = _RET_IP_; -@@ -3112,7 +3109,19 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) - dst->key_cache_path = 0; - } - --void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -+void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, -+ darray_trans_kmalloc_trace *trace) -+{ -+ printbuf_tabstops_reset(out); -+ printbuf_tabstop_push(out, 60); -+ -+ darray_for_each(*trace, i) -+ prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); -+} -+#endif -+ -+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) - { - struct bch_fs *c = trans->c; - unsigned new_top = trans->mem_top + size; -@@ -3122,14 +3131,35 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) - void *new_mem; - void *p; - -- WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); -+ if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { -+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -+ struct printbuf buf = PRINTBUF; -+ bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+#endif -+ } - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (ret) - return ERR_PTR(ret); - - struct btree_transaction_stats *s = btree_trans_stats(trans); -- s->max_mem = max(s->max_mem, new_bytes); -+ if (new_bytes > s->max_mem) { -+ mutex_lock(&s->lock); -+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -+ darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); -+ s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, -+ trans->trans_kmalloc_trace.nr); -+ -+ memcpy(s->trans_kmalloc_trace.data, -+ trans->trans_kmalloc_trace.data, -+ sizeof(s->trans_kmalloc_trace.data[0]) * -+ s->trans_kmalloc_trace.nr); -+#endif -+ s->max_mem = new_bytes; -+ mutex_unlock(&s->lock); -+ } - - if (trans->used_mempool) { - if (trans->mem_bytes >= new_bytes) -@@ -3189,6 +3219,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) - BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); - } - out_change_top: -+ bch2_trans_kmalloc_trace(trans, size, ip); -+ - p = trans->mem + trans->mem_top; - trans->mem_top += size; - memset(p, 0, size); -@@ -3248,7 +3280,6 @@ u32 bch2_trans_begin(struct btree_trans *trans) - - trans->restart_count++; - trans->mem_top = 0; -- trans->journal_entries = NULL; - - trans_for_each_path(trans, path, i) { - path->should_be_locked = false; -@@ -3302,6 +3333,10 @@ u32 bch2_trans_begin(struct btree_trans *trans) - } - #endif - -+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -+ trans->trans_kmalloc_trace.nr = 0; -+#endif -+ - trans_set_locked(trans, false); - - if (trans->restarted) { -@@ -3402,7 +3437,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) - } - - trans->nr_paths_max = s->nr_max_paths; -- trans->journal_entries_size = s->journal_entries_size; - } - - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); -@@ -3414,29 +3448,45 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) - return trans; - } - --static void check_btree_paths_leaked(struct btree_trans *trans) --{ - #ifdef CONFIG_BCACHEFS_DEBUG -- struct bch_fs *c = trans->c; -+ -+static bool btree_paths_leaked(struct btree_trans *trans) -+{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->ref) -- goto leaked; -- return; --leaked: -- bch_err(c, "btree paths leaked from %s!", trans->fn); -- trans_for_each_path(trans, path, i) -- if (path->ref) -- printk(KERN_ERR " btree %s %pS\n", -- bch2_btree_id_str(path->btree_id), -- (void *) path->ip_allocated); -- /* Be noisy about this: */ -- bch2_fatal_error(c); --#endif -+ return true; -+ return false; - } - -+static void check_btree_paths_leaked(struct btree_trans *trans) -+{ -+ if (btree_paths_leaked(trans)) { -+ struct bch_fs *c = trans->c; -+ struct btree_path *path; -+ unsigned i; -+ -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ -+ prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn); -+ trans_for_each_path(trans, path, i) -+ if (path->ref) -+ prt_printf(&buf, "btree %s %pS\n", -+ bch2_btree_id_str(path->btree_id), -+ (void *) path->ip_allocated); -+ -+ bch2_fs_emergency_read_only2(c, &buf); -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ } -+} -+#else -+static inline void check_btree_paths_leaked(struct btree_trans *trans) {} -+#endif -+ - void bch2_trans_put(struct btree_trans *trans) - __releases(&c->btree_trans_barrier) - { -@@ -3471,6 +3521,9 @@ void bch2_trans_put(struct btree_trans *trans) - #ifdef CONFIG_BCACHEFS_DEBUG - darray_exit(&trans->last_restarted_trace); - #endif -+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -+ darray_exit(&trans->trans_kmalloc_trace); -+#endif - - unsigned long *paths_allocated = trans->paths_allocated; - trans->paths_allocated = NULL; -@@ -3625,6 +3678,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) - for (s = c->btree_transaction_stats; - s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); - s++) { -+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -+ darray_exit(&s->trans_kmalloc_trace); -+#endif - kfree(s->max_paths_text); - bch2_time_stats_exit(&s->lock_hold_times); - } -diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h -index b96157f3dc9c..cafd35a5e7a3 100644 ---- a/fs/bcachefs/btree_iter.h -+++ b/fs/bcachefs/btree_iter.h -@@ -9,7 +9,6 @@ - void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); - void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); - void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); --void bch2_dump_trans_updates(struct btree_trans *); - void bch2_dump_trans_paths_updates(struct btree_trans *); - - static inline int __bkey_err(const struct bkey *k) -@@ -286,14 +285,23 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex - : __bch2_trans_mutex_lock(trans, lock); - } - --#ifdef CONFIG_BCACHEFS_DEBUG --void bch2_trans_verify_paths(struct btree_trans *); --void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); --#else --static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} --static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, -- struct bpos pos) {} --#endif -+/* Debug: */ -+ -+void __bch2_trans_verify_paths(struct btree_trans *); -+void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); -+ -+static inline void bch2_trans_verify_paths(struct btree_trans *trans) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_iterators)) -+ __bch2_trans_verify_paths(trans); -+} -+ -+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree, -+ struct bpos pos) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_iterators)) -+ __bch2_assert_pos_locked(trans, btree, pos); -+} - - void bch2_btree_path_fix_key_modified(struct btree_trans *trans, - struct btree *, struct bkey_packed *); -@@ -335,13 +343,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra - } - - __always_inline --static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) -+static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip) - { - BUG_ON(err <= 0); - BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); - - trans->restarted = err; - trans->last_restarted_ip = ip; -+ return -err; -+} -+ -+__always_inline -+static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) -+{ -+ btree_trans_restart_foreign_task(trans, err, ip); - #ifdef CONFIG_BCACHEFS_DEBUG - darray_exit(&trans->last_restarted_trace); - bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); -@@ -387,36 +402,37 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct - void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); - void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); - --int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); --int __must_check bch2_btree_iter_traverse(struct btree_iter *); -+int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); -+int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); - --struct btree *bch2_btree_iter_peek_node(struct btree_iter *); --struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); --struct btree *bch2_btree_iter_next_node(struct btree_iter *); -+struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *); -+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *); -+struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *); - --struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos); --struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos); -+struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *); - --static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) -+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans, -+ struct btree_iter *iter) - { -- return bch2_btree_iter_peek_max(iter, SPOS_MAX); -+ return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX); - } - --struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos); -+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos); - --static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) -+static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter) - { -- return bch2_btree_iter_peek_prev_min(iter, POS_MIN); -+ return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN); - } - --struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *); - --struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); --struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); --struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *); - --bool bch2_btree_iter_advance(struct btree_iter *); --bool bch2_btree_iter_rewind(struct btree_iter *); -+bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *); -+bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *); - - static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) - { -@@ -427,10 +443,9 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo - iter->k.size = 0; - } - --static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) -+static inline void bch2_btree_iter_set_pos(struct btree_trans *trans, -+ struct btree_iter *iter, struct bpos new_pos) - { -- struct btree_trans *trans = iter->trans; -- - if (unlikely(iter->update_path)) - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); -@@ -448,13 +463,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it - iter->pos = bkey_start_pos(&iter->k); - } - --static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) -+static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans, -+ struct btree_iter *iter, u32 snapshot) - { - struct bpos pos = iter->pos; - - iter->snapshot = snapshot; - pos.snapshot = snapshot; -- bch2_btree_iter_set_pos(iter, pos); -+ bch2_btree_iter_set_pos(trans, iter, pos); - } - - void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); -@@ -496,7 +512,6 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, - unsigned flags, - unsigned long ip) - { -- iter->trans = trans; - iter->update_path = 0; - iter->key_cache_path = 0; - iter->btree_id = btree_id; -@@ -533,47 +548,77 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans, - void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos, - unsigned, unsigned, unsigned); --void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); -+void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *); - --void bch2_set_btree_iter_dontneed(struct btree_iter *); -+void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); - --void *__bch2_trans_kmalloc(struct btree_trans *, size_t); -+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -+void bch2_trans_kmalloc_trace_to_text(struct printbuf *, -+ darray_trans_kmalloc_trace *); -+#endif - --/** -- * bch2_trans_kmalloc - allocate memory for use by the current transaction -- * -- * Must be called after bch2_trans_begin, which on second and further calls -- * frees all memory allocated in this transaction -- */ --static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -+void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); -+ -+static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, -+ unsigned long ip) -+{ -+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -+ darray_push(&trans->trans_kmalloc_trace, -+ ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); -+#endif -+} -+ -+static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size, -+ unsigned long ip) - { - size = roundup(size, 8); - -+ bch2_trans_kmalloc_trace(trans, size, ip); -+ - if (likely(trans->mem_top + size <= trans->mem_bytes)) { - void *p = trans->mem + trans->mem_top; - - trans->mem_top += size; -- memset(p, 0, size); - return p; - } else { -- return __bch2_trans_kmalloc(trans, size); -+ return __bch2_trans_kmalloc(trans, size, ip); - } - } - --static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) -+static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size, -+ unsigned long ip) - { -- size = round_up(size, 8); -+ size = roundup(size, 8); -+ -+ bch2_trans_kmalloc_trace(trans, size, ip); - - if (likely(trans->mem_top + size <= trans->mem_bytes)) { - void *p = trans->mem + trans->mem_top; - - trans->mem_top += size; -+ memset(p, 0, size); - return p; - } else { -- return __bch2_trans_kmalloc(trans, size); -+ return __bch2_trans_kmalloc(trans, size, ip); - } - } - -+/** -+ * bch2_trans_kmalloc - allocate memory for use by the current transaction -+ * -+ * Must be called after bch2_trans_begin, which on second and further calls -+ * frees all memory allocated in this transaction -+ */ -+static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -+{ -+ return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_); -+} -+ -+static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) -+{ -+ return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_); -+} -+ - static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, -@@ -582,7 +627,7 @@ static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, - struct bkey_s_c k; - - bch2_trans_iter_init(trans, iter, btree_id, pos, flags); -- k = bch2_btree_iter_peek_slot(iter); -+ k = bch2_btree_iter_peek_slot(trans, iter); - - if (!bkey_err(k) && type && k.k->type != type) - k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); -@@ -652,14 +697,14 @@ u32 bch2_trans_begin(struct btree_trans *); - int _ret3 = 0; \ - do { \ - _ret3 = lockrestart_do((_trans), ({ \ -- struct btree *_b = bch2_btree_iter_peek_node(&_iter); \ -+ struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\ - if (!_b) \ - break; \ - \ - PTR_ERR_OR_ZERO(_b) ?: (_do); \ - })) ?: \ - lockrestart_do((_trans), \ -- PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \ -+ PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\ - } while (!_ret3); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ -@@ -671,31 +716,34 @@ u32 bch2_trans_begin(struct btree_trans *); - __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - 0, 0, _flags, _b, _do) - --static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, -+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans, -+ struct btree_iter *iter, - unsigned flags) - { -- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : -- bch2_btree_iter_peek_prev(iter); -+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : -+ bch2_btree_iter_peek_prev(trans, iter); - } - --static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, -+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans, -+ struct btree_iter *iter, - unsigned flags) - { -- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : -- bch2_btree_iter_peek(iter); -+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : -+ bch2_btree_iter_peek(trans, iter); - } - --static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter, -- struct bpos end, -- unsigned flags) -+static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos end, -+ unsigned flags) - { - if (!(flags & BTREE_ITER_slots)) -- return bch2_btree_iter_peek_max(iter, end); -+ return bch2_btree_iter_peek_max(trans, iter, end); - - if (bkey_gt(iter->pos, end)) - return bkey_s_c_null; - -- return bch2_btree_iter_peek_slot(iter); -+ return bch2_btree_iter_peek_slot(trans, iter); - } - - int __bch2_btree_trans_too_many_iters(struct btree_trans *); -@@ -762,14 +810,14 @@ transaction_restart: \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ -- (_k) = bch2_btree_iter_peek_max_type(&(_iter), \ -+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \ - _end, (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ -- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ -+ } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -@@ -807,14 +855,14 @@ transaction_restart: \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ -- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \ -+ (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \ - (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ -- } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \ -+ } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -@@ -844,37 +892,38 @@ transaction_restart: \ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - --struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, -+ struct btree_iter *); - - #define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ -- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\ -+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ -- bch2_btree_iter_advance(&(_iter))) -+ bch2_btree_iter_advance(_trans, &(_iter))) - --#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\ -+#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\ - for (; \ -- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ -+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ -- bch2_btree_iter_advance(&(_iter))) -+ bch2_btree_iter_advance(_trans, &(_iter))) - - #define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ - SPOS_MAX, _flags, _k, _ret) - --#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ -- _start, _flags, _k, _ret) \ -- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ -- (_start), (_flags)); \ -- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ -- !((_ret) = bkey_err(_k)) && (_k).k; \ -- bch2_btree_iter_rewind(&(_iter))) -+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ -+ _start, _flags, _k, _ret) \ -+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ -+ (_start), (_flags)); \ -+ (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \ -+ !((_ret) = bkey_err(_k)) && (_k).k; \ -+ bch2_btree_iter_rewind(_trans, &(_iter))) - --#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ -- for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) -+#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \ -+ for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret) - - /* - * This should not be used in a fastpath, without first trying _do in -diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c -index 6d25e3f85ce8..ade3b5addd75 100644 ---- a/fs/bcachefs/btree_journal_iter.c -+++ b/fs/bcachefs/btree_journal_iter.c -@@ -288,7 +288,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, - .size = max_t(size_t, keys->size, 8) * 2, - }; - -- new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); -+ new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL); - if (!new_keys.data) { - bch_err(c, "%s: error allocating new key array (size %zu)", - __func__, new_keys.size); -@@ -687,7 +687,8 @@ void bch2_journal_keys_put(struct bch_fs *c) - - static void __journal_keys_sort(struct journal_keys *keys) - { -- sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); -+ sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]), -+ journal_sort_key_cmp, NULL); - - cond_resched(); - -diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c -index edce59433375..741329f1400a 100644 ---- a/fs/bcachefs/btree_key_cache.c -+++ b/fs/bcachefs/btree_key_cache.c -@@ -13,6 +13,7 @@ - #include "trace.h" - - #include -+#include - - static inline bool btree_uses_pcpu_readers(enum btree_id id) - { -@@ -101,8 +102,8 @@ static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu - kmem_cache_free(bch2_key_cache, ck); - } - --static void bkey_cached_free(struct btree_key_cache *bc, -- struct bkey_cached *ck) -+static inline void bkey_cached_free_noassert(struct btree_key_cache *bc, -+ struct bkey_cached *ck) - { - kfree(ck->k); - ck->k = NULL; -@@ -116,6 +117,19 @@ static void bkey_cached_free(struct btree_key_cache *bc, - this_cpu_inc(*bc->nr_pending); - } - -+static void bkey_cached_free(struct btree_trans *trans, -+ struct btree_key_cache *bc, -+ struct bkey_cached *ck) -+{ -+ /* -+ * we'll hit strange issues in the SRCU code if we aren't holding an -+ * SRCU read lock... -+ */ -+ EBUG_ON(!trans->srcu_held); -+ -+ bkey_cached_free_noassert(bc, ck); -+} -+ - static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) - { - gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; -@@ -281,16 +295,31 @@ static int btree_key_cache_create(struct btree_trans *trans, - ck_path->uptodate = BTREE_ITER_UPTODATE; - return 0; - err: -- bkey_cached_free(bc, ck); -+ bkey_cached_free(trans, bc, ck); - mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); - - return ret; - } - -+static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans, -+ struct btree_path *ck_path, -+ struct bkey_s_c k) -+{ -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bpos_to_text(&buf, ck_path->pos); -+ prt_char(&buf, ' '); -+ bch2_bkey_val_to_text(&buf, trans->c, k); -+ trace_key_cache_fill(trans, buf.buf); -+ printbuf_exit(&buf); -+} -+ - static noinline int btree_key_cache_fill(struct btree_trans *trans, -- struct btree_path *ck_path, -+ btree_path_idx_t ck_path_idx, - unsigned flags) - { -+ struct btree_path *ck_path = trans->paths + ck_path_idx; -+ - if (flags & BTREE_ITER_cached_nofill) { - ck_path->l[0].b = NULL; - return 0; -@@ -306,12 +335,13 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, - BTREE_ITER_key_cache_fill| - BTREE_ITER_cached_nofill); - iter.flags &= ~BTREE_ITER_with_journal; -- k = bch2_btree_iter_peek_slot(&iter); -+ k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - /* Recheck after btree lookup, before allocating: */ -+ ck_path = trans->paths + ck_path_idx; - ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; - if (unlikely(ret)) - goto out; -@@ -320,28 +350,22 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, - if (ret) - goto err; - -- if (trace_key_cache_fill_enabled()) { -- struct printbuf buf = PRINTBUF; -- -- bch2_bpos_to_text(&buf, ck_path->pos); -- prt_char(&buf, ' '); -- bch2_bkey_val_to_text(&buf, trans->c, k); -- trace_key_cache_fill(trans, buf.buf); -- printbuf_exit(&buf); -- } -+ if (trace_key_cache_fill_enabled()) -+ do_trace_key_cache_fill(trans, ck_path, k); - out: - /* We're not likely to need this iterator again: */ -- bch2_set_btree_iter_dontneed(&iter); -+ bch2_set_btree_iter_dontneed(trans, &iter); - err: - bch2_trans_iter_exit(trans, &iter); - return ret; - } - - static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, -- struct btree_path *path) -+ btree_path_idx_t path_idx) - { - struct bch_fs *c = trans->c; - struct bkey_cached *ck; -+ struct btree_path *path = trans->paths + path_idx; - retry: - ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) -@@ -367,27 +391,32 @@ static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, - return 0; - } - --int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, -+int bch2_btree_path_traverse_cached(struct btree_trans *trans, -+ btree_path_idx_t path_idx, - unsigned flags) - { -- EBUG_ON(path->level); -- -- path->l[1].b = NULL; -+ EBUG_ON(trans->paths[path_idx].level); - - int ret; - do { -- ret = btree_path_traverse_cached_fast(trans, path); -+ ret = btree_path_traverse_cached_fast(trans, path_idx); - if (unlikely(ret == -ENOENT)) -- ret = btree_key_cache_fill(trans, path, flags); -+ ret = btree_key_cache_fill(trans, path_idx, flags); - } while (ret == -EEXIST); - -+ struct btree_path *path = trans->paths + path_idx; -+ - if (unlikely(ret)) { - path->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - btree_node_unlock(trans, path, 0); - path->l[0].b = ERR_PTR(ret); - } -+ } else { -+ BUG_ON(path->uptodate); -+ BUG_ON(!path->nodes_locked); - } -+ - return ret; - } - -@@ -412,7 +441,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, - BTREE_ITER_intent); - b_iter.flags &= ~BTREE_ITER_with_key_cache; - -- ret = bch2_btree_iter_traverse(&c_iter); -+ ret = bch2_btree_iter_traverse(trans, &c_iter); - if (ret) - goto out; - -@@ -444,7 +473,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, - !test_bit(JOURNAL_space_low, &c->journal.flags)) - commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - -- struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); -+ struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter); - ret = bkey_err(btree_k); - if (ret) - goto err; -@@ -496,7 +525,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, - - mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); - if (bkey_cached_evict(&c->btree_key_cache, ck)) { -- bkey_cached_free(&c->btree_key_cache, ck); -+ bkey_cached_free(trans, &c->btree_key_cache, ck); - } else { - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); -@@ -610,7 +639,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, - } - - bkey_cached_evict(bc, ck); -- bkey_cached_free(bc, ck); -+ bkey_cached_free(trans, bc, ck); - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); - -@@ -678,7 +707,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, - } else if (!bkey_cached_lock_for_evict(ck)) { - bc->skipped_lock_fail++; - } else if (bkey_cached_evict(bc, ck)) { -- bkey_cached_free(bc, ck); -+ bkey_cached_free_noassert(bc, ck); - bc->freed++; - freed++; - } else { -@@ -784,6 +813,18 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) - { - } - -+static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) -+{ -+ struct bch_fs *c = shrink->private_data; -+ struct btree_key_cache *bc = &c->btree_key_cache; -+ char *cbuf; -+ size_t buflen = seq_buf_get_buf(s, &cbuf); -+ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); -+ -+ bch2_btree_key_cache_to_text(&out, bc); -+ seq_buf_commit(s, out.pos); -+} -+ - int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) - { - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); -@@ -808,6 +849,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) - bc->shrink = shrink; - shrink->count_objects = bch2_btree_key_cache_count; - shrink->scan_objects = bch2_btree_key_cache_scan; -+ shrink->to_text = bch2_btree_key_cache_shrinker_to_text; - shrink->batch = 1 << 14; - shrink->seeks = 0; - shrink->private_data = c; -diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h -index 51d6289b8dee..82d8c72512a9 100644 ---- a/fs/bcachefs/btree_key_cache.h -+++ b/fs/bcachefs/btree_key_cache.h -@@ -40,8 +40,7 @@ int bch2_btree_key_cache_journal_flush(struct journal *, - struct bkey_cached * - bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); - --int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, -- unsigned); -+int bch2_btree_path_traverse_cached(struct btree_trans *, btree_path_idx_t, unsigned); - - bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, - struct btree_insert_entry *); -diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c -index caef65adeae4..59a366fdd24c 100644 ---- a/fs/bcachefs/btree_locking.c -+++ b/fs/bcachefs/btree_locking.c -@@ -1,6 +1,7 @@ - // SPDX-License-Identifier: GPL-2.0 - - #include "bcachefs.h" -+#include "btree_cache.h" - #include "btree_locking.h" - #include "btree_types.h" - -@@ -91,10 +92,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) - struct trans_waiting_for_lock *i; - - for (i = g->g; i != g->g + g->nr; i++) { -- struct task_struct *task = i->trans->locking_wait.task; -+ struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); - if (i != g->g) - prt_str(out, "<- "); -- prt_printf(out, "%u ", task ?task->pid : 0); -+ prt_printf(out, "%u ", task ? task->pid : 0); - } - prt_newline(out); - } -@@ -172,7 +173,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) - { - if (i == g->g) { - trace_would_deadlock(g, i->trans); -- return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); -+ return btree_trans_restart_foreign_task(i->trans, -+ BCH_ERR_transaction_restart_would_deadlock, -+ _THIS_IP_); - } else { - i->trans->lock_must_abort = true; - wake_up_process(i->trans->locking_wait.task); -@@ -234,7 +237,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, - prt_newline(&buf); - } - -- bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); -+ bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - BUG(); - } -@@ -616,22 +619,23 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, - unsigned new_locks_want, - struct get_locks_fail *f) - { -- EBUG_ON(path->locks_want >= new_locks_want); -- -- path->locks_want = new_locks_want; -+ path->locks_want = max_t(unsigned, path->locks_want, new_locks_want); - - bool ret = btree_path_get_locks(trans, path, true, f); - bch2_trans_verify_locks(trans); - return ret; - } - --bool __bch2_btree_path_upgrade(struct btree_trans *trans, -- struct btree_path *path, -- unsigned new_locks_want, -- struct get_locks_fail *f) -+int __bch2_btree_path_upgrade(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned new_locks_want) - { -- bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f); -- if (ret) -+ struct get_locks_fail f = {}; -+ unsigned old_locks = path->nodes_locked; -+ unsigned old_locks_want = path->locks_want; -+ int ret = 0; -+ -+ if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, &f)) - goto out; - - /* -@@ -666,6 +670,28 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, - btree_path_get_locks(trans, linked, true, NULL); - } - } -+ -+ count_event(trans->c, trans_restart_upgrade); -+ if (trace_trans_restart_upgrade_enabled()) { -+ struct printbuf buf = PRINTBUF; -+ -+ prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_); -+ prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id)); -+ bch2_bpos_to_text(&buf, path->pos); -+ prt_printf(&buf, "locks want %u -> %u level %u\n", -+ old_locks_want, new_locks_want, f.l); -+ prt_printf(&buf, "nodes_locked %x -> %x\n", -+ old_locks, path->nodes_locked); -+ prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) : -+ !f.b ? "(null)" : "(node)"); -+ prt_printf(&buf, "path seq %u node seq %u\n", -+ IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq, -+ path->l[f.l].lock_seq); -+ -+ trace_trans_restart_upgrade(trans->c, buf.buf); -+ printbuf_exit(&buf); -+ } -+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); - out: - bch2_trans_verify_locks(trans); - return ret; -@@ -736,7 +762,9 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, path->pos); -- prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); -+ prt_printf(&buf, " %s l=%u seq=%u node seq=", -+ bch2_btree_id_str(path->btree_id), -+ f->l, path->l[f->l].lock_seq); - if (IS_ERR_OR_NULL(f->b)) { - prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); - } else { -@@ -797,13 +825,6 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) - return __bch2_trans_relock(trans, false); - } - --void bch2_trans_unlock_noassert(struct btree_trans *trans) --{ -- __bch2_trans_unlock(trans); -- -- trans_set_unlocked(trans); --} -- - void bch2_trans_unlock(struct btree_trans *trans) - { - __bch2_trans_unlock(trans); -@@ -840,9 +861,7 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, - - /* Debug */ - --#ifdef CONFIG_BCACHEFS_DEBUG -- --void bch2_btree_path_verify_locks(struct btree_path *path) -+void __bch2_btree_path_verify_locks(struct btree_path *path) - { - /* - * A path may be uptodate and yet have nothing locked if and only if -@@ -883,7 +902,7 @@ static bool bch2_trans_locked(struct btree_trans *trans) - return false; - } - --void bch2_trans_verify_locks(struct btree_trans *trans) -+void __bch2_trans_verify_locks(struct btree_trans *trans) - { - if (!trans->locked) { - BUG_ON(bch2_trans_locked(trans)); -@@ -894,7 +913,5 @@ void bch2_trans_verify_locks(struct btree_trans *trans) - unsigned i; - - trans_for_each_path(trans, path, i) -- bch2_btree_path_verify_locks(path); -+ __bch2_btree_path_verify_locks(path); - } -- --#endif -diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h -index b33ab7af8440..1bb28e21d021 100644 ---- a/fs/bcachefs/btree_locking.h -+++ b/fs/bcachefs/btree_locking.h -@@ -15,7 +15,6 @@ - - void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); - --void bch2_trans_unlock_noassert(struct btree_trans *); - void bch2_trans_unlock_write(struct btree_trans *); - - static inline bool is_btree_node(struct btree_path *path, unsigned l) -@@ -381,27 +380,18 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, - struct btree_path *, unsigned, - struct get_locks_fail *); - --bool __bch2_btree_path_upgrade(struct btree_trans *, -- struct btree_path *, unsigned, -- struct get_locks_fail *); -+int __bch2_btree_path_upgrade(struct btree_trans *, -+ struct btree_path *, unsigned); - - static inline int bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) - { -- struct get_locks_fail f = {}; -- unsigned old_locks_want = path->locks_want; -- - new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - -- if (path->locks_want < new_locks_want -- ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) -- : path->nodes_locked) -- return 0; -- -- trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, -- old_locks_want, new_locks_want, &f); -- return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); -+ return likely(path->locks_want >= new_locks_want && path->nodes_locked) -+ ? 0 -+ : __bch2_btree_path_upgrade(trans, path, new_locks_want); - } - - /* misc: */ -@@ -439,12 +429,19 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, - - int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); - --#ifdef CONFIG_BCACHEFS_DEBUG --void bch2_btree_path_verify_locks(struct btree_path *); --void bch2_trans_verify_locks(struct btree_trans *); --#else --static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} --static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} --#endif -+void __bch2_btree_path_verify_locks(struct btree_path *); -+void __bch2_trans_verify_locks(struct btree_trans *); -+ -+static inline void bch2_btree_path_verify_locks(struct btree_path *path) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_btree_locking)) -+ __bch2_btree_path_verify_locks(path); -+} -+ -+static inline void bch2_trans_verify_locks(struct btree_trans *trans) -+{ -+ if (static_branch_unlikely(&bch2_debug_check_btree_locking)) -+ __bch2_trans_verify_locks(trans); -+} - - #endif /* _BCACHEFS_BTREE_LOCKING_H */ -diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c -index a7f06deee13c..5a97a6b8a757 100644 ---- a/fs/bcachefs/btree_node_scan.c -+++ b/fs/bcachefs/btree_node_scan.c -@@ -13,6 +13,7 @@ - - #include - #include -+#include - #include - - struct find_btree_nodes_worker { -@@ -166,17 +167,23 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, bn, PAGE_SIZE); - -+ u64 submit_time = local_clock(); - submit_bio_wait(bio); -- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, -- "IO error in try_read_btree_node() at %llu: %s", -- offset, bch2_blk_status_to_str(bio->bi_status))) -+ -+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); -+ -+ if (bio->bi_status) { -+ bch_err_dev_ratelimited(ca, -+ "IO error in try_read_btree_node() at %llu: %s", -+ offset, bch2_blk_status_to_str(bio->bi_status)); - return; -+ } - - if (le64_to_cpu(bn->magic) != bset_magic(c)) - return; - - if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { -- if (!c->chacha20) -+ if (!c->chacha20_key_set) - return; - - struct nonce nonce = btree_nonce(&bn->keys, 0); -@@ -264,7 +271,7 @@ static int read_btree_nodes_worker(void *p) - err: - bio_put(bio); - free_page((unsigned long) buf); -- percpu_ref_get(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - closure_put(w->cl); - kfree(w); - return 0; -@@ -278,37 +285,37 @@ static int read_btree_nodes(struct find_btree_nodes *f) - - closure_init_stack(&cl); - -- for_each_online_member(c, ca) { -+ for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) { - if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) - continue; - - struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); -- struct task_struct *t; -- - if (!w) { -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - ret = -ENOMEM; - goto err; - } - -- percpu_ref_get(&ca->io_ref); -- closure_get(&cl); - w->cl = &cl; - w->f = f; - w->ca = ca; - -- t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); -+ struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); - ret = PTR_ERR_OR_ZERO(t); - if (ret) { -- percpu_ref_put(&ca->io_ref); -- closure_put(&cl); -- f->ret = ret; -- bch_err(c, "error starting kthread: %i", ret); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); -+ kfree(w); -+ bch_err_msg(c, ret, "starting kthread"); - break; - } -+ -+ closure_get(&cl); -+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); -+ wake_up_process(t); - } - err: -- closure_sync(&cl); -+ while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2)) -+ ; - return f->ret ?: ret; - } - -@@ -388,10 +395,10 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes found:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); -- bch2_print_string_as_lines(KERN_INFO, buf.buf); -+ bch2_print_str(c, KERN_INFO, buf.buf); - } - -- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); -+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); - - dst = 0; - darray_for_each(f->nodes, i) { -@@ -411,13 +418,13 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) - } - f->nodes.nr = dst; - -- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); -+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); - - if (0 && c->opts.verbose) { - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); -- bch2_print_string_as_lines(KERN_INFO, buf.buf); -+ bch2_print_str(c, KERN_INFO, buf.buf); - } - - swap(nodes_heap, f->nodes); -@@ -463,7 +470,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); -- bch2_print_string_as_lines(KERN_INFO, buf.buf); -+ bch2_print_str(c, KERN_INFO, buf.buf); - } else { - bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); - } -@@ -534,7 +541,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, - - struct find_btree_nodes *f = &c->found_btree_nodes; - -- int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); -+ int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); - if (ret) - return ret; - -@@ -572,10 +579,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, - - found_btree_node_to_key(&tmp.k, &n); - -- struct printbuf buf = PRINTBUF; -- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); -- bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); -- printbuf_exit(&buf); -+ if (c->opts.verbose) { -+ struct printbuf buf = PRINTBUF; -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); -+ bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); -+ printbuf_exit(&buf); -+ } - - BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), - (struct bkey_validate_context) { -diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h -index 2811b6857c97..422d49a5c57c 100644 ---- a/fs/bcachefs/btree_node_scan_types.h -+++ b/fs/bcachefs/btree_node_scan_types.h -@@ -2,7 +2,7 @@ - #ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H - #define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H - --#include "darray.h" -+#include - - struct found_btree_node { - bool range_updated:1; -diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c -index c4f524b2ca9a..1c03c965d836 100644 ---- a/fs/bcachefs/btree_trans_commit.c -+++ b/fs/bcachefs/btree_trans_commit.c -@@ -11,6 +11,7 @@ - #include "btree_write_buffer.h" - #include "buckets.h" - #include "disk_accounting.h" -+#include "enumerated_ref.h" - #include "errcode.h" - #include "error.h" - #include "journal.h" -@@ -20,6 +21,7 @@ - #include "snapshot.h" - - #include -+#include - - static const char * const trans_commit_flags_strs[] = { - #define x(n, ...) #n, -@@ -164,6 +166,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, - EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); - EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); - EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); -+ kmsan_check_memory(insert, bkey_bytes(&insert->k)); - - k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) -@@ -336,6 +339,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, - BUG_ON(i->cached != path->cached); - BUG_ON(i->level != path->level); - BUG_ON(i->btree_id != path->btree_id); -+ BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); - EBUG_ON(!i->level && - btree_type_has_snapshots(i->btree_id) && - !(i->flags & BTREE_UPDATE_internal_snapshot_node) && -@@ -364,7 +368,8 @@ static noinline void journal_transaction_name(struct btree_trans *trans) - struct jset_entry_log *l = - container_of(entry, struct jset_entry_log, entry); - -- strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); -+ memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), -+ trans->fn, strlen(trans->fn), 0); - } - - static inline int btree_key_can_insert(struct btree_trans *trans, -@@ -517,69 +522,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ - } - } - --static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, -- unsigned *btree_id_updates_start) -+static int bch2_trans_commit_run_triggers(struct btree_trans *trans) - { -- bool trans_trigger_run; -+ unsigned sort_id_start = 0; - -- /* -- * Running triggers will append more updates to the list of updates as -- * we're walking it: -- */ -- do { -- trans_trigger_run = false; -+ while (sort_id_start < trans->nr_updates) { -+ unsigned i, sort_id = trans->updates[sort_id_start].sort_order; -+ bool trans_trigger_run; - -- for (unsigned i = *btree_id_updates_start; -- i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; -- i++) { -- if (trans->updates[i].btree_id < btree_id) { -- *btree_id_updates_start = i; -- continue; -+ /* -+ * For a given btree, this algorithm runs insert triggers before -+ * overwrite triggers: this is so that when extents are being -+ * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop -+ * references before they are re-added. -+ * -+ * Running triggers will append more updates to the list of -+ * updates as we're walking it: -+ */ -+ do { -+ trans_trigger_run = false; -+ -+ for (i = sort_id_start; -+ i < trans->nr_updates && trans->updates[i].sort_order <= sort_id; -+ i++) { -+ if (trans->updates[i].sort_order < sort_id) { -+ sort_id_start = i; -+ continue; -+ } -+ -+ int ret = run_one_trans_trigger(trans, trans->updates + i); -+ if (ret < 0) -+ return ret; -+ if (ret) -+ trans_trigger_run = true; - } -+ } while (trans_trigger_run); - -- int ret = run_one_trans_trigger(trans, trans->updates + i); -- if (ret < 0) -- return ret; -- if (ret) -- trans_trigger_run = true; -- } -- } while (trans_trigger_run); -- -- trans_for_each_update(trans, i) -- BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && -- i->btree_id == btree_id && -- btree_node_type_has_trans_triggers(i->bkey_type) && -- (!i->insert_trigger_run || !i->overwrite_trigger_run)); -- -- return 0; --} -- --static int bch2_trans_commit_run_triggers(struct btree_trans *trans) --{ -- unsigned btree_id = 0, btree_id_updates_start = 0; -- int ret = 0; -- -- /* -- * -- * For a given btree, this algorithm runs insert triggers before -- * overwrite triggers: this is so that when extents are being moved -- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before -- * they are re-added. -- */ -- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { -- if (btree_id == BTREE_ID_alloc) -- continue; -- -- ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start); -- if (ret) -- return ret; -+ sort_id_start = i; - } - -- btree_id_updates_start = 0; -- ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start); -- if (ret) -- return ret; -- - #ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && -@@ -666,10 +647,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - !(flags & BCH_TRANS_COMMIT_no_journal_res)) { -- if (bch2_journal_seq_verify) -+ if (static_branch_unlikely(&bch2_journal_seq_verify)) - trans_for_each_update(trans, i) - i->k->k.bversion.lo = trans->journal_res.seq; -- else if (bch2_inject_invalid_keys) -+ else if (static_branch_unlikely(&bch2_inject_invalid_keys)) - trans_for_each_update(trans, i) - i->k->k.bversion = MAX_VERSION; - } -@@ -682,18 +663,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - h = h->next; - } - -- struct jset_entry *entry = trans->journal_entries; -+ struct bkey_i *accounting; - - percpu_down_read(&c->mark_lock); -- for (entry = trans->journal_entries; -- entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); -- entry = vstruct_next(entry)) -- if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && -- entry->start->k.type == KEY_TYPE_accounting) { -- ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags); -- if (ret) -- goto revert_fs_usage; -- } -+ for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); -+ accounting != btree_trans_subbuf_top(trans, &trans->accounting); -+ accounting = bkey_next(accounting)) { -+ ret = bch2_accounting_trans_commit_hook(trans, -+ bkey_i_to_accounting(accounting), flags); -+ if (ret) -+ goto revert_fs_usage; -+ } - percpu_up_read(&c->mark_lock); - - /* XXX: we only want to run this if deltas are nonzero */ -@@ -717,8 +697,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; - -- for (struct jset_entry *i = trans->journal_entries; -- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); -+ for (struct jset_entry *i = btree_trans_journal_entries_start(trans); -+ i != btree_trans_journal_entries_top(trans); - i = vstruct_next(i)) { - ret = bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, -@@ -773,11 +753,18 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - } - - memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), -- trans->journal_entries, -- trans->journal_entries_u64s); -+ btree_trans_journal_entries_start(trans), -+ trans->journal_entries.u64s); -+ -+ trans->journal_res.offset += trans->journal_entries.u64s; -+ trans->journal_res.u64s -= trans->journal_entries.u64s; - -- trans->journal_res.offset += trans->journal_entries_u64s; -- trans->journal_res.u64s -= trans->journal_entries_u64s; -+ memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, -+ BCH_JSET_ENTRY_write_buffer_keys, -+ BTREE_ID_accounting, 0, -+ trans->accounting.u64s)->_data, -+ btree_trans_subbuf_base(trans, &trans->accounting), -+ trans->accounting.u64s); - - if (trans->journal_seq) - *trans->journal_seq = trans->journal_res.seq; -@@ -799,13 +786,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); - percpu_down_read(&c->mark_lock); - revert_fs_usage: -- for (struct jset_entry *entry2 = trans->journal_entries; -- entry2 != entry; -- entry2 = vstruct_next(entry2)) -- if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && -- entry2->start->k.type == KEY_TYPE_accounting) -- bch2_accounting_trans_commit_revert(trans, -- bkey_i_to_accounting(entry2->start), flags); -+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); -+ i != accounting; -+ i = bkey_next(i)) -+ bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); - percpu_up_read(&c->mark_lock); - return ret; - } -@@ -903,18 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, - struct bch_fs *c = trans->c; - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - -- switch (ret) { -- case -BCH_ERR_btree_insert_btree_node_full: -- ret = bch2_btree_split_leaf(trans, i->path, flags); -- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -- trace_and_count(c, trans_restart_btree_node_split, trans, -- trace_ip, trans->paths + i->path); -- break; -- case -BCH_ERR_btree_insert_need_mark_replicas: -- ret = drop_locks_do(trans, -- bch2_accounting_update_sb(trans)); -- break; -- case -BCH_ERR_journal_res_get_blocked: -+ if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) { - /* - * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK - * flag -@@ -922,13 +895,26 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, - if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; -- break; -+ goto out; - } - - ret = drop_locks_do(trans, - bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_CHECK)); -+ goto out; -+ } -+ -+ switch (ret) { -+ case -BCH_ERR_btree_insert_btree_node_full: -+ ret = bch2_btree_split_leaf(trans, i->path, flags); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ trace_and_count(c, trans_restart_btree_node_split, trans, -+ trace_ip, trans->paths + i->path); -+ break; -+ case -BCH_ERR_btree_insert_need_mark_replicas: -+ ret = drop_locks_do(trans, -+ bch2_accounting_update_sb(trans)); - break; - case -BCH_ERR_btree_insert_need_journal_reclaim: - bch2_trans_unlock(trans); -@@ -950,7 +936,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, - BUG_ON(ret >= 0); - break; - } -- -+out: - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); - - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && -@@ -978,8 +964,8 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) - return ret; - } - -- for (struct jset_entry *i = trans->journal_entries; -- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); -+ for (struct jset_entry *i = btree_trans_journal_entries_start(trans); -+ i != btree_trans_journal_entries_top(trans); - i = vstruct_next(i)) - if (i->type == BCH_JSET_ENTRY_btree_keys || - i->type == BCH_JSET_ENTRY_write_buffer_keys) { -@@ -988,6 +974,14 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) - return ret; - } - -+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); -+ i != btree_trans_subbuf_top(trans, &trans->accounting); -+ i = bkey_next(i)) { -+ int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i); -+ if (ret) -+ return ret; -+ } -+ - return 0; - } - -@@ -1004,7 +998,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) - goto out_reset; - - if (!trans->nr_updates && -- !trans->journal_entries_u64s) -+ !trans->journal_entries.u64s && -+ !trans->accounting.u64s) - goto out_reset; - - ret = bch2_trans_commit_run_triggers(trans); -@@ -1012,7 +1007,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) - goto out_reset; - - if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && -- unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { -+ unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) { - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) - ret = do_bch2_trans_commit_to_journal_replay(trans); - else -@@ -1022,7 +1017,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) - - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - -- trans->journal_u64s = trans->journal_entries_u64s; -+ trans->journal_u64s = trans->journal_entries.u64s + jset_u64s(trans->accounting.u64s); - trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); - if (trans->journal_transaction_names) - trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); -@@ -1078,7 +1073,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) - trace_and_count(c, transaction_commit, trans, _RET_IP_); - out: - if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) -- bch2_write_ref_put(c, BCH_WRITE_REF_trans); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans); - out_reset: - if (!ret) - bch2_trans_downgrade(trans); -diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h -index a09cbe9cd94f..1cec08467e17 100644 ---- a/fs/bcachefs/btree_types.h -+++ b/fs/bcachefs/btree_types.h -@@ -2,13 +2,13 @@ - #ifndef _BCACHEFS_BTREE_TYPES_H - #define _BCACHEFS_BTREE_TYPES_H - -+#include - #include - #include - - #include "bbpos_types.h" - #include "btree_key_cache_types.h" - #include "buckets_types.h" --#include "darray.h" - #include "errcode.h" - #include "journal_types.h" - #include "replicas_types.h" -@@ -139,6 +139,7 @@ struct btree { - }; - - #define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ -+ x(cache_reserve) \ - x(lock_intent) \ - x(lock_write) \ - x(dirty) \ -@@ -257,9 +258,6 @@ struct btree_node_iter { - * - * BTREE_TRIGGER_insert - @new is entering the btree - * BTREE_TRIGGER_overwrite - @old is leaving the btree -- * -- * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc -- * trigger - */ - #define BTREE_TRIGGER_FLAGS() \ - x(norun) \ -@@ -269,8 +267,7 @@ struct btree_node_iter { - x(gc) \ - x(insert) \ - x(overwrite) \ -- x(is_root) \ -- x(bucket_invalidate) -+ x(is_root) - - enum { - #define x(n) BTREE_ITER_FLAG_BIT_##n, -@@ -367,7 +364,6 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path) - * @nodes_intent_locked - bitmask indicating which locks are intent locks - */ - struct btree_iter { -- struct btree_trans *trans; - btree_path_idx_t path; - btree_path_idx_t update_path; - btree_path_idx_t key_cache_path; -@@ -423,6 +419,7 @@ static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) - - struct btree_insert_entry { - unsigned flags; -+ u8 sort_order; - u8 bkey_type; - enum btree_id btree_id:8; - u8 level:4; -@@ -477,6 +474,18 @@ struct btree_trans_paths { - struct btree_path paths[]; - }; - -+struct trans_kmalloc_trace { -+ unsigned long ip; -+ size_t bytes; -+}; -+typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; -+ -+struct btree_trans_subbuf { -+ u16 base; -+ u16 u64s; -+ u16 size;; -+}; -+ - struct btree_trans { - struct bch_fs *c; - -@@ -488,6 +497,9 @@ struct btree_trans { - void *mem; - unsigned mem_top; - unsigned mem_bytes; -+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -+ darray_trans_kmalloc_trace trans_kmalloc_trace; -+#endif - - btree_path_idx_t nr_sorted; - btree_path_idx_t nr_paths; -@@ -528,9 +540,8 @@ struct btree_trans { - int srcu_idx; - - /* update path: */ -- u16 journal_entries_u64s; -- u16 journal_entries_size; -- struct jset_entry *journal_entries; -+ struct btree_trans_subbuf journal_entries; -+ struct btree_trans_subbuf accounting; - - struct btree_trans_commit_hook *hooks; - struct journal_entry_pin *journal_pin; -@@ -647,13 +658,13 @@ static inline struct bset_tree *bset_tree_last(struct btree *b) - static inline void * - __btree_node_offset_to_ptr(const struct btree *b, u16 offset) - { -- return (void *) ((u64 *) b->data + 1 + offset); -+ return (void *) ((u64 *) b->data + offset); - } - - static inline u16 - __btree_node_ptr_to_offset(const struct btree *b, const void *p) - { -- u16 ret = (u64 *) p - 1 - (u64 *) b->data; -+ u16 ret = (u64 *) p - (u64 *) b->data; - - EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); - return ret; -@@ -853,6 +864,18 @@ static inline bool btree_type_uses_write_buffer(enum btree_id btree) - return BIT_ULL(btree) & mask; - } - -+static inline u8 btree_trigger_order(enum btree_id btree) -+{ -+ switch (btree) { -+ case BTREE_ID_alloc: -+ return U8_MAX; -+ case BTREE_ID_stripes: -+ return U8_MAX - 1; -+ default: -+ return btree; -+ } -+} -+ - struct btree_root { - struct btree *b; - -diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c -index 13d794f201a5..afd05c3dfd03 100644 ---- a/fs/bcachefs/btree_update.c -+++ b/fs/bcachefs/btree_update.c -@@ -14,10 +14,13 @@ - #include "snapshot.h" - #include "trace.h" - -+#include -+#include -+ - static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, - const struct btree_insert_entry *r) - { -- return cmp_int(l->btree_id, r->btree_id) ?: -+ return cmp_int(l->sort_order, r->sort_order) ?: - cmp_int(l->cached, r->cached) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p); -@@ -126,7 +129,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - struct bpos new_pos) - { - struct bch_fs *c = trans->c; -- struct btree_iter old_iter, new_iter = { NULL }; -+ struct btree_iter old_iter, new_iter = {}; - struct bkey_s_c old_k, new_k; - snapshot_id_list s; - struct bkey_i *update; -@@ -140,7 +143,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - bch2_trans_iter_init(trans, &old_iter, id, old_pos, - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); -- while ((old_k = bch2_btree_iter_prev(&old_iter)).k && -+ while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k && - !(ret = bkey_err(old_k)) && - bkey_eq(old_pos, old_k.k->p)) { - struct bpos whiteout_pos = -@@ -296,7 +299,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, - BTREE_ITER_intent| - BTREE_ITER_with_updates| - BTREE_ITER_not_extents); -- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); -+ k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) -@@ -322,8 +325,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans, - if (done) - goto out; - next: -- bch2_btree_iter_advance(&iter); -- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); -+ bch2_btree_iter_advance(trans, &iter); -+ k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) -@@ -397,6 +400,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, - - n = (struct btree_insert_entry) { - .flags = flags, -+ .sort_order = btree_trigger_order(path->btree_id), - .bkey_type = __btree_node_type(path->level, path->btree_id), - .btree_id = path->btree_id, - .level = path->level, -@@ -508,9 +512,12 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, - return 0; - } - --int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, -- struct bkey_i *k, enum btree_iter_update_trigger_flags flags) -+int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags, -+ unsigned long ip) - { -+ kmsan_check_memory(k, bkey_bytes(&k->k)); -+ - btree_path_idx_t path_idx = iter->update_path ?: iter->path; - int ret; - -@@ -543,7 +550,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter - path_idx = iter->key_cache_path; - } - -- return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_); -+ return bch2_trans_update_by_path(trans, path_idx, k, flags, ip); - } - - int bch2_btree_insert_clone_trans(struct btree_trans *trans, -@@ -559,43 +566,42 @@ int bch2_btree_insert_clone_trans(struct btree_trans *trans, - return bch2_btree_insert_trans(trans, btree, n, 0); - } - --struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) -+void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, -+ struct btree_trans_subbuf *buf, -+ unsigned u64s) - { -- unsigned new_top = trans->journal_entries_u64s + u64s; -- unsigned old_size = trans->journal_entries_size; -- -- if (new_top > trans->journal_entries_size) { -- trans->journal_entries_size = roundup_pow_of_two(new_top); -+ unsigned new_top = buf->u64s + u64s; -+ unsigned old_size = buf->size; - -- btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size; -- } -+ if (new_top > buf->size) -+ buf->size = roundup_pow_of_two(new_top); - -- struct jset_entry *n = -- bch2_trans_kmalloc_nomemzero(trans, -- trans->journal_entries_size * sizeof(u64)); -+ void *n = bch2_trans_kmalloc_nomemzero(trans, buf->size * sizeof(u64)); - if (IS_ERR(n)) -- return ERR_CAST(n); -+ return n; - -- if (trans->journal_entries) -- memcpy(n, trans->journal_entries, old_size * sizeof(u64)); -- trans->journal_entries = n; -+ if (buf->u64s) -+ memcpy(n, -+ btree_trans_subbuf_base(trans, buf), -+ old_size * sizeof(u64)); -+ buf->base = (u64 *) n - (u64 *) trans->mem; - -- struct jset_entry *e = btree_trans_journal_entries_top(trans); -- trans->journal_entries_u64s = new_top; -- return e; -+ void *p = btree_trans_subbuf_top(trans, buf); -+ buf->u64s = new_top; -+ return p; - } - - int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, - enum btree_id btree, struct bpos end) - { - bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); -- struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); -+ struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter); - int ret = bkey_err(k); - if (ret) - goto err; - -- bch2_btree_iter_advance(iter); -- k = bch2_btree_iter_peek_slot(iter); -+ bch2_btree_iter_advance(trans, iter); -+ k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -631,7 +637,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, - BTREE_ITER_cached| - BTREE_ITER_not_extents| - BTREE_ITER_intent); -- ret = bch2_btree_iter_traverse(&iter) ?: -+ ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -@@ -643,7 +649,7 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), - BTREE_ITER_intent|flags); -- int ret = bch2_btree_iter_traverse(&iter) ?: -+ int ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -@@ -692,7 +698,7 @@ int bch2_btree_delete(struct btree_trans *trans, - bch2_trans_iter_init(trans, &iter, btree, pos, - BTREE_ITER_cached| - BTREE_ITER_intent); -- ret = bch2_btree_iter_traverse(&iter) ?: -+ ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, update_flags); - bch2_trans_iter_exit(trans, &iter); - -@@ -710,7 +716,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, - int ret = 0; - - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); -- while ((k = bch2_btree_iter_peek_max(&iter, end)).k) { -+ while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(trans->c, 0); - struct bkey_i delete; -@@ -805,7 +811,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - -- int ret = bch2_btree_iter_traverse(&iter) ?: -+ int ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_bit_mod_iter(trans, &iter, set); - bch2_trans_iter_exit(trans, &iter); - return ret; -@@ -826,7 +832,6 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, - int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) - { - unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); -- prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); - - int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) -@@ -839,7 +844,20 @@ int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) - - struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); - journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); -- memcpy(l->d, buf->buf, buf->pos); -+ memcpy_and_pad(l->d, u64s * sizeof(u64), buf->buf, buf->pos, 0); -+ return 0; -+} -+ -+int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree, -+ unsigned level, struct bkey_i *k) -+{ -+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); -+ int ret = PTR_ERR_OR_ZERO(e); -+ if (ret) -+ return ret; -+ -+ journal_entry_init(e, BCH_JSET_ENTRY_log_bkey, btree, level, k->k.u64s); -+ bkey_copy(e->start, k); - return 0; - } - -@@ -852,7 +870,6 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, - prt_vprintf(&buf, fmt, args); - - unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); -- prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); - - int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) -@@ -865,7 +882,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, - - struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); - journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); -- memcpy(l->d, buf.buf, buf.pos); -+ memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0); - c->journal.early_journal_entries.nr += jset_u64s(u64s); - } else { - ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, -diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h -index 47d8690f01bf..a54dc7277177 100644 ---- a/fs/bcachefs/btree_update.h -+++ b/fs/bcachefs/btree_update.h -@@ -102,26 +102,60 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter * - int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos); - --int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, -- struct bkey_i *, enum btree_iter_update_trigger_flags); -+int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, enum btree_iter_update_trigger_flags, -+ unsigned long); - --struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); -+static inline int __must_check -+bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags) -+{ -+ return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_); -+} -+ -+static inline void *btree_trans_subbuf_base(struct btree_trans *trans, -+ struct btree_trans_subbuf *buf) -+{ -+ return (u64 *) trans->mem + buf->base; -+} -+ -+static inline void *btree_trans_subbuf_top(struct btree_trans *trans, -+ struct btree_trans_subbuf *buf) -+{ -+ return (u64 *) trans->mem + buf->base + buf->u64s; -+} -+ -+void *__bch2_trans_subbuf_alloc(struct btree_trans *, -+ struct btree_trans_subbuf *, -+ unsigned); -+ -+static inline void * -+bch2_trans_subbuf_alloc(struct btree_trans *trans, -+ struct btree_trans_subbuf *buf, -+ unsigned u64s) -+{ -+ if (buf->u64s + u64s > buf->size) -+ return __bch2_trans_subbuf_alloc(trans, buf, u64s); -+ -+ void *p = btree_trans_subbuf_top(trans, buf); -+ buf->u64s += u64s; -+ return p; -+} -+ -+static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans) -+{ -+ return btree_trans_subbuf_base(trans, &trans->journal_entries); -+} - - static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) - { -- return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); -+ return btree_trans_subbuf_top(trans, &trans->journal_entries); - } - - static inline struct jset_entry * - bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) - { -- if (!trans->journal_entries || -- trans->journal_entries_u64s + u64s > trans->journal_entries_size) -- return __bch2_trans_jset_entry_alloc(trans, u64s); -- -- struct jset_entry *e = btree_trans_journal_entries_top(trans); -- trans->journal_entries_u64s += u64s; -- return e; -+ return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s); - } - - int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); -@@ -133,6 +167,10 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr - enum btree_id btree, - struct bkey_i *k) - { -+ kmsan_check_memory(k, bkey_bytes(&k->k)); -+ -+ EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); -+ - if (unlikely(!btree_type_uses_write_buffer(btree))) { - int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); - dump_stack(); -@@ -168,6 +206,8 @@ void bch2_trans_commit_hook(struct btree_trans *, - int __bch2_trans_commit(struct btree_trans *, unsigned); - - int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); -+int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *); -+ - __printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); - __printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); - -@@ -213,12 +253,15 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) - bch2_path_put(trans, i->path, true); - - trans->nr_updates = 0; -- trans->journal_entries_u64s = 0; -+ trans->journal_entries.u64s = 0; -+ trans->journal_entries.size = 0; -+ trans->accounting.u64s = 0; -+ trans->accounting.size = 0; - trans->hooks = NULL; - trans->extra_disk_res = 0; - } - --static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, -+static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, - unsigned type, unsigned min_bytes) - { - unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); -@@ -241,7 +284,7 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t - return mut; - } - --static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) -+static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) - { - return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); - } -diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c -index e4e7c804625e..74e65714fecd 100644 ---- a/fs/bcachefs/btree_update_interior.c -+++ b/fs/bcachefs/btree_update_interior.c -@@ -14,6 +14,7 @@ - #include "btree_locking.h" - #include "buckets.h" - #include "clock.h" -+#include "enumerated_ref.h" - #include "error.h" - #include "extents.h" - #include "io_write.h" -@@ -35,6 +36,8 @@ static const char * const bch2_btree_update_modes[] = { - NULL - }; - -+static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *); -+ - static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - btree_path_idx_t, struct btree *, struct keylist *); - static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -@@ -54,6 +57,8 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) - struct bkey_buf prev; - int ret = 0; - -+ printbuf_indent_add_nextline(&buf, 2); -+ - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, - b->data->min_key)); -@@ -64,19 +69,20 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) - - if (b == btree_node_root(c, b)) { - if (!bpos_eq(b->data->min_key, POS_MIN)) { -- printbuf_reset(&buf); -+ ret = __bch2_topology_error(c, &buf); -+ - bch2_bpos_to_text(&buf, b->data->min_key); - log_fsck_err(trans, btree_root_bad_min_key, - "btree root with incorrect min_key: %s", buf.buf); -- goto topology_repair; -+ goto out; - } - - if (!bpos_eq(b->data->max_key, SPOS_MAX)) { -- printbuf_reset(&buf); -+ ret = __bch2_topology_error(c, &buf); - bch2_bpos_to_text(&buf, b->data->max_key); - log_fsck_err(trans, btree_root_bad_max_key, - "btree root with incorrect max_key: %s", buf.buf); -- goto topology_repair; -+ goto out; - } - } - -@@ -94,20 +100,19 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) - : bpos_successor(prev.k->k.p); - - if (!bpos_eq(expected_min, bp.v->min_key)) { -- bch2_topology_error(c); -+ ret = __bch2_topology_error(c, &buf); - -- printbuf_reset(&buf); -- prt_str(&buf, "end of prev node doesn't match start of next node\n in "); -+ prt_str(&buf, "end of prev node doesn't match start of next node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); -- prt_str(&buf, "\n prev "); -+ prt_str(&buf, "\nprev "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); -- prt_str(&buf, "\n next "); -+ prt_str(&buf, "\nnext "); - bch2_bkey_val_to_text(&buf, c, k); - - log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); -- goto topology_repair; -+ goto out; - } - - bch2_bkey_buf_reassemble(&prev, c, k); -@@ -115,29 +120,25 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) - } - - if (bkey_deleted(&prev.k->k)) { -- bch2_topology_error(c); -+ ret = __bch2_topology_error(c, &buf); - -- printbuf_reset(&buf); -- prt_str(&buf, "empty interior node\n in "); -+ prt_str(&buf, "empty interior node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); -- goto topology_repair; - } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { -- bch2_topology_error(c); -+ ret = __bch2_topology_error(c, &buf); - -- printbuf_reset(&buf); -- prt_str(&buf, "last child node doesn't end at end of parent node\n in "); -+ prt_str(&buf, "last child node doesn't end at end of parent node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); -- prt_str(&buf, "\n last key "); -+ prt_str(&buf, "\nlast key "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - - log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); -- goto topology_repair; - } - out: - fsck_err: -@@ -145,9 +146,6 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) - bch2_bkey_buf_exit(&prev, c); - printbuf_exit(&buf); - return ret; --topology_repair: -- ret = bch2_topology_error(c); -- goto out; - } - - /* Calculate ideal packed bkey format for new btree nodes: */ -@@ -287,6 +285,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, - struct disk_reservation *res, - struct closure *cl, - bool interior_node, -+ unsigned target, - unsigned flags) - { - struct bch_fs *c = trans->c; -@@ -320,6 +319,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, - mutex_unlock(&c->btree_reserve_cache_lock); - retry: - ret = bch2_alloc_sectors_start_trans(trans, -+ target ?: - c->opts.metadata_target ?: - c->opts.foreground_target, - 0, -@@ -328,7 +328,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, - res->nr_replicas, - min(res->nr_replicas, - c->opts.metadata_replicas_required), -- watermark, 0, cl, &wp); -+ watermark, -+ target ? BCH_WRITE_only_specified_devs : 0, -+ cl, &wp); - if (unlikely(ret)) - goto err; - -@@ -508,6 +510,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * - static int bch2_btree_reserve_get(struct btree_trans *trans, - struct btree_update *as, - unsigned nr_nodes[2], -+ unsigned target, - unsigned flags, - struct closure *cl) - { -@@ -530,7 +533,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, - - while (p->nr < nr_nodes[interior]) { - b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, -- interior, flags); -+ interior, target, flags); - if (IS_ERR(b)) { - ret = PTR_ERR(b); - goto err; -@@ -649,6 +652,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, - return 0; - } - -+/* If the node has been reused, we might be reading uninitialized memory - that's fine: */ -+static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) -+{ -+ struct btree_node *b_data = READ_ONCE(b->data); -+ -+ return (b_data ? b_data->keys.seq : 0) == seq; -+} -+ - static void btree_update_nodes_written(struct btree_update *as) - { - struct bch_fs *c = as->c; -@@ -677,17 +688,9 @@ static void btree_update_nodes_written(struct btree_update *as) - * on disk: - */ - for (i = 0; i < as->nr_old_nodes; i++) { -- __le64 seq; -- - b = as->old_nodes[i]; - -- bch2_trans_begin(trans); -- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); -- seq = b->data ? b->data->keys.seq : 0; -- six_unlock_read(&b->c.lock); -- bch2_trans_unlock_long(trans); -- -- if (seq == as->old_nodes_seq[i]) -+ if (btree_node_seq_matches(b, as->old_nodes_seq[i])) - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, - TASK_UNINTERRUPTIBLE); - } -@@ -1119,7 +1122,8 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * - - static struct btree_update * - bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, -- unsigned level_start, bool split, unsigned flags) -+ unsigned level_start, bool split, -+ unsigned target, unsigned flags) - { - struct bch_fs *c = trans->c; - struct btree_update *as; -@@ -1224,12 +1228,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - - ret = bch2_disk_reservation_get(c, &as->disk_res, - (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), -- c->opts.metadata_replicas, -+ READ_ONCE(c->opts.metadata_replicas), - disk_res_flags); - if (ret) - goto err; - -- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); -+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); - if (bch2_err_matches(ret, ENOSPC) || - bch2_err_matches(ret, ENOMEM)) { - struct closure cl; -@@ -1248,7 +1252,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - closure_init_stack(&cl); - - do { -- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); -+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); - - bch2_trans_unlock(trans); - bch2_wait_on_allocator(c, &cl); -@@ -1271,7 +1275,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - bch2_btree_update_free(as, trans); - if (!bch2_err_matches(ret, ENOSPC) && - !bch2_err_matches(ret, EROFS) && -- ret != -BCH_ERR_journal_reclaim_would_deadlock) -+ ret != -BCH_ERR_journal_reclaim_would_deadlock && -+ ret != -BCH_ERR_journal_shutdown) - bch_err_fn_ratelimited(c, ret); - return ERR_PTR(ret); - } -@@ -1391,7 +1396,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, - printbuf_exit(&buf); - } - --static void -+static int - bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, -@@ -1413,7 +1418,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, - insert = bkey_next(insert)) - bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); - -- if (bch2_btree_node_check_topology(trans, b)) { -+ int ret = bch2_btree_node_check_topology(trans, b); -+ if (ret) { - struct printbuf buf = PRINTBUF; - - for (struct bkey_i *k = keys->keys; -@@ -1423,11 +1429,15 @@ bch2_btree_insert_keys_interior(struct btree_update *as, - prt_newline(&buf); - } - -- panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf); -+ bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s", -+ (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf); -+ dump_stack(); -+ return ret; - } - - memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); - keys->top_p -= insert->_data - keys->keys_p; -+ return 0; - } - - static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) -@@ -1561,11 +1571,11 @@ static void __btree_split_node(struct btree_update *as, - * nodes that were coalesced, and thus in the middle of a child node post - * coalescing: - */ --static void btree_split_insert_keys(struct btree_update *as, -- struct btree_trans *trans, -- btree_path_idx_t path_idx, -- struct btree *b, -- struct keylist *keys) -+static int btree_split_insert_keys(struct btree_update *as, -+ struct btree_trans *trans, -+ btree_path_idx_t path_idx, -+ struct btree *b, -+ struct keylist *keys) - { - struct btree_path *path = trans->paths + path_idx; - -@@ -1575,8 +1585,12 @@ static void btree_split_insert_keys(struct btree_update *as, - - bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - -- bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); -+ int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); -+ if (ret) -+ return ret; - } -+ -+ return 0; - } - - static int btree_split(struct btree_update *as, struct btree_trans *trans, -@@ -1609,8 +1623,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, - __btree_split_node(as, trans, b, n, keys); - - if (keys) { -- btree_split_insert_keys(as, trans, path, n1, keys); -- btree_split_insert_keys(as, trans, path, n2, keys); -+ ret = btree_split_insert_keys(as, trans, path, n1, keys) ?: -+ btree_split_insert_keys(as, trans, path, n2, keys); -+ if (ret) -+ goto err; - BUG_ON(!bch2_keylist_empty(keys)); - } - -@@ -1656,7 +1672,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, - n3->sib_u64s[0] = U16_MAX; - n3->sib_u64s[1] = U16_MAX; - -- btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); -+ ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); -+ if (ret) -+ goto err; - } - } else { - trace_and_count(c, btree_node_compact, trans, b); -@@ -1664,7 +1682,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, - n1 = bch2_btree_node_alloc_replacement(as, trans, b); - - if (keys) { -- btree_split_insert_keys(as, trans, path, n1, keys); -+ ret = btree_split_insert_keys(as, trans, path, n1, keys); -+ if (ret) -+ goto err; - BUG_ON(!bch2_keylist_empty(keys)); - } - -@@ -1782,11 +1802,24 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t - int ret; - - lockdep_assert_held(&c->gc_lock); -- BUG_ON(!btree_node_intent_locked(path, b->c.level)); - BUG_ON(!b->c.level); - BUG_ON(!as || as->b); - bch2_verify_keylist_sorted(keys); - -+ if (!btree_node_intent_locked(path, b->c.level)) { -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ prt_printf(&buf, "%s(): node not locked at level %u\n", -+ __func__, b->c.level); -+ bch2_btree_update_to_text(&buf, as); -+ bch2_btree_path_to_text(&buf, trans, path_idx); -+ bch2_fs_emergency_read_only2(c, &buf); -+ -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ return -EIO; -+ } -+ - ret = bch2_btree_node_lock_write(trans, path, &b->c); - if (ret) - return ret; -@@ -1798,15 +1831,15 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t - goto split; - } - -- ret = bch2_btree_node_check_topology(trans, b); -+ -+ ret = bch2_btree_node_check_topology(trans, b) ?: -+ bch2_btree_insert_keys_interior(as, trans, path, b, -+ path->l[b->c.level].iter, keys); - if (ret) { - bch2_btree_node_unlock_write(trans, path, b); - return ret; - } - -- bch2_btree_insert_keys_interior(as, trans, path, b, -- path->l[b->c.level].iter, keys); -- - trans_for_each_path_with_node(trans, b, linked, i) - bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - -@@ -1852,7 +1885,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, - - as = bch2_btree_update_start(trans, trans->paths + path, - trans->paths[path].level, -- true, flags); -+ true, 0, flags); - if (IS_ERR(as)) - return PTR_ERR(as); - -@@ -1922,7 +1955,8 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, - return bch2_btree_split_leaf(trans, path, flags); - - struct btree_update *as = -- bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); -+ bch2_btree_update_start(trans, trans->paths + path, b->c.level, -+ true, 0, flags); - if (IS_ERR(as)) - return PTR_ERR(as); - -@@ -2007,18 +2041,22 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, - } - - if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { -- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; -- -- bch2_bpos_to_text(&buf1, prev->data->max_key); -- bch2_bpos_to_text(&buf2, next->data->min_key); -- bch_err(c, -- "%s(): btree topology error:\n" -- " prev ends at %s\n" -- " next starts at %s", -- __func__, buf1.buf, buf2.buf); -- printbuf_exit(&buf1); -- printbuf_exit(&buf2); -- ret = bch2_topology_error(c); -+ struct printbuf buf = PRINTBUF; -+ -+ printbuf_indent_add_nextline(&buf, 2); -+ prt_printf(&buf, "%s(): ", __func__); -+ ret = __bch2_topology_error(c, &buf); -+ prt_newline(&buf); -+ -+ prt_printf(&buf, "prev ends at "); -+ bch2_bpos_to_text(&buf, prev->data->max_key); -+ prt_newline(&buf); -+ -+ prt_printf(&buf, "next starts at "); -+ bch2_bpos_to_text(&buf, next->data->min_key); -+ -+ bch_err(c, "%s", buf.buf); -+ printbuf_exit(&buf); - goto err; - } - -@@ -2047,7 +2085,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, - - parent = btree_node_parent(trans->paths + path, b); - as = bch2_btree_update_start(trans, trans->paths + path, level, false, -- BCH_TRANS_COMMIT_no_enospc|flags); -+ 0, BCH_TRANS_COMMIT_no_enospc|flags); - ret = PTR_ERR_OR_ZERO(as); - if (ret) - goto err; -@@ -2126,9 +2164,35 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, - goto out; - } - -+static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, -+ struct btree *b) -+{ -+ bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, -+ BTREE_MAX_DEPTH, b->c.level, -+ BTREE_ITER_intent); -+ int ret = bch2_btree_iter_traverse(trans, iter); -+ if (ret) -+ goto err; -+ -+ /* has node been freed? */ -+ if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { -+ /* node has been freed: */ -+ BUG_ON(!btree_node_dying(b)); -+ ret = -BCH_ERR_btree_node_dying; -+ goto err; -+ } -+ -+ BUG_ON(!btree_node_hashed(b)); -+ return 0; -+err: -+ bch2_trans_iter_exit(trans, iter); -+ return ret; -+} -+ - int bch2_btree_node_rewrite(struct btree_trans *trans, - struct btree_iter *iter, - struct btree *b, -+ unsigned target, - unsigned flags) - { - struct bch_fs *c = trans->c; -@@ -2141,7 +2205,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, - - struct btree_path *path = btree_iter_path(trans, iter); - parent = btree_node_parent(path, b); -- as = bch2_btree_update_start(trans, path, b->c.level, false, flags); -+ as = bch2_btree_update_start(trans, path, b->c.level, -+ false, target, flags); - ret = PTR_ERR_OR_ZERO(as); - if (ret) - goto out; -@@ -2191,67 +2256,83 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, - goto out; - } - --struct async_btree_rewrite { -- struct bch_fs *c; -- struct work_struct work; -- struct list_head list; -- enum btree_id btree_id; -- unsigned level; -- struct bkey_buf key; --}; -- --static int async_btree_node_rewrite_trans(struct btree_trans *trans, -- struct async_btree_rewrite *a) -+static int bch2_btree_node_rewrite_key(struct btree_trans *trans, -+ enum btree_id btree, unsigned level, -+ struct bkey_i *k, unsigned flags) - { - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, -- a->btree_id, a->key.k->k.p, -- BTREE_MAX_DEPTH, a->level, 0); -- struct btree *b = bch2_btree_iter_peek_node(&iter); -+ btree, k->k.p, -+ BTREE_MAX_DEPTH, level, 0); -+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto out; - -- bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); -+ bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); - ret = found -- ? bch2_btree_node_rewrite(trans, &iter, b, 0) -+ ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags) - : -ENOENT; -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} - --#if 0 -- /* Tracepoint... */ -- if (!ret || ret == -ENOENT) { -- struct bch_fs *c = trans->c; -- struct printbuf buf = PRINTBUF; -+int bch2_btree_node_rewrite_pos(struct btree_trans *trans, -+ enum btree_id btree, unsigned level, -+ struct bpos pos, -+ unsigned target, -+ unsigned flags) -+{ -+ BUG_ON(!level); - -- if (!ret) { -- prt_printf(&buf, "rewrite node:\n "); -- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); -- } else { -- prt_printf(&buf, "node to rewrite not found:\n want: "); -- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); -- prt_printf(&buf, "\n got: "); -- if (b) -- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); -- else -- prt_str(&buf, "(null)"); -- } -- bch_info(c, "%s", buf.buf); -- printbuf_exit(&buf); -- } --#endif --out: -+ /* Traverse one depth lower to get a pointer to the node itself: */ -+ struct btree_iter iter; -+ bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); -+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter); -+ int ret = PTR_ERR_OR_ZERO(b); -+ if (ret) -+ goto err; -+ -+ ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags); -+err: - bch2_trans_iter_exit(trans, &iter); - return ret; - } - -+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, -+ struct btree *b, unsigned flags) -+{ -+ struct btree_iter iter; -+ int ret = get_iter_to_node(trans, &iter, b); -+ if (ret) -+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret; -+ -+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+struct async_btree_rewrite { -+ struct bch_fs *c; -+ struct work_struct work; -+ struct list_head list; -+ enum btree_id btree_id; -+ unsigned level; -+ struct bkey_buf key; -+}; -+ - static void async_btree_node_rewrite_work(struct work_struct *work) - { - struct async_btree_rewrite *a = - container_of(work, struct async_btree_rewrite, work); - struct bch_fs *c = a->c; - -- int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); -- if (ret != -ENOENT) -+ int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, -+ a->btree_id, a->level, a->key.k, 0)); -+ if (ret != -ENOENT && -+ !bch2_err_matches(ret, EROFS) && -+ ret != -BCH_ERR_journal_shutdown) - bch_err_fn_ratelimited(c, ret); - - spin_lock(&c->btree_node_rewrites_lock); -@@ -2261,7 +2342,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work) - closure_wake_up(&c->btree_node_rewrites_wait); - - bch2_bkey_buf_exit(&a->key, c); -- bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite); - kfree(a); - } - -@@ -2282,8 +2363,8 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) - bool now = false, pending = false; - - spin_lock(&c->btree_node_rewrites_lock); -- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && -- bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { -+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) && -+ enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { - list_add(&a->list, &c->btree_node_rewrites); - now = true; - } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { -@@ -2322,7 +2403,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c) - if (!a) - break; - -- bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); -+ enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite); - queue_work(c->btree_node_rewrite_worker, &a->work); - } - } -@@ -2352,7 +2433,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, - bool skip_triggers) - { - struct bch_fs *c = trans->c; -- struct btree_iter iter2 = { NULL }; -+ struct btree_iter iter2 = {}; - struct btree *parent; - int ret; - -@@ -2376,7 +2457,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, - - parent = btree_node_parent(btree_iter_path(trans, iter), b); - if (parent) { -- bch2_trans_copy_iter(&iter2, iter); -+ bch2_trans_copy_iter(trans, &iter2, iter); - - iter2.path = bch2_btree_path_make_mut(trans, iter2.path, - iter2.flags & BTREE_ITER_intent, -@@ -2390,7 +2471,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, - - trans->paths_sorted = false; - -- ret = bch2_btree_iter_traverse(&iter2) ?: -+ ret = bch2_btree_iter_traverse(trans, &iter2) ?: - bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); - if (ret) - goto err; -@@ -2494,30 +2575,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, - unsigned commit_flags, bool skip_triggers) - { - struct btree_iter iter; -- int ret; -- -- bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, -- BTREE_MAX_DEPTH, b->c.level, -- BTREE_ITER_intent); -- ret = bch2_btree_iter_traverse(&iter); -+ int ret = get_iter_to_node(trans, &iter, b); - if (ret) -- goto out; -- -- /* has node been freed? */ -- if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) { -- /* node has been freed: */ -- BUG_ON(!btree_node_dying(b)); -- goto out; -- } -- -- BUG_ON(!btree_node_hashed(b)); -+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - - bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, - !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); - - ret = bch2_btree_node_update_key(trans, &iter, b, new_key, - commit_flags, skip_triggers); --out: - bch2_trans_iter_exit(trans, &iter); - return ret; - } -diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h -index 26d646e1275c..7fe793788a79 100644 ---- a/fs/bcachefs/btree_update_interior.h -+++ b/fs/bcachefs/btree_update_interior.h -@@ -144,7 +144,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, - - EBUG_ON(!btree_node_locked(path, level)); - -- if (bch2_btree_node_merging_disabled) -+ if (static_branch_unlikely(&bch2_btree_node_merging_disabled)) - return 0; - - b = path->l[level].b; -@@ -168,8 +168,15 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, - } - - int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, -- struct btree *, unsigned); -+ struct btree *, unsigned, unsigned); -+int bch2_btree_node_rewrite_pos(struct btree_trans *, -+ enum btree_id, unsigned, -+ struct bpos, unsigned, unsigned); -+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, -+ struct btree *, unsigned); -+ - void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); -+ - int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, - struct btree *, struct bkey_i *, - unsigned, bool); -diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c -index 2c09d19dd621..efb0c64d0aac 100644 ---- a/fs/bcachefs/btree_write_buffer.c -+++ b/fs/bcachefs/btree_write_buffer.c -@@ -7,6 +7,7 @@ - #include "btree_update_interior.h" - #include "btree_write_buffer.h" - #include "disk_accounting.h" -+#include "enumerated_ref.h" - #include "error.h" - #include "extents.h" - #include "journal.h" -@@ -144,7 +145,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite - EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); - EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); - -- ret = bch2_btree_iter_traverse(iter); -+ ret = bch2_btree_iter_traverse(trans, iter); - if (ret) - return ret; - -@@ -181,6 +182,8 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite - return wb_flush_one_slowpath(trans, iter, wb); - } - -+ EBUG_ON(!bpos_eq(wb->k.k.p, path->pos)); -+ - bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); - (*fast)++; - return 0; -@@ -208,7 +211,7 @@ btree_write_buffered_insert(struct btree_trans *trans, - - trans->journal_res.seq = wb->journal_seq; - -- ret = bch2_btree_iter_traverse(&iter) ?: -+ ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &wb->k, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); -@@ -285,7 +288,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct btree_write_buffer *wb = &c->btree_write_buffer; -- struct btree_iter iter = { NULL }; -+ struct btree_iter iter = {}; - size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; - bool write_locked = false; - bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); -@@ -368,7 +371,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) - write_locked = false; - - ret = lockrestart_do(trans, -- bch2_btree_iter_traverse(&iter) ?: -+ bch2_btree_iter_traverse(trans, &iter) ?: - bch2_foreground_maybe_merge(trans, iter.path, 0, - BCH_WATERMARK_reclaim| - BCH_TRANS_COMMIT_journal_reclaim| -@@ -385,7 +388,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) - BTREE_ITER_intent|BTREE_ITER_all_snapshots); - } - -- bch2_btree_iter_set_pos(&iter, k->k.k.p); -+ bch2_btree_iter_set_pos(trans, &iter, k->k.k.p); - btree_iter_path(trans, &iter)->preserve = false; - - bool accounting_accumulated = false; -@@ -428,10 +431,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) - */ - trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); - -- sort(wb->flushing.keys.data, -- wb->flushing.keys.nr, -- sizeof(wb->flushing.keys.data[0]), -- wb_key_seq_cmp, NULL); -+ sort_nonatomic(wb->flushing.keys.data, -+ wb->flushing.keys.nr, -+ sizeof(wb->flushing.keys.data[0]), -+ wb_key_seq_cmp, NULL); - - darray_for_each(wb->flushing.keys, i) { - if (!i->journal_seq) -@@ -629,11 +632,11 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) - { - struct bch_fs *c = trans->c; - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) - return -BCH_ERR_erofs_no_writes; - - int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); -- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); - return ret; - } - -@@ -692,7 +695,7 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work) - } while (!ret && bch2_btree_write_buffer_should_flush(c)); - mutex_unlock(&wb->flushing.lock); - -- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); - } - - static void wb_accounting_sort(struct btree_write_buffer *wb) -@@ -821,9 +824,9 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ - bch2_journal_pin_drop(&c->journal, &dst->wb->pin); - - if (bch2_btree_write_buffer_should_flush(c) && -- __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && -+ __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) && - !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) -- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); - - if (dst->wb == &wb->flushing) - mutex_unlock(&wb->flushing.lock); -@@ -866,13 +869,18 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) - darray_exit(&wb->inc.keys); - } - --int bch2_fs_btree_write_buffer_init(struct bch_fs *c) -+void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c) - { - struct btree_write_buffer *wb = &c->btree_write_buffer; - - mutex_init(&wb->inc.lock); - mutex_init(&wb->flushing.lock); - INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); -+} -+ -+int bch2_fs_btree_write_buffer_init(struct bch_fs *c) -+{ -+ struct btree_write_buffer *wb = &c->btree_write_buffer; - - /* Will be resized by journal as needed: */ - unsigned initial_size = 1 << 16; -diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h -index d535cea28bde..05f56fd1eed0 100644 ---- a/fs/bcachefs/btree_write_buffer.h -+++ b/fs/bcachefs/btree_write_buffer.h -@@ -101,6 +101,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_t - - int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); - void bch2_fs_btree_write_buffer_exit(struct bch_fs *); -+void bch2_fs_btree_write_buffer_init_early(struct bch_fs *); - int bch2_fs_btree_write_buffer_init(struct bch_fs *); - - #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ -diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h -index e9e76e20f43b..d39d163c6ea9 100644 ---- a/fs/bcachefs/btree_write_buffer_types.h -+++ b/fs/bcachefs/btree_write_buffer_types.h -@@ -2,7 +2,7 @@ - #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H - #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H - --#include "darray.h" -+#include - #include "journal_types.h" - - #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 -diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c -index 345b117a4a4a..8bb6384190c5 100644 ---- a/fs/bcachefs/buckets.c -+++ b/fs/bcachefs/buckets.c -@@ -29,9 +29,16 @@ - #include - - void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) -+{ -+ for (unsigned i = 0; i < BCH_DATA_NR; i++) -+ usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets); -+} -+ -+void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage) - { - memset(usage, 0, sizeof(*usage)); -- acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s()); -+ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, -+ sizeof(struct bch_dev_usage_full) / sizeof(u64)); - } - - static u64 reserve_factor(u64 r) -@@ -75,7 +82,7 @@ bch2_fs_usage_read_short(struct bch_fs *c) - - void bch2_dev_usage_to_text(struct printbuf *out, - struct bch_dev *ca, -- struct bch_dev_usage *usage) -+ struct bch_dev_usage_full *usage) - { - if (out->nr_tabstops < 5) { - printbuf_tabstops_reset(out); -@@ -365,7 +372,7 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); -- ret = bch2_btree_iter_traverse(&iter) ?: -+ ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun); -@@ -381,6 +388,31 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, - return ret; - } - -+static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf, -+ struct bkey_s_c k, bool insert, enum bch_sb_error_id id) -+{ -+ struct bch_fs *c = trans->c; -+ -+ prt_printf(buf, "\nwhile marking "); -+ bch2_bkey_val_to_text(buf, c, k); -+ prt_newline(buf); -+ -+ bool print = __bch2_count_fsck_err(c, id, buf); -+ -+ int ret = bch2_run_explicit_recovery_pass(c, buf, -+ BCH_RECOVERY_PASS_check_allocations, 0); -+ -+ if (insert) { -+ bch2_trans_updates_to_text(buf, trans); -+ __bch2_inconsistent_error(c, buf); -+ ret = -BCH_ERR_bucket_ref_update; -+ } -+ -+ if (print || insert) -+ bch2_print_str(c, KERN_ERR, buf->buf); -+ return ret; -+} -+ - int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, -@@ -396,32 +428,29 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - - BUG_ON(!sectors); - -- if (gen_after(ptr->gen, b_gen)) { -- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -- log_fsck_err(trans, ptr_gen_newer_than_bucket_gen, -- "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" -- "while marking %s", -+ if (unlikely(gen_after(ptr->gen, b_gen))) { -+ bch2_log_msg_start(c, &buf); -+ prt_printf(&buf, -+ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), -- ptr->gen, -- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -- if (inserting) -- goto err; -+ ptr->gen); -+ -+ ret = bucket_ref_update_err(trans, &buf, k, inserting, -+ BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen); - goto out; - } - -- if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { -- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -- log_fsck_err(trans, ptr_too_stale, -- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" -- "while marking %s", -+ if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) { -+ bch2_log_msg_start(c, &buf); -+ prt_printf(&buf, -+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), -- ptr->gen, -- (printbuf_reset(&buf), -- bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -- if (inserting) -- goto err; -+ ptr->gen); -+ -+ ret = bucket_ref_update_err(trans, &buf, k, inserting, -+ BCH_FSCK_ERR_ptr_too_stale); - goto out; - } - -@@ -430,62 +459,50 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - goto out; - } - -- if (b_gen != ptr->gen) { -- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -- log_fsck_err(trans, stale_dirty_ptr, -- "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" -- "while marking %s", -+ if (unlikely(b_gen != ptr->gen)) { -+ bch2_log_msg_start(c, &buf); -+ prt_printf(&buf, -+ "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)", - ptr->dev, bucket_nr, b_gen, - bucket_gen_get(ca, bucket_nr), - bch2_data_type_str(bucket_data_type ?: ptr_data_type), -- ptr->gen, -- (printbuf_reset(&buf), -- bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -- if (inserting) -- goto err; -+ ptr->gen); -+ -+ ret = bucket_ref_update_err(trans, &buf, k, inserting, -+ BCH_FSCK_ERR_stale_dirty_ptr); - goto out; - } - -- if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { -- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -- log_fsck_err(trans, ptr_bucket_data_type_mismatch, -- "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" -- "while marking %s", -- ptr->dev, bucket_nr, b_gen, -- bch2_data_type_str(bucket_data_type), -- bch2_data_type_str(ptr_data_type), -- (printbuf_reset(&buf), -- bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -- if (inserting) -- goto err; -+ if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) { -+ bch2_log_msg_start(c, &buf); -+ prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s", -+ ptr->dev, bucket_nr, b_gen, -+ bch2_data_type_str(bucket_data_type), -+ bch2_data_type_str(ptr_data_type)); -+ -+ ret = bucket_ref_update_err(trans, &buf, k, inserting, -+ BCH_FSCK_ERR_ptr_bucket_data_type_mismatch); - goto out; - } - -- if ((u64) *bucket_sectors + sectors > U32_MAX) { -- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -- log_fsck_err(trans, bucket_sector_count_overflow, -- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" -- "while marking %s", -+ if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) { -+ bch2_log_msg_start(c, &buf); -+ prt_printf(&buf, -+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), -- *bucket_sectors, sectors, -- (printbuf_reset(&buf), -- bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -- if (inserting) -- goto err; -+ *bucket_sectors, sectors); -+ -+ ret = bucket_ref_update_err(trans, &buf, k, inserting, -+ BCH_FSCK_ERR_bucket_sector_count_overflow); - sectors = -*bucket_sectors; -+ goto out; - } - - *bucket_sectors += sectors; - out: - printbuf_exit(&buf); - return ret; --err: --fsck_err: -- bch2_dump_trans_updates(trans); -- bch2_inconsistent_error(c); -- ret = -BCH_ERR_bucket_ref_update; -- goto out; - } - - void bch2_trans_account_disk_usage_change(struct btree_trans *trans) -@@ -582,6 +599,13 @@ static int bch2_trigger_pointer(struct btree_trans *trans, - } - - struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); -+ if (!bucket_valid(ca, bucket.offset)) { -+ if (insert) { -+ bch2_dev_bucket_missing(ca, bucket.offset); -+ ret = -BCH_ERR_trigger_pointer; -+ } -+ goto err; -+ } - - if (flags & BTREE_TRIGGER_transactional) { - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); -@@ -590,11 +614,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans, - if (ret) - goto err; - -- if (!p.ptr.cached) { -- ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); -- if (ret) -- goto err; -- } -+ ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); -+ if (ret) -+ goto err; - } - - if (flags & BTREE_TRIGGER_gc) { -@@ -653,9 +675,9 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, - stripe_blockcount_get(&s->v, p.ec.block) + - sectors); - -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_replicas, -- }; -+ struct disk_accounting_pos acc; -+ memset(&acc, 0, sizeof(acc)); -+ acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = data_type; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); -@@ -674,26 +696,28 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, - return -BCH_ERR_ENOMEM_mark_stripe_ptr; - } - -- mutex_lock(&c->ec_stripes_heap_lock); -+ gc_stripe_lock(m); - - if (!m || !m->alive) { -- mutex_unlock(&c->ec_stripes_heap_lock); -+ gc_stripe_unlock(m); - struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ", -+ (u64) p.ec.idx); - bch2_bkey_val_to_text(&buf, c, k); -- bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s", -- (u64) p.ec.idx, buf.buf); -+ __bch2_inconsistent_error(c, &buf); -+ bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -- bch2_inconsistent_error(c); - return -BCH_ERR_trigger_stripe_pointer; - } - - m->block_sectors[p.ec.block] += sectors; - -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_replicas, -- }; -- memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e)); -- mutex_unlock(&c->ec_stripes_heap_lock); -+ struct disk_accounting_pos acc; -+ memset(&acc, 0, sizeof(acc)); -+ acc.type = BCH_DISK_ACCOUNTING_replicas; -+ unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA"); -+ gc_stripe_unlock(m); - - acc.replicas.data_type = data_type; - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); -@@ -719,16 +743,14 @@ static int __trigger_extent(struct btree_trans *trans, - : BCH_DATA_user; - int ret = 0; - -- struct disk_accounting_pos acc_replicas_key = { -- .type = BCH_DISK_ACCOUNTING_replicas, -- .replicas.data_type = data_type, -- .replicas.nr_devs = 0, -- .replicas.nr_required = 1, -- }; -+ struct disk_accounting_pos acc_replicas_key; -+ memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); -+ acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; -+ acc_replicas_key.replicas.data_type = data_type; -+ acc_replicas_key.replicas.nr_devs = 0; -+ acc_replicas_key.replicas.nr_required = 1; - -- struct disk_accounting_pos acct_compression_key = { -- .type = BCH_DISK_ACCOUNTING_compression, -- }; -+ unsigned cur_compression_type = 0; - u64 compression_acct[3] = { 1, 0, 0 }; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -@@ -762,13 +784,13 @@ static int __trigger_extent(struct btree_trans *trans, - acc_replicas_key.replicas.nr_required = 0; - } - -- if (acct_compression_key.compression.type && -- acct_compression_key.compression.type != p.crc.compression_type) { -+ if (cur_compression_type && -+ cur_compression_type != p.crc.compression_type) { - if (flags & BTREE_TRIGGER_overwrite) - bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - -- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, -- ARRAY_SIZE(compression_acct), gc); -+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, -+ compression, cur_compression_type); - if (ret) - return ret; - -@@ -777,7 +799,7 @@ static int __trigger_extent(struct btree_trans *trans, - compression_acct[2] = 0; - } - -- acct_compression_key.compression.type = p.crc.compression_type; -+ cur_compression_type = p.crc.compression_type; - if (p.crc.compression_type) { - compression_acct[1] += p.crc.uncompressed_size; - compression_acct[2] += p.crc.compressed_size; -@@ -791,45 +813,34 @@ static int __trigger_extent(struct btree_trans *trans, - } - - if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { -- struct disk_accounting_pos acc_snapshot_key = { -- .type = BCH_DISK_ACCOUNTING_snapshot, -- .snapshot.id = k.k->p.snapshot, -- }; -- ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); -+ ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); - if (ret) - return ret; - } - -- if (acct_compression_key.compression.type) { -+ if (cur_compression_type) { - if (flags & BTREE_TRIGGER_overwrite) - bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - -- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, -- ARRAY_SIZE(compression_acct), gc); -+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, -+ compression, cur_compression_type); - if (ret) - return ret; - } - - if (level) { -- struct disk_accounting_pos acc_btree_key = { -- .type = BCH_DISK_ACCOUNTING_btree, -- .btree.id = btree_id, -- }; -- ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); -+ ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); - if (ret) - return ret; - } else { - bool insert = !(flags & BTREE_TRIGGER_overwrite); -- struct disk_accounting_pos acc_inum_key = { -- .type = BCH_DISK_ACCOUNTING_inum, -- .inum.inum = k.k->p.inode, -- }; -+ - s64 v[3] = { - insert ? 1 : -1, - insert ? k.k->size : -((s64) k.k->size), - *replicas_sectors, - }; -- ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); -+ ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); - if (ret) - return ret; - } -@@ -878,15 +889,15 @@ int bch2_trigger_extent(struct btree_trans *trans, - } - - int need_rebalance_delta = 0; -- s64 need_rebalance_sectors_delta = 0; -+ s64 need_rebalance_sectors_delta[1] = { 0 }; - - s64 s = bch2_bkey_sectors_need_rebalance(c, old); - need_rebalance_delta -= s != 0; -- need_rebalance_sectors_delta -= s; -+ need_rebalance_sectors_delta[0] -= s; - - s = bch2_bkey_sectors_need_rebalance(c, new.s_c); - need_rebalance_delta += s != 0; -- need_rebalance_sectors_delta += s; -+ need_rebalance_sectors_delta[0] += s; - - if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, -@@ -895,12 +906,9 @@ int bch2_trigger_extent(struct btree_trans *trans, - return ret; - } - -- if (need_rebalance_sectors_delta) { -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_rebalance_work, -- }; -- int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, -- flags & BTREE_TRIGGER_gc); -+ if (need_rebalance_sectors_delta[0]) { -+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, -+ need_rebalance_sectors_delta, rebalance_work); - if (ret) - return ret; - } -@@ -916,17 +924,13 @@ static int __trigger_reservation(struct btree_trans *trans, - enum btree_iter_update_trigger_flags flags) - { - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { -- s64 sectors = k.k->size; -+ s64 sectors[1] = { k.k->size }; - - if (flags & BTREE_TRIGGER_overwrite) -- sectors = -sectors; -- -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_persistent_reserved, -- .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas, -- }; -+ sectors[0] = -sectors[0]; - -- return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc); -+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, -+ persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); - } - - return 0; -@@ -957,14 +961,23 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - return PTR_ERR(a); - - if (a->v.data_type && type && a->v.data_type != type) { -- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); -- log_fsck_err(trans, bucket_metadata_type_mismatch, -- "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" -- "while marking %s", -- iter.pos.inode, iter.pos.offset, a->v.gen, -- bch2_data_type_str(a->v.data_type), -- bch2_data_type_str(type), -- bch2_data_type_str(type)); -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" -+ "while marking %s\n", -+ iter.pos.inode, iter.pos.offset, a->v.gen, -+ bch2_data_type_str(a->v.data_type), -+ bch2_data_type_str(type), -+ bch2_data_type_str(type)); -+ -+ bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); -+ -+ bch2_run_explicit_recovery_pass(c, &buf, -+ BCH_RECOVERY_PASS_check_allocations, 0); -+ -+ if (print) -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); - ret = -BCH_ERR_metadata_bucket_inconsistency; - goto err; - } -@@ -976,7 +989,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - } - err: --fsck_err: - bch2_trans_iter_exit(trans, &iter); - return ret; - } -@@ -1134,10 +1146,10 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, - int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, - enum btree_iter_update_trigger_flags flags) - { -- for_each_online_member(c, ca) { -+ for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) { - int ret = bch2_trans_mark_dev_sb(c, ca, flags); - if (ret) { -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs); - return ret; - } - } -@@ -1305,15 +1317,18 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) - old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); - - if (resize) { -- bucket_gens->nbuckets = min(bucket_gens->nbuckets, -- old_bucket_gens->nbuckets); -- bucket_gens->nbuckets_minus_first = -- bucket_gens->nbuckets - bucket_gens->first_bucket; -+ u64 copy = min(bucket_gens->nbuckets, -+ old_bucket_gens->nbuckets); - memcpy(bucket_gens->b, - old_bucket_gens->b, -- bucket_gens->nbuckets); -+ sizeof(bucket_gens->b[0]) * copy); - } - -+ ret = bch2_bucket_bitmap_resize(&ca->bucket_backpointer_mismatch, -+ ca->mi.nbuckets, nbuckets) ?: -+ bch2_bucket_bitmap_resize(&ca->bucket_backpointer_empty, -+ ca->mi.nbuckets, nbuckets); -+ - rcu_assign_pointer(ca->bucket_gens, bucket_gens); - bucket_gens = old_bucket_gens; - -@@ -1336,7 +1351,7 @@ void bch2_dev_buckets_free(struct bch_dev *ca) - - int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) - { -- ca->usage = alloc_percpu(struct bch_dev_usage); -+ ca->usage = alloc_percpu(struct bch_dev_usage_full); - if (!ca->usage) - return -BCH_ERR_ENOMEM_usage_init; - -diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h -index a9acdd6c0c86..af1532de4a37 100644 ---- a/fs/bcachefs/buckets.h -+++ b/fs/bcachefs/buckets.h -@@ -39,38 +39,12 @@ static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t - for (_b = (_buckets)->b + (_buckets)->first_bucket; \ - _b < (_buckets)->b + (_buckets)->nbuckets; _b++) - --/* -- * Ugly hack alert: -- * -- * We need to cram a spinlock in a single byte, because that's what we have left -- * in struct bucket, and we care about the size of these - during fsck, we need -- * in memory state for every single bucket on every device. -- * -- * We used to do -- * while (xchg(&b->lock, 1) cpu_relax(); -- * but, it turns out not all architectures support xchg on a single byte. -- * -- * So now we use bit_spin_lock(), with fun games since we can't burn a whole -- * ulong for this - we just need to make sure the lock bit always ends up in the -- * first byte. -- */ -- --#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ --#define BUCKET_LOCK_BITNR 0 --#else --#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) --#endif -- --union ulong_byte_assert { -- ulong ulong; -- u8 byte; --}; -- - static inline void bucket_unlock(struct bucket *b) - { - BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); - - clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock); -+ smp_mb__after_atomic(); - wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR); - } - -@@ -167,9 +141,7 @@ static inline int gen_cmp(u8 a, u8 b) - - static inline int gen_after(u8 a, u8 b) - { -- int r = gen_cmp(a, b); -- -- return r > 0 ? r : 0; -+ return max(0, gen_cmp(a, b)); - } - - static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) -@@ -201,7 +173,16 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) - return ret; - } - --void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *); -+void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *); -+static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca) -+{ -+ struct bch_dev_usage_full ret; -+ -+ bch2_dev_usage_full_read_fast(ca, &ret); -+ return ret; -+} -+ -+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *); - - static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) - { -@@ -236,7 +217,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca, - enum bch_watermark watermark) - { - return max_t(s64, 0, -- usage.d[BCH_DATA_free].buckets - -+ usage.buckets[BCH_DATA_free]- - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, watermark)); - } -@@ -246,10 +227,10 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, - enum bch_watermark watermark) - { - return max_t(s64, 0, -- usage.d[BCH_DATA_free].buckets -- + usage.d[BCH_DATA_cached].buckets -- + usage.d[BCH_DATA_need_gc_gens].buckets -- + usage.d[BCH_DATA_need_discard].buckets -+ usage.buckets[BCH_DATA_free] -+ + usage.buckets[BCH_DATA_cached] -+ + usage.buckets[BCH_DATA_need_gc_gens] -+ + usage.buckets[BCH_DATA_need_discard] - - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, watermark)); - } -@@ -262,11 +243,6 @@ static inline u64 dev_buckets_available(struct bch_dev *ca, - - /* Filesystem usage: */ - --static inline unsigned dev_usage_u64s(void) --{ -- return sizeof(struct bch_dev_usage) / sizeof(u64); --} -- - struct bch_fs_usage_short - bch2_fs_usage_read_short(struct bch_fs *); - -diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h -index 7174047b8e92..0aed2500ade3 100644 ---- a/fs/bcachefs/buckets_types.h -+++ b/fs/bcachefs/buckets_types.h -@@ -7,6 +7,33 @@ - - #define BUCKET_JOURNAL_SEQ_BITS 16 - -+/* -+ * Ugly hack alert: -+ * -+ * We need to cram a spinlock in a single byte, because that's what we have left -+ * in struct bucket, and we care about the size of these - during fsck, we need -+ * in memory state for every single bucket on every device. -+ * -+ * We used to do -+ * while (xchg(&b->lock, 1) cpu_relax(); -+ * but, it turns out not all architectures support xchg on a single byte. -+ * -+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole -+ * ulong for this - we just need to make sure the lock bit always ends up in the -+ * first byte. -+ */ -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+#define BUCKET_LOCK_BITNR 0 -+#else -+#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) -+#endif -+ -+union ulong_byte_assert { -+ ulong ulong; -+ u8 byte; -+}; -+ - struct bucket { - u8 lock; - u8 gen_valid:1; -@@ -27,7 +54,12 @@ struct bucket_gens { - u8 b[] __counted_by(nbuckets); - }; - -+/* Only info on bucket countns: */ - struct bch_dev_usage { -+ u64 buckets[BCH_DATA_NR]; -+}; -+ -+struct bch_dev_usage_full { - struct bch_dev_usage_type { - u64 buckets; - u64 sectors; /* _compressed_ sectors: */ -diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c -index 46e9e32105a9..4066946b26bc 100644 ---- a/fs/bcachefs/chardev.c -+++ b/fs/bcachefs/chardev.c -@@ -11,6 +11,7 @@ - #include "move.h" - #include "recovery_passes.h" - #include "replicas.h" -+#include "sb-counters.h" - #include "super-io.h" - #include "thread_with_file.h" - -@@ -312,7 +313,12 @@ static int bch2_data_thread(void *arg) - struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); - - ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); -- ctx->stats.data_type = U8_MAX; -+ if (ctx->thr.ret == -BCH_ERR_device_offline) -+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; -+ else { -+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; -+ ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; -+ } - return 0; - } - -@@ -331,14 +337,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, - struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - struct bch_fs *c = ctx->c; - struct bch_ioctl_data_event e = { -- .type = BCH_DATA_EVENT_PROGRESS, -- .p.data_type = ctx->stats.data_type, -- .p.btree_id = ctx->stats.pos.btree, -- .p.pos = ctx->stats.pos.pos, -- .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), -- .p.sectors_total = bch2_fs_usage_read_short(c).used, -+ .type = BCH_DATA_EVENT_PROGRESS, -+ .ret = ctx->stats.ret, -+ .p.data_type = ctx->stats.data_type, -+ .p.btree_id = ctx->stats.pos.btree, -+ .p.pos = ctx->stats.pos.pos, -+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), -+ .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), -+ .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), - }; - -+ if (ctx->arg.op == BCH_DATA_OP_scrub) { -+ struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); -+ if (ca) { -+ struct bch_dev_usage_full u; -+ bch2_dev_usage_full_read_fast(ca, &u); -+ for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) -+ if (ctx->arg.scrub.data_types & BIT(i)) -+ e.p.sectors_total += u.d[i].sectors; -+ bch2_dev_put(ca); -+ } -+ } else { -+ e.p.sectors_total = bch2_fs_usage_read_short(c).used; -+ } -+ - if (len < sizeof(e)) - return -EINVAL; - -@@ -404,10 +426,8 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, - arg.replica_entries_bytes = replicas.nr; - - for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { -- struct disk_accounting_pos k = { -- .type = BCH_DISK_ACCOUNTING_persistent_reserved, -- .persistent_reserved.nr_replicas = i, -- }; -+ struct disk_accounting_pos k; -+ disk_accounting_key_init(k, persistent_reserved, .nr_replicas = i); - - bch2_accounting_mem_read(c, - disk_accounting_pos_to_bpos(&k), -@@ -453,7 +473,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, - struct bch_ioctl_dev_usage __user *user_arg) - { - struct bch_ioctl_dev_usage arg; -- struct bch_dev_usage src; -+ struct bch_dev_usage_full src; - struct bch_dev *ca; - unsigned i; - -@@ -473,7 +493,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, - if (IS_ERR(ca)) - return PTR_ERR(ca); - -- src = bch2_dev_usage_read(ca); -+ src = bch2_dev_usage_full_read(ca); - - arg.state = ca->mi.state; - arg.bucket_size = ca->mi.bucket_size; -@@ -494,7 +514,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, - struct bch_ioctl_dev_usage_v2 __user *user_arg) - { - struct bch_ioctl_dev_usage_v2 arg; -- struct bch_dev_usage src; -+ struct bch_dev_usage_full src; - struct bch_dev *ca; - int ret = 0; - -@@ -514,7 +534,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, - if (IS_ERR(ca)) - return PTR_ERR(ca); - -- src = bch2_dev_usage_read(ca); -+ src = bch2_dev_usage_full_read(ca); - - arg.state = ca->mi.state; - arg.bucket_size = ca->mi.bucket_size; -@@ -593,11 +613,13 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, - if (!dev) - return -EINVAL; - -- for_each_online_member(c, ca) -+ rcu_read_lock(); -+ for_each_online_member_rcu(c, ca) - if (ca->dev == dev) { -- percpu_ref_put(&ca->io_ref); -+ rcu_read_unlock(); - return ca->dev_idx; - } -+ rcu_read_unlock(); - - return -BCH_ERR_ENOENT_dev_idx_not_found; - } -@@ -710,6 +732,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) - BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); - case BCH_IOCTL_QUERY_ACCOUNTING: - return bch2_ioctl_query_accounting(c, arg); -+ case BCH_IOCTL_QUERY_COUNTERS: -+ return bch2_ioctl_query_counters(c, arg); - default: - return -ENOTTY; - } -diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c -index 23a383577d4c..ff5ab8ada777 100644 ---- a/fs/bcachefs/checksum.c -+++ b/fs/bcachefs/checksum.c -@@ -7,17 +7,12 @@ - #include "super-io.h" - - #include --#include - #include - #include - #include - #include --#include --#include - #include --#include - #include --#include - #include - - /* -@@ -96,116 +91,40 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void * - } - } - --static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, -- struct nonce nonce, -- struct scatterlist *sg, size_t len) -+static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS], -+ const struct bch_key *key, struct nonce nonce) - { -- SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); -+ u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)]; - -- skcipher_request_set_sync_tfm(req, tfm); -- skcipher_request_set_callback(req, 0, NULL, NULL); -- skcipher_request_set_crypt(req, sg, sg, len, nonce.d); -+ BUILD_BUG_ON(sizeof(key_words) != sizeof(*key)); -+ memcpy(key_words, key, sizeof(key_words)); -+ le32_to_cpu_array(key_words, ARRAY_SIZE(key_words)); - -- int ret = crypto_skcipher_encrypt(req); -- if (ret) -- pr_err("got error %i from crypto_skcipher_encrypt()", ret); -- -- return ret; --} -- --static inline int do_encrypt(struct crypto_sync_skcipher *tfm, -- struct nonce nonce, -- void *buf, size_t len) --{ -- if (!is_vmalloc_addr(buf)) { -- struct scatterlist sg = {}; -- -- sg_mark_end(&sg); -- sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf)); -- return do_encrypt_sg(tfm, nonce, &sg, len); -- } else { -- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; -- size_t sgl_len = 0; -- int ret; -- -- darray_init(&sgl); -- -- while (len) { -- unsigned offset = offset_in_page(buf); -- struct scatterlist sg = { -- .page_link = (unsigned long) vmalloc_to_page(buf), -- .offset = offset, -- .length = min(len, PAGE_SIZE - offset), -- }; -- -- if (darray_push(&sgl, sg)) { -- sg_mark_end(&darray_last(sgl)); -- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); -- if (ret) -- goto err; -- -- nonce = nonce_add(nonce, sgl_len); -- sgl_len = 0; -- sgl.nr = 0; -- BUG_ON(darray_push(&sgl, sg)); -- } -- -- buf += sg.length; -- len -= sg.length; -- sgl_len += sg.length; -- } -+ BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE); -+ chacha_init(state, key_words, (const u8 *)nonce.d); - -- sg_mark_end(&darray_last(sgl)); -- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); --err: -- darray_exit(&sgl); -- return ret; -- } -+ memzero_explicit(key_words, sizeof(key_words)); - } - --int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, -- void *buf, size_t len) -+void bch2_chacha20(const struct bch_key *key, struct nonce nonce, -+ void *data, size_t len) - { -- struct crypto_sync_skcipher *chacha20 = -- crypto_alloc_sync_skcipher("chacha20", 0, 0); -- int ret; -- -- ret = PTR_ERR_OR_ZERO(chacha20); -- if (ret) { -- pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret)); -- return ret; -- } -- -- ret = crypto_skcipher_setkey(&chacha20->base, -- (void *) key, sizeof(*key)); -- if (ret) { -- pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret)); -- goto err; -- } -+ u32 state[CHACHA_STATE_WORDS]; - -- ret = do_encrypt(chacha20, nonce, buf, len); --err: -- crypto_free_sync_skcipher(chacha20); -- return ret; -+ bch2_chacha20_init(state, key, nonce); -+ chacha20_crypt(state, data, data, len); -+ memzero_explicit(state, sizeof(state)); - } - --static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, -- struct nonce nonce) -+static void bch2_poly1305_init(struct poly1305_desc_ctx *desc, -+ struct bch_fs *c, struct nonce nonce) - { -- u8 key[POLY1305_KEY_SIZE]; -- int ret; -+ u8 key[POLY1305_KEY_SIZE] = { 0 }; - - nonce.d[3] ^= BCH_NONCE_POLY; - -- memset(key, 0, sizeof(key)); -- ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); -- if (ret) -- return ret; -- -- desc->tfm = c->poly1305; -- crypto_shash_init(desc); -- crypto_shash_update(desc, key, sizeof(key)); -- return 0; -+ bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key)); -+ poly1305_init(desc, key); - } - - struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, -@@ -230,14 +149,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, - - case BCH_CSUM_chacha20_poly1305_80: - case BCH_CSUM_chacha20_poly1305_128: { -- SHASH_DESC_ON_STACK(desc, c->poly1305); -+ struct poly1305_desc_ctx dctx; - u8 digest[POLY1305_DIGEST_SIZE]; - struct bch_csum ret = { 0 }; - -- gen_poly_key(c, desc, nonce); -- -- crypto_shash_update(desc, data, len); -- crypto_shash_final(desc, digest); -+ bch2_poly1305_init(&dctx, c, nonce); -+ poly1305_update(&dctx, data, len); -+ poly1305_final(&dctx, digest); - - memcpy(&ret, digest, bch_crc_bytes[type]); - return ret; -@@ -253,11 +171,12 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, - if (!bch2_csum_type_is_encryption(type)) - return 0; - -- if (bch2_fs_inconsistent_on(!c->chacha20, -+ if (bch2_fs_inconsistent_on(!c->chacha20_key_set, - c, "attempting to encrypt without encryption key")) - return -BCH_ERR_no_encryption_key; - -- return do_encrypt(c->chacha20, nonce, data, len); -+ bch2_chacha20(&c->chacha20_key, nonce, data, len); -+ return 0; - } - - static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, -@@ -296,26 +215,26 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, - - case BCH_CSUM_chacha20_poly1305_80: - case BCH_CSUM_chacha20_poly1305_128: { -- SHASH_DESC_ON_STACK(desc, c->poly1305); -+ struct poly1305_desc_ctx dctx; - u8 digest[POLY1305_DIGEST_SIZE]; - struct bch_csum ret = { 0 }; - -- gen_poly_key(c, desc, nonce); -+ bch2_poly1305_init(&dctx, c, nonce); - - #ifdef CONFIG_HIGHMEM - __bio_for_each_segment(bv, bio, *iter, *iter) { - void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; - -- crypto_shash_update(desc, p, bv.bv_len); -+ poly1305_update(&dctx, p, bv.bv_len); - kunmap_local(p); - } - #else - __bio_for_each_bvec(bv, bio, *iter, *iter) -- crypto_shash_update(desc, -+ poly1305_update(&dctx, - page_address(bv.bv_page) + bv.bv_offset, - bv.bv_len); - #endif -- crypto_shash_final(desc, digest); -+ poly1305_final(&dctx, digest); - - memcpy(&ret, digest, bch_crc_bytes[type]); - return ret; -@@ -338,43 +257,33 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, - { - struct bio_vec bv; - struct bvec_iter iter; -- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; -- size_t sgl_len = 0; -+ u32 chacha_state[CHACHA_STATE_WORDS]; - int ret = 0; - -- if (bch2_fs_inconsistent_on(!c->chacha20, -+ if (bch2_fs_inconsistent_on(!c->chacha20_key_set, - c, "attempting to encrypt without encryption key")) - return -BCH_ERR_no_encryption_key; - -- darray_init(&sgl); -+ bch2_chacha20_init(chacha_state, &c->chacha20_key, nonce); - - bio_for_each_segment(bv, bio, iter) { -- struct scatterlist sg = { -- .page_link = (unsigned long) bv.bv_page, -- .offset = bv.bv_offset, -- .length = bv.bv_len, -- }; -- -- if (darray_push(&sgl, sg)) { -- sg_mark_end(&darray_last(sgl)); -- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); -- if (ret) -- goto err; -- -- nonce = nonce_add(nonce, sgl_len); -- sgl_len = 0; -- sgl.nr = 0; -- -- BUG_ON(darray_push(&sgl, sg)); -+ void *p; -+ -+ /* -+ * chacha_crypt() assumes that the length is a multiple of -+ * CHACHA_BLOCK_SIZE on any non-final call. -+ */ -+ if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) { -+ bch_err_ratelimited(c, "bio not aligned for encryption"); -+ ret = -EIO; -+ break; - } - -- sgl_len += sg.length; -+ p = bvec_kmap_local(&bv); -+ chacha20_crypt(chacha_state, p, p, bv.bv_len); -+ kunmap_local(p); - } -- -- sg_mark_end(&darray_last(sgl)); -- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); --err: -- darray_exit(&sgl); -+ memzero_explicit(chacha_state, sizeof(chacha_state)); - return ret; - } - -@@ -466,7 +375,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, - prt_str(&buf, ")"); - WARN_RATELIMIT(1, "%s", buf.buf); - printbuf_exit(&buf); -- return -EIO; -+ return -BCH_ERR_recompute_checksum; - } - - for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { -@@ -650,10 +559,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c, - } - - /* decrypt real key: */ -- ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), -- &sb_key, sizeof(sb_key)); -- if (ret) -- goto err; -+ bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key)); - - if (bch2_key_is_encrypted(&sb_key)) { - bch_err(c, "incorrect encryption key"); -@@ -668,31 +574,14 @@ int bch2_decrypt_sb_key(struct bch_fs *c, - return ret; - } - --static int bch2_alloc_ciphers(struct bch_fs *c) --{ -- if (c->chacha20) -- return 0; -- -- struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); -- int ret = PTR_ERR_OR_ZERO(chacha20); -- if (ret) { -- bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); -- return ret; -- } -- -- struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0); -- ret = PTR_ERR_OR_ZERO(poly1305); -- if (ret) { -- bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); -- crypto_free_sync_skcipher(chacha20); -- return ret; -- } -- -- c->chacha20 = chacha20; -- c->poly1305 = poly1305; -- return 0; --} -+#if 0 - -+/* -+ * This seems to be duplicating code in cmd_remove_passphrase() in -+ * bcachefs-tools, but we might want to switch userspace to use this - and -+ * perhaps add an ioctl for calling this at runtime, so we can take the -+ * passphrase off of a mounted filesystem (which has come up). -+ */ - int bch2_disable_encryption(struct bch_fs *c) - { - struct bch_sb_field_crypt *crypt; -@@ -725,6 +614,10 @@ int bch2_disable_encryption(struct bch_fs *c) - return ret; - } - -+/* -+ * For enabling encryption on an existing filesystem: not hooked up yet, but it -+ * should be -+ */ - int bch2_enable_encryption(struct bch_fs *c, bool keyed) - { - struct bch_encrypted_key key; -@@ -781,48 +674,25 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) - memzero_explicit(&key, sizeof(key)); - return ret; - } -+#endif - - void bch2_fs_encryption_exit(struct bch_fs *c) - { -- if (c->poly1305) -- crypto_free_shash(c->poly1305); -- if (c->chacha20) -- crypto_free_sync_skcipher(c->chacha20); -- if (c->sha256) -- crypto_free_shash(c->sha256); -+ memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key)); - } - - int bch2_fs_encryption_init(struct bch_fs *c) - { - struct bch_sb_field_crypt *crypt; -- struct bch_key key; -- int ret = 0; -- -- c->sha256 = crypto_alloc_shash("sha256", 0, 0); -- ret = PTR_ERR_OR_ZERO(c->sha256); -- if (ret) { -- c->sha256 = NULL; -- bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); -- goto out; -- } -+ int ret; - - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); - if (!crypt) -- goto out; -- -- ret = bch2_alloc_ciphers(c); -- if (ret) -- goto out; -- -- ret = bch2_decrypt_sb_key(c, crypt, &key); -- if (ret) -- goto out; -+ return 0; - -- ret = crypto_skcipher_setkey(&c->chacha20->base, -- (void *) &key.key, sizeof(key.key)); -+ ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key); - if (ret) -- goto out; --out: -- memzero_explicit(&key, sizeof(key)); -- return ret; -+ return ret; -+ c->chacha20_key_set = true; -+ return 0; - } -diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h -index 43b9d71f2f2b..7bd9cf6104ca 100644 ---- a/fs/bcachefs/checksum.h -+++ b/fs/bcachefs/checksum.h -@@ -69,7 +69,8 @@ static inline void bch2_csum_err_msg(struct printbuf *out, - bch2_csum_to_text(out, type, expected); - } - --int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); -+void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t); -+ - int bch2_request_key(struct bch_sb *, struct bch_key *); - #ifndef __KERNEL__ - int bch2_revoke_key(struct bch_sb *); -@@ -103,8 +104,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; - int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, - struct bch_key *); - -+#if 0 - int bch2_disable_encryption(struct bch_fs *); - int bch2_enable_encryption(struct bch_fs *, bool); -+#endif - - void bch2_fs_encryption_exit(struct bch_fs *); - int bch2_fs_encryption_init(struct bch_fs *); -@@ -154,7 +157,7 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c, - if (type >= BCH_CSUM_NR) - return false; - -- if (bch2_csum_type_is_encryption(type) && !c->chacha20) -+ if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set) - return false; - - return true; -diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c -index 114bf2f3879f..1bca61d17092 100644 ---- a/fs/bcachefs/compress.c -+++ b/fs/bcachefs/compress.c -@@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, - size_t src_len = src->bi_iter.bi_size; - size_t dst_len = crc.uncompressed_size << 9; - void *workspace; -- int ret; -+ int ret = 0, ret2; - - enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); - mempool_t *workspace_pool = &c->compress_workspace[opt]; -@@ -189,7 +189,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, - else - ret = -BCH_ERR_compression_workspace_not_initialized; - if (ret) -- goto out; -+ goto err; - } - - src_data = bio_map_or_bounce(c, src, READ); -@@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, - switch (crc.compression_type) { - case BCH_COMPRESSION_TYPE_lz4_old: - case BCH_COMPRESSION_TYPE_lz4: -- ret = LZ4_decompress_safe_partial(src_data.b, dst_data, -- src_len, dst_len, dst_len); -- if (ret != dst_len) -- goto err; -+ ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, -+ src_len, dst_len, dst_len); -+ if (ret2 != dst_len) -+ ret = -BCH_ERR_decompress_lz4; - break; - case BCH_COMPRESSION_TYPE_gzip: { - z_stream strm = { -@@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, - - zlib_set_workspace(&strm, workspace); - zlib_inflateInit2(&strm, -MAX_WBITS); -- ret = zlib_inflate(&strm, Z_FINISH); -+ ret2 = zlib_inflate(&strm, Z_FINISH); - - mempool_free(workspace, workspace_pool); - -- if (ret != Z_STREAM_END) -- goto err; -+ if (ret2 != Z_STREAM_END) -+ ret = -BCH_ERR_decompress_gzip; - break; - } - case BCH_COMPRESSION_TYPE_zstd: { - ZSTD_DCtx *ctx; - size_t real_src_len = le32_to_cpup(src_data.b); - -- if (real_src_len > src_len - 4) -+ if (real_src_len > src_len - 4) { -+ ret = -BCH_ERR_decompress_zstd_src_len_bad; - goto err; -+ } - - workspace = mempool_alloc(workspace_pool, GFP_NOFS); - ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - -- ret = zstd_decompress_dctx(ctx, -+ ret2 = zstd_decompress_dctx(ctx, - dst_data, dst_len, - src_data.b + 4, real_src_len); - - mempool_free(workspace, workspace_pool); - -- if (ret != dst_len) -- goto err; -+ if (ret2 != dst_len) -+ ret = -BCH_ERR_decompress_zstd; - break; - } - default: - BUG(); - } -- ret = 0; -+err: - fsck_err: --out: - bio_unmap_or_unbounce(c, src_data); - return ret; --err: -- ret = -EIO; -- goto out; - } - - int bch2_bio_uncompress_inplace(struct bch_write_op *op, -@@ -268,27 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, - BUG_ON(!bio->bi_vcnt); - BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - -- if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || -- crc->compressed_size << 9 > c->opts.encoded_extent_max) { -- struct printbuf buf = PRINTBUF; -- bch2_write_op_error(&buf, op); -- prt_printf(&buf, "error rewriting existing data: extent too big"); -- bch_err_ratelimited(c, "%s", buf.buf); -- printbuf_exit(&buf); -- return -EIO; -+ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) { -+ bch2_write_op_error(op, op->pos.offset, -+ "extent too big to decompress (%u > %u)", -+ crc->uncompressed_size << 9, c->opts.encoded_extent_max); -+ return -BCH_ERR_decompress_exceeded_max_encoded_extent; - } - - data = __bounce_alloc(c, dst_len, WRITE); - -- if (__bio_uncompress(c, bio, data.b, *crc)) { -- if (!c->opts.no_data_io) { -- struct printbuf buf = PRINTBUF; -- bch2_write_op_error(&buf, op); -- prt_printf(&buf, "error rewriting existing data: decompression error"); -- bch_err_ratelimited(c, "%s", buf.buf); -- printbuf_exit(&buf); -- } -- ret = -EIO; -+ ret = __bio_uncompress(c, bio, data.b, *crc); -+ -+ if (c->opts.no_data_io) -+ ret = 0; -+ -+ if (ret) { -+ bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret)); - goto err; - } - -@@ -321,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, - - if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || - crc.compressed_size << 9 > c->opts.encoded_extent_max) -- return -EIO; -+ return -BCH_ERR_decompress_exceeded_max_encoded_extent; - - dst_data = dst_len == dst_iter.bi_size - ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) -@@ -378,13 +371,14 @@ static int attempt_compress(struct bch_fs *c, - }; - - zlib_set_workspace(&strm, workspace); -- zlib_deflateInit2(&strm, -+ if (zlib_deflateInit2(&strm, - compression.level - ? clamp_t(unsigned, compression.level, - Z_BEST_SPEED, Z_BEST_COMPRESSION) - : Z_DEFAULT_COMPRESSION, - Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, -- Z_DEFAULT_STRATEGY); -+ Z_DEFAULT_STRATEGY) != Z_OK) -+ return 0; - - if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) - return 0; -@@ -720,7 +714,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, - - ret = match_string(bch2_compression_opts, -1, type_str); - if (ret < 0 && err) -- prt_str(err, "invalid compression type"); -+ prt_printf(err, "invalid compression type\n"); - if (ret < 0) - goto err; - -@@ -735,7 +729,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, - if (!ret && level > 15) - ret = -EINVAL; - if (ret < 0 && err) -- prt_str(err, "invalid compression level"); -+ prt_printf(err, "invalid compression level\n"); - if (ret < 0) - goto err; - -diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c -index 642fbc60ecab..de096ca65b4b 100644 ---- a/fs/bcachefs/data_update.c -+++ b/fs/bcachefs/data_update.c -@@ -20,6 +20,15 @@ - #include "subvolume.h" - #include "trace.h" - -+#include -+ -+static const char * const bch2_data_update_type_strs[] = { -+#define x(t, n, ...) [n] = #t, -+ BCH_DATA_UPDATE_TYPES() -+#undef x -+ NULL -+}; -+ - static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) - { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -@@ -33,7 +42,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) { -- if (!bch2_dev_tryget(c, ptr->dev)) { -+ if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; -@@ -91,7 +100,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc - return true; - } - --static noinline void trace_move_extent_finish2(struct data_update *u, -+static noinline void trace_io_move_finish2(struct data_update *u, - struct bkey_i *new, - struct bkey_i *insert) - { -@@ -111,11 +120,11 @@ static noinline void trace_move_extent_finish2(struct data_update *u, - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_newline(&buf); - -- trace_move_extent_finish(c, buf.buf); -+ trace_io_move_finish(c, buf.buf); - printbuf_exit(&buf); - } - --static void trace_move_extent_fail2(struct data_update *m, -+static void trace_io_move_fail2(struct data_update *m, - struct bkey_s_c new, - struct bkey_s_c wrote, - struct bkey_i *insert, -@@ -126,7 +135,7 @@ static void trace_move_extent_fail2(struct data_update *m, - struct printbuf buf = PRINTBUF; - unsigned rewrites_found = 0; - -- if (!trace_move_extent_fail_enabled()) -+ if (!trace_io_move_fail_enabled()) - return; - - prt_str(&buf, msg); -@@ -166,7 +175,7 @@ static void trace_move_extent_fail2(struct data_update *m, - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - } - -- trace_move_extent_fail(c, buf.buf); -+ trace_io_move_fail(c, buf.buf); - printbuf_exit(&buf); - } - -@@ -179,6 +188,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - container_of(op, struct data_update, op); - struct keylist *keys = &op->insert_keys; - struct bkey_buf _new, _insert; -+ struct printbuf journal_msg = PRINTBUF; - int ret = 0; - - bch2_bkey_buf_init(&_new); -@@ -206,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - - bch2_trans_begin(trans); - -- k = bch2_btree_iter_peek_slot(&iter); -+ k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -214,7 +224,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - new = bkey_i_to_extent(bch2_keylist_front(keys)); - - if (!bch2_extents_match(k, old)) { -- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), -+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), - NULL, "no match:"); - goto nowork; - } -@@ -254,7 +264,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - if (m->data_opts.rewrite_ptrs && - !rewrites_found && - bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { -- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); -+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); - goto nowork; - } - -@@ -271,7 +281,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - } - - if (!bkey_val_u64s(&new->k)) { -- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); -+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); - goto nowork; - } - -@@ -336,8 +346,9 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - .btree = m->btree_id, - .flags = BCH_VALIDATE_commit, - }); -- if (invalid) { -+ if (unlikely(invalid)) { - struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); - - prt_str(&buf, "about to insert invalid key in data update path"); - prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); -@@ -348,14 +359,30 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - -- bch2_print_string_as_lines(KERN_ERR, buf.buf); -+ bch2_fs_emergency_read_only2(c, &buf); -+ -+ bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - -- bch2_fatal_error(c); -- ret = -EIO; -+ ret = -BCH_ERR_invalid_bkey; - goto out; - } - -+ printbuf_reset(&journal_msg); -+ prt_str(&journal_msg, bch2_data_update_type_strs[m->type]); -+ -+ ret = bch2_trans_log_msg(trans, &journal_msg) ?: -+ bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: -+ bch2_insert_snapshot_whiteouts(trans, m->btree_id, -+ k.k->p, bkey_start_pos(&insert->k)) ?: -+ bch2_insert_snapshot_whiteouts(trans, m->btree_id, -+ k.k->p, insert->k.p) ?: -+ bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: -+ bch2_trans_update(trans, &iter, insert, -+ BTREE_UPDATE_internal_snapshot_node); -+ if (ret) -+ goto err; -+ - if (trace_data_update_enabled()) { - struct printbuf buf = PRINTBUF; - -@@ -370,25 +397,38 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - printbuf_exit(&buf); - } - -- ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, -- k.k->p, bkey_start_pos(&insert->k)) ?: -- bch2_insert_snapshot_whiteouts(trans, m->btree_id, -- k.k->p, insert->k.p) ?: -- bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: -- bch2_trans_update(trans, &iter, insert, -- BTREE_UPDATE_internal_snapshot_node) ?: -- bch2_trans_commit(trans, &op->res, -+ if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > -+ bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); -+ -+ prt_str(&buf, "\nold: "); -+ bch2_bkey_val_to_text(&buf, c, old); -+ prt_str(&buf, "\nk: "); -+ bch2_bkey_val_to_text(&buf, c, k); -+ prt_str(&buf, "\nnew: "); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); -+ -+ trace_io_move_created_rebalance(c, buf.buf); -+ printbuf_exit(&buf); -+ -+ this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); -+ } -+ -+ ret = bch2_trans_commit(trans, &op->res, - NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - m->data_opts.btree_insert_flags); -- if (!ret) { -- bch2_btree_iter_set_pos(&iter, next_pos); -+ if (ret) -+ goto err; - -- this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); -- if (trace_move_extent_finish_enabled()) -- trace_move_extent_finish2(m, &new->k_i, insert); -- } -+ bch2_btree_iter_set_pos(trans, &iter, next_pos); -+ -+ this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); -+ if (trace_io_move_finish_enabled()) -+ trace_io_move_finish2(m, &new->k_i, insert); - err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; -@@ -409,12 +449,13 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - &m->stats->sectors_raced); - } - -- count_event(c, move_extent_fail); -+ count_event(c, io_move_fail); - -- bch2_btree_iter_advance(&iter); -+ bch2_btree_iter_advance(trans, &iter); - goto next; - } - out: -+ printbuf_exit(&journal_msg); - bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&_insert, c); - bch2_bkey_buf_exit(&_new, c); -@@ -427,14 +468,17 @@ int bch2_data_update_index_update(struct bch_write_op *op) - return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); - } - --void bch2_data_update_read_done(struct data_update *m, -- struct bch_extent_crc_unpacked crc) -+void bch2_data_update_read_done(struct data_update *m) - { -+ m->read_done = true; -+ - /* write bio must own pages: */ - BUG_ON(!m->op.wbio.bio.bi_vcnt); - -- m->op.crc = crc; -- m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; -+ m->op.crc = m->rbio.pick.crc; -+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; -+ -+ this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); - - closure_call(&m->op.cl, bch2_write, NULL, NULL); - } -@@ -444,38 +488,41 @@ void bch2_data_update_exit(struct data_update *update) - struct bch_fs *c = update->op.c; - struct bkey_s_c k = bkey_i_to_s_c(update->k.k); - -+ bch2_bio_free_pages_pool(c, &update->op.wbio.bio); -+ kfree(update->bvecs); -+ update->bvecs = NULL; -+ - if (c->opts.nocow_enabled) - bkey_nocow_unlock(c, k); - bkey_put_dev_refs(c, k); -- bch2_bkey_buf_exit(&update->k, c); - bch2_disk_reservation_put(c, &update->op.res); -- bch2_bio_free_pages_pool(c, &update->op.wbio.bio); -+ bch2_bkey_buf_exit(&update->k, c); - } - --static void bch2_update_unwritten_extent(struct btree_trans *trans, -- struct data_update *update) -+static int bch2_update_unwritten_extent(struct btree_trans *trans, -+ struct data_update *update) - { - struct bch_fs *c = update->op.c; -- struct bio *bio = &update->op.wbio.bio; - struct bkey_i_extent *e; - struct write_point *wp; - struct closure cl; - struct btree_iter iter; - struct bkey_s_c k; -- int ret; -+ int ret = 0; - - closure_init_stack(&cl); - bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); - -- while (bio_sectors(bio)) { -- unsigned sectors = bio_sectors(bio); -+ while (bpos_lt(update->op.pos, update->k.k->k.p)) { -+ unsigned sectors = update->k.k->k.p.offset - -+ update->op.pos.offset; - - bch2_trans_begin(trans); - - bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, - BTREE_ITER_slots); - ret = lockrestart_do(trans, ({ -- k = bch2_btree_iter_peek_slot(&iter); -+ k = bch2_btree_iter_peek_slot(trans, &iter); - bkey_err(k); - })); - bch2_trans_iter_exit(trans, &iter); -@@ -504,7 +551,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, - bch_err_fn_ratelimited(c, ret); - - if (ret) -- return; -+ break; - - sectors = min(sectors, wp->sectors_free); - -@@ -514,7 +561,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); - bch2_alloc_sectors_done(c, wp); - -- bio_advance(bio, sectors << 9); - update->op.pos.offset += sectors; - - extent_for_each_ptr(extent_i_to_s(e), ptr) -@@ -533,13 +579,16 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, - bch2_trans_unlock(trans); - closure_sync(&cl); - } -+ -+ return ret; - } - - void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) - { -- printbuf_tabstop_push(out, 20); -+ if (!out->nr_tabstops) -+ printbuf_tabstop_push(out, 20); - - prt_str_indented(out, "rewrite ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); -@@ -563,10 +612,17 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - - prt_str_indented(out, "extra replicas:\t"); - prt_u64(out, data_opts->extra_replicas); -+ prt_newline(out); -+ -+ prt_str_indented(out, "scrub:\t"); -+ prt_u64(out, data_opts->scrub); - } - - void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) - { -+ prt_str(out, bch2_data_update_type_strs[m->type]); -+ prt_newline(out); -+ - bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - prt_newline(out); - -@@ -574,6 +630,25 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) - bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); - } - -+void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) -+{ -+ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); -+ prt_newline(out); -+ printbuf_indent_add(out, 2); -+ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); -+ -+ if (!m->read_done) { -+ prt_printf(out, "read:\n"); -+ printbuf_indent_add(out, 2); -+ bch2_read_bio_to_text(out, &m->rbio); -+ } else { -+ prt_printf(out, "write:\n"); -+ printbuf_indent_add(out, 2); -+ bch2_write_op_to_text(out, &m->op); -+ } -+ printbuf_indent_sub(out, 4); -+} -+ - int bch2_extent_drop_ptrs(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, -@@ -617,12 +692,87 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - } - -+int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, -+ struct bch_io_opts *io_opts) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ /* write path might have to decompress data: */ -+ unsigned buf_bytes = 0; -+ bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) -+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); -+ -+ unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); -+ -+ m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); -+ if (!m->bvecs) -+ return -ENOMEM; -+ -+ bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); -+ bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); -+ -+ if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { -+ kfree(m->bvecs); -+ m->bvecs = NULL; -+ return -ENOMEM; -+ } -+ -+ rbio_init(&m->rbio.bio, c, *io_opts, NULL); -+ m->rbio.data_update = true; -+ m->rbio.bio.bi_iter.bi_size = buf_bytes; -+ m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); -+ m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); -+ return 0; -+} -+ -+static int can_write_extent(struct bch_fs *c, struct data_update *m) -+{ -+ if ((m->op.flags & BCH_WRITE_alloc_nowait) && -+ unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) -+ return -BCH_ERR_data_update_done_would_block; -+ -+ unsigned target = m->op.flags & BCH_WRITE_only_specified_devs -+ ? m->op.target -+ : 0; -+ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); -+ -+ darray_for_each(m->op.devs_have, i) -+ __clear_bit(*i, devs.d); -+ -+ rcu_read_lock(); -+ unsigned nr_replicas = 0, i; -+ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); -+ if (!ca) -+ continue; -+ -+ struct bch_dev_usage usage; -+ bch2_dev_usage_read_fast(ca, &usage); -+ -+ if (!dev_buckets_free(ca, usage, m->op.watermark)) -+ continue; -+ -+ nr_replicas += ca->mi.durability; -+ if (nr_replicas >= m->op.nr_replicas) -+ break; -+ } -+ rcu_read_unlock(); -+ -+ if (!nr_replicas) -+ return -BCH_ERR_data_update_done_no_rw_devs; -+ if (nr_replicas < m->op.nr_replicas) -+ return -BCH_ERR_insufficient_devices; -+ return 0; -+} -+ - int bch2_data_update_init(struct btree_trans *trans, - struct btree_iter *iter, - struct moving_context *ctxt, - struct data_update *m, - struct write_point_specifier wp, -- struct bch_io_opts io_opts, -+ struct bch_io_opts *io_opts, - struct data_update_opts data_opts, - enum btree_id btree_id, - struct bkey_s_c k) -@@ -640,36 +790,30 @@ int bch2_data_update_init(struct btree_trans *trans, - * snapshots table - just skip it, we can move it later. - */ - if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) -- return -BCH_ERR_data_update_done; -- -- if (!bkey_get_dev_refs(c, k)) -- return -BCH_ERR_data_update_done; -- -- if (c->opts.nocow_enabled && -- !bkey_nocow_lock(c, ctxt, k)) { -- bkey_put_dev_refs(c, k); -- return -BCH_ERR_nocow_lock_blocked; -- } -+ return -BCH_ERR_data_update_done_no_snapshot; - - bch2_bkey_buf_init(&m->k); - bch2_bkey_buf_reassemble(&m->k, c, k); -+ m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc -+ ? BCH_DATA_UPDATE_copygc -+ : BCH_DATA_UPDATE_rebalance; - m->btree_id = btree_id; - m->data_opts = data_opts; - m->ctxt = ctxt; - m->stats = ctxt ? ctxt->stats : NULL; - -- bch2_write_op_init(&m->op, c, io_opts); -+ bch2_write_op_init(&m->op, c, *io_opts); - m->op.pos = bkey_start_pos(k.k); - m->op.version = k.k->bversion; - m->op.target = data_opts.target; - m->op.write_point = wp; - m->op.nr_replicas = 0; -- m->op.flags |= BCH_WRITE_PAGES_STABLE| -- BCH_WRITE_PAGES_OWNED| -- BCH_WRITE_DATA_ENCODED| -- BCH_WRITE_MOVE| -+ m->op.flags |= BCH_WRITE_pages_stable| -+ BCH_WRITE_pages_owned| -+ BCH_WRITE_data_encoded| -+ BCH_WRITE_move| - m->data_opts.write_flags; -- m->op.compression_opt = io_opts.background_compression; -+ m->op.compression_opt = io_opts->background_compression; - m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - - unsigned durability_have = 0, durability_removing = 0; -@@ -707,7 +851,7 @@ int bch2_data_update_init(struct btree_trans *trans, - ptr_bit <<= 1; - } - -- unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); -+ unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); - - /* - * If current extent durability is less than io_opts.data_replicas, -@@ -740,28 +884,70 @@ int bch2_data_update_init(struct btree_trans *trans, - m->data_opts.rewrite_ptrs = 0; - /* if iter == NULL, it's just a promote */ - if (iter) -- ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); -- goto out; -+ ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); -+ if (!ret) -+ ret = -BCH_ERR_data_update_done_no_writes_needed; -+ goto out_bkey_buf_exit; - } - -+ /* -+ * Check if the allocation will succeed, to avoid getting an error later -+ * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless -+ * read: -+ * -+ * This guards against -+ * - BCH_WRITE_alloc_nowait allocations failing (promotes) -+ * - Destination target full -+ * - Device(s) in destination target offline -+ * - Insufficient durability available in destination target -+ * (i.e. trying to move a durability=2 replica to a target with a -+ * single durability=2 device) -+ */ -+ ret = can_write_extent(c, m); -+ if (ret) -+ goto out_bkey_buf_exit; -+ - if (reserve_sectors) { - ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, - m->data_opts.extra_replicas - ? 0 - : BCH_DISK_RESERVATION_NOFAIL); - if (ret) -- goto out; -+ goto out_bkey_buf_exit; -+ } -+ -+ if (!bkey_get_dev_refs(c, k)) { -+ ret = -BCH_ERR_data_update_done_no_dev_refs; -+ goto out_put_disk_res; -+ } -+ -+ if (c->opts.nocow_enabled && -+ !bkey_nocow_lock(c, ctxt, k)) { -+ ret = -BCH_ERR_nocow_lock_blocked; -+ goto out_put_dev_refs; - } - - if (bkey_extent_is_unwritten(k)) { -- bch2_update_unwritten_extent(trans, m); -- goto out; -+ ret = bch2_update_unwritten_extent(trans, m) ?: -+ -BCH_ERR_data_update_done_unwritten; -+ goto out_nocow_unlock; - } - -+ ret = bch2_data_update_bios_init(m, c, io_opts); -+ if (ret) -+ goto out_nocow_unlock; -+ - return 0; --out: -- bch2_data_update_exit(m); -- return ret ?: -BCH_ERR_data_update_done; -+out_nocow_unlock: -+ if (c->opts.nocow_enabled) -+ bkey_nocow_unlock(c, k); -+out_put_dev_refs: -+ bkey_put_dev_refs(c, k); -+out_put_disk_res: -+ bch2_disk_reservation_put(c, &m->op.res); -+out_bkey_buf_exit: -+ bch2_bkey_buf_exit(&m->k, c); -+ return ret; - } - - void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) -diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h -index e4b50723428e..5e14d13568de 100644 ---- a/fs/bcachefs/data_update.h -+++ b/fs/bcachefs/data_update.h -@@ -4,6 +4,7 @@ - #define _BCACHEFS_DATA_UPDATE_H - - #include "bkey_buf.h" -+#include "io_read.h" - #include "io_write_types.h" - - struct moving_context; -@@ -15,27 +16,61 @@ struct data_update_opts { - u8 extra_replicas; - unsigned btree_insert_flags; - unsigned write_flags; -+ -+ int read_dev; -+ bool scrub; - }; - - void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, - struct bch_io_opts *, struct data_update_opts *); - -+#define BCH_DATA_UPDATE_TYPES() \ -+ x(copygc, 0) \ -+ x(rebalance, 1) \ -+ x(promote, 2) -+ -+enum bch_data_update_types { -+#define x(n, id) BCH_DATA_UPDATE_##n = id, -+ BCH_DATA_UPDATE_TYPES() -+#undef x -+}; -+ - struct data_update { -+ enum bch_data_update_types type; - /* extent being updated: */ -+ bool read_done; - enum btree_id btree_id; - struct bkey_buf k; - struct data_update_opts data_opts; - struct moving_context *ctxt; - struct bch_move_stats *stats; -+ -+ struct bch_read_bio rbio; - struct bch_write_op op; -+ struct bio_vec *bvecs; -+}; -+ -+struct promote_op { -+ struct rcu_head rcu; -+ u64 start_time; -+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -+ unsigned list_idx; -+#endif -+ -+ struct rhash_head hash; -+ struct bpos pos; -+ -+ struct work_struct work; -+ struct data_update write; -+ struct bio_vec bi_inline_vecs[]; /* must be last */ - }; - - void bch2_data_update_to_text(struct printbuf *, struct data_update *); -+void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); - - int bch2_data_update_index_update(struct bch_write_op *); - --void bch2_data_update_read_done(struct data_update *, -- struct bch_extent_crc_unpacked); -+void bch2_data_update_read_done(struct data_update *); - - int bch2_extent_drop_ptrs(struct btree_trans *, - struct btree_iter *, -@@ -43,12 +78,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *, - struct bch_io_opts *, - struct data_update_opts *); - -+int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, -+ struct bch_io_opts *); -+ - void bch2_data_update_exit(struct data_update *); - int bch2_data_update_init(struct btree_trans *, struct btree_iter *, - struct moving_context *, - struct data_update *, - struct write_point_specifier, -- struct bch_io_opts, struct data_update_opts, -+ struct bch_io_opts *, struct data_update_opts, - enum btree_id, struct bkey_s_c); - void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); - -diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c -index 55333e82d1fe..4fa70634c90e 100644 ---- a/fs/bcachefs/debug.c -+++ b/fs/bcachefs/debug.c -@@ -7,6 +7,8 @@ - */ - - #include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "async_objs.h" - #include "bkey_methods.h" - #include "btree_cache.h" - #include "btree_io.h" -@@ -15,6 +17,7 @@ - #include "btree_update.h" - #include "btree_update_interior.h" - #include "buckets.h" -+#include "data_update.h" - #include "debug.h" - #include "error.h" - #include "extents.h" -@@ -39,9 +42,10 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, - struct btree_node *n_sorted = c->verify_data->data; - struct bset *sorted, *inmemory = &b->data->keys; - struct bio *bio; -- bool failed = false, saw_error = false; -+ bool failed = false; - -- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); -+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, -+ BCH_DEV_READ_REF_btree_verify_replicas); - if (!ca) - return false; - -@@ -56,12 +60,13 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, - submit_bio_wait(bio); - - bio_put(bio); -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], -+ BCH_DEV_READ_REF_btree_verify_replicas); - - memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); - - v->written = 0; -- if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) -+ if (bch2_btree_node_read_done(c, ca, v, NULL, NULL)) - return false; - - n_sorted = c->verify_data->data; -@@ -190,12 +195,13 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, - unsigned offset = 0; - int ret; - -- if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { -+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { - prt_printf(out, "error getting device to read from: invalid device\n"); - return; - } - -- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); -+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, -+ BCH_DEV_READ_REF_btree_node_ondisk_to_text); - if (!ca) { - prt_printf(out, "error getting device to read from: not online\n"); - return; -@@ -296,28 +302,13 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, - if (bio) - bio_put(bio); - kvfree(n_ondisk); -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], -+ BCH_DEV_READ_REF_btree_node_ondisk_to_text); - } - - #ifdef CONFIG_DEBUG_FS - --/* XXX: bch_fs refcounting */ -- --struct dump_iter { -- struct bch_fs *c; -- enum btree_id id; -- struct bpos from; -- struct bpos prev_node; -- u64 iter; -- -- struct printbuf buf; -- -- char __user *ubuf; /* destination user buffer */ -- size_t size; /* size of requested read */ -- ssize_t ret; /* bytes read so far */ --}; -- --static ssize_t flush_buf(struct dump_iter *i) -+ssize_t bch2_debugfs_flush_buf(struct dump_iter *i) - { - if (i->buf.pos) { - size_t bytes = min_t(size_t, i->buf.pos, i->size); -@@ -329,6 +320,11 @@ static ssize_t flush_buf(struct dump_iter *i) - i->buf.pos -= copied; - memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); - -+ if (i->buf.last_newline >= copied) -+ i->buf.last_newline -= copied; -+ if (i->buf.last_field >= copied) -+ i->buf.last_field -= copied; -+ - if (copied != bytes) - return -EFAULT; - } -@@ -355,7 +351,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file) - return 0; - } - --static int bch2_dump_release(struct inode *inode, struct file *file) -+int bch2_dump_release(struct inode *inode, struct file *file) - { - struct dump_iter *i = file->private_data; - -@@ -373,7 +369,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, - i->size = size; - i->ret = 0; - -- return flush_buf(i) ?: -+ return bch2_debugfs_flush_buf(i) ?: - bch2_trans_run(i->c, - for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_prefetch| -@@ -382,7 +378,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, - prt_newline(&i->buf); - bch2_trans_unlock(trans); - i->from = bpos_successor(iter.pos); -- flush_buf(i); -+ bch2_debugfs_flush_buf(i); - }))) ?: - i->ret; - } -@@ -403,7 +399,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, - i->size = size; - i->ret = 0; - -- ssize_t ret = flush_buf(i); -+ ssize_t ret = bch2_debugfs_flush_buf(i); - if (ret) - return ret; - -@@ -417,7 +413,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, - ? bpos_successor(b->key.k.p) - : b->key.k.p; - -- drop_locks_do(trans, flush_buf(i)); -+ drop_locks_do(trans, bch2_debugfs_flush_buf(i)); - }))) ?: i->ret; - } - -@@ -437,7 +433,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, - i->size = size; - i->ret = 0; - -- return flush_buf(i) ?: -+ return bch2_debugfs_flush_buf(i) ?: - bch2_trans_run(i->c, - for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_prefetch| -@@ -455,7 +451,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, - bch2_bfloat_to_text(&i->buf, l->b, _k); - bch2_trans_unlock(trans); - i->from = bpos_successor(iter.pos); -- flush_buf(i); -+ bch2_debugfs_flush_buf(i); - }))) ?: - i->ret; - } -@@ -516,7 +512,7 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, - struct rhash_head *pos; - struct btree *b; - -- ret = flush_buf(i); -+ ret = bch2_debugfs_flush_buf(i); - if (ret) - return ret; - -@@ -539,7 +535,7 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, - ret = -ENOMEM; - - if (!ret) -- ret = flush_buf(i); -+ ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; - } -@@ -613,7 +609,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, - - closure_put(&trans->ref); - -- ret = flush_buf(i); -+ ret = bch2_debugfs_flush_buf(i); - if (ret) - goto unlocked; - -@@ -626,7 +622,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, - ret = -ENOMEM; - - if (!ret) -- ret = flush_buf(i); -+ ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; - } -@@ -651,7 +647,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, - i->ret = 0; - - while (1) { -- err = flush_buf(i); -+ err = bch2_debugfs_flush_buf(i); - if (err) - return err; - -@@ -694,7 +690,7 @@ static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, - i->iter++; - } - -- err = flush_buf(i); -+ err = bch2_debugfs_flush_buf(i); - if (err) - return err; - -@@ -752,7 +748,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, - while (1) { - struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; - -- err = flush_buf(i); -+ err = bch2_debugfs_flush_buf(i); - if (err) - return err; - -@@ -769,6 +765,12 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, - mutex_lock(&s->lock); - - prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); -+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -+ printbuf_indent_add(&i->buf, 2); -+ bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); -+ printbuf_indent_sub(&i->buf, 2); -+#endif -+ - prt_printf(&i->buf, "Transaction duration:\n"); - - printbuf_indent_add(&i->buf, 2); -@@ -844,8 +846,11 @@ static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) - seqmutex_unlock(&c->btree_trans_lock); - } - --static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, -- size_t size, loff_t *ppos) -+typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); -+ -+static ssize_t bch2_simple_print(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos, -+ fs_to_text_fn fn) - { - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; -@@ -856,7 +861,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - i->ret = 0; - - if (!i->iter) { -- btree_deadlock_to_text(&i->buf, c); -+ fn(&i->buf, c); - i->iter++; - } - -@@ -864,11 +869,17 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - ret = -ENOMEM; - - if (!ret) -- ret = flush_buf(i); -+ ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; - } - -+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); -+} -+ - static const struct file_operations btree_deadlock_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, -@@ -876,6 +887,19 @@ static const struct file_operations btree_deadlock_ops = { - .read = bch2_btree_deadlock_read, - }; - -+static ssize_t bch2_write_points_read(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); -+} -+ -+static const struct file_operations write_points_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_write_points_read, -+}; -+ - void bch2_fs_debug_exit(struct bch_fs *c) - { - if (!IS_ERR_OR_NULL(c->fs_debug_dir)) -@@ -904,7 +928,11 @@ void bch2_fs_debug_init(struct bch_fs *c) - if (IS_ERR_OR_NULL(bch_debug)) - return; - -- snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); -+ if (c->sb.multi_device) -+ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); -+ else -+ strscpy(name, c->name, sizeof(name)); -+ - c->fs_debug_dir = debugfs_create_dir(name, bch_debug); - if (IS_ERR_OR_NULL(c->fs_debug_dir)) - return; -@@ -927,6 +955,11 @@ void bch2_fs_debug_init(struct bch_fs *c) - debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, - c->btree_debug, &btree_deadlock_ops); - -+ debugfs_create_file("write_points", 0400, c->fs_debug_dir, -+ c->btree_debug, &write_points_ops); -+ -+ bch2_fs_async_obj_debugfs_init(c); -+ - c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); - if (IS_ERR_OR_NULL(c->btree_debug_dir)) - return; -diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h -index 2c37143b5fd1..d88b1194b8ac 100644 ---- a/fs/bcachefs/debug.h -+++ b/fs/bcachefs/debug.h -@@ -14,11 +14,29 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, - - static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) - { -- if (bch2_verify_btree_ondisk) -+ if (static_branch_unlikely(&bch2_verify_btree_ondisk)) - __bch2_btree_verify(c, b); - } - - #ifdef CONFIG_DEBUG_FS -+struct dump_iter { -+ struct bch_fs *c; -+ struct async_obj_list *list; -+ enum btree_id id; -+ struct bpos from; -+ struct bpos prev_node; -+ u64 iter; -+ -+ struct printbuf buf; -+ -+ char __user *ubuf; /* destination user buffer */ -+ size_t size; /* size of requested read */ -+ ssize_t ret; /* bytes read so far */ -+}; -+ -+ssize_t bch2_debugfs_flush_buf(struct dump_iter *); -+int bch2_dump_release(struct inode *, struct file *); -+ - void bch2_fs_debug_exit(struct bch_fs *); - void bch2_fs_debug_init(struct bch_fs *); - #else -diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c -index 600eee936f13..d198001838f3 100644 ---- a/fs/bcachefs/dirent.c -+++ b/fs/bcachefs/dirent.c -@@ -13,6 +13,28 @@ - - #include - -+int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, -+ const struct qstr *str, struct qstr *out_cf) -+{ -+ *out_cf = (struct qstr) QSTR_INIT(NULL, 0); -+ -+#ifdef CONFIG_UNICODE -+ unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); -+ int ret = PTR_ERR_OR_ZERO(buf); -+ if (ret) -+ return ret; -+ -+ ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1); -+ if (ret <= 0) -+ return ret; -+ -+ *out_cf = (struct qstr) QSTR_INIT(buf, ret); -+ return 0; -+#else -+ return -EOPNOTSUPP; -+#endif -+} -+ - static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) - { - if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) -@@ -28,13 +50,38 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) - #endif - - return bkey_bytes - -- offsetof(struct bch_dirent, d_name) - -+ (d.v->d_casefold -+ ? offsetof(struct bch_dirent, d_cf_name_block.d_names) -+ : offsetof(struct bch_dirent, d_name)) - - trailing_nuls; - } - - struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) - { -- return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); -+ if (d.v->d_casefold) { -+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); -+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len); -+ } else { -+ return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); -+ } -+} -+ -+static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d) -+{ -+ if (d.v->d_casefold) { -+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); -+ unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len); -+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len); -+ } else { -+ return (struct qstr) QSTR_INIT(NULL, 0); -+ } -+} -+ -+static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d) -+{ -+ return d.v->d_casefold -+ ? bch2_dirent_get_casefold_name(d) -+ : bch2_dirent_get_name(d); - } - - static u64 bch2_dirent_hash(const struct bch_hash_info *info, -@@ -57,7 +104,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) - static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) - { - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -- struct qstr name = bch2_dirent_get_name(d); -+ struct qstr name = bch2_dirent_get_lookup_name(d); - - return bch2_dirent_hash(info, &name); - } -@@ -65,7 +112,7 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) - static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) - { - struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); -- const struct qstr l_name = bch2_dirent_get_name(l); -+ const struct qstr l_name = bch2_dirent_get_lookup_name(l); - const struct qstr *r_name = _r; - - return !qstr_eq(l_name, *r_name); -@@ -75,8 +122,8 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) - { - struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); -- const struct qstr l_name = bch2_dirent_get_name(l); -- const struct qstr r_name = bch2_dirent_get_name(r); -+ const struct qstr l_name = bch2_dirent_get_lookup_name(l); -+ const struct qstr r_name = bch2_dirent_get_lookup_name(r); - - return !qstr_eq(l_name, r_name); - } -@@ -104,17 +151,19 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) - { - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ unsigned name_block_len = bch2_dirent_name_bytes(d); - struct qstr d_name = bch2_dirent_get_name(d); -+ struct qstr d_cf_name = bch2_dirent_get_casefold_name(d); - int ret = 0; - - bkey_fsck_err_on(!d_name.len, - c, dirent_empty_name, - "empty name"); - -- bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), -+ bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len, - c, dirent_val_too_big, -- "value too big (%zu > %u)", -- bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); -+ "dirent names exceed bkey size (%d + %d > %d)", -+ d_name.len, d_cf_name.len, name_block_len); - - /* - * Check new keys don't exceed the max length -@@ -142,6 +191,18 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, - le64_to_cpu(d.v->d_inum) == d.k->p.inode, - c, dirent_to_itself, - "dirent points to own directory"); -+ -+ if (d.v->d_casefold) { -+ bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit && -+ d_cf_name.len > BCH_NAME_MAX, -+ c, dirent_cf_name_too_big, -+ "dirent w/ cf name too big (%u > %u)", -+ d_cf_name.len, BCH_NAME_MAX); -+ -+ bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len), -+ c, dirent_stray_data_after_cf_name, -+ "dirent has stray data after cf name's NUL"); -+ } - fsck_err: - return ret; - } -@@ -151,27 +212,33 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr d_name = bch2_dirent_get_name(d); - -- prt_printf(out, "%.*s -> ", d_name.len, d_name.name); -+ prt_printf(out, "%.*s", d_name.len, d_name.name); -+ -+ if (d.v->d_casefold) { -+ struct qstr d_name = bch2_dirent_get_lookup_name(d); -+ prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name); -+ } -+ -+ prt_str(out, " ->"); - - if (d.v->d_type != DT_SUBVOL) -- prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum)); -+ prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum)); - else -- prt_printf(out, "%u -> %u", -+ prt_printf(out, " %u -> %u", - le32_to_cpu(d.v->d_parent_subvol), - le32_to_cpu(d.v->d_child_subvol)); - - prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); - } - --static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, -- subvol_inum dir, u8 type, -- const struct qstr *name, u64 dst) -+static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, -+ subvol_inum dir, -+ u8 type, -+ int name_len, int cf_name_len, -+ u64 dst) - { - struct bkey_i_dirent *dirent; -- unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); -- -- if (name->len > BCH_NAME_MAX) -- return ERR_PTR(-ENAMETOOLONG); -+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len); - - BUG_ON(u64s > U8_MAX); - -@@ -190,14 +257,75 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, - } - - dirent->v.d_type = type; -+ dirent->v.d_unused = 0; -+ dirent->v.d_casefold = cf_name_len ? 1 : 0; -+ -+ return dirent; -+} -+ -+static void dirent_init_regular_name(struct bkey_i_dirent *dirent, -+ const struct qstr *name) -+{ -+ EBUG_ON(dirent->v.d_casefold); -+ -+ memcpy(&dirent->v.d_name[0], name->name, name->len); -+ memset(&dirent->v.d_name[name->len], 0, -+ bkey_val_bytes(&dirent->k) - -+ offsetof(struct bch_dirent, d_name) - -+ name->len); -+} -+ -+static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, -+ const struct qstr *name, -+ const struct qstr *cf_name) -+{ -+ EBUG_ON(!dirent->v.d_casefold); -+ EBUG_ON(!cf_name->len); -+ -+ dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); -+ dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len); -+ memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); -+ memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); -+ memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, -+ bkey_val_bytes(&dirent->k) - -+ offsetof(struct bch_dirent, d_cf_name_block.d_names) - -+ name->len + cf_name->len); -+ -+ EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len); -+} -+ -+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, -+ const struct bch_hash_info *hash_info, -+ subvol_inum dir, -+ u8 type, -+ const struct qstr *name, -+ const struct qstr *cf_name, -+ u64 dst) -+{ -+ struct bkey_i_dirent *dirent; -+ struct qstr _cf_name; - -- memcpy(dirent->v.d_name, name->name, name->len); -- memset(dirent->v.d_name + name->len, 0, -- bkey_val_bytes(&dirent->k) - -- offsetof(struct bch_dirent, d_name) - -- name->len); -+ if (name->len > BCH_NAME_MAX) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ if (hash_info->cf_encoding && !cf_name) { -+ int ret = bch2_casefold(trans, hash_info, name, &_cf_name); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ cf_name = &_cf_name; -+ } - -- EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); -+ dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst); -+ if (IS_ERR(dirent)) -+ return dirent; -+ -+ if (cf_name) -+ dirent_init_casefolded_name(dirent, name, cf_name); -+ else -+ dirent_init_regular_name(dirent, name); -+ -+ EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); - - return dirent; - } -@@ -213,7 +341,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, - struct bkey_i_dirent *dirent; - int ret; - -- dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum); -+ dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); - ret = PTR_ERR_OR_ZERO(dirent); - if (ret) - return ret; -@@ -222,8 +350,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, - dirent->k.p.snapshot = snapshot; - - ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, -- dir_inum, snapshot, &dirent->k_i, -- flags|BTREE_UPDATE_internal_snapshot_node); -+ dir_inum, snapshot, &dirent->k_i, flags); - *dir_offset = dirent->k.p.offset; - - return ret; -@@ -238,7 +365,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, - struct bkey_i_dirent *dirent; - int ret; - -- dirent = dirent_create_key(trans, dir, type, name, dst_inum); -+ dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); - ret = PTR_ERR_OR_ZERO(dirent); - if (ret) - return ret; -@@ -275,14 +402,15 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, - } - - int bch2_dirent_rename(struct btree_trans *trans, -- subvol_inum src_dir, struct bch_hash_info *src_hash, -- subvol_inum dst_dir, struct bch_hash_info *dst_hash, -+ subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size, -+ subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size, - const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, - const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, - enum bch_rename_mode mode) - { -- struct btree_iter src_iter = { NULL }; -- struct btree_iter dst_iter = { NULL }; -+ struct qstr src_name_lookup, dst_name_lookup; -+ struct btree_iter src_iter = {}; -+ struct btree_iter dst_iter = {}; - struct bkey_s_c old_src, old_dst = bkey_s_c_null; - struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; - struct bpos dst_pos = -@@ -295,8 +423,11 @@ int bch2_dirent_rename(struct btree_trans *trans, - memset(dst_inum, 0, sizeof(*dst_inum)); - - /* Lookup src: */ -+ ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup); -+ if (ret) -+ goto out; - old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, -- src_hash, src_dir, src_name, -+ src_hash, src_dir, &src_name_lookup, - BTREE_ITER_intent); - ret = bkey_err(old_src); - if (ret) -@@ -308,6 +439,9 @@ int bch2_dirent_rename(struct btree_trans *trans, - goto out; - - /* Lookup dst: */ -+ ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup); -+ if (ret) -+ goto out; - if (mode == BCH_RENAME) { - /* - * Note that we're _not_ checking if the target already exists - -@@ -315,12 +449,12 @@ int bch2_dirent_rename(struct btree_trans *trans, - * correctness: - */ - ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, -- dst_hash, dst_dir, dst_name); -+ dst_hash, dst_dir, &dst_name_lookup); - if (ret) - goto out; - } else { - old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, -- dst_hash, dst_dir, dst_name, -+ dst_hash, dst_dir, &dst_name_lookup, - BTREE_ITER_intent); - ret = bkey_err(old_dst); - if (ret) -@@ -336,7 +470,8 @@ int bch2_dirent_rename(struct btree_trans *trans, - *src_offset = dst_iter.pos.offset; - - /* Create new dst key: */ -- new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); -+ new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, -+ dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); - ret = PTR_ERR_OR_ZERO(new_dst); - if (ret) - goto out; -@@ -346,7 +481,8 @@ int bch2_dirent_rename(struct btree_trans *trans, - - /* Create new src key: */ - if (mode == BCH_RENAME_EXCHANGE) { -- new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); -+ new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name, -+ src_hash->cf_encoding ? &src_name_lookup : NULL, 0); - ret = PTR_ERR_OR_ZERO(new_src); - if (ret) - goto out; -@@ -406,6 +542,14 @@ int bch2_dirent_rename(struct btree_trans *trans, - new_src->v.d_type == DT_SUBVOL) - new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); - -+ if (old_dst.k) -+ *dst_dir_i_size -= bkey_bytes(old_dst.k); -+ *src_dir_i_size -= bkey_bytes(old_src.k); -+ -+ if (mode == BCH_RENAME_EXCHANGE) -+ *src_dir_i_size += bkey_bytes(&new_src->k); -+ *dst_dir_i_size += bkey_bytes(&new_dst->k); -+ - ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); - if (ret) - goto out; -@@ -434,16 +578,16 @@ int bch2_dirent_rename(struct btree_trans *trans, - } - - if (delete_src) { -- bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); -- ret = bch2_btree_iter_traverse(&src_iter) ?: -+ bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot); -+ ret = bch2_btree_iter_traverse(trans, &src_iter) ?: - bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto out; - } - - if (delete_dst) { -- bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); -- ret = bch2_btree_iter_traverse(&dst_iter) ?: -+ bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot); -+ ret = bch2_btree_iter_traverse(trans, &dst_iter) ?: - bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto out; -@@ -465,9 +609,14 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans, - const struct qstr *name, subvol_inum *inum, - unsigned flags) - { -+ struct qstr lookup_name; -+ int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name); -+ if (ret) -+ return ret; -+ - struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, -- hash_info, dir, name, flags); -- int ret = bkey_err(k); -+ hash_info, dir, &lookup_name, flags); -+ ret = bkey_err(k); - if (ret) - goto err; - -@@ -485,7 +634,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, - const struct qstr *name, subvol_inum *inum) - { - struct btree_trans *trans = bch2_trans_get(c); -- struct btree_iter iter = { NULL }; -+ struct btree_iter iter = {}; - - int ret = lockrestart_do(trans, - bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); -@@ -540,7 +689,7 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv - vfs_d_type(d.v->d_type)); - if (ret) - ctx->pos = d.k->p.offset + 1; -- return ret; -+ return !ret; - } - - int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) -@@ -565,10 +714,61 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) - if (ret2 > 0) - continue; - -- ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target)); -+ ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target)); - }))); - - bch2_bkey_buf_exit(&sk, c); - - return ret < 0 ? ret : 0; - } -+ -+/* fsck */ -+ -+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, -+ struct bch_inode_unpacked *inode) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), -+ BTREE_ITER_all_snapshots, k, ret) { -+ if (k.k->p.offset != inode_nr) -+ break; -+ if (!bkey_is_inode(k.k)) -+ continue; -+ ret = bch2_inode_unpack(k, inode); -+ goto found; -+ } -+ ret = -BCH_ERR_ENOENT_inode; -+found: -+ bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bch_inode_unpacked dir_inode; -+ struct bch_hash_info dir_hash_info; -+ int ret; -+ -+ ret = lookup_first_inode(trans, pos.inode, &dir_inode); -+ if (ret) -+ goto err; -+ -+ dir_hash_info = bch2_hash_info_init(c, &dir_inode); -+ -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); -+ -+ ret = bch2_btree_iter_traverse(trans, &iter) ?: -+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc, -+ &dir_hash_info, &iter, -+ BTREE_UPDATE_internal_snapshot_node); -+ bch2_trans_iter_exit(trans, &iter); -+err: -+ bch_err_fn(c, ret); -+ return ret; -+} -diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h -index 362b3b2f2f2e..d3e7ae669575 100644 ---- a/fs/bcachefs/dirent.h -+++ b/fs/bcachefs/dirent.h -@@ -23,12 +23,30 @@ struct bch_fs; - struct bch_hash_info; - struct bch_inode_info; - -+int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, -+ const struct qstr *, struct qstr *); -+ -+static inline int bch2_maybe_casefold(struct btree_trans *trans, -+ const struct bch_hash_info *info, -+ const struct qstr *str, struct qstr *out_cf) -+{ -+ if (likely(!info->cf_encoding)) { -+ *out_cf = *str; -+ return 0; -+ } else { -+ return bch2_casefold(trans, info, str, out_cf); -+ } -+} -+ - struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); - --static inline unsigned dirent_val_u64s(unsigned len) -+static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) - { -- return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, -- sizeof(u64)); -+ unsigned bytes = cf_len -+ ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len -+ : offsetof(struct bch_dirent, d_name) + len; -+ -+ return DIV_ROUND_UP(bytes, sizeof(u64)); - } - - int bch2_dirent_read_target(struct btree_trans *, subvol_inum, -@@ -62,8 +80,8 @@ enum bch_rename_mode { - }; - - int bch2_dirent_rename(struct btree_trans *, -- subvol_inum, struct bch_hash_info *, -- subvol_inum, struct bch_hash_info *, -+ subvol_inum, struct bch_hash_info *, u64 *, -+ subvol_inum, struct bch_hash_info *, u64 *, - const struct qstr *, subvol_inum *, u64 *, - const struct qstr *, subvol_inum *, u64 *, - enum bch_rename_mode); -@@ -79,4 +97,6 @@ int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); - int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); - int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); - -+int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); -+ - #endif /* _BCACHEFS_DIRENT_H */ -diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h -index 5e116b88e814..a46dbddd21aa 100644 ---- a/fs/bcachefs/dirent_format.h -+++ b/fs/bcachefs/dirent_format.h -@@ -29,9 +29,25 @@ struct bch_dirent { - * Copy of mode bits 12-15 from the target inode - so userspace can get - * the filetype without having to do a stat() - */ -- __u8 d_type; -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 d_type:5, -+ d_unused:2, -+ d_casefold:1; -+#elif defined(__BIG_ENDIAN_BITFIELD) -+ __u8 d_casefold:1, -+ d_unused:2, -+ d_type:5; -+#endif - -- __u8 d_name[]; -+ union { -+ struct { -+ __u8 d_pad; -+ __le16 d_name_len; -+ __le16 d_cf_name_len; -+ __u8 d_names[]; -+ } d_cf_name_block __packed; -+ __DECLARE_FLEX_ARRAY(__u8, d_name); -+ } __packed; - } __packed __aligned(8); - - #define DT_SUBVOL 16 -diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c -index b32e91ba8be8..b3840ff7c407 100644 ---- a/fs/bcachefs/disk_accounting.c -+++ b/fs/bcachefs/disk_accounting.c -@@ -68,23 +68,31 @@ static const char * const disk_accounting_type_strs[] = { - NULL - }; - --static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, -- s64 *d, unsigned nr) -+static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos, -+ s64 *d, unsigned nr) - { - struct bkey_i_accounting *acc = bkey_accounting_init(k); - -- acc->k.p = disk_accounting_pos_to_bpos(pos); -+ acc->k.p = pos; - set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); - - memcpy_u64s_small(acc->v.d, d, nr); - } - -+static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, -+ s64 *d, unsigned nr) -+{ -+ return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr); -+} -+ - static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); - - int bch2_disk_accounting_mod(struct btree_trans *trans, - struct disk_accounting_pos *k, - s64 *d, unsigned nr, bool gc) - { -+ BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); -+ - /* Normalize: */ - switch (k->type) { - case BCH_DISK_ACCOUNTING_replicas: -@@ -92,21 +100,49 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, - break; - } - -- BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); -+ struct bpos pos = disk_accounting_pos_to_bpos(k); -+ -+ if (likely(!gc)) { -+ struct bkey_i_accounting *a; -+#if 0 -+ for (a = btree_trans_subbuf_base(trans, &trans->accounting); -+ a != btree_trans_subbuf_top(trans, &trans->accounting); -+ a = (void *) bkey_next(&a->k_i)) -+ if (bpos_eq(a->k.p, pos)) { -+ BUG_ON(nr != bch2_accounting_counters(&a->k)); -+ acc_u64s(a->v.d, d, nr); -+ -+ if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) { -+ unsigned offset = (u64 *) a - -+ (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); -+ -+ trans->accounting.u64s -= a->k.u64s; -+ memmove_u64s_down(a, -+ bkey_next(&a->k_i), -+ trans->accounting.u64s - offset); -+ } -+ return 0; -+ } -+#endif -+ unsigned u64s = sizeof(*a) / sizeof(u64) + nr; -+ a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); -+ int ret = PTR_ERR_OR_ZERO(a); -+ if (ret) -+ return ret; - -- struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; -+ __accounting_key_init(&a->k_i, pos, d, nr); -+ return 0; -+ } else { -+ struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; - -- accounting_key_init(&k_i.k, k, d, nr); -+ __accounting_key_init(&k_i.k, pos, d, nr); - -- if (unlikely(gc)) { - int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); - if (ret == -BCH_ERR_btree_insert_need_mark_replicas) - ret = drop_locks_do(trans, - bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: - bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); - return ret; -- } else { -- return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); - } - } - -@@ -114,10 +150,9 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *trans, - unsigned dev, s64 sectors, - bool gc) - { -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_replicas, -- }; -- -+ struct disk_accounting_pos acc; -+ memset(&acc, 0, sizeof(acc)); -+ acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_replicas_entry_cached(&acc.replicas, dev); - - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); -@@ -135,6 +170,12 @@ static inline bool is_zero(char *start, char *end) - - #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) - -+static const unsigned bch2_accounting_type_nr_counters[] = { -+#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr, -+ BCH_DISK_ACCOUNTING_TYPES() -+#undef x -+}; -+ - int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) - { -@@ -193,6 +234,11 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, - bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), - c, accounting_key_junk_at_end, - "junk at end of accounting key"); -+ -+ bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], -+ c, accounting_key_nr_counters_wrong, -+ "accounting key with %u counters, should be %u", -+ bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); - fsck_err: - return ret; - } -@@ -277,7 +323,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc - - static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) - { -- struct bch_replicas_padded r; -+ union bch_replicas_padded r; - return accounting_to_replicas(&r.e, p) - ? bch2_mark_replicas(c, &r.e) - : 0; -@@ -289,14 +335,13 @@ static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) - */ - int bch2_accounting_update_sb(struct btree_trans *trans) - { -- for (struct jset_entry *i = trans->journal_entries; -- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); -- i = vstruct_next(i)) -- if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) { -- int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p); -- if (ret) -- return ret; -- } -+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); -+ i != btree_trans_subbuf_top(trans, &trans->accounting); -+ i = bkey_next(i)) { -+ int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); -+ if (ret) -+ return ret; -+ } - - return 0; - } -@@ -351,7 +396,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun - int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, - enum bch_accounting_mode mode) - { -- struct bch_replicas_padded r; -+ union bch_replicas_padded r; - - if (mode != BCH_ACCOUNTING_read && - accounting_to_replicas(&r.e, a.k->p) && -@@ -366,6 +411,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, - return ret; - } - -+int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, -+ enum bch_accounting_mode mode) -+{ -+ union bch_replicas_padded r; -+ -+ if (mode != BCH_ACCOUNTING_read && -+ accounting_to_replicas(&r.e, a.k->p) && -+ !bch2_replicas_marked_locked(c, &r.e)) -+ return -BCH_ERR_btree_insert_need_mark_replicas; -+ -+ return __bch2_accounting_mem_insert(c, a); -+} -+ - static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) - { - for (unsigned i = 0; i < e->nr_counters; i++) -@@ -415,10 +473,12 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) - - percpu_down_read(&c->mark_lock); - darray_for_each(acc->k, i) { -- struct { -+ union { -+ u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, -+ BCH_BKEY_PTRS_MAX)]; - struct bch_replicas_usage r; -- u8 pad[BCH_BKEY_PTRS_MAX]; - } u; -+ u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; - - if (!accounting_to_replicas(&u.r.r, i->pos)) - continue; -@@ -547,11 +607,11 @@ int bch2_gc_accounting_done(struct bch_fs *c) - prt_str(&buf, "accounting mismatch for "); - bch2_accounting_key_to_text(&buf, &acc_k); - -- prt_str(&buf, ": got"); -+ prt_str(&buf, ":\n got"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", dst_v[j]); - -- prt_str(&buf, " should be"); -+ prt_str(&buf, "\nshould be"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", src_v[j]); - -@@ -573,7 +633,7 @@ int bch2_gc_accounting_done(struct bch_fs *c) - accounting_key_init(&k_i.k, &acc_k, src_v, nr); - bch2_accounting_mem_mod_locked(trans, - bkey_i_to_s_c_accounting(&k_i.k), -- BCH_ACCOUNTING_normal); -+ BCH_ACCOUNTING_normal, true); - - preempt_disable(); - struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); -@@ -602,23 +662,23 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) - - percpu_down_read(&c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), -- BCH_ACCOUNTING_read); -+ BCH_ACCOUNTING_read, false); - percpu_up_read(&c->mark_lock); - return ret; - } - - static int bch2_disk_accounting_validate_late(struct btree_trans *trans, -- struct disk_accounting_pos acc, -+ struct disk_accounting_pos *acc, - u64 *v, unsigned nr) - { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0, invalid_dev = -1; - -- switch (acc.type) { -+ switch (acc->type) { - case BCH_DISK_ACCOUNTING_replicas: { -- struct bch_replicas_padded r; -- __accounting_to_replicas(&r.e, &acc); -+ union bch_replicas_padded r; -+ __accounting_to_replicas(&r.e, acc); - - for (unsigned i = 0; i < r.e.nr_devs; i++) - if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && -@@ -635,9 +695,9 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - - if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), - trans, accounting_replicas_not_marked, -- "accounting not marked in superblock replicas\n %s", -+ "accounting not marked in superblock replicas\n%s", - (printbuf_reset(&buf), -- bch2_accounting_key_to_text(&buf, &acc), -+ bch2_accounting_key_to_text(&buf, acc), - buf.buf))) { - /* - * We're not RW yet and still single threaded, dropping -@@ -653,8 +713,8 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - } - - case BCH_DISK_ACCOUNTING_dev_data_type: -- if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { -- invalid_dev = acc.dev_data_type.dev; -+ if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { -+ invalid_dev = acc->dev_data_type.dev; - goto invalid_device; - } - break; -@@ -665,16 +725,16 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - return ret; - invalid_device: - if (fsck_err(trans, accounting_to_invalid_device, -- "accounting entry points to invalid device %i\n %s", -+ "accounting entry points to invalid device %i\n%s", - invalid_dev, - (printbuf_reset(&buf), -- bch2_accounting_key_to_text(&buf, &acc), -+ bch2_accounting_key_to_text(&buf, acc), - buf.buf))) { - for (unsigned i = 0; i < nr; i++) - v[i] = -v[i]; - - ret = commit_do(trans, NULL, NULL, 0, -- bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: -+ bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: - -BCH_ERR_remove_disk_accounting_entry; - } else { - ret = -BCH_ERR_remove_disk_accounting_entry; -@@ -725,9 +785,11 @@ int bch2_accounting_read(struct bch_fs *c) - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; - -- if (!bch2_accounting_is_mem(acc_k)) { -- struct disk_accounting_pos next = { .type = acc_k.type + 1 }; -- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); -+ if (!bch2_accounting_is_mem(&acc_k)) { -+ struct disk_accounting_pos next; -+ memset(&next, 0, sizeof(next)); -+ next.type = acc_k.type + 1; -+ bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); - continue; - } - -@@ -745,7 +807,7 @@ int bch2_accounting_read(struct bch_fs *c) - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); - -- if (!bch2_accounting_is_mem(acc_k)) -+ if (!bch2_accounting_is_mem(&acc_k)) - continue; - - struct bkey_s_c k = bkey_i_to_s_c(i->k); -@@ -801,7 +863,7 @@ int bch2_accounting_read(struct bch_fs *c) - */ - ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) - ? -BCH_ERR_remove_disk_accounting_entry -- : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); -+ : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); - - if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(i->v[0]); -@@ -882,15 +944,13 @@ int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) - int bch2_dev_usage_init(struct bch_dev *ca, bool gc) - { - struct bch_fs *c = ca->fs; -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_dev_data_type, -- .dev_data_type.dev = ca->dev_idx, -- .dev_data_type.data_type = BCH_DATA_free, -- }; - u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; - - int ret = bch2_trans_do(c, ({ -- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?: -+ bch2_disk_accounting_mod2(trans, gc, -+ v, dev_data_type, -+ .dev = ca->dev_idx, -+ .data_type = BCH_DATA_free) ?: - (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); - })); - bch_err_fn(c, ret); -@@ -916,9 +976,11 @@ void bch2_verify_accounting_clean(struct bch_fs *c) - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; - -- if (!bch2_accounting_is_mem(acc_k)) { -- struct disk_accounting_pos next = { .type = acc_k.type + 1 }; -- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); -+ if (!bch2_accounting_is_mem(&acc_k)) { -+ struct disk_accounting_pos next; -+ memset(&next, 0, sizeof(next)); -+ next.type = acc_k.type + 1; -+ bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); - continue; - } - -diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h -index f4372cafea2e..f6098e33ab30 100644 ---- a/fs/bcachefs/disk_accounting.h -+++ b/fs/bcachefs/disk_accounting.h -@@ -33,10 +33,12 @@ static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a) - static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, - struct bkey_s_c_accounting src) - { -- EBUG_ON(dst->k.u64s != src.k->u64s); -- -- for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++) -+ for (unsigned i = 0; -+ i < min(bch2_accounting_counters(&dst->k), -+ bch2_accounting_counters(src.k)); -+ i++) - dst->v.d[i] += src.v->d[i]; -+ - if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0) - dst->k.bversion = src.k->bversion; - } -@@ -85,6 +87,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos - - int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, - s64 *, unsigned, bool); -+ -+#define disk_accounting_key_init(_k, _type, ...) \ -+do { \ -+ memset(&(_k), 0, sizeof(_k)); \ -+ (_k).type = BCH_DISK_ACCOUNTING_##_type; \ -+ (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ -+} while (0) -+ -+#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ -+({ \ -+ struct disk_accounting_pos pos; \ -+ disk_accounting_key_init(pos, __VA_ARGS__); \ -+ bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ -+}) -+ -+#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ -+ bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) -+ - int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); - - int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, -@@ -116,12 +136,13 @@ enum bch_accounting_mode { - }; - - int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); -+int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); - void bch2_accounting_mem_gc(struct bch_fs *); - --static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) -+static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc) - { -- return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && -- acc.type != BCH_DISK_ACCOUNTING_inum; -+ return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR && -+ acc->type != BCH_DISK_ACCOUNTING_inum; - } - - /* -@@ -130,7 +151,8 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) - */ - static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, - struct bkey_s_c_accounting a, -- enum bch_accounting_mode mode) -+ enum bch_accounting_mode mode, -+ bool write_locked) - { - struct bch_fs *c = trans->c; - struct bch_accounting_mem *acc = &c->accounting; -@@ -141,7 +163,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, - if (gc && !acc->gc_running) - return 0; - -- if (!bch2_accounting_is_mem(acc_k)) -+ if (!bch2_accounting_is_mem(&acc_k)) - return 0; - - if (mode == BCH_ACCOUNTING_normal) { -@@ -169,7 +191,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, - - while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { -- int ret = bch2_accounting_mem_insert(c, a, mode); -+ int ret = 0; -+ if (unlikely(write_locked)) -+ ret = bch2_accounting_mem_insert_locked(c, a, mode); -+ else -+ ret = bch2_accounting_mem_insert(c, a, mode); - if (ret) - return ret; - } -@@ -186,7 +212,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, - static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) - { - percpu_down_read(&trans->c->mark_lock); -- int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal); -+ int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false); - percpu_up_read(&trans->c->mark_lock); - return ret; - } -@@ -233,13 +259,13 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, - struct bkey_i_accounting *a, - unsigned commit_flags) - { -- a->k.bversion = journal_pos_to_bversion(&trans->journal_res, -- (u64 *) a - (u64 *) trans->journal_entries); -+ u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); -+ a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base); - - EBUG_ON(bversion_zero(a->k.bversion)); - - return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) -- ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal) -+ ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false) - : 0; - } - -@@ -251,7 +277,7 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans - struct bkey_s_accounting a = accounting_i_to_s(a_i); - - bch2_accounting_neg(a); -- bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); -+ bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false); - bch2_accounting_neg(a); - } - } -diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h -index 7b6e6c97e6aa..8269af1dbe2a 100644 ---- a/fs/bcachefs/disk_accounting_format.h -+++ b/fs/bcachefs/disk_accounting_format.h -@@ -95,40 +95,81 @@ static inline bool data_type_is_hidden(enum bch_data_type type) - } - } - -+/* -+ * field 1: name -+ * field 2: id -+ * field 3: number of counters (max 3) -+ */ -+ - #define BCH_DISK_ACCOUNTING_TYPES() \ -- x(nr_inodes, 0) \ -- x(persistent_reserved, 1) \ -- x(replicas, 2) \ -- x(dev_data_type, 3) \ -- x(compression, 4) \ -- x(snapshot, 5) \ -- x(btree, 6) \ -- x(rebalance_work, 7) \ -- x(inum, 8) -+ x(nr_inodes, 0, 1) \ -+ x(persistent_reserved, 1, 1) \ -+ x(replicas, 2, 1) \ -+ x(dev_data_type, 3, 3) \ -+ x(compression, 4, 3) \ -+ x(snapshot, 5, 1) \ -+ x(btree, 6, 1) \ -+ x(rebalance_work, 7, 1) \ -+ x(inum, 8, 3) - - enum disk_accounting_type { --#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr, -+#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, - BCH_DISK_ACCOUNTING_TYPES() - #undef x - BCH_DISK_ACCOUNTING_TYPE_NR, - }; - --struct bch_nr_inodes { -+/* -+ * No subtypes - number of inodes in the entire filesystem -+ * -+ * XXX: perhaps we could add a per-subvolume counter? -+ */ -+struct bch_acct_nr_inodes { - }; - --struct bch_persistent_reserved { -+/* -+ * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the -+ * reservation: -+ */ -+struct bch_acct_persistent_reserved { - __u8 nr_replicas; - }; - --struct bch_dev_data_type { -+/* -+ * device, data type counter fields: -+ * [ -+ * nr_buckets -+ * live sectors (in buckets of that data type) -+ * sectors of internal fragmentation -+ * ] -+ * -+ * XXX: live sectors should've been done differently, you can have multiple data -+ * types in the same bucket (user, stripe, cached) and this collapses them to -+ * the bucket data type, and makes the internal fragmentation counter redundant -+ */ -+struct bch_acct_dev_data_type { - __u8 dev; - __u8 data_type; - }; - -+/* -+ * Compression type fields: -+ * [ -+ * number of extents -+ * uncompressed size -+ * compressed size -+ * ] -+ * -+ * Compression ratio, average extent size (fragmentation). -+ */ - struct bch_acct_compression { - __u8 type; - }; - -+/* -+ * On disk usage by snapshot id; counts same values as replicas counter, but -+ * aggregated differently -+ */ - struct bch_acct_snapshot { - __u32 id; - } __packed; -@@ -137,10 +178,27 @@ struct bch_acct_btree { - __u32 id; - } __packed; - -+/* -+ * inum counter fields: -+ * [ -+ * number of extents -+ * sum of extent sizes - bkey size -+ * this field is similar to inode.bi_sectors, except here extents in -+ * different snapshots but the same inode number are all collapsed to the -+ * same counter -+ * sum of on disk size - same values tracked by replicas counters -+ * ] -+ * -+ * This tracks on disk fragmentation. -+ */ - struct bch_acct_inum { - __u64 inum; - } __packed; - -+/* -+ * Simple counter of the amount of data (on disk sectors) rebalance needs to -+ * move, extents counted here are also in the rebalance_work btree. -+ */ - struct bch_acct_rebalance_work { - }; - -@@ -149,10 +207,10 @@ struct disk_accounting_pos { - struct { - __u8 type; - union { -- struct bch_nr_inodes nr_inodes; -- struct bch_persistent_reserved persistent_reserved; -+ struct bch_acct_nr_inodes nr_inodes; -+ struct bch_acct_persistent_reserved persistent_reserved; - struct bch_replicas_entry_v1 replicas; -- struct bch_dev_data_type dev_data_type; -+ struct bch_acct_dev_data_type dev_data_type; - struct bch_acct_compression compression; - struct bch_acct_snapshot snapshot; - struct bch_acct_btree btree; -diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h -index b1982131b206..242b3270cd5c 100644 ---- a/fs/bcachefs/disk_accounting_types.h -+++ b/fs/bcachefs/disk_accounting_types.h -@@ -2,7 +2,7 @@ - #ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H - #define _BCACHEFS_DISK_ACCOUNTING_TYPES_H - --#include "darray.h" -+#include - - struct accounting_mem_entry { - struct bpos pos; -diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c -index 5df8de0b8c02..c20ecf5e5381 100644 ---- a/fs/bcachefs/disk_groups.c -+++ b/fs/bcachefs/disk_groups.c -@@ -86,35 +86,6 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field * - return ret; - } - --void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) --{ -- out->atomic++; -- rcu_read_lock(); -- -- struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); -- if (!g) -- goto out; -- -- for (unsigned i = 0; i < g->nr; i++) { -- if (i) -- prt_printf(out, " "); -- -- if (g->entries[i].deleted) { -- prt_printf(out, "[deleted]"); -- continue; -- } -- -- prt_printf(out, "[parent %d devs", g->entries[i].parent); -- for_each_member_device_rcu(c, ca, &g->entries[i].devs) -- prt_printf(out, " %s", ca->name); -- prt_printf(out, "]"); -- } -- --out: -- rcu_read_unlock(); -- out->atomic--; --} -- - static void bch2_sb_disk_groups_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -@@ -241,20 +212,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) - case TARGET_DEV: - return dev == t.dev; - case TARGET_GROUP: { -- struct bch_disk_groups_cpu *g; -- const struct bch_devs_mask *m; -- bool ret; -- -- rcu_read_lock(); -- g = rcu_dereference(c->disk_groups); -- m = g && t.group < g->nr && !g->entries[t.group].deleted -+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); -+ const struct bch_devs_mask *m = -+ g && t.group < g->nr && !g->entries[t.group].deleted - ? &g->entries[t.group].devs - : NULL; - -- ret = m ? test_bit(dev, m->d) : false; -- rcu_read_unlock(); -- -- return ret; -+ return m ? test_bit(dev, m->d) : false; - } - default: - BUG(); -@@ -377,54 +341,81 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) - return v; - } - --void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) -+static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g, -+ unsigned v) - { -- struct bch_disk_groups_cpu *groups; -- struct bch_disk_group_cpu *g; -- unsigned nr = 0; - u16 path[32]; -- -- out->atomic++; -- rcu_read_lock(); -- groups = rcu_dereference(c->disk_groups); -- if (!groups) -- goto invalid; -+ unsigned nr = 0; - - while (1) { - if (nr == ARRAY_SIZE(path)) - goto invalid; - -- if (v >= groups->nr) -+ if (v >= (g ? g->nr : 0)) - goto invalid; - -- g = groups->entries + v; -+ struct bch_disk_group_cpu *e = g->entries + v; - -- if (g->deleted) -+ if (e->deleted) - goto invalid; - - path[nr++] = v; - -- if (!g->parent) -+ if (!e->parent) - break; - -- v = g->parent - 1; -+ v = e->parent - 1; - } - - while (nr) { -- v = path[--nr]; -- g = groups->entries + v; -+ struct bch_disk_group_cpu *e = g->entries + path[--nr]; - -- prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); -+ prt_printf(out, "%.*s", (int) sizeof(e->label), e->label); - if (nr) - prt_printf(out, "."); - } --out: -- rcu_read_unlock(); -- out->atomic--; - return; - invalid: - prt_printf(out, "invalid label %u", v); -- goto out; -+} -+ -+void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ bch2_printbuf_make_room(out, 4096); -+ -+ out->atomic++; -+ rcu_read_lock(); -+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); -+ -+ for (unsigned i = 0; i < (g ? g->nr : 0); i++) { -+ prt_printf(out, "%2u: ", i); -+ -+ if (g->entries[i].deleted) { -+ prt_printf(out, "[deleted]"); -+ goto next; -+ } -+ -+ __bch2_disk_path_to_text(out, g, i); -+ -+ prt_printf(out, " devs"); -+ -+ for_each_member_device_rcu(c, ca, &g->entries[i].devs) -+ prt_printf(out, " %s", ca->name); -+next: -+ prt_newline(out); -+ } -+ -+ rcu_read_unlock(); -+ out->atomic--; -+} -+ -+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) -+{ -+ out->atomic++; -+ rcu_read_lock(); -+ __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v), -+ rcu_read_unlock(); -+ --out->atomic; - } - - void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) -@@ -470,23 +461,22 @@ void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned - - int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) - { -- struct bch_member *mi; -- int ret, v = -1; -+ lockdep_assert_held(&c->sb_lock); - -- if (!strlen(name) || !strcmp(name, "none")) -- return 0; - -- v = bch2_disk_path_find_or_create(&c->disk_sb, name); -- if (v < 0) -- return v; -+ if (!strlen(name) || !strcmp(name, "none")) { -+ struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); -+ SET_BCH_MEMBER_GROUP(mi, 0); -+ } else { -+ int v = bch2_disk_path_find_or_create(&c->disk_sb, name); -+ if (v < 0) -+ return v; - -- ret = bch2_sb_disk_groups_to_cpu(c); -- if (ret) -- return ret; -+ struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); -+ SET_BCH_MEMBER_GROUP(mi, v + 1); -+ } - -- mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); -- SET_BCH_MEMBER_GROUP(mi, v + 1); -- return 0; -+ return bch2_sb_disk_groups_to_cpu(c); - } - - int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -@@ -555,14 +545,12 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) - ? rcu_dereference(c->devs[t.dev]) - : NULL; - -- if (ca && percpu_ref_tryget(&ca->io_ref)) { -+ if (ca && ca->disk_sb.bdev) - prt_printf(out, "/dev/%s", ca->name); -- percpu_ref_put(&ca->io_ref); -- } else if (ca) { -+ else if (ca) - prt_printf(out, "offline device %u", t.dev); -- } else { -+ else - prt_printf(out, "invalid device %u", t.dev); -- } - - rcu_read_unlock(); - out->atomic--; -diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c -index d2a5e76e6479..c581426e3894 100644 ---- a/fs/bcachefs/ec.c -+++ b/fs/bcachefs/ec.c -@@ -16,10 +16,12 @@ - #include "disk_accounting.h" - #include "disk_groups.h" - #include "ec.h" -+#include "enumerated_ref.h" - #include "error.h" - #include "io_read.h" - #include "io_write.h" - #include "keylist.h" -+#include "lru.h" - #include "recovery.h" - #include "replicas.h" - #include "super-io.h" -@@ -104,6 +106,8 @@ struct ec_bio { - struct bch_dev *ca; - struct ec_stripe_buf *buf; - size_t idx; -+ int rw; -+ u64 submit_time; - struct bio bio; - }; - -@@ -298,15 +302,27 @@ static int mark_stripe_bucket(struct btree_trans *trans, - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - if (flags & BTREE_TRIGGER_transactional) { -+ struct extent_ptr_decoded p = { -+ .ptr = *ptr, -+ .crc = bch2_extent_crc_unpack(s.k, NULL), -+ }; -+ struct bkey_i_backpointer bp; -+ bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p, -+ (const union bch_extent_entry *) ptr, &bp); -+ - struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update(trans, bucket, 0); -- ret = PTR_ERR_OR_ZERO(a) ?: -- __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); -+ ret = PTR_ERR_OR_ZERO(a) ?: -+ __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?: -+ bch2_bucket_backpointer_mod(trans, s.s_c, &bp, -+ !(flags & BTREE_TRIGGER_overwrite)); -+ if (ret) -+ goto err; - } - - if (flags & BTREE_TRIGGER_gc) { - struct bucket *g = gc_bucket(ca, bucket.offset); -- if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", -+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", - ptr->dev, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; -@@ -366,19 +382,6 @@ static int mark_stripe_buckets(struct btree_trans *trans, - return 0; - } - --static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s) --{ -- m->sectors = le16_to_cpu(s->sectors); -- m->algorithm = s->algorithm; -- m->nr_blocks = s->nr_blocks; -- m->nr_redundant = s->nr_redundant; -- m->disk_label = s->disk_label; -- m->blocks_nonempty = 0; -- -- for (unsigned i = 0; i < s->nr_blocks; i++) -- m->blocks_nonempty += !!stripe_blockcount_get(s, i); --} -- - int bch2_trigger_stripe(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s _new, -@@ -399,6 +402,15 @@ int bch2_trigger_stripe(struct btree_trans *trans, - (new_s->nr_blocks != old_s->nr_blocks || - new_s->nr_redundant != old_s->nr_redundant)); - -+ if (flags & BTREE_TRIGGER_transactional) { -+ int ret = bch2_lru_change(trans, -+ BCH_LRU_STRIPE_FRAGMENTATION, -+ idx, -+ stripe_lru_pos(old_s), -+ stripe_lru_pos(new_s)); -+ if (ret) -+ return ret; -+ } - - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - /* -@@ -443,24 +455,25 @@ int bch2_trigger_stripe(struct btree_trans *trans, - if (new_s) { - s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant; - -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_replicas, -- }; -+ struct disk_accounting_pos acc; -+ memset(&acc, 0, sizeof(acc)); -+ acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, new); - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); - if (ret) - return ret; - - if (gc) -- memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas)); -+ unsafe_memcpy(&gc->r.e, &acc.replicas, -+ replicas_entry_bytes(&acc.replicas), "VLA"); - } - - if (old_s) { - s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant; - -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_replicas, -- }; -+ struct disk_accounting_pos acc; -+ memset(&acc, 0, sizeof(acc)); -+ acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, old); - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); - if (ret) -@@ -472,38 +485,6 @@ int bch2_trigger_stripe(struct btree_trans *trans, - return ret; - } - -- if (flags & BTREE_TRIGGER_atomic) { -- struct stripe *m = genradix_ptr(&c->stripes, idx); -- -- if (!m) { -- struct printbuf buf1 = PRINTBUF; -- struct printbuf buf2 = PRINTBUF; -- -- bch2_bkey_val_to_text(&buf1, c, old); -- bch2_bkey_val_to_text(&buf2, c, new); -- bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" -- "old %s\n" -- "new %s", idx, buf1.buf, buf2.buf); -- printbuf_exit(&buf2); -- printbuf_exit(&buf1); -- bch2_inconsistent_error(c); -- return -1; -- } -- -- if (!new_s) { -- bch2_stripes_heap_del(c, m, idx); -- -- memset(m, 0, sizeof(*m)); -- } else { -- stripe_to_mem(m, new_s); -- -- if (!old_s) -- bch2_stripes_heap_insert(c, m, idx); -- else -- bch2_stripes_heap_update(c, m, idx); -- } -- } -- - return 0; - } - -@@ -527,20 +508,14 @@ static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, - - static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) - { -- switch (k.k->type) { -- case KEY_TYPE_extent: { -- struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -- const union bch_extent_entry *entry; -- -- extent_for_each_entry(e, entry) -- if (extent_entry_type(entry) == -- BCH_EXTENT_ENTRY_stripe_ptr && -- entry->stripe_ptr.idx == idx) -- return true; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; - -- break; -- } -- } -+ bkey_extent_entry_for_each(ptrs, entry) -+ if (extent_entry_type(entry) == -+ BCH_EXTENT_ENTRY_stripe_ptr && -+ entry->stripe_ptr.idx == idx) -+ return true; - - return false; - } -@@ -725,15 +700,20 @@ static void ec_block_endio(struct bio *bio) - struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; - struct bch_dev *ca = ec_bio->ca; - struct closure *cl = bio->bi_private; -+ int rw = ec_bio->rw; -+ unsigned ref = rw == READ -+ ? BCH_DEV_READ_REF_ec_block -+ : BCH_DEV_WRITE_REF_ec_block; - -- if (bch2_dev_io_err_on(bio->bi_status, ca, -- bio_data_dir(bio) -- ? BCH_MEMBER_ERROR_write -- : BCH_MEMBER_ERROR_read, -- "erasure coding %s error: %s", -+ bch2_account_io_completion(ca, bio_data_dir(bio), -+ ec_bio->submit_time, !bio->bi_status); -+ -+ if (bio->bi_status) { -+ bch_err_dev_ratelimited(ca, "erasure coding %s error: %s", - str_write_read(bio_data_dir(bio)), -- bch2_blk_status_to_str(bio->bi_status))) -+ bch2_blk_status_to_str(bio->bi_status)); - clear_bit(ec_bio->idx, ec_bio->buf->valid); -+ } - - int stale = dev_ptr_stale(ca, ptr); - if (stale) { -@@ -745,7 +725,7 @@ static void ec_block_endio(struct bio *bio) - } - - bio_put(&ec_bio->bio); -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[rw], ref); - closure_put(cl); - } - -@@ -759,8 +739,11 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, - ? BCH_DATA_user - : BCH_DATA_parity; - int rw = op_is_write(opf); -+ unsigned ref = rw == READ -+ ? BCH_DEV_READ_REF_ec_block -+ : BCH_DEV_WRITE_REF_ec_block; - -- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); -+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref); - if (!ca) { - clear_bit(idx, buf->valid); - return; -@@ -796,6 +779,8 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, - ec_bio->ca = ca; - ec_bio->buf = buf; - ec_bio->idx = idx; -+ ec_bio->rw = rw; -+ ec_bio->submit_time = local_clock(); - - ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); - ec_bio->bio.bi_end_io = ec_block_endio; -@@ -804,14 +789,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, - bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); - - closure_get(cl); -- percpu_ref_get(&ca->io_ref); -+ enumerated_ref_get(&ca->io_ref[rw], ref); - - submit_bio(&ec_bio->bio); - - offset += b; - } - -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[rw], ref); - } - - static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, -@@ -917,26 +902,6 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, - - static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) - { -- ec_stripes_heap n, *h = &c->ec_stripes_heap; -- -- if (idx >= h->size) { -- if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) -- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; -- -- mutex_lock(&c->ec_stripes_heap_lock); -- if (n.size > h->size) { -- memcpy(n.data, h->data, h->nr * sizeof(h->data[0])); -- n.nr = h->nr; -- swap(*h, n); -- } -- mutex_unlock(&c->ec_stripes_heap_lock); -- -- free_heap(&n); -- } -- -- if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) -- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; -- - if (c->gc_pos.phase != GC_PHASE_not_running && - !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; -@@ -1009,188 +974,58 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) - s->idx = 0; - } - --/* Heap of all existing stripes, ordered by blocks_nonempty */ -- --static u64 stripe_idx_to_delete(struct bch_fs *c) --{ -- ec_stripes_heap *h = &c->ec_stripes_heap; -- -- lockdep_assert_held(&c->ec_stripes_heap_lock); -- -- if (h->nr && -- h->data[0].blocks_nonempty == 0 && -- !bch2_stripe_is_open(c, h->data[0].idx)) -- return h->data[0].idx; -- -- return 0; --} -- --static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, -- size_t i) --{ -- struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); -- -- genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; --} -- --static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args) --{ -- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; -- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; -- -- return ((_l->blocks_nonempty > _r->blocks_nonempty) < -- (_l->blocks_nonempty < _r->blocks_nonempty)); --} -- --static inline void ec_stripes_heap_swap(void *l, void *r, void *h) --{ -- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; -- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; -- ec_stripes_heap *_h = (ec_stripes_heap *)h; -- size_t i = _l - _h->data; -- size_t j = _r - _h->data; -- -- swap(*_l, *_r); -- -- ec_stripes_heap_set_backpointer(_h, i); -- ec_stripes_heap_set_backpointer(_h, j); --} -- --static const struct min_heap_callbacks callbacks = { -- .less = ec_stripes_heap_cmp, -- .swp = ec_stripes_heap_swap, --}; -- --static void heap_verify_backpointer(struct bch_fs *c, size_t idx) --{ -- ec_stripes_heap *h = &c->ec_stripes_heap; -- struct stripe *m = genradix_ptr(&c->stripes, idx); -- -- BUG_ON(m->heap_idx >= h->nr); -- BUG_ON(h->data[m->heap_idx].idx != idx); --} -- --void bch2_stripes_heap_del(struct bch_fs *c, -- struct stripe *m, size_t idx) --{ -- mutex_lock(&c->ec_stripes_heap_lock); -- heap_verify_backpointer(c, idx); -- -- min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap); -- mutex_unlock(&c->ec_stripes_heap_lock); --} -- --void bch2_stripes_heap_insert(struct bch_fs *c, -- struct stripe *m, size_t idx) --{ -- mutex_lock(&c->ec_stripes_heap_lock); -- BUG_ON(min_heap_full(&c->ec_stripes_heap)); -- -- genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr; -- min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) { -- .idx = idx, -- .blocks_nonempty = m->blocks_nonempty, -- }), -- &callbacks, -- &c->ec_stripes_heap); -- -- heap_verify_backpointer(c, idx); -- mutex_unlock(&c->ec_stripes_heap_lock); --} -- --void bch2_stripes_heap_update(struct bch_fs *c, -- struct stripe *m, size_t idx) --{ -- ec_stripes_heap *h = &c->ec_stripes_heap; -- bool do_deletes; -- size_t i; -- -- mutex_lock(&c->ec_stripes_heap_lock); -- heap_verify_backpointer(c, idx); -- -- h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; -- -- i = m->heap_idx; -- min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap); -- min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap); -- -- heap_verify_backpointer(c, idx); -- -- do_deletes = stripe_idx_to_delete(c) != 0; -- mutex_unlock(&c->ec_stripes_heap_lock); -- -- if (do_deletes) -- bch2_do_stripe_deletes(c); --} -- - /* stripe deletion */ - - static int ec_stripe_delete(struct btree_trans *trans, u64 idx) - { -- struct bch_fs *c = trans->c; - struct btree_iter iter; -- struct bkey_s_c k; -- struct bkey_s_c_stripe s; -- int ret; -- -- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), -- BTREE_ITER_intent); -- ret = bkey_err(k); -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, -+ BTREE_ID_stripes, POS(0, idx), -+ BTREE_ITER_intent); -+ int ret = bkey_err(k); - if (ret) - goto err; - -- if (k.k->type != KEY_TYPE_stripe) { -- bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); -- ret = -EINVAL; -- goto err; -- } -- -- s = bkey_s_c_to_stripe(k); -- for (unsigned i = 0; i < s.v->nr_blocks; i++) -- if (stripe_blockcount_get(s.v, i)) { -- struct printbuf buf = PRINTBUF; -- -- bch2_bkey_val_to_text(&buf, c, k); -- bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); -- printbuf_exit(&buf); -- ret = -EINVAL; -- goto err; -- } -- -- ret = bch2_btree_delete_at(trans, &iter, 0); -+ /* -+ * We expect write buffer races here -+ * Important: check stripe_is_open with stripe key locked: -+ */ -+ if (k.k->type == KEY_TYPE_stripe && -+ !bch2_stripe_is_open(trans->c, idx) && -+ stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) -+ ret = bch2_btree_delete_at(trans, &iter, 0); - err: - bch2_trans_iter_exit(trans, &iter); - return ret; - } - -+/* -+ * XXX -+ * can we kill this and delete stripes from the trigger? -+ */ - static void ec_stripe_delete_work(struct work_struct *work) - { - struct bch_fs *c = - container_of(work, struct bch_fs, ec_stripe_delete_work); - -- while (1) { -- mutex_lock(&c->ec_stripes_heap_lock); -- u64 idx = stripe_idx_to_delete(c); -- mutex_unlock(&c->ec_stripes_heap_lock); -- -- if (!idx) -- break; -- -- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -- ec_stripe_delete(trans, idx)); -- bch_err_fn(c, ret); -- if (ret) -- break; -- } -- -- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); -+ bch2_trans_run(c, -+ bch2_btree_write_buffer_tryflush(trans) ?: -+ for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru, -+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0), -+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX), -+ 0, lru_k, -+ NULL, NULL, -+ BCH_TRANS_COMMIT_no_enospc, ({ -+ ec_stripe_delete(trans, lru_k.k->p.offset); -+ }))); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); - } - - void bch2_do_stripe_deletes(struct bch_fs *c) - { -- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && -+ if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) && - !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) -- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); - } - - /* stripe creation: */ -@@ -1294,7 +1129,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, - - bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); -- return -EIO; -+ return -BCH_ERR_erasure_coding_found_btree_node; - } - - k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); -@@ -1360,7 +1195,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b - - struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); - if (!ca) -- return -EIO; -+ return -BCH_ERR_ENOENT_dev_not_found; - - struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); - -@@ -1380,8 +1215,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b - if (bp_k.k->type != KEY_TYPE_backpointer) - continue; - -+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); -+ if (bp.v->btree_id == BTREE_ID_stripes) -+ continue; -+ - ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, -- bkey_s_c_to_backpointer(bp_k), &last_flushed); -+ bp, &last_flushed); - })); - - bch2_bkey_buf_exit(&last_flushed, c); -@@ -1393,21 +1232,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) - { - struct btree_trans *trans = bch2_trans_get(c); - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; -- unsigned i, nr_data = v->nr_blocks - v->nr_redundant; -- int ret = 0; -+ unsigned nr_data = v->nr_blocks - v->nr_redundant; - -- ret = bch2_btree_write_buffer_flush_sync(trans); -+ int ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - -- for (i = 0; i < nr_data; i++) { -+ for (unsigned i = 0; i < nr_data; i++) { - ret = ec_stripe_update_bucket(trans, s, i); - if (ret) - break; - } - err: - bch2_trans_put(trans); -- - return ret; - } - -@@ -1416,7 +1253,8 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, - unsigned block, - struct open_bucket *ob) - { -- struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); -+ struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, -+ BCH_DEV_WRITE_REF_ec_bucket_zero); - if (!ca) { - s->err = -BCH_ERR_erofs_no_writes; - return; -@@ -1432,7 +1270,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, - ob->sectors_free, - GFP_KERNEL, 0); - -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero); - - if (ret) - s->err = ret; -@@ -1473,6 +1311,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) - if (s->err) { - if (!bch2_err_matches(s->err, EROFS)) - bch_err(c, "error creating stripe: error writing data buckets"); -+ ret = s->err; - goto err; - } - -@@ -1481,6 +1320,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) - - if (ec_do_recov(c, &s->existing_stripe)) { - bch_err(c, "error creating stripe: error reading existing stripe"); -+ ret = -BCH_ERR_ec_block_read; - goto err; - } - -@@ -1506,6 +1346,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) - - if (ec_nr_failed(&s->new_stripe)) { - bch_err(c, "error creating stripe: error writing redundancy buckets"); -+ ret = -BCH_ERR_ec_block_write; - goto err; - } - -@@ -1527,6 +1368,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) - if (ret) - goto err; - err: -+ trace_stripe_create(c, s->idx, ret); -+ - bch2_disk_reservation_put(c, &s->res); - - for (i = 0; i < v->nr_blocks; i++) -@@ -1577,15 +1420,15 @@ static void ec_stripe_create_work(struct work_struct *work) - while ((s = get_pending_stripe(c))) - ec_stripe_create(s); - -- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); - } - - void bch2_ec_do_stripe_creates(struct bch_fs *c) - { -- bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); -+ enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create); - - if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) -- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); - } - - static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) -@@ -1612,11 +1455,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int - ec_stripe_new_set_pending(c, h); - } - --void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) -+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err) - { - struct ec_stripe_new *s = ob->ec; - -- s->err = -EIO; -+ s->err = err; - } - - void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) -@@ -1875,23 +1718,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, - } - - static int new_stripe_alloc_buckets(struct btree_trans *trans, -+ struct alloc_request *req, - struct ec_stripe_head *h, struct ec_stripe_new *s, -- enum bch_watermark watermark, struct closure *cl) -+ struct closure *cl) - { - struct bch_fs *c = trans->c; -- struct bch_devs_mask devs = h->devs; - struct open_bucket *ob; -- struct open_buckets buckets; - struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - unsigned i, j, nr_have_parity = 0, nr_have_data = 0; -- bool have_cache = true; - int ret = 0; - -+ req->scratch_data_type = req->data_type; -+ req->scratch_ptrs = req->ptrs; -+ req->scratch_nr_replicas = req->nr_replicas; -+ req->scratch_nr_effective = req->nr_effective; -+ req->scratch_have_cache = req->have_cache; -+ req->scratch_devs_may_alloc = req->devs_may_alloc; -+ -+ req->devs_may_alloc = h->devs; -+ req->have_cache = true; -+ - BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); - BUG_ON(v->nr_redundant != s->nr_parity); - - /* * We bypass the sector allocator which normally does this: */ -- bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); -+ bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, -+ c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); - - for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { - /* -@@ -1901,7 +1753,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, - * block when updating the stripe - */ - if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) -- __clear_bit(v->ptrs[i].dev, devs.d); -+ __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); - - if (i < s->nr_data) - nr_have_data++; -@@ -1912,95 +1764,94 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, - BUG_ON(nr_have_data > s->nr_data); - BUG_ON(nr_have_parity > s->nr_parity); - -- buckets.nr = 0; -+ req->ptrs.nr = 0; - if (nr_have_parity < s->nr_parity) { -- ret = bch2_bucket_alloc_set_trans(trans, &buckets, -- &h->parity_stripe, -- &devs, -- s->nr_parity, -- &nr_have_parity, -- &have_cache, 0, -- BCH_DATA_parity, -- watermark, -- cl); -- -- open_bucket_for_each(c, &buckets, ob, i) { -+ req->nr_replicas = s->nr_parity; -+ req->nr_effective = nr_have_parity; -+ req->data_type = BCH_DATA_parity; -+ -+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); -+ -+ open_bucket_for_each(c, &req->ptrs, ob, i) { - j = find_next_zero_bit(s->blocks_gotten, - s->nr_data + s->nr_parity, - s->nr_data); - BUG_ON(j >= s->nr_data + s->nr_parity); - -- s->blocks[j] = buckets.v[i]; -+ s->blocks[j] = req->ptrs.v[i]; - v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, s->blocks_gotten); - } - - if (ret) -- return ret; -+ goto err; - } - -- buckets.nr = 0; -+ req->ptrs.nr = 0; - if (nr_have_data < s->nr_data) { -- ret = bch2_bucket_alloc_set_trans(trans, &buckets, -- &h->block_stripe, -- &devs, -- s->nr_data, -- &nr_have_data, -- &have_cache, 0, -- BCH_DATA_user, -- watermark, -- cl); -- -- open_bucket_for_each(c, &buckets, ob, i) { -+ req->nr_replicas = s->nr_data; -+ req->nr_effective = nr_have_data; -+ req->data_type = BCH_DATA_user; -+ -+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); -+ -+ open_bucket_for_each(c, &req->ptrs, ob, i) { - j = find_next_zero_bit(s->blocks_gotten, - s->nr_data, 0); - BUG_ON(j >= s->nr_data); - -- s->blocks[j] = buckets.v[i]; -+ s->blocks[j] = req->ptrs.v[i]; - v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, s->blocks_gotten); - } - - if (ret) -- return ret; -+ goto err; - } -- -- return 0; -+err: -+ req->data_type = req->scratch_data_type; -+ req->ptrs = req->scratch_ptrs; -+ req->nr_replicas = req->scratch_nr_replicas; -+ req->nr_effective = req->scratch_nr_effective; -+ req->have_cache = req->scratch_have_cache; -+ req->devs_may_alloc = req->scratch_devs_may_alloc; -+ return ret; - } - --static s64 get_existing_stripe(struct bch_fs *c, -- struct ec_stripe_head *head) -+static int __get_existing_stripe(struct btree_trans *trans, -+ struct ec_stripe_head *head, -+ struct ec_stripe_buf *stripe, -+ u64 idx) - { -- ec_stripes_heap *h = &c->ec_stripes_heap; -- struct stripe *m; -- size_t heap_idx; -- u64 stripe_idx; -- s64 ret = -1; -- -- if (may_create_new_stripe(c)) -- return -1; -+ struct bch_fs *c = trans->c; - -- mutex_lock(&c->ec_stripes_heap_lock); -- for (heap_idx = 0; heap_idx < h->nr; heap_idx++) { -- /* No blocks worth reusing, stripe will just be deleted: */ -- if (!h->data[heap_idx].blocks_nonempty) -- continue; -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, -+ BTREE_ID_stripes, POS(0, idx), 0); -+ int ret = bkey_err(k); -+ if (ret) -+ goto err; - -- stripe_idx = h->data[heap_idx].idx; -+ /* We expect write buffer races here */ -+ if (k.k->type != KEY_TYPE_stripe) -+ goto out; - -- m = genradix_ptr(&c->stripes, stripe_idx); -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ if (stripe_lru_pos(s.v) <= 1) -+ goto out; - -- if (m->disk_label == head->disk_label && -- m->algorithm == head->algo && -- m->nr_redundant == head->redundancy && -- m->sectors == head->blocksize && -- m->blocks_nonempty < m->nr_blocks - m->nr_redundant && -- bch2_try_open_stripe(c, head->s, stripe_idx)) { -- ret = stripe_idx; -- break; -- } -+ if (s.v->disk_label == head->disk_label && -+ s.v->algorithm == head->algo && -+ s.v->nr_redundant == head->redundancy && -+ le16_to_cpu(s.v->sectors) == head->blocksize && -+ bch2_try_open_stripe(c, head->s, idx)) { -+ bkey_reassemble(&stripe->key, k); -+ ret = 1; - } -- mutex_unlock(&c->ec_stripes_heap_lock); -+out: -+ bch2_set_btree_iter_dontneed(trans, &iter); -+err: -+ bch2_trans_iter_exit(trans, &iter); - return ret; - } - -@@ -2052,24 +1903,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri - struct ec_stripe_new *s) - { - struct bch_fs *c = trans->c; -- s64 idx; -- int ret; - - /* - * If we can't allocate a new stripe, and there's no stripes with empty - * blocks for us to reuse, that means we have to wait on copygc: - */ -- idx = get_existing_stripe(c, h); -- if (idx < 0) -- return -BCH_ERR_stripe_alloc_blocked; -+ if (may_create_new_stripe(c)) -+ return -1; - -- ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); -- bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, -- "reading stripe key: %s", bch2_err_str(ret)); -- if (ret) { -- bch2_stripe_close(c, s); -- return ret; -+ struct btree_iter lru_iter; -+ struct bkey_s_c lru_k; -+ int ret = 0; -+ -+ for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru, -+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0), -+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX), -+ 0, lru_k, ret) { -+ ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset); -+ if (ret) -+ break; - } -+ bch2_trans_iter_exit(trans, &lru_iter); -+ if (!ret) -+ ret = -BCH_ERR_stripe_alloc_blocked; -+ if (ret == 1) -+ ret = 0; -+ if (ret) -+ return ret; - - return init_new_stripe_from_existing(c, s); - } -@@ -2102,7 +1962,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st - if (bkey_gt(k.k->p, POS(0, U32_MAX))) { - if (start_pos.offset) { - start_pos = min_pos; -- bch2_btree_iter_set_pos(&iter, start_pos); -+ bch2_btree_iter_set_pos(trans, &iter, start_pos); - continue; - } - -@@ -2136,17 +1996,15 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st - } - - struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, -- unsigned target, -+ struct alloc_request *req, - unsigned algo, -- unsigned redundancy, -- enum bch_watermark watermark, - struct closure *cl) - { - struct bch_fs *c = trans->c; -- struct ec_stripe_head *h; -- bool waiting = false; -+ unsigned redundancy = req->nr_replicas - 1; - unsigned disk_label = 0; -- struct target t = target_decode(target); -+ struct target t = target_decode(req->target); -+ bool waiting = false; - int ret; - - if (t.type == TARGET_GROUP) { -@@ -2157,7 +2015,9 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - disk_label = t.group + 1; /* 0 == no label */ - } - -- h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); -+ struct ec_stripe_head *h = -+ __bch2_ec_stripe_head_get(trans, disk_label, algo, -+ redundancy, req->watermark); - if (IS_ERR_OR_NULL(h)) - return h; - -@@ -2181,8 +2041,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - goto alloc_existing; - - /* First, try to allocate a full stripe: */ -- ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: -+ enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; -+ swap(req->watermark, saved_watermark); -+ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h, s); -+ swap(req->watermark, saved_watermark); -+ - if (!ret) - goto allocate_buf; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || -@@ -2200,8 +2064,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) - goto err; - -- if (watermark == BCH_WATERMARK_copygc) { -- ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: -+ if (req->watermark == BCH_WATERMARK_copygc) { -+ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h, s); - if (ret) - goto err; -@@ -2220,7 +2084,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - * Retry allocating buckets, with the watermark for this - * particular write: - */ -- ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); -+ ret = new_stripe_alloc_buckets(trans, req, h, s, cl); - if (ret) - goto err; - -@@ -2242,67 +2106,106 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - - /* device removal */ - --static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a) -+int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ unsigned dev_idx, -+ unsigned flags) - { -- struct bch_alloc_v4 a_convert; -- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); -- -- if (!a->stripe) -+ if (k.k->type != KEY_TYPE_stripe) - return 0; - -- if (a->stripe_sectors) { -- bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); -- return -BCH_ERR_invalidate_stripe_to_dev; -- } -- -- struct btree_iter iter; - struct bkey_i_stripe *s = -- bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), -- BTREE_ITER_slots, stripe); -+ bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); - int ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_replicas, -- }; -+ struct disk_accounting_pos acc; - - s64 sectors = 0; - for (unsigned i = 0; i < s->v.nr_blocks; i++) - sectors -= stripe_blockcount_get(&s->v, i); - -+ memset(&acc, 0, sizeof(acc)); -+ acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = BCH_DATA_user; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); - if (ret) -- goto err; -+ return ret; - - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); -- bkey_for_each_ptr(ptrs, ptr) -- if (ptr->dev == k_a.k->p.inode) -+ -+ /* XXX: how much redundancy do we still have? check degraded flags */ -+ -+ unsigned nr_good = 0; -+ -+ rcu_read_lock(); -+ bkey_for_each_ptr(ptrs, ptr) { -+ if (ptr->dev == dev_idx) - ptr->dev = BCH_SB_MEMBER_INVALID; - -+ struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev); -+ nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; -+ } -+ rcu_read_unlock(); -+ -+ if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) -+ return -BCH_ERR_remove_would_lose_data; -+ -+ unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; -+ -+ if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) -+ return -BCH_ERR_remove_would_lose_data; -+ - sectors = -sectors; - -+ memset(&acc, 0, sizeof(acc)); -+ acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = BCH_DATA_user; -- ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); -+ return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); -+} -+ -+static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, -+ unsigned flags) -+{ -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); -+ -+ if (!a->stripe) -+ return 0; -+ -+ if (a->stripe_sectors) { -+ bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); -+ return -BCH_ERR_invalidate_stripe_to_dev; -+ } -+ -+ struct btree_iter iter; -+ struct bkey_s_c_stripe s = -+ bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), -+ BTREE_ITER_slots, stripe); -+ int ret = bkey_err(s); - if (ret) -- goto err; --err: -+ return ret; -+ -+ ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; - } - --int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) -+int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) - { -- return bch2_trans_run(c, -+ int ret = bch2_trans_run(c, - for_each_btree_key_max_commit(trans, iter, - BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), - BTREE_ITER_intent, k, - NULL, NULL, 0, ({ -- bch2_invalidate_stripe_to_dev(trans, k); -+ bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); - }))); -+ bch_err_fn(c, ret); -+ return ret; - } - - /* startup/shutdown */ -@@ -2351,10 +2254,10 @@ void bch2_fs_ec_stop(struct bch_fs *c) - - static bool bch2_fs_ec_flush_done(struct bch_fs *c) - { -- bool ret; -+ sched_annotate_sleep(); - - mutex_lock(&c->ec_stripe_new_lock); -- ret = list_empty(&c->ec_stripe_new_list); -+ bool ret = list_empty(&c->ec_stripe_new_list); - mutex_unlock(&c->ec_stripe_new_lock); - - return ret; -@@ -2367,46 +2270,7 @@ void bch2_fs_ec_flush(struct bch_fs *c) - - int bch2_stripes_read(struct bch_fs *c) - { -- int ret = bch2_trans_run(c, -- for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, -- BTREE_ITER_prefetch, k, ({ -- if (k.k->type != KEY_TYPE_stripe) -- continue; -- -- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); -- if (ret) -- break; -- -- struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); -- -- stripe_to_mem(m, bkey_s_c_to_stripe(k).v); -- -- bch2_stripes_heap_insert(c, m, k.k->p.offset); -- 0; -- }))); -- bch_err_fn(c, ret); -- return ret; --} -- --void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) --{ -- ec_stripes_heap *h = &c->ec_stripes_heap; -- struct stripe *m; -- size_t i; -- -- mutex_lock(&c->ec_stripes_heap_lock); -- for (i = 0; i < min_t(size_t, h->nr, 50); i++) { -- m = genradix_ptr(&c->stripes, h->data[i].idx); -- -- prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, -- h->data[i].blocks_nonempty, -- m->nr_blocks - m->nr_redundant, -- m->nr_redundant); -- if (bch2_stripe_is_open(c, h->data[i].idx)) -- prt_str(out, " open"); -- prt_newline(out); -- } -- mutex_unlock(&c->ec_stripes_heap_lock); -+ return 0; - } - - static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, -@@ -2477,15 +2341,12 @@ void bch2_fs_ec_exit(struct bch_fs *c) - - BUG_ON(!list_empty(&c->ec_stripe_new_list)); - -- free_heap(&c->ec_stripes_heap); -- genradix_free(&c->stripes); - bioset_exit(&c->ec_bioset); - } - - void bch2_fs_ec_init_early(struct bch_fs *c) - { - spin_lock_init(&c->ec_stripes_new_lock); -- mutex_init(&c->ec_stripes_heap_lock); - - INIT_LIST_HEAD(&c->ec_stripe_head_list); - mutex_init(&c->ec_stripe_head_lock); -@@ -2503,3 +2364,40 @@ int bch2_fs_ec_init(struct bch_fs *c) - return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), - BIOSET_NEED_BVECS); - } -+ -+static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, -+ struct bkey_s_c k, -+ struct bkey_buf *last_flushed) -+{ -+ if (k.k->type != KEY_TYPE_stripe) -+ return 0; -+ -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ -+ u64 lru_idx = stripe_lru_pos(s.v); -+ if (lru_idx) { -+ int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION, -+ k.k->p.offset, lru_idx, k, last_flushed); -+ if (ret) -+ return ret; -+ } -+ return 0; -+} -+ -+int bch2_check_stripe_to_lru_refs(struct bch_fs *c) -+{ -+ struct bkey_buf last_flushed; -+ -+ bch2_bkey_buf_init(&last_flushed); -+ bkey_init(&last_flushed.k->k); -+ -+ int ret = bch2_trans_run(c, -+ for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, -+ POS_MIN, BTREE_ITER_prefetch, k, -+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -+ bch2_check_stripe_to_lru_ref(trans, k, &last_flushed))); -+ -+ bch2_bkey_buf_exit(&last_flushed, c); -+ bch_err_fn(c, ret); -+ return ret; -+} -diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h -index 583ca6a226da..548048adf0d5 100644 ---- a/fs/bcachefs/ec.h -+++ b/fs/bcachefs/ec.h -@@ -92,6 +92,29 @@ static inline void stripe_csum_set(struct bch_stripe *s, - memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); - } - -+#define STRIPE_LRU_POS_EMPTY 1 -+ -+static inline u64 stripe_lru_pos(const struct bch_stripe *s) -+{ -+ if (!s) -+ return 0; -+ -+ unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0; -+ -+ for (unsigned i = 0; i < nr_data; i++) -+ blocks_empty += !stripe_blockcount_get(s, i); -+ -+ /* Will be picked up by the stripe_delete worker */ -+ if (blocks_empty == nr_data) -+ return STRIPE_LRU_POS_EMPTY; -+ -+ if (!blocks_empty) -+ return 0; -+ -+ /* invert: more blocks empty = reuse first */ -+ return LRU_TIME_MAX - blocks_empty; -+} -+ - static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, - const struct bch_extent_ptr *data_ptr, - unsigned sectors) -@@ -132,6 +155,21 @@ static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, - m->sectors); - } - -+static inline void gc_stripe_unlock(struct gc_stripe *s) -+{ -+ BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); -+ -+ clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock); -+ smp_mb__after_atomic(); -+ wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR); -+} -+ -+static inline void gc_stripe_lock(struct gc_stripe *s) -+{ -+ wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR, -+ TASK_UNINTERRUPTIBLE); -+} -+ - struct bch_read_bio; - - struct ec_stripe_buf { -@@ -212,18 +250,15 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey - - void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); - --void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); -+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); - - int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); - - void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); --struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, -- unsigned, unsigned, unsigned, -- enum bch_watermark, struct closure *); - --void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); --void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); --void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); -+struct alloc_request; -+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, -+ struct alloc_request *, unsigned, struct closure *); - - void bch2_do_stripe_deletes(struct bch_fs *); - void bch2_ec_do_stripe_creates(struct bch_fs *); -@@ -253,7 +288,9 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, - } - } - --int bch2_dev_remove_stripes(struct bch_fs *, unsigned); -+int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *, -+ struct bkey_s_c, unsigned, unsigned); -+int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned); - - void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); - void bch2_fs_ec_stop(struct bch_fs *); -@@ -261,11 +298,12 @@ void bch2_fs_ec_flush(struct bch_fs *); - - int bch2_stripes_read(struct bch_fs *); - --void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); - void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); - - void bch2_fs_ec_exit(struct bch_fs *); - void bch2_fs_ec_init_early(struct bch_fs *); - int bch2_fs_ec_init(struct bch_fs *); - -+int bch2_check_stripe_to_lru_refs(struct bch_fs *); -+ - #endif /* _BCACHEFS_EC_H */ -diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h -index 8d1e70e830ac..809446c78951 100644 ---- a/fs/bcachefs/ec_types.h -+++ b/fs/bcachefs/ec_types.h -@@ -4,9 +4,10 @@ - - #include "bcachefs_format.h" - --struct bch_replicas_padded { -+union bch_replicas_padded { -+ u8 bytes[struct_size_t(struct bch_replicas_entry_v1, -+ devs, BCH_BKEY_PTRS_MAX)]; - struct bch_replicas_entry_v1 e; -- u8 pad[BCH_BKEY_PTRS_MAX]; - }; - - struct stripe { -@@ -20,23 +21,15 @@ struct stripe { - }; - - struct gc_stripe { -+ u8 lock; -+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */ - u16 sectors; -- - u8 nr_blocks; - u8 nr_redundant; -- -- unsigned alive:1; /* does a corresponding key exist in stripes btree? */ - u16 block_sectors[BCH_BKEY_PTRS_MAX]; - struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; - -- struct bch_replicas_padded r; -+ union bch_replicas_padded r; - }; - --struct ec_stripe_heap_entry { -- size_t idx; -- unsigned blocks_nonempty; --}; -- --typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap; -- - #endif /* _BCACHEFS_EC_TYPES_H */ -diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c -new file mode 100644 -index 000000000000..56ab430f209f ---- /dev/null -+++ b/fs/bcachefs/enumerated_ref.c -@@ -0,0 +1,144 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "enumerated_ref.h" -+#include "util.h" -+ -+#include -+ -+#ifdef ENUMERATED_REF_DEBUG -+void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) -+{ -+ BUG_ON(idx >= ref->nr); -+ atomic_long_inc(&ref->refs[idx]); -+} -+ -+bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -+{ -+ BUG_ON(idx >= ref->nr); -+ return atomic_long_inc_not_zero(&ref->refs[idx]); -+} -+ -+bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -+{ -+ BUG_ON(idx >= ref->nr); -+ return !ref->dying && -+ atomic_long_inc_not_zero(&ref->refs[idx]); -+} -+ -+void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) -+{ -+ BUG_ON(idx >= ref->nr); -+ long v = atomic_long_dec_return(&ref->refs[idx]); -+ -+ BUG_ON(v < 0); -+ if (v) -+ return; -+ -+ for (unsigned i = 0; i < ref->nr; i++) -+ if (atomic_long_read(&ref->refs[i])) -+ return; -+ -+ if (ref->stop_fn) -+ ref->stop_fn(ref); -+ complete(&ref->stop_complete); -+} -+#endif -+ -+#ifndef ENUMERATED_REF_DEBUG -+static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref) -+{ -+ struct enumerated_ref *ref = -+ container_of(percpu_ref, struct enumerated_ref, ref); -+ -+ if (ref->stop_fn) -+ ref->stop_fn(ref); -+ complete(&ref->stop_complete); -+} -+#endif -+ -+void enumerated_ref_stop_async(struct enumerated_ref *ref) -+{ -+ reinit_completion(&ref->stop_complete); -+ -+#ifndef ENUMERATED_REF_DEBUG -+ percpu_ref_kill(&ref->ref); -+#else -+ ref->dying = true; -+ for (unsigned i = 0; i < ref->nr; i++) -+ enumerated_ref_put(ref, i); -+#endif -+} -+ -+void enumerated_ref_stop(struct enumerated_ref *ref, -+ const char * const names[]) -+{ -+ enumerated_ref_stop_async(ref); -+ while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) { -+ struct printbuf buf = PRINTBUF; -+ -+ prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n"); -+ prt_str(&buf, "Outstanding refs:\n"); -+ enumerated_ref_to_text(&buf, ref, names); -+ printk(KERN_ERR "%s", buf.buf); -+ printbuf_exit(&buf); -+ } -+} -+ -+void enumerated_ref_start(struct enumerated_ref *ref) -+{ -+#ifndef ENUMERATED_REF_DEBUG -+ percpu_ref_reinit(&ref->ref); -+#else -+ ref->dying = false; -+ for (unsigned i = 0; i < ref->nr; i++) { -+ BUG_ON(atomic_long_read(&ref->refs[i])); -+ atomic_long_inc(&ref->refs[i]); -+ } -+#endif -+} -+ -+void enumerated_ref_exit(struct enumerated_ref *ref) -+{ -+#ifndef ENUMERATED_REF_DEBUG -+ percpu_ref_exit(&ref->ref); -+#else -+ kfree(ref->refs); -+ ref->refs = NULL; -+ ref->nr = 0; -+#endif -+} -+ -+int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr, -+ void (*stop_fn)(struct enumerated_ref *)) -+{ -+ init_completion(&ref->stop_complete); -+ ref->stop_fn = stop_fn; -+ -+#ifndef ENUMERATED_REF_DEBUG -+ return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb, -+ PERCPU_REF_INIT_DEAD, GFP_KERNEL); -+#else -+ ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL); -+ if (!ref->refs) -+ return -ENOMEM; -+ -+ ref->nr = nr; -+ return 0; -+#endif -+} -+ -+void enumerated_ref_to_text(struct printbuf *out, -+ struct enumerated_ref *ref, -+ const char * const names[]) -+{ -+#ifdef ENUMERATED_REF_DEBUG -+ bch2_printbuf_tabstop_push(out, 32); -+ -+ for (unsigned i = 0; i < ref->nr; i++) -+ prt_printf(out, "%s\t%li\n", names[i], -+ atomic_long_read(&ref->refs[i])); -+#else -+ prt_str(out, "(not in debug mode)\n"); -+#endif -+} -diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h -new file mode 100644 -index 000000000000..ec01cf59ef80 ---- /dev/null -+++ b/fs/bcachefs/enumerated_ref.h -@@ -0,0 +1,66 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ENUMERATED_REF_H -+#define _BCACHEFS_ENUMERATED_REF_H -+ -+#include "enumerated_ref_types.h" -+ -+/* -+ * A refcount where the users are enumerated: in debug mode, we create sepate -+ * refcounts for each user, to make leaks and refcount errors easy to track -+ * down: -+ */ -+ -+#ifdef ENUMERATED_REF_DEBUG -+void enumerated_ref_get(struct enumerated_ref *, unsigned); -+bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned); -+bool enumerated_ref_tryget(struct enumerated_ref *, unsigned); -+void enumerated_ref_put(struct enumerated_ref *, unsigned); -+#else -+ -+static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) -+{ -+ percpu_ref_get(&ref->ref); -+} -+ -+static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -+{ -+ return percpu_ref_tryget(&ref->ref); -+} -+ -+static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -+{ -+ return percpu_ref_tryget_live(&ref->ref); -+} -+ -+static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) -+{ -+ percpu_ref_put(&ref->ref); -+} -+#endif -+ -+static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref) -+{ -+#ifndef ENUMERATED_REF_DEBUG -+ return percpu_ref_is_zero(&ref->ref); -+#else -+ for (unsigned i = 0; i < ref->nr; i++) -+ if (atomic_long_read(&ref->refs[i])) -+ return false; -+ return true; -+#endif -+} -+ -+void enumerated_ref_stop_async(struct enumerated_ref *); -+void enumerated_ref_stop(struct enumerated_ref *, const char * const[]); -+void enumerated_ref_start(struct enumerated_ref *); -+ -+void enumerated_ref_exit(struct enumerated_ref *); -+int enumerated_ref_init(struct enumerated_ref *, unsigned, -+ void (*stop_fn)(struct enumerated_ref *)); -+ -+struct printbuf; -+void enumerated_ref_to_text(struct printbuf *, -+ struct enumerated_ref *, -+ const char * const[]); -+ -+#endif /* _BCACHEFS_ENUMERATED_REF_H */ -diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h -new file mode 100644 -index 000000000000..0e6076f466d3 ---- /dev/null -+++ b/fs/bcachefs/enumerated_ref_types.h -@@ -0,0 +1,19 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H -+#define _BCACHEFS_ENUMERATED_REF_TYPES_H -+ -+#include -+ -+struct enumerated_ref { -+#ifdef ENUMERATED_REF_DEBUG -+ unsigned nr; -+ bool dying; -+ atomic_long_t *refs; -+#else -+ struct percpu_ref ref; -+#endif -+ void (*stop_fn)(struct enumerated_ref *); -+ struct completion stop_complete; -+}; -+ -+#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */ -diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h -index 4590cd0c7c90..62843e772b2c 100644 ---- a/fs/bcachefs/errcode.h -+++ b/fs/bcachefs/errcode.h -@@ -5,6 +5,8 @@ - #define BCH_ERRCODES() \ - x(ERANGE, ERANGE_option_too_small) \ - x(ERANGE, ERANGE_option_too_big) \ -+ x(EINVAL, injected) \ -+ x(BCH_ERR_injected, injected_fs_start) \ - x(EINVAL, mount_option) \ - x(BCH_ERR_mount_option, option_name) \ - x(BCH_ERR_mount_option, option_value) \ -@@ -51,6 +53,7 @@ - x(ENOMEM, ENOMEM_dio_write_bioset_init) \ - x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ - x(ENOMEM, ENOMEM_promote_table_init) \ -+ x(ENOMEM, ENOMEM_async_obj_init) \ - x(ENOMEM, ENOMEM_compression_bounce_read_init) \ - x(ENOMEM, ENOMEM_compression_bounce_write_init) \ - x(ENOMEM, ENOMEM_compression_workspace_init) \ -@@ -116,9 +119,11 @@ - x(ENOENT, ENOENT_snapshot_tree) \ - x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ - x(ENOENT, ENOENT_dev_not_found) \ -+ x(ENOENT, ENOENT_dev_bucket_not_found) \ - x(ENOENT, ENOENT_dev_idx_not_found) \ - x(ENOENT, ENOENT_inode_no_backpointer) \ - x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ -+ x(ENOENT, btree_node_dying) \ - x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ - x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ - x(EEXIST, EEXIST_str_hash_set) \ -@@ -170,6 +175,7 @@ - x(0, backpointer_to_overwritten_btree_node) \ - x(0, journal_reclaim_would_deadlock) \ - x(EINVAL, fsck) \ -+ x(BCH_ERR_fsck, fsck_ask) \ - x(BCH_ERR_fsck, fsck_fix) \ - x(BCH_ERR_fsck, fsck_delete_bkey) \ - x(BCH_ERR_fsck, fsck_ignore) \ -@@ -177,9 +183,14 @@ - x(BCH_ERR_fsck, fsck_repair_unimplemented) \ - x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(EINVAL, restart_recovery) \ -- x(EINVAL, not_in_recovery) \ - x(EINVAL, cannot_rewind_recovery) \ - x(0, data_update_done) \ -+ x(BCH_ERR_data_update_done, data_update_done_would_block) \ -+ x(BCH_ERR_data_update_done, data_update_done_unwritten) \ -+ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ -+ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ -+ x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ -+ x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \ - x(EINVAL, device_state_not_allowed) \ - x(EINVAL, member_info_missing) \ - x(EINVAL, mismatched_block_size) \ -@@ -191,6 +202,7 @@ - x(EINVAL, device_has_been_removed) \ - x(EINVAL, device_splitbrain) \ - x(EINVAL, device_already_online) \ -+ x(EINVAL, filesystem_uuid_already_open) \ - x(EINVAL, insufficient_devices_to_start) \ - x(EINVAL, invalid) \ - x(EINVAL, internal_fsck_err) \ -@@ -200,6 +212,9 @@ - x(EINVAL, no_resize_with_buckets_nouse) \ - x(EINVAL, inode_unpack_error) \ - x(EINVAL, varint_decode_error) \ -+ x(EINVAL, erasure_coding_found_btree_node) \ -+ x(EINVAL, option_negative) \ -+ x(EOPNOTSUPP, may_not_use_incompat_feature) \ - x(EROFS, erofs_trans_commit) \ - x(EROFS, erofs_no_writes) \ - x(EROFS, erofs_journal_err) \ -@@ -207,13 +222,23 @@ - x(EROFS, erofs_unfixed_errors) \ - x(EROFS, erofs_norecovery) \ - x(EROFS, erofs_nochanges) \ -+ x(EROFS, erofs_no_alloc_info) \ -+ x(EROFS, erofs_filesystem_full) \ - x(EROFS, insufficient_devices) \ - x(0, operation_blocked) \ - x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ -- x(BCH_ERR_operation_blocked, journal_res_get_blocked) \ -- x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \ -- x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \ -- x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \ -+ x(BCH_ERR_operation_blocked, journal_res_blocked) \ -+ x(BCH_ERR_journal_res_blocked, journal_blocked) \ -+ x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \ -+ x(BCH_ERR_journal_res_blocked, journal_max_open) \ -+ x(BCH_ERR_journal_res_blocked, journal_full) \ -+ x(BCH_ERR_journal_res_blocked, journal_pin_full) \ -+ x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ -+ x(BCH_ERR_journal_res_blocked, journal_stuck) \ -+ x(BCH_ERR_journal_res_blocked, journal_retry_open) \ -+ x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \ -+ x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ -+ x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ - x(BCH_ERR_invalid, invalid_sb) \ - x(BCH_ERR_invalid_sb, invalid_sb_magic) \ - x(BCH_ERR_invalid_sb, invalid_sb_version) \ -@@ -223,6 +248,7 @@ - x(BCH_ERR_invalid_sb, invalid_sb_csum) \ - x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ - x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_offset) \ - x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ - x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ - x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ -@@ -248,8 +274,9 @@ - x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ - x(BCH_ERR_invalid, invalid_bkey) \ - x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ -- x(EIO, journal_shutdown) \ -+ x(EROFS, journal_shutdown) \ - x(EIO, journal_flush_err) \ -+ x(EIO, journal_write_err) \ - x(EIO, btree_node_read_err) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ - x(EIO, sb_not_downgraded) \ -@@ -258,17 +285,53 @@ - x(EIO, btree_node_read_validate_error) \ - x(EIO, btree_need_topology_repair) \ - x(EIO, bucket_ref_update) \ -+ x(EIO, trigger_alloc) \ - x(EIO, trigger_pointer) \ - x(EIO, trigger_stripe_pointer) \ - x(EIO, metadata_bucket_inconsistency) \ - x(EIO, mark_stripe) \ - x(EIO, stripe_reconstruct) \ - x(EIO, key_type_error) \ -- x(EIO, no_device_to_read_from) \ -+ x(EIO, extent_poisoned) \ - x(EIO, missing_indirect_extent) \ - x(EIO, invalidate_stripe_to_dev) \ - x(EIO, no_encryption_key) \ - x(EIO, insufficient_journal_devices) \ -+ x(EIO, device_offline) \ -+ x(EIO, EIO_fault_injected) \ -+ x(EIO, ec_block_read) \ -+ x(EIO, ec_block_write) \ -+ x(EIO, recompute_checksum) \ -+ x(EIO, decompress) \ -+ x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \ -+ x(BCH_ERR_decompress, decompress_lz4) \ -+ x(BCH_ERR_decompress, decompress_gzip) \ -+ x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \ -+ x(BCH_ERR_decompress, decompress_zstd) \ -+ x(EIO, data_write) \ -+ x(BCH_ERR_data_write, data_write_io) \ -+ x(BCH_ERR_data_write, data_write_csum) \ -+ x(BCH_ERR_data_write, data_write_invalid_ptr) \ -+ x(BCH_ERR_data_write, data_write_misaligned) \ -+ x(BCH_ERR_decompress, data_read) \ -+ x(BCH_ERR_data_read, no_device_to_read_from) \ -+ x(BCH_ERR_data_read, no_devices_valid) \ -+ x(BCH_ERR_data_read, data_read_io_err) \ -+ x(BCH_ERR_data_read, data_read_csum_err) \ -+ x(BCH_ERR_data_read, data_read_retry) \ -+ x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ -+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ -+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \ -+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \ -+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \ -+ x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\ -+ x(BCH_ERR_data_read, data_read_decompress_err) \ -+ x(BCH_ERR_data_read, data_read_decrypt_err) \ -+ x(BCH_ERR_data_read, data_read_ptr_stale_race) \ -+ x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ -+ x(BCH_ERR_data_read, data_read_no_encryption_key) \ -+ x(BCH_ERR_data_read, data_read_buffer_too_small) \ -+ x(BCH_ERR_data_read, data_read_key_overwritten) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ -diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c -index 038da6a61f6b..c2cad28635bf 100644 ---- a/fs/bcachefs/error.c -+++ b/fs/bcachefs/error.c -@@ -3,15 +3,24 @@ - #include "btree_cache.h" - #include "btree_iter.h" - #include "error.h" --#include "fs-common.h" - #include "journal.h" -+#include "namei.h" - #include "recovery_passes.h" - #include "super.h" - #include "thread_with_file.h" - - #define FSCK_ERR_RATELIMIT_NR 10 - --bool bch2_inconsistent_error(struct bch_fs *c) -+void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out) -+{ -+ printbuf_indent_add_nextline(out, 2); -+ -+#ifdef BCACHEFS_LOG_PREFIX -+ prt_printf(out, "bcachefs (%s): ", fs_or_dev_name); -+#endif -+} -+ -+bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) - { - set_bit(BCH_FS_error, &c->flags); - -@@ -20,11 +29,10 @@ bool bch2_inconsistent_error(struct bch_fs *c) - return false; - case BCH_ON_ERROR_fix_safe: - case BCH_ON_ERROR_ro: -- if (bch2_fs_emergency_read_only(c)) -- bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", -- journal_cur_seq(&c->journal)); -+ bch2_fs_emergency_read_only2(c, out); - return true; - case BCH_ON_ERROR_panic: -+ bch2_print_str(c, KERN_ERR, out->buf); - panic(bch2_fmt(c, "panic after error")); - return true; - default: -@@ -32,18 +40,91 @@ bool bch2_inconsistent_error(struct bch_fs *c) - } - } - --int bch2_topology_error(struct bch_fs *c) -+bool bch2_inconsistent_error(struct bch_fs *c) -+{ -+ struct printbuf buf = PRINTBUF; -+ buf.atomic++; -+ -+ printbuf_indent_add_nextline(&buf, 2); -+ -+ bool ret = __bch2_inconsistent_error(c, &buf); -+ if (ret) -+ bch_err(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+__printf(3, 0) -+static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans, -+ const char *fmt, va_list args) -+{ -+ struct printbuf buf = PRINTBUF; -+ buf.atomic++; -+ -+ bch2_log_msg_start(c, &buf); -+ -+ prt_vprintf(&buf, fmt, args); -+ prt_newline(&buf); -+ -+ if (trans) -+ bch2_trans_updates_to_text(&buf, trans); -+ bool ret = __bch2_inconsistent_error(c, &buf); -+ bch2_print_str_nonblocking(c, KERN_ERR, buf.buf); -+ -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...) -+{ -+ va_list args; -+ va_start(args, fmt); -+ bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args); -+ va_end(args); -+ return ret; -+} -+ -+bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...) -+{ -+ va_list args; -+ va_start(args, fmt); -+ bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args); -+ va_end(args); -+ return ret; -+} -+ -+int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) - { -+ prt_printf(out, "btree topology error: "); -+ - set_bit(BCH_FS_topology_error, &c->flags); -- if (!test_bit(BCH_FS_recovery_running, &c->flags)) { -- bch2_inconsistent_error(c); -+ if (!test_bit(BCH_FS_in_recovery, &c->flags)) { -+ __bch2_inconsistent_error(c, out); - return -BCH_ERR_btree_need_topology_repair; - } else { -- return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -+ return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: - -BCH_ERR_btree_node_read_validate_error; - } - } - -+int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) -+{ -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_log_msg_start(c, &buf); -+ -+ va_list args; -+ va_start(args, fmt); -+ prt_vprintf(&buf, fmt, args); -+ va_end(args); -+ -+ int ret = __bch2_topology_error(c, &buf); -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ -+ printbuf_exit(&buf); -+ return ret; -+} -+ - void bch2_fatal_error(struct bch_fs *c) - { - if (bch2_fs_emergency_read_only(c)) -@@ -54,25 +135,44 @@ void bch2_io_error_work(struct work_struct *work) - { - struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); - struct bch_fs *c = ca->fs; -- bool dev; -+ -+ /* XXX: if it's reads or checksums that are failing, set it to failed */ - - down_write(&c->state_lock); -- dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, -- BCH_FORCE_IF_DEGRADED); -- if (dev -- ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, -- BCH_FORCE_IF_DEGRADED) -- : bch2_fs_emergency_read_only(c)) -- bch_err(ca, -- "too many IO errors, setting %s RO", -+ unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); -+ -+ if (write_errors_start && -+ time_after(jiffies, -+ write_errors_start + c->opts.write_error_timeout * HZ)) { -+ if (ca->mi.state >= BCH_MEMBER_STATE_ro) -+ goto out; -+ -+ bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, -+ BCH_FORCE_IF_DEGRADED); -+ struct printbuf buf = PRINTBUF; -+ __bch2_log_msg_start(ca->name, &buf); -+ -+ prt_printf(&buf, "writes erroring for %u seconds, setting %s ro", -+ c->opts.write_error_timeout, - dev ? "device" : "filesystem"); -+ if (!dev) -+ bch2_fs_emergency_read_only2(c, &buf); -+ -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ } -+out: - up_write(&c->state_lock); - } - - void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) - { - atomic64_inc(&ca->errors[type]); -- //queue_work(system_long_wq, &ca->io_error_work); -+ -+ if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start) -+ ca->write_errors_start = jiffies; -+ -+ queue_work(system_long_wq, &ca->io_error_work); - } - - enum ask_yn { -@@ -168,15 +268,13 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) - - #endif - --static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) -+static struct fsck_err_state *fsck_err_get(struct bch_fs *c, -+ enum bch_sb_error_id id) - { - struct fsck_err_state *s; - -- if (!test_bit(BCH_FS_fsck_running, &c->flags)) -- return NULL; -- - list_for_each_entry(s, &c->fsck_error_msgs, list) -- if (s->fmt == fmt) { -+ if (s->id == id) { - /* - * move it to the head of the list: repeated fsck errors - * are common -@@ -194,7 +292,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) - } - - INIT_LIST_HEAD(&s->list); -- s->fmt = fmt; -+ s->id = id; - list_add(&s->list, &c->fsck_error_msgs); - return s; - } -@@ -231,7 +329,7 @@ static int do_fsck_ask_yn(struct bch_fs *c, - if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s", question->buf); - else -- bch2_print_string_as_lines(KERN_ERR, question->buf); -+ bch2_print_str(c, KERN_ERR, question->buf); - - int ask = bch2_fsck_ask_yn(c, trans); - -@@ -244,15 +342,107 @@ static int do_fsck_ask_yn(struct bch_fs *c, - return ask; - } - -+static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c, -+ enum bch_sb_error_id id, const char *msg, -+ bool *repeat, bool *print, bool *suppress) -+{ -+ bch2_sb_error_count(c, id); -+ -+ struct fsck_err_state *s = fsck_err_get(c, id); -+ if (s) { -+ /* -+ * We may be called multiple times for the same error on -+ * transaction restart - this memoizes instead of asking the user -+ * multiple times for the same error: -+ */ -+ if (s->last_msg && !strcmp(msg, s->last_msg)) { -+ *repeat = true; -+ *print = false; -+ return s; -+ } -+ -+ kfree(s->last_msg); -+ s->last_msg = kstrdup(msg, GFP_KERNEL); -+ -+ if (c->opts.ratelimit_errors && -+ s->nr >= FSCK_ERR_RATELIMIT_NR) { -+ if (s->nr == FSCK_ERR_RATELIMIT_NR) -+ *suppress = true; -+ else -+ *print = false; -+ } -+ -+ s->nr++; -+ } -+ return s; -+} -+ -+bool __bch2_count_fsck_err(struct bch_fs *c, -+ enum bch_sb_error_id id, struct printbuf *msg) -+{ -+ bch2_sb_error_count(c, id); -+ -+ mutex_lock(&c->fsck_error_msgs_lock); -+ bool print = true, repeat = false, suppress = false; -+ -+ count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress); -+ mutex_unlock(&c->fsck_error_msgs_lock); -+ -+ if (suppress) -+ prt_printf(msg, "Ratelimiting new instances of previous error\n"); -+ -+ return print && !repeat; -+} -+ -+int bch2_fsck_err_opt(struct bch_fs *c, -+ enum bch_fsck_flags flags, -+ enum bch_sb_error_id err) -+{ -+ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) -+ flags |= fsck_flags_extra[err]; -+ -+ if (test_bit(BCH_FS_in_fsck, &c->flags)) { -+ if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) -+ return -BCH_ERR_fsck_repair_unimplemented; -+ -+ switch (c->opts.fix_errors) { -+ case FSCK_FIX_exit: -+ return -BCH_ERR_fsck_errors_not_fixed; -+ case FSCK_FIX_yes: -+ if (flags & FSCK_CAN_FIX) -+ return -BCH_ERR_fsck_fix; -+ fallthrough; -+ case FSCK_FIX_no: -+ if (flags & FSCK_CAN_IGNORE) -+ return -BCH_ERR_fsck_ignore; -+ return -BCH_ERR_fsck_errors_not_fixed; -+ case FSCK_FIX_ask: -+ if (flags & FSCK_AUTOFIX) -+ return -BCH_ERR_fsck_fix; -+ return -BCH_ERR_fsck_ask; -+ default: -+ BUG(); -+ } -+ } else { -+ if ((flags & FSCK_AUTOFIX) && -+ (c->opts.errors == BCH_ON_ERROR_continue || -+ c->opts.errors == BCH_ON_ERROR_fix_safe)) -+ return -BCH_ERR_fsck_fix; -+ -+ if (c->opts.errors == BCH_ON_ERROR_continue && -+ (flags & FSCK_CAN_IGNORE)) -+ return -BCH_ERR_fsck_ignore; -+ return -BCH_ERR_fsck_errors_not_fixed; -+ } -+} -+ - int __bch2_fsck_err(struct bch_fs *c, - struct btree_trans *trans, - enum bch_fsck_flags flags, - enum bch_sb_error_id err, - const char *fmt, ...) - { -- struct fsck_err_state *s = NULL; - va_list args; -- bool print = true, suppressing = false, inconsistent = false, exiting = false; - struct printbuf buf = PRINTBUF, *out = &buf; - int ret = -BCH_ERR_fsck_ignore; - const char *action_orig = "fix?", *action = action_orig; -@@ -287,7 +477,12 @@ int __bch2_fsck_err(struct bch_fs *c, - ? -BCH_ERR_fsck_fix - : -BCH_ERR_fsck_ignore; - -- bch2_sb_error_count(c, err); -+ printbuf_indent_add_nextline(out, 2); -+ -+#ifdef BCACHEFS_LOG_PREFIX -+ if (strncmp(fmt, "bcachefs", 8)) -+ prt_printf(out, bch2_log_msg(c, "")); -+#endif - - va_start(args, fmt); - prt_vprintf(out, fmt, args); -@@ -307,42 +502,15 @@ int __bch2_fsck_err(struct bch_fs *c, - } - - mutex_lock(&c->fsck_error_msgs_lock); -- s = fsck_err_get(c, fmt); -- if (s) { -- /* -- * We may be called multiple times for the same error on -- * transaction restart - this memoizes instead of asking the user -- * multiple times for the same error: -- */ -- if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { -- ret = s->ret; -- goto err_unlock; -- } -- -- kfree(s->last_msg); -- s->last_msg = kstrdup(buf.buf, GFP_KERNEL); -- if (!s->last_msg) { -- ret = -ENOMEM; -- goto err_unlock; -- } -- -- if (c->opts.ratelimit_errors && -- !(flags & FSCK_NO_RATELIMIT) && -- s->nr >= FSCK_ERR_RATELIMIT_NR) { -- if (s->nr == FSCK_ERR_RATELIMIT_NR) -- suppressing = true; -- else -- print = false; -- } -- -- s->nr++; -+ bool repeat = false, print = true, suppress = false; -+ bool inconsistent = false, exiting = false; -+ struct fsck_err_state *s = -+ count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress); -+ if (repeat) { -+ ret = s->ret; -+ goto err_unlock; - } - --#ifdef BCACHEFS_LOG_PREFIX -- if (!strncmp(fmt, "bcachefs:", 9)) -- prt_printf(out, bch2_log_msg(c, "")); --#endif -- - if ((flags & FSCK_AUTOFIX) && - (c->opts.errors == BCH_ON_ERROR_continue || - c->opts.errors == BCH_ON_ERROR_fix_safe)) { -@@ -356,11 +524,14 @@ int __bch2_fsck_err(struct bch_fs *c, - } - - goto print; -- } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { -+ } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) { - if (c->opts.errors != BCH_ON_ERROR_continue || - !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { -- prt_str(out, ", shutting down"); -+ prt_str_indented(out, ", shutting down\n" -+ "error not marked as autofix and not in fsck\n" -+ "run fsck, and forward to devs so error can be marked for self-healing"); - inconsistent = true; -+ print = true; - ret = -BCH_ERR_fsck_errors_not_fixed; - } else if (flags & FSCK_CAN_FIX) { - prt_str(out, ", "); -@@ -412,31 +583,37 @@ int __bch2_fsck_err(struct bch_fs *c, - !(flags & FSCK_CAN_IGNORE))) - ret = -BCH_ERR_fsck_errors_not_fixed; - -- if (test_bit(BCH_FS_fsck_running, &c->flags) && -+ if (test_bit(BCH_FS_in_fsck, &c->flags) && - (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore)) { - exiting = true; - print = true; - } - print: -+ prt_newline(out); -+ -+ if (inconsistent) -+ __bch2_inconsistent_error(c, out); -+ else if (exiting) -+ prt_printf(out, "Unable to continue, halting\n"); -+ else if (suppress) -+ prt_printf(out, "Ratelimiting new instances of previous error\n"); -+ - if (print) { -+ /* possibly strip an empty line, from printbuf_indent_add */ -+ while (out->pos && out->buf[out->pos - 1] == ' ') -+ --out->pos; -+ printbuf_nul_terminate(out); -+ - if (bch2_fs_stdio_redirect(c)) -- bch2_print(c, "%s\n", out->buf); -+ bch2_print(c, "%s", out->buf); - else -- bch2_print_string_as_lines(KERN_ERR, out->buf); -+ bch2_print_str(c, KERN_ERR, out->buf); - } - -- if (exiting) -- bch_err(c, "Unable to continue, halting"); -- else if (suppressing) -- bch_err(c, "Ratelimiting new instances of previous error"); -- - if (s) - s->ret = ret; - -- if (inconsistent) -- bch2_inconsistent_error(c); -- - /* - * We don't yet track whether the filesystem currently has errors, for - * log_fsck_err()s: that would require us to track for every error type -@@ -498,29 +675,27 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, - prt_printf(&buf, " level=%u: ", from.level); - - bch2_bkey_val_to_text(&buf, c, k); -- prt_str(&buf, "\n "); -+ prt_newline(&buf); - - va_list args; - va_start(args, fmt); - prt_vprintf(&buf, fmt, args); - va_end(args); - -- prt_str(&buf, ": delete?"); -- -- int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf); -+ int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf); - printbuf_exit(&buf); - return ret; - } - --void bch2_flush_fsck_errs(struct bch_fs *c) -+static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print) - { - struct fsck_err_state *s, *n; - - mutex_lock(&c->fsck_error_msgs_lock); - - list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { -- if (s->ratelimited && s->last_msg) -- bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); -+ if (print && s->ratelimited && s->last_msg) -+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); - - list_del(&s->list); - kfree(s->last_msg); -@@ -530,35 +705,53 @@ void bch2_flush_fsck_errs(struct bch_fs *c) - mutex_unlock(&c->fsck_error_msgs_lock); - } - --int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) -+void bch2_flush_fsck_errs(struct bch_fs *c) -+{ -+ __bch2_flush_fsck_errs(c, true); -+} -+ -+void bch2_free_fsck_errs(struct bch_fs *c) -+{ -+ __bch2_flush_fsck_errs(c, false); -+} -+ -+int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, -+ subvol_inum inum, u64 offset) - { - u32 restart_count = trans->restart_count; - int ret = 0; - -- /* XXX: we don't yet attempt to print paths when we don't know the subvol */ -- if (inum.subvol) -- ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); -+ if (inum.subvol) { -+ ret = bch2_inum_to_path(trans, inum, out); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ return ret; -+ } - if (!inum.subvol || ret) - prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); -+ prt_printf(out, " offset %llu: ", offset); - - return trans_was_restarted(trans, restart_count); - } - --int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, -- subvol_inum inum, u64 offset) -+void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, -+ subvol_inum inum, u64 offset) - { -- int ret = bch2_inum_err_msg_trans(trans, out, inum); -- prt_printf(out, " offset %llu: ", offset); -- return ret; -+ bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); - } - --void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) -+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, -+ struct bpos pos) - { -- bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); -+ int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out); -+ if (ret) -+ return ret; -+ -+ prt_printf(out, " offset %llu: ", pos.offset << 8); -+ return 0; - } - --void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, -- subvol_inum inum, u64 offset) -+void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, -+ struct bpos pos) - { -- bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); -+ bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); - } -diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h -index 7acf2a27ca28..5123d4c86770 100644 ---- a/fs/bcachefs/error.h -+++ b/fs/bcachefs/error.h -@@ -18,6 +18,13 @@ struct work_struct; - - /* Error messages: */ - -+void __bch2_log_msg_start(const char *, struct printbuf *); -+ -+static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) -+{ -+ __bch2_log_msg_start(c->name, out); -+} -+ - /* - * Inconsistency errors: The on disk data is inconsistent. If these occur during - * initial recovery, they don't indicate a bug in the running code - we walk all -@@ -29,21 +36,10 @@ struct work_struct; - * BCH_ON_ERROR_CONTINUE mode - */ - -+bool __bch2_inconsistent_error(struct bch_fs *, struct printbuf *); - bool bch2_inconsistent_error(struct bch_fs *); -- --int bch2_topology_error(struct bch_fs *); -- --#define bch2_fs_topology_error(c, ...) \ --({ \ -- bch_err(c, "btree topology error: " __VA_ARGS__); \ -- bch2_topology_error(c); \ --}) -- --#define bch2_fs_inconsistent(c, ...) \ --({ \ -- bch_err(c, __VA_ARGS__); \ -- bch2_inconsistent_error(c); \ --}) -+__printf(2, 3) -+bool bch2_fs_inconsistent(struct bch_fs *, const char *, ...); - - #define bch2_fs_inconsistent_on(cond, ...) \ - ({ \ -@@ -53,26 +49,21 @@ int bch2_topology_error(struct bch_fs *); - _ret; \ - }) - --/* -- * When a transaction update discovers or is causing a fs inconsistency, it's -- * helpful to also dump the pending updates: -- */ --#define bch2_trans_inconsistent(trans, ...) \ --({ \ -- bch_err(trans->c, __VA_ARGS__); \ -- bch2_dump_trans_updates(trans); \ -- bch2_inconsistent_error(trans->c); \ --}) -+__printf(2, 3) -+bool bch2_trans_inconsistent(struct btree_trans *, const char *, ...); - --#define bch2_trans_inconsistent_on(cond, trans, ...) \ -+#define bch2_trans_inconsistent_on(cond, ...) \ - ({ \ - bool _ret = unlikely(!!(cond)); \ -- \ - if (_ret) \ -- bch2_trans_inconsistent(trans, __VA_ARGS__); \ -+ bch2_trans_inconsistent(__VA_ARGS__); \ - _ret; \ - }) - -+int __bch2_topology_error(struct bch_fs *, struct printbuf *); -+__printf(2, 3) -+int bch2_fs_topology_error(struct bch_fs *, const char *, ...); -+ - /* - * Fsck errors: inconsistency errors we detect at mount time, and should ideally - * be able to repair: -@@ -80,7 +71,7 @@ int bch2_topology_error(struct bch_fs *); - - struct fsck_err_state { - struct list_head list; -- const char *fmt; -+ enum bch_sb_error_id id; - u64 nr; - bool ratelimited; - int ret; -@@ -90,6 +81,14 @@ struct fsck_err_state { - - #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) - -+bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *); -+#define bch2_count_fsck_err(_c, _err, ...) \ -+ __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__) -+ -+int bch2_fsck_err_opt(struct bch_fs *, -+ enum bch_fsck_flags, -+ enum bch_sb_error_id); -+ - __printf(5, 6) __cold - int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, - enum bch_fsck_flags, -@@ -101,6 +100,7 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, - _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__) - - void bch2_flush_fsck_errs(struct bch_fs *); -+void bch2_free_fsck_errs(struct bch_fs *); - - #define fsck_err_wrap(_do) \ - ({ \ -@@ -216,32 +216,43 @@ void bch2_io_error_work(struct work_struct *); - /* Does the error handling without logging a message */ - void bch2_io_error(struct bch_dev *, enum bch_member_error_type); - --#define bch2_dev_io_err_on(cond, ca, _type, ...) \ --({ \ -- bool _ret = (cond); \ -- \ -- if (_ret) { \ -- bch_err_dev_ratelimited(ca, __VA_ARGS__); \ -- bch2_io_error(ca, _type); \ -- } \ -- _ret; \ --}) -- --#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \ --({ \ -- bool _ret = (cond); \ -- \ -- if (_ret) { \ -- bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ -- bch2_io_error(ca, _type); \ -- } \ -- _ret; \ --}) -+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -+void bch2_latency_acct(struct bch_dev *, u64, int); -+#else -+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -+#endif -+ -+static inline void bch2_account_io_success_fail(struct bch_dev *ca, -+ enum bch_member_error_type type, -+ bool success) -+{ -+ if (likely(success)) { -+ if (type == BCH_MEMBER_ERROR_write && -+ ca->write_errors_start) -+ ca->write_errors_start = 0; -+ } else { -+ bch2_io_error(ca, type); -+ } -+} -+ -+static inline void bch2_account_io_completion(struct bch_dev *ca, -+ enum bch_member_error_type type, -+ u64 submit_time, bool success) -+{ -+ if (unlikely(!ca)) -+ return; -+ -+ if (type != BCH_MEMBER_ERROR_checksum) -+ bch2_latency_acct(ca, submit_time, type); -+ -+ bch2_account_io_success_fail(ca, type, success); -+} - --int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); - int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); - --void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); - void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); - -+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); -+void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos); -+ - #endif /* _BCACHEFS_ERROR_H */ -diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c -index 6aac579a692a..b899ee75f5b9 100644 ---- a/fs/bcachefs/extent_update.c -+++ b/fs/bcachefs/extent_update.c -@@ -37,16 +37,17 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) - return lru + ret * 2; - } - -+#define EXTENT_ITERS_MAX 64 -+ - static int count_iters_for_insert(struct btree_trans *trans, - struct bkey_s_c k, - unsigned offset, - struct bpos *end, -- unsigned *nr_iters, -- unsigned max_iters) -+ unsigned *nr_iters) - { - int ret = 0, ret2 = 0; - -- if (*nr_iters >= max_iters) { -+ if (*nr_iters >= EXTENT_ITERS_MAX) { - *end = bpos_min(*end, k.k->p); - ret = 1; - } -@@ -56,7 +57,7 @@ static int count_iters_for_insert(struct btree_trans *trans, - case KEY_TYPE_reflink_v: - *nr_iters += bch2_bkey_nr_alloc_ptrs(k); - -- if (*nr_iters >= max_iters) { -+ if (*nr_iters >= EXTENT_ITERS_MAX) { - *end = bpos_min(*end, k.k->p); - ret = 1; - } -@@ -81,7 +82,7 @@ static int count_iters_for_insert(struct btree_trans *trans, - - *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); - -- if (*nr_iters >= max_iters) { -+ if (*nr_iters >= EXTENT_ITERS_MAX) { - struct bpos pos = bkey_start_pos(k.k); - pos.offset += min_t(u64, k.k->size, - r_k.k->p.offset - idx); -@@ -100,59 +101,31 @@ static int count_iters_for_insert(struct btree_trans *trans, - return ret2 ?: ret; - } - --#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3) -- - int bch2_extent_atomic_end(struct btree_trans *trans, - struct btree_iter *iter, -- struct bkey_i *insert, - struct bpos *end) - { -- struct btree_iter copy; -- struct bkey_s_c k; - unsigned nr_iters = 0; -- int ret; -- -- ret = bch2_btree_iter_traverse(iter); -- if (ret) -- return ret; -- -- *end = insert->k.p; - -- /* extent_update_to_keys(): */ -- nr_iters += 1; -- -- ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, -- &nr_iters, EXTENT_ITERS_MAX / 2); -- if (ret < 0) -- return ret; -+ struct btree_iter copy; -+ bch2_trans_copy_iter(trans, ©, iter); - -- bch2_trans_copy_iter(©, iter); -+ int ret = bch2_btree_iter_traverse(trans, ©); -+ if (ret) -+ goto err; - -- for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) { -+ struct bkey_s_c k; -+ for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) { - unsigned offset = 0; - -- if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) -- offset = bkey_start_offset(&insert->k) - -- bkey_start_offset(k.k); -+ if (bkey_gt(iter->pos, bkey_start_pos(k.k))) -+ offset = iter->pos.offset - bkey_start_offset(k.k); - -- /* extent_handle_overwrites(): */ -- switch (bch2_extent_overlap(&insert->k, k.k)) { -- case BCH_EXTENT_OVERLAP_ALL: -- case BCH_EXTENT_OVERLAP_FRONT: -- nr_iters += 1; -- break; -- case BCH_EXTENT_OVERLAP_BACK: -- case BCH_EXTENT_OVERLAP_MIDDLE: -- nr_iters += 2; -- break; -- } -- -- ret = count_iters_for_insert(trans, k, offset, end, -- &nr_iters, EXTENT_ITERS_MAX); -+ ret = count_iters_for_insert(trans, k, offset, end, &nr_iters); - if (ret) - break; - } -- -+err: - bch2_trans_iter_exit(trans, ©); - return ret < 0 ? ret : 0; - } -@@ -161,10 +134,8 @@ int bch2_extent_trim_atomic(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *k) - { -- struct bpos end; -- int ret; -- -- ret = bch2_extent_atomic_end(trans, iter, k, &end); -+ struct bpos end = k->k.p; -+ int ret = bch2_extent_atomic_end(trans, iter, &end); - if (ret) - return ret; - -diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h -index 6f5cf449361a..34467db53f45 100644 ---- a/fs/bcachefs/extent_update.h -+++ b/fs/bcachefs/extent_update.h -@@ -5,7 +5,7 @@ - #include "bcachefs.h" - - int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, -- struct bkey_i *, struct bpos *); -+ struct bpos *); - int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, - struct bkey_i *); - -diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c -index 2d8042f853dc..1ac9897f189d 100644 ---- a/fs/bcachefs/extents.c -+++ b/fs/bcachefs/extents.c -@@ -28,6 +28,13 @@ - #include "trace.h" - #include "util.h" - -+static const char * const bch2_extent_flags_strs[] = { -+#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, -+ BCH_EXTENT_FLAGS() -+#undef x -+ NULL, -+}; -+ - static unsigned bch2_crc_field_size_max[] = { - [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, -@@ -38,6 +45,49 @@ static void bch2_extent_crc_pack(union bch_extent_crc *, - struct bch_extent_crc_unpacked, - enum bch_extent_entry_type); - -+void bch2_io_failures_to_text(struct printbuf *out, -+ struct bch_fs *c, -+ struct bch_io_failures *failed) -+{ -+ static const char * const error_types[] = { -+ "io", "checksum", "ec reconstruct", NULL -+ }; -+ -+ for (struct bch_dev_io_failures *f = failed->devs; -+ f < failed->devs + failed->nr; -+ f++) { -+ unsigned errflags = -+ ((!!f->failed_io) << 0) | -+ ((!!f->failed_csum_nr) << 1) | -+ ((!!f->failed_ec) << 2); -+ -+ if (!errflags) -+ continue; -+ -+ bch2_printbuf_make_room(out, 1024); -+ rcu_read_lock(); -+ out->atomic++; -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); -+ if (ca) -+ prt_str(out, ca->name); -+ else -+ prt_printf(out, "(invalid device %u)", f->dev); -+ --out->atomic; -+ rcu_read_unlock(); -+ -+ prt_char(out, ' '); -+ -+ if (is_power_of_2(errflags)) { -+ prt_bitflags(out, error_types, errflags); -+ prt_str(out, " error"); -+ } else { -+ prt_str(out, "errors: "); -+ prt_bitflags(out, error_types, errflags); -+ } -+ prt_newline(out); -+ } -+} -+ - struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, - unsigned dev) - { -@@ -51,7 +101,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, - } - - void bch2_mark_io_failure(struct bch_io_failures *failed, -- struct extent_ptr_decoded *p) -+ struct extent_ptr_decoded *p, -+ bool csum_error) - { - struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); - -@@ -59,53 +110,73 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, - BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); - - f = &failed->devs[failed->nr++]; -- f->dev = p->ptr.dev; -- f->idx = p->idx; -- f->nr_failed = 1; -- f->nr_retries = 0; -- } else if (p->idx != f->idx) { -- f->idx = p->idx; -- f->nr_failed = 1; -- f->nr_retries = 0; -- } else { -- f->nr_failed++; -+ memset(f, 0, sizeof(*f)); -+ f->dev = p->ptr.dev; -+ } -+ -+ if (p->do_ec_reconstruct) -+ f->failed_ec = true; -+ else if (!csum_error) -+ f->failed_io = true; -+ else -+ f->failed_csum_nr++; -+} -+ -+void bch2_mark_btree_validate_failure(struct bch_io_failures *failed, -+ unsigned dev) -+{ -+ struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev); -+ -+ if (!f) { -+ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); -+ -+ f = &failed->devs[failed->nr++]; -+ memset(f, 0, sizeof(*f)); -+ f->dev = dev; - } -+ -+ f->failed_btree_validate = true; - } - --static inline u64 dev_latency(struct bch_fs *c, unsigned dev) -+static inline u64 dev_latency(struct bch_dev *ca) - { -- struct bch_dev *ca = bch2_dev_rcu(c, dev); - return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; - } - -+static inline int dev_failed(struct bch_dev *ca) -+{ -+ return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; -+} -+ - /* - * returns true if p1 is better than p2: - */ - static inline bool ptr_better(struct bch_fs *c, - const struct extent_ptr_decoded p1, -- const struct extent_ptr_decoded p2) -+ u64 p1_latency, -+ struct bch_dev *ca1, -+ const struct extent_ptr_decoded p2, -+ u64 p2_latency) - { -- if (likely(!p1.idx && !p2.idx)) { -- u64 l1 = dev_latency(c, p1.ptr.dev); -- u64 l2 = dev_latency(c, p2.ptr.dev); -+ struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); - -- /* -- * Square the latencies, to bias more in favor of the faster -- * device - we never want to stop issuing reads to the slower -- * device altogether, so that we can update our latency numbers: -- */ -- l1 *= l1; -- l2 *= l2; -+ int failed_delta = dev_failed(ca1) - dev_failed(ca2); -+ if (unlikely(failed_delta)) -+ return failed_delta < 0; - -- /* Pick at random, biased in favor of the faster device: */ -+ if (static_branch_unlikely(&bch2_force_reconstruct_read)) -+ return p1.do_ec_reconstruct > p2.do_ec_reconstruct; - -- return bch2_get_random_u64_below(l1 + l2) > l1; -- } -+ if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) -+ return p1.do_ec_reconstruct < p2.do_ec_reconstruct; - -- if (bch2_force_reconstruct_read) -- return p1.idx > p2.idx; -+ int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr; -+ if (unlikely(crc_retry_delta)) -+ return crc_retry_delta < 0; - -- return p1.idx < p2.idx; -+ /* Pick at random, biased in favor of the faster device: */ -+ -+ return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency; - } - - /* -@@ -115,64 +186,117 @@ static inline bool ptr_better(struct bch_fs *c, - */ - int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_failures *failed, -- struct extent_ptr_decoded *pick) -+ struct extent_ptr_decoded *pick, -+ int dev) - { -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- const union bch_extent_entry *entry; -- struct extent_ptr_decoded p; -- struct bch_dev_io_failures *f; -- int ret = 0; -+ bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; -+ bool have_dirty_ptrs = false, have_pick = false; - - if (k.k->type == KEY_TYPE_error) - return -BCH_ERR_key_type_error; - - rcu_read_lock(); -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ u64 pick_latency; -+ - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ have_dirty_ptrs |= !p.ptr.cached; -+ - /* - * Unwritten extent: no need to actually read, treat it as a - * hole and return 0s: - */ - if (p.ptr.unwritten) { -- ret = 0; -- break; -+ rcu_read_unlock(); -+ return 0; - } - -- /* -- * If there are any dirty pointers it's an error if we can't -- * read: -- */ -- if (!ret && !p.ptr.cached) -- ret = -BCH_ERR_no_device_to_read_from; -+ /* Are we being asked to read from a specific device? */ -+ if (dev >= 0 && p.ptr.dev != dev) -+ continue; -+ -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - -- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); -+ if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) { -+ rcu_read_unlock(); -+ int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev); -+ if (ret) -+ return ret; -+ rcu_read_lock(); -+ } - - if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) - continue; - -- f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; -- if (f) -- p.idx = f->nr_failed < f->nr_retries -- ? f->idx -- : f->idx + 1; -+ struct bch_dev_io_failures *f = -+ unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; -+ if (unlikely(f)) { -+ p.crc_retry_nr = f->failed_csum_nr; -+ p.has_ec &= ~f->failed_ec; - -- if (!p.idx && (!ca || !bch2_dev_is_readable(ca))) -- p.idx++; -+ if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { -+ have_io_errors |= f->failed_io; -+ have_io_errors |= f->failed_btree_validate; -+ have_io_errors |= f->failed_ec; -+ } -+ have_csum_errors |= !!f->failed_csum_nr; -+ -+ if (p.has_ec && (f->failed_io || f->failed_csum_nr)) -+ p.do_ec_reconstruct = true; -+ else if (f->failed_io || -+ f->failed_btree_validate || -+ f->failed_csum_nr > c->opts.checksum_err_retry_nr) -+ continue; -+ } - -- if (!p.idx && p.has_ec && bch2_force_reconstruct_read) -- p.idx++; -+ have_missing_devs |= ca && !bch2_dev_is_online(ca); - -- if (p.idx > (unsigned) p.has_ec) -- continue; -+ if (!ca || !bch2_dev_is_online(ca)) { -+ if (!p.has_ec) -+ continue; -+ p.do_ec_reconstruct = true; -+ } - -- if (ret > 0 && !ptr_better(c, p, *pick)) -- continue; -+ if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec) -+ p.do_ec_reconstruct = true; - -- *pick = p; -- ret = 1; -+ u64 p_latency = dev_latency(ca); -+ /* -+ * Square the latencies, to bias more in favor of the faster -+ * device - we never want to stop issuing reads to the slower -+ * device altogether, so that we can update our latency numbers: -+ */ -+ p_latency *= p_latency; -+ -+ if (!have_pick || -+ ptr_better(c, -+ p, p_latency, ca, -+ *pick, pick_latency)) { -+ *pick = p; -+ pick_latency = p_latency; -+ have_pick = true; -+ } - } - rcu_read_unlock(); - -- return ret; -+ if (have_pick) -+ return 1; -+ if (!have_dirty_ptrs) -+ return 0; -+ if (have_missing_devs) -+ return -BCH_ERR_no_device_to_read_from; -+ if (have_csum_errors) -+ return -BCH_ERR_data_read_csum_err; -+ if (have_io_errors) -+ return -BCH_ERR_data_read_io_err; -+ -+ /* -+ * If we get here, we have pointers (bkey_ptrs_validate() ensures that), -+ * but they don't point to valid devices: -+ */ -+ return -BCH_ERR_no_devices_valid; - } - - /* KEY_TYPE_btree_ptr: */ -@@ -536,29 +660,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, - struct bch_extent_crc_unpacked src, - enum bch_extent_entry_type type) - { --#define set_common_fields(_dst, _src) \ -- _dst.type = 1 << type; \ -- _dst.csum_type = _src.csum_type, \ -- _dst.compression_type = _src.compression_type, \ -- _dst._compressed_size = _src.compressed_size - 1, \ -- _dst._uncompressed_size = _src.uncompressed_size - 1, \ -- _dst.offset = _src.offset -+#define common_fields(_src) \ -+ .type = BIT(type), \ -+ .csum_type = _src.csum_type, \ -+ .compression_type = _src.compression_type, \ -+ ._compressed_size = _src.compressed_size - 1, \ -+ ._uncompressed_size = _src.uncompressed_size - 1, \ -+ .offset = _src.offset - - switch (type) { - case BCH_EXTENT_ENTRY_crc32: -- set_common_fields(dst->crc32, src); -- dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); -+ dst->crc32 = (struct bch_extent_crc32) { -+ common_fields(src), -+ .csum = (u32 __force) *((__le32 *) &src.csum.lo), -+ }; - break; - case BCH_EXTENT_ENTRY_crc64: -- set_common_fields(dst->crc64, src); -- dst->crc64.nonce = src.nonce; -- dst->crc64.csum_lo = (u64 __force) src.csum.lo; -- dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); -+ dst->crc64 = (struct bch_extent_crc64) { -+ common_fields(src), -+ .nonce = src.nonce, -+ .csum_lo = (u64 __force) src.csum.lo, -+ .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), -+ }; - break; - case BCH_EXTENT_ENTRY_crc128: -- set_common_fields(dst->crc128, src); -- dst->crc128.nonce = src.nonce; -- dst->crc128.csum = src.csum; -+ dst->crc128 = (struct bch_extent_crc128) { -+ common_fields(src), -+ .nonce = src.nonce, -+ .csum = src.csum, -+ }; - break; - default: - BUG(); -@@ -991,13 +1121,14 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke - static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, - struct bch_extent_ptr *ptr) - { -- if (!opts->promote_target || -- !bch2_dev_in_target(c, ptr->dev, opts->promote_target)) -+ unsigned target = opts->promote_target ?: opts->foreground_target; -+ -+ if (target && !bch2_dev_in_target(c, ptr->dev, target)) - return false; - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - -- return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); -+ return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr); - } - - void bch2_extent_ptr_set_cached(struct bch_fs *c, -@@ -1005,33 +1136,50 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c, - struct bkey_s k, - struct bch_extent_ptr *ptr) - { -- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - struct extent_ptr_decoded p; -+ bool have_cached_ptr; -+ unsigned drop_dev = ptr->dev; - - rcu_read_lock(); -- if (!want_cached_ptr(c, opts, ptr)) { -- bch2_bkey_drop_ptr_noerror(k, ptr); -- goto out; -- } -+restart_drop_ptrs: -+ ptrs = bch2_bkey_ptrs(k); -+ have_cached_ptr = false; - -- /* -- * Stripes can't contain cached data, for - reasons. -- * -- * Possibly something we can fix in the future? -- */ -- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -- if (&entry->ptr == ptr) { -- if (p.has_ec) -- bch2_bkey_drop_ptr_noerror(k, ptr); -- else -- ptr->cached = true; -- goto out; -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ /* -+ * Check if it's erasure coded - stripes can't contain cached -+ * data. Possibly something we can fix in the future? -+ */ -+ if (&entry->ptr == ptr && p.has_ec) -+ goto drop; -+ -+ if (p.ptr.cached) { -+ if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) { -+ bch2_bkey_drop_ptr_noerror(k, &entry->ptr); -+ ptr = NULL; -+ goto restart_drop_ptrs; -+ } -+ -+ have_cached_ptr = true; - } -+ } -+ -+ if (!ptr) -+ bkey_for_each_ptr(ptrs, ptr2) -+ if (ptr2->dev == drop_dev) -+ ptr = ptr2; - -- BUG(); --out: -+ if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) -+ goto drop; -+ -+ ptr->cached = true; -+ rcu_read_unlock(); -+ return; -+drop: - rcu_read_unlock(); -+ bch2_bkey_drop_ptr_noerror(k, ptr); - } - - /* -@@ -1220,6 +1368,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - bch2_extent_rebalance_to_text(out, c, &entry->rebalance); - break; - -+ case BCH_EXTENT_ENTRY_flags: -+ prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); -+ break; -+ - default: - prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); - return; -@@ -1381,6 +1533,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, - #endif - break; - } -+ case BCH_EXTENT_ENTRY_flags: -+ bkey_fsck_err_on(entry != ptrs.start, -+ c, extent_flags_not_at_start, -+ "extent flags entry not at start"); -+ break; - } - } - -@@ -1447,6 +1604,28 @@ void bch2_ptr_swab(struct bkey_s k) - } - } - -+int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags) -+{ -+ int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags); -+ if (ret) -+ return ret; -+ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ -+ if (ptrs.start != ptrs.end && -+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) { -+ ptrs.start->flags.flags = flags; -+ } else { -+ struct bch_extent_flags f = { -+ .type = BIT(BCH_EXTENT_ENTRY_flags), -+ .flags = flags, -+ }; -+ __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f); -+ } -+ -+ return 0; -+} -+ - /* Generic extent code: */ - - int bch2_cut_front_s(struct bpos where, struct bkey_s k) -@@ -1492,8 +1671,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) - entry->crc128.offset += sub; - break; - case BCH_EXTENT_ENTRY_stripe_ptr: -- break; - case BCH_EXTENT_ENTRY_rebalance: -+ case BCH_EXTENT_ENTRY_flags: - break; - } - -diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h -index 204d765dd74c..b8590e51b76e 100644 ---- a/fs/bcachefs/extents.h -+++ b/fs/bcachefs/extents.h -@@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) - ({ \ - __label__ out; \ - \ -- (_ptr).idx = 0; \ -- (_ptr).has_ec = false; \ -+ (_ptr).has_ec = false; \ -+ (_ptr).do_ec_reconstruct = false; \ -+ (_ptr).crc_retry_nr = 0; \ - \ - __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ - switch (__extent_entry_type(_entry)) { \ -@@ -379,13 +380,6 @@ out: \ - - /* Iterate over pointers in KEY_TYPE_extent: */ - --#define extent_for_each_entry_from(_e, _entry, _start) \ -- __bkey_extent_entry_for_each_from(_start, \ -- extent_entry_last(_e), _entry) -- --#define extent_for_each_entry(_e, _entry) \ -- extent_for_each_entry_from(_e, _entry, (_e).v->start) -- - #define extent_ptr_next(_e, _ptr) \ - __bkey_ptr_next(_ptr, extent_entry_last(_e)) - -@@ -398,13 +392,16 @@ out: \ - - /* utility code common to all keys with pointers: */ - -+void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *, -+ struct bch_io_failures *); - struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, - unsigned); - void bch2_mark_io_failure(struct bch_io_failures *, -- struct extent_ptr_decoded *); -+ struct extent_ptr_decoded *, bool); -+void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned); - int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, - struct bch_io_failures *, -- struct extent_ptr_decoded *); -+ struct extent_ptr_decoded *, int); - - /* KEY_TYPE_btree_ptr: */ - -@@ -753,4 +750,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size) - k->size = new_size; - } - -+static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs) -+{ -+ if (ptrs.start != ptrs.end && -+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) -+ return ptrs.start->flags.flags; -+ return 0; -+} -+ -+static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k) -+{ -+ return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k)); -+} -+ -+int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64); -+ - #endif /* _BCACHEFS_EXTENTS_H */ -diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h -index c198dfc376d6..74c0252cbd98 100644 ---- a/fs/bcachefs/extents_format.h -+++ b/fs/bcachefs/extents_format.h -@@ -79,8 +79,9 @@ - x(crc64, 2) \ - x(crc128, 3) \ - x(stripe_ptr, 4) \ -- x(rebalance, 5) --#define BCH_EXTENT_ENTRY_MAX 6 -+ x(rebalance, 5) \ -+ x(flags, 6) -+#define BCH_EXTENT_ENTRY_MAX 7 - - enum bch_extent_entry_type { - #define x(f, n) BCH_EXTENT_ENTRY_##f = n, -@@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr { - #endif - }; - -+#define BCH_EXTENT_FLAGS() \ -+ x(poisoned, 0) -+ -+enum bch_extent_flags_e { -+#define x(n, v) BCH_EXTENT_FLAG_##n = v, -+ BCH_EXTENT_FLAGS() -+#undef x -+}; -+ -+struct bch_extent_flags { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:7, -+ flags:57; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 flags:57, -+ type:7; -+#endif -+}; -+ - /* bch_extent_rebalance: */ - #include "rebalance_format.h" - -diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h -index 43d6c341ecca..b23ce4a373c0 100644 ---- a/fs/bcachefs/extents_types.h -+++ b/fs/bcachefs/extents_types.h -@@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked { - }; - - struct extent_ptr_decoded { -- unsigned idx; - bool has_ec; -+ bool do_ec_reconstruct; -+ u8 crc_retry_nr; - struct bch_extent_crc_unpacked crc; - struct bch_extent_ptr ptr; - struct bch_extent_stripe_ptr ec; -@@ -31,10 +32,11 @@ struct bch_io_failures { - u8 nr; - struct bch_dev_io_failures { - u8 dev; -- u8 idx; -- u8 nr_failed; -- u8 nr_retries; -- } devs[BCH_REPLICAS_MAX]; -+ unsigned failed_csum_nr:6, -+ failed_io:1, -+ failed_btree_validate:1, -+ failed_ec:1; -+ } devs[BCH_REPLICAS_MAX + 1]; - }; - - #endif /* _BCACHEFS_EXTENTS_TYPES_H */ -diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c -index 2eaffe37b5e7..0e742555cb0a 100644 ---- a/fs/bcachefs/eytzinger.c -+++ b/fs/bcachefs/eytzinger.c -@@ -148,89 +148,99 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr - return cmp(a, b, priv); - } - --static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, -+static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size, - cmp_r_func_t cmp_func, const void *priv, - size_t l, size_t r) - { -- return do_cmp(base + inorder_to_eytzinger0(l, n) * size, -- base + inorder_to_eytzinger0(r, n) * size, -+ return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size, -+ base1 + inorder_to_eytzinger1(r, n) * size, - cmp_func, priv); - } - --static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, -+static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size, - swap_r_func_t swap_func, const void *priv, - size_t l, size_t r) - { -- do_swap(base + inorder_to_eytzinger0(l, n) * size, -- base + inorder_to_eytzinger0(r, n) * size, -+ do_swap(base1 + inorder_to_eytzinger1(l, n) * size, -+ base1 + inorder_to_eytzinger1(r, n) * size, - size, swap_func, priv); - } - --void eytzinger0_sort_r(void *base, size_t n, size_t size, -- cmp_r_func_t cmp_func, -- swap_r_func_t swap_func, -- const void *priv) -+static void eytzinger1_sort_r(void *base1, size_t n, size_t size, -+ cmp_r_func_t cmp_func, -+ swap_r_func_t swap_func, -+ const void *priv) - { -- int i, j, k; -+ unsigned i, j, k; - - /* called from 'sort' without swap function, let's pick the default */ - if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) - swap_func = NULL; - - if (!swap_func) { -- if (is_aligned(base, size, 8)) -+ if (is_aligned(base1, size, 8)) - swap_func = SWAP_WORDS_64; -- else if (is_aligned(base, size, 4)) -+ else if (is_aligned(base1, size, 4)) - swap_func = SWAP_WORDS_32; - else - swap_func = SWAP_BYTES; - } - - /* heapify */ -- for (i = n / 2 - 1; i >= 0; --i) { -+ for (i = n / 2; i >= 1; --i) { - /* Find the sift-down path all the way to the leaves. */ -- for (j = i; k = j * 2 + 1, k + 1 < n;) -- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; -+ for (j = i; k = j * 2, k < n;) -+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - - /* Special case for the last leaf with no sibling. */ -- if (j * 2 + 2 == n) -- j = j * 2 + 1; -+ if (j * 2 == n) -+ j *= 2; - - /* Backtrack to the correct location. */ -- while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) -- j = (j - 1) / 2; -+ while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0) -+ j /= 2; - - /* Shift the element into its correct place. */ - for (k = j; j != i;) { -- j = (j - 1) / 2; -- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); -+ j /= 2; -+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); - } - } - - /* sort */ -- for (i = n - 1; i > 0; --i) { -- eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); -+ for (i = n; i > 1; --i) { -+ eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i); - - /* Find the sift-down path all the way to the leaves. */ -- for (j = 0; k = j * 2 + 1, k + 1 < i;) -- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; -+ for (j = 1; k = j * 2, k + 1 < i;) -+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - - /* Special case for the last leaf with no sibling. */ -- if (j * 2 + 2 == i) -- j = j * 2 + 1; -+ if (j * 2 + 1 == i) -+ j *= 2; - - /* Backtrack to the correct location. */ -- while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) -- j = (j - 1) / 2; -+ while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0) -+ j /= 2; - - /* Shift the element into its correct place. */ -- for (k = j; j;) { -- j = (j - 1) / 2; -- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); -+ for (k = j; j > 1;) { -+ j /= 2; -+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); - } - } - } - -+void eytzinger0_sort_r(void *base, size_t n, size_t size, -+ cmp_r_func_t cmp_func, -+ swap_r_func_t swap_func, -+ const void *priv) -+{ -+ void *base1 = base - size; -+ -+ return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv); -+} -+ - void eytzinger0_sort(void *base, size_t n, size_t size, - cmp_func_t cmp_func, - swap_func_t swap_func) -diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h -index 0541192d7bc0..643c1f716061 100644 ---- a/fs/bcachefs/eytzinger.h -+++ b/fs/bcachefs/eytzinger.h -@@ -6,6 +6,7 @@ - #include - - #ifdef EYTZINGER_DEBUG -+#include - #define EYTZINGER_BUG_ON(cond) BUG_ON(cond) - #else - #define EYTZINGER_BUG_ON(cond) -@@ -56,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size) - return rounddown_pow_of_two(size + 1) - 1; - } - --/* -- * eytzinger1_next() and eytzinger1_prev() have the nice properties that -- * -- * eytzinger1_next(0) == eytzinger1_first()) -- * eytzinger1_prev(0) == eytzinger1_last()) -- * -- * eytzinger1_prev(eytzinger1_first()) == 0 -- * eytzinger1_next(eytzinger1_last()) == 0 -- */ -- - static inline unsigned eytzinger1_next(unsigned i, unsigned size) - { -- EYTZINGER_BUG_ON(i > size); -+ EYTZINGER_BUG_ON(i == 0 || i > size); - - if (eytzinger1_right_child(i) <= size) { - i = eytzinger1_right_child(i); - -- i <<= __fls(size + 1) - __fls(i); -+ i <<= __fls(size) - __fls(i); - i >>= i > size; - } else { - i >>= ffz(i) + 1; -@@ -84,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) - - static inline unsigned eytzinger1_prev(unsigned i, unsigned size) - { -- EYTZINGER_BUG_ON(i > size); -+ EYTZINGER_BUG_ON(i == 0 || i > size); - - if (eytzinger1_left_child(i) <= size) { - i = eytzinger1_left_child(i) + 1; - -- i <<= __fls(size + 1) - __fls(i); -+ i <<= __fls(size) - __fls(i); - i -= 1; - i >>= i > size; - } else { -@@ -243,73 +234,63 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) - (_i) != -1; \ - (_i) = eytzinger0_next((_i), (_size))) - -+#define eytzinger0_for_each_prev(_i, _size) \ -+ for (unsigned (_i) = eytzinger0_last((_size)); \ -+ (_i) != -1; \ -+ (_i) = eytzinger0_prev((_i), (_size))) -+ - /* return greatest node <= @search, or -1 if not found */ - static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) - { -- unsigned i, n = 0; -- -- if (!nr) -- return -1; -- -- do { -- i = n; -- n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); -- } while (n < nr); -- -- if (n & 1) { -- /* -- * @i was greater than @search, return previous node: -- * -- * if @i was leftmost/smallest element, -- * eytzinger0_prev(eytzinger0_first())) returns -1, as expected -- */ -- return eytzinger0_prev(i, nr); -- } else { -- return i; -- } -+ void *base1 = base - size; -+ unsigned n = 1; -+ -+ while (n <= nr) -+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); -+ n >>= __ffs(n) + 1; -+ return n - 1; - } - -+/* return smallest node > @search, or -1 if not found */ - static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) - { -- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); -+ void *base1 = base - size; -+ unsigned n = 1; - -- /* -- * if eytitzinger0_find_le() returned -1 - no element was <= search - we -- * want to return the first element; next/prev identities mean this work -- * as expected -- * -- * similarly if find_le() returns last element, we should return -1; -- * identities mean this all works out: -- */ -- return eytzinger0_next(idx, nr); -+ while (n <= nr) -+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); -+ n >>= __ffs(n + 1) + 1; -+ return n - 1; - } - -+/* return smallest node >= @search, or -1 if not found */ - static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) - { -- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); -- -- if (idx < nr && !cmp(base + idx * size, search)) -- return idx; -+ void *base1 = base - size; -+ unsigned n = 1; - -- return eytzinger0_next(idx, nr); -+ while (n <= nr) -+ n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0); -+ n >>= __ffs(n + 1) + 1; -+ return n - 1; - } - - #define eytzinger0_find(base, nr, size, _cmp, search) \ - ({ \ -- void *_base = (base); \ -+ size_t _size = (size); \ -+ void *_base1 = (void *)(base) - _size; \ - const void *_search = (search); \ - size_t _nr = (nr); \ -- size_t _size = (size); \ -- size_t _i = 0; \ -+ size_t _i = 1; \ - int _res; \ - \ -- while (_i < _nr && \ -- (_res = _cmp(_search, _base + _i * _size))) \ -- _i = eytzinger0_child(_i, _res > 0); \ -- _i; \ -+ while (_i <= _nr && \ -+ (_res = _cmp(_search, _base1 + _i * _size))) \ -+ _i = eytzinger1_child(_i, _res > 0); \ -+ _i - 1; \ - }) - - void eytzinger0_sort_r(void *, size_t, size_t, -diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c -new file mode 100644 -index 000000000000..2faec143eb31 ---- /dev/null -+++ b/fs/bcachefs/fast_list.c -@@ -0,0 +1,156 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+/* -+ * Fast, unordered lists -+ * -+ * Supports add, remove, and iterate -+ * -+ * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot -+ * allocation and freeing. -+ * -+ * This means that adding, removing, and iterating over items is lockless, -+ * except when refilling/emptying the percpu slot buffers. -+ */ -+ -+#include "fast_list.h" -+ -+struct fast_list_pcpu { -+ u32 nr; -+ u32 entries[31]; -+}; -+ -+static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp) -+{ -+ int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp); -+ if (unlikely(idx < 0)) -+ return 0; -+ -+ if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) { -+ ida_free(&l->slots_allocated, idx); -+ return 0; -+ } -+ -+ return idx; -+} -+ -+/** -+ * fast_list_get_idx - get a slot in a fast_list -+ * @l: list to get slot in -+ * -+ * This allocates a slot in the radix tree without storing to it, so that we can -+ * take the potential memory allocation failure early and do the list add later -+ * when we can't take an allocation failure. -+ * -+ * Returns: positive integer on success, -ENOMEM on failure -+ */ -+int fast_list_get_idx(struct fast_list *l) -+{ -+ unsigned long flags; -+ int idx; -+retry: -+ local_irq_save(flags); -+ struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); -+ -+ if (unlikely(!lp->nr)) { -+ u32 entries[16], nr = 0; -+ -+ local_irq_restore(flags); -+ while (nr < ARRAY_SIZE(entries) && -+ (idx = fast_list_alloc_idx(l, GFP_KERNEL))) -+ entries[nr++] = idx; -+ local_irq_save(flags); -+ -+ lp = this_cpu_ptr(l->buffer); -+ -+ while (nr && lp->nr < ARRAY_SIZE(lp->entries)) -+ lp->entries[lp->nr++] = entries[--nr]; -+ -+ if (unlikely(nr)) { -+ local_irq_restore(flags); -+ while (nr) -+ ida_free(&l->slots_allocated, entries[--nr]); -+ goto retry; -+ } -+ -+ if (unlikely(!lp->nr)) { -+ local_irq_restore(flags); -+ return -ENOMEM; -+ } -+ } -+ -+ idx = lp->entries[--lp->nr]; -+ local_irq_restore(flags); -+ -+ return idx; -+} -+ -+/** -+ * fast_list_add - add an item to a fast_list -+ * @l: list -+ * @item: item to add -+ * -+ * Allocates a slot in the radix tree and stores to it and then returns the -+ * slot index, which must be passed to fast_list_remove(). -+ * -+ * Returns: positive integer on success, -ENOMEM on failure -+ */ -+int fast_list_add(struct fast_list *l, void *item) -+{ -+ int idx = fast_list_get_idx(l); -+ if (idx < 0) -+ return idx; -+ -+ *genradix_ptr_inlined(&l->items, idx) = item; -+ return idx; -+} -+ -+/** -+ * fast_list_remove - remove an item from a fast_list -+ * @l: list -+ * @idx: item's slot index -+ * -+ * Zeroes out the slot in the radix tree and frees the slot for future -+ * fast_list_add() operations. -+ */ -+void fast_list_remove(struct fast_list *l, unsigned idx) -+{ -+ u32 entries[16], nr = 0; -+ unsigned long flags; -+ -+ if (!idx) -+ return; -+ -+ *genradix_ptr_inlined(&l->items, idx) = NULL; -+ -+ local_irq_save(flags); -+ struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); -+ -+ if (unlikely(lp->nr == ARRAY_SIZE(lp->entries))) -+ while (nr < ARRAY_SIZE(entries)) -+ entries[nr++] = lp->entries[--lp->nr]; -+ -+ lp->entries[lp->nr++] = idx; -+ local_irq_restore(flags); -+ -+ if (unlikely(nr)) -+ while (nr) -+ ida_free(&l->slots_allocated, entries[--nr]); -+} -+ -+void fast_list_exit(struct fast_list *l) -+{ -+ /* XXX: warn if list isn't empty */ -+ free_percpu(l->buffer); -+ ida_destroy(&l->slots_allocated); -+ genradix_free(&l->items); -+} -+ -+int fast_list_init(struct fast_list *l) -+{ -+ genradix_init(&l->items); -+ ida_init(&l->slots_allocated); -+ l->buffer = alloc_percpu(*l->buffer); -+ if (!l->buffer) -+ return -ENOMEM; -+ return 0; -+} -diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h -new file mode 100644 -index 000000000000..73c9bf591fd6 ---- /dev/null -+++ b/fs/bcachefs/fast_list.h -@@ -0,0 +1,41 @@ -+#ifndef _LINUX_FAST_LIST_H -+#define _LINUX_FAST_LIST_H -+ -+#include -+#include -+#include -+ -+struct fast_list_pcpu; -+ -+struct fast_list { -+ GENRADIX(void *) items; -+ struct ida slots_allocated;; -+ struct fast_list_pcpu __percpu -+ *buffer; -+}; -+ -+static inline void *fast_list_iter_peek(struct genradix_iter *iter, -+ struct fast_list *list) -+{ -+ void **p; -+ while ((p = genradix_iter_peek(iter, &list->items)) && !*p) -+ genradix_iter_advance(iter, &list->items); -+ -+ return p ? *p : NULL; -+} -+ -+#define fast_list_for_each_from(_list, _iter, _i, _start) \ -+ for (_iter = genradix_iter_init(&(_list)->items, _start); \ -+ (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \ -+ genradix_iter_advance(&(_iter), &(_list)->items)) -+ -+#define fast_list_for_each(_list, _iter, _i) \ -+ fast_list_for_each_from(_list, _iter, _i, 0) -+ -+int fast_list_get_idx(struct fast_list *l); -+int fast_list_add(struct fast_list *l, void *item); -+void fast_list_remove(struct fast_list *l, unsigned idx); -+void fast_list_exit(struct fast_list *l); -+int fast_list_init(struct fast_list *l); -+ -+#endif /* _LINUX_FAST_LIST_H */ -diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c -index ab1d5db2fa56..e3a75dcca60c 100644 ---- a/fs/bcachefs/fs-io-buffered.c -+++ b/fs/bcachefs/fs-io-buffered.c -@@ -110,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans, - if (!get_more) - break; - -+ unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio); -+ -+ if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping)) -+ break; -+ -+ unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); -+ -+ /* ensure proper alignment */ -+ order = min(order, __ffs(folio_offset|BIT(31))); -+ - folio = xa_load(&iter->mapping->i_pages, folio_offset); - if (folio && !xa_is_value(folio)) - break; - -- folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); -+ folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order); - if (!folio) - break; - -@@ -149,12 +159,10 @@ static void bchfs_read(struct btree_trans *trans, - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_buf sk; -- int flags = BCH_READ_RETRY_IF_STALE| -- BCH_READ_MAY_PROMOTE; -+ int flags = BCH_READ_retry_if_stale| -+ BCH_READ_may_promote; - int ret = 0; - -- rbio->c = c; -- rbio->start_time = local_clock(); - rbio->subvol = inum.subvol; - - bch2_bkey_buf_init(&sk); -@@ -175,12 +183,12 @@ static void bchfs_read(struct btree_trans *trans, - if (ret) - goto err; - -- bch2_btree_iter_set_snapshot(&iter, snapshot); -+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - -- bch2_btree_iter_set_pos(&iter, -+ bch2_btree_iter_set_pos(trans, &iter, - POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - -- k = bch2_btree_iter_peek_slot(&iter); -+ k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -211,14 +219,29 @@ static void bchfs_read(struct btree_trans *trans, - swap(rbio->bio.bi_iter.bi_size, bytes); - - if (rbio->bio.bi_iter.bi_size == bytes) -- flags |= BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_last_fragment; - - bch2_bio_page_state_set(&rbio->bio, k); - - bch2_read_extent(trans, rbio, iter.pos, - data_btree, k, offset_into_extent, flags); -+ /* -+ * Careful there's a landmine here if bch2_read_extent() ever -+ * starts returning transaction restarts here. -+ * -+ * We've changed rbio->bi_iter.bi_size to be "bytes we can read -+ * from this extent" with the swap call, and we restore it -+ * below. That restore needs to come before checking for -+ * errors. -+ * -+ * But unlike __bch2_read(), we use the rbio bvec iter, not one -+ * on the stack, so we can't do the restore right after the -+ * bch2_read_extent() call: we don't own that iterator anymore -+ * if BCH_READ_last_fragment is set, since we may have submitted -+ * that rbio instead of cloning it. -+ */ - -- if (flags & BCH_READ_LAST_FRAGMENT) -+ if (flags & BCH_READ_last_fragment) - break; - - swap(rbio->bio.bi_iter.bi_size, bytes); -@@ -232,7 +255,8 @@ static void bchfs_read(struct btree_trans *trans, - - if (ret) { - struct printbuf buf = PRINTBUF; -- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); -+ lockrestart_do(trans, -+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); - prt_printf(&buf, "read error %i from btree lookup", ret); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); -@@ -280,12 +304,13 @@ void bch2_readahead(struct readahead_control *ractl) - struct bch_read_bio *rbio = - rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, - GFP_KERNEL, &c->bio_read), -- opts); -+ c, -+ opts, -+ bch2_readpages_end_io); - - readpage_iter_advance(&readpages_iter); - - rbio->bio.bi_iter.bi_sector = folio_sector(folio); -- rbio->bio.bi_end_io = bch2_readpages_end_io; - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bchfs_read(trans, rbio, inode_inum(inode), -@@ -323,10 +348,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), -- opts); -+ c, -+ opts, -+ bch2_read_single_folio_end_io); - rbio->bio.bi_private = &done; -- rbio->bio.bi_end_io = bch2_read_single_folio_end_io; -- - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); -@@ -420,7 +445,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op) - } - } - -- if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { -+ if (io->op.flags & BCH_WRITE_wrote_data_inline) { - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - -diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c -index 2089c36b5866..1f5154d9676b 100644 ---- a/fs/bcachefs/fs-io-direct.c -+++ b/fs/bcachefs/fs-io-direct.c -@@ -3,6 +3,7 @@ - - #include "bcachefs.h" - #include "alloc_foreground.h" -+#include "enumerated_ref.h" - #include "fs.h" - #include "fs-io.h" - #include "fs-io-direct.h" -@@ -73,6 +74,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - struct blk_plug plug; - loff_t offset = req->ki_pos; - bool sync = is_sync_kiocb(req); -+ bool split = false; - size_t shorten; - ssize_t ret; - -@@ -99,8 +101,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - GFP_KERNEL, - &c->dio_read_bioset); - -- bio->bi_end_io = bch2_direct_IO_read_endio; -- - dio = container_of(bio, struct dio_read, rbio.bio); - closure_init(&dio->cl, NULL); - -@@ -133,12 +133,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - - goto start; - while (iter->count) { -+ split = true; -+ - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->bio_read); -- bio->bi_end_io = bch2_direct_IO_read_split_endio; - start: - bio->bi_opf = REQ_OP_READ|REQ_SYNC; - bio->bi_iter.bi_sector = offset >> 9; -@@ -160,7 +161,15 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - if (iter->count) - closure_get(&dio->cl); - -- bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); -+ struct bch_read_bio *rbio = -+ rbio_init(bio, -+ c, -+ opts, -+ split -+ ? bch2_direct_IO_read_split_endio -+ : bch2_direct_IO_read_endio); -+ -+ bch2_read(c, rbio, inode_inum(inode)); - } - - blk_finish_plug(&plug); -@@ -393,7 +402,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) - ret = dio->op.error ?: ((long) dio->written << 9); - bio_put(&dio->op.wbio.bio); - -- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); - - /* inode->i_dio_count is our ref on inode and thus bch_fs */ - inode_dio_end(&inode->v); -@@ -511,8 +520,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) - dio->op.devs_need_flush = &inode->ei_devs_need_flush; - - if (sync) -- dio->op.flags |= BCH_WRITE_SYNC; -- dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; -+ dio->op.flags |= BCH_WRITE_sync; -+ dio->op.flags |= BCH_WRITE_check_enospc; - - ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, - bio_sectors(bio), true); -@@ -598,7 +607,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) - prefetch(&inode->ei_inode); - prefetch((void *) &inode->ei_inode + 64); - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write)) -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write)) - return -EROFS; - - inode_lock(&inode->v); -@@ -667,7 +676,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) - bio_put(bio); - inode_dio_end(&inode->v); - err_put_write_ref: -- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); - goto out; - } - -diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c -index e072900e6a5b..fbae9c1de746 100644 ---- a/fs/bcachefs/fs-io-pagecache.c -+++ b/fs/bcachefs/fs-io-pagecache.c -@@ -605,10 +605,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) - struct address_space *mapping = file->f_mapping; - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation res; -- unsigned len; -- loff_t isize; - vm_fault_t ret; - -+ loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c)); -+ unsigned offset = file_offset - folio_pos(folio); -+ unsigned len = max(PAGE_SIZE, block_bytes(c)); -+ -+ BUG_ON(offset + len > folio_size(folio)); -+ - bch2_folio_reservation_init(c, inode, &res); - - sb_start_pagefault(inode->v.i_sb); -@@ -623,24 +627,24 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) - bch2_pagecache_add_get(inode); - - folio_lock(folio); -- isize = i_size_read(&inode->v); -+ u64 isize = i_size_read(&inode->v); - -- if (folio->mapping != mapping || folio_pos(folio) >= isize) { -+ if (folio->mapping != mapping || file_offset >= isize) { - folio_unlock(folio); - ret = VM_FAULT_NOPAGE; - goto out; - } - -- len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); -+ len = min_t(unsigned, len, isize - file_offset); - - if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: -- bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { -+ bch2_folio_reservation_get(c, inode, folio, &res, offset, len)) { - folio_unlock(folio); - ret = VM_FAULT_SIGBUS; - goto out; - } - -- bch2_set_folio_dirty(c, inode, folio, &res, 0, len); -+ bch2_set_folio_dirty(c, inode, folio, &res, offset, len); - bch2_folio_reservation_put(c, inode, &res); - - folio_wait_stable(folio); -diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c -index 717e7b94c66f..b1e9ee28fc0f 100644 ---- a/fs/bcachefs/fs-io.c -+++ b/fs/bcachefs/fs-io.c -@@ -7,6 +7,7 @@ - #include "btree_update.h" - #include "buckets.h" - #include "clock.h" -+#include "enumerated_ref.h" - #include "error.h" - #include "extents.h" - #include "extent_update.h" -@@ -48,7 +49,8 @@ static void nocow_flush_endio(struct bio *_bio) - struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); - - closure_put(bio->cl); -- percpu_ref_put(&bio->ca->io_ref); -+ enumerated_ref_put(&bio->ca->io_ref[WRITE], -+ BCH_DEV_WRITE_REF_nocow_flush); - bio_put(&bio->bio); - } - -@@ -71,7 +73,8 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, - for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); -- if (ca && !percpu_ref_tryget(&ca->io_ref)) -+ if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], -+ BCH_DEV_WRITE_REF_nocow_flush)) - ca = NULL; - rcu_read_unlock(); - -@@ -144,10 +147,24 @@ int __must_check bch2_write_inode_size(struct bch_fs *c, - void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, s64 sectors) - { -- bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, -- "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", -- inode->v.i_ino, (u64) inode->v.i_blocks, sectors, -- inode->ei_inode.bi_sectors); -+ if (unlikely((s64) inode->v.i_blocks + sectors < 0)) { -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", -+ inode->v.i_ino, (u64) inode->v.i_blocks, sectors, -+ inode->ei_inode.bi_sectors); -+ -+ bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); -+ if (print) -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ -+ if (sectors < 0) -+ sectors = -inode->v.i_blocks; -+ else -+ sectors = 0; -+ } -+ - inode->v.i_blocks += sectors; - - #ifdef CONFIG_BCACHEFS_QUOTA -@@ -205,7 +222,7 @@ static int bch2_flush_inode(struct bch_fs *c, - if (c->opts.journal_flush_disabled) - return 0; - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) - return -EROFS; - - u64 seq; -@@ -213,7 +230,7 @@ static int bch2_flush_inode(struct bch_fs *c, - bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: - bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: - bch2_inode_flush_nocow_writes(c, inode); -- bch2_write_ref_put(c, BCH_WRITE_REF_fsync); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync); - return ret; - } - -@@ -502,11 +519,20 @@ int bchfs_truncate(struct mnt_idmap *idmap, - goto err; - } - -- bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && -- !bch2_journal_error(&c->journal), c, -- "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", -- inode->v.i_ino, (u64) inode->v.i_blocks, -- inode->ei_inode.bi_sectors); -+ if (unlikely(!inode->v.i_size && inode->v.i_blocks && -+ !bch2_journal_error(&c->journal))) { -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ prt_printf(&buf, -+ "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", -+ inode->v.i_ino, (u64) inode->v.i_blocks, -+ inode->ei_inode.bi_sectors); -+ -+ bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); -+ if (print) -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ } - - ret = bch2_setattr_nonsize(idmap, inode, iattr); - err: -@@ -636,9 +662,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, - if (ret) - goto bkey_err; - -- bch2_btree_iter_set_snapshot(&iter, snapshot); -+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - -- k = bch2_btree_iter_peek_slot(&iter); -+ k = bch2_btree_iter_peek_slot(trans, &iter); - if ((ret = bkey_err(k))) - goto bkey_err; - -@@ -649,13 +675,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, - /* already reserved */ - if (bkey_extent_is_reservation(k) && - bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { -- bch2_btree_iter_advance(&iter); -+ bch2_btree_iter_advance(trans, &iter); - continue; - } - - if (bkey_extent_is_data(k.k) && - !(mode & FALLOC_FL_ZERO_RANGE)) { -- bch2_btree_iter_advance(&iter); -+ bch2_btree_iter_advance(trans, &iter); - continue; - } - -@@ -676,7 +702,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, - if (ret) - goto bkey_err; - } -- bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); -+ bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); - - if (ret) - goto bkey_err; -@@ -795,7 +821,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, - struct bch_fs *c = inode->v.i_sb->s_fs_info; - long ret; - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate)) - return -EROFS; - - inode_lock(&inode->v); -@@ -819,7 +845,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, - err: - bch2_pagecache_block_put(inode); - inode_unlock(&inode->v); -- bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate); - - return bch2_err_class(ret); - } -@@ -999,17 +1025,28 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) - POS(inode->v.i_ino, offset >> 9), - POS(inode->v.i_ino, U64_MAX), - inum.subvol, BTREE_ITER_slots, k, ({ -- if (k.k->p.inode != inode->v.i_ino) { -- next_hole = bch2_seek_pagecache_hole(&inode->v, -- offset, MAX_LFS_FILESIZE, 0, false); -- break; -- } else if (!bkey_extent_is_data(k.k)) { -- next_hole = bch2_seek_pagecache_hole(&inode->v, -- max(offset, bkey_start_offset(k.k) << 9), -- k.k->p.offset << 9, 0, false); -- -- if (next_hole < k.k->p.offset << 9) -+ if (k.k->p.inode != inode->v.i_ino || -+ !bkey_extent_is_data(k.k)) { -+ loff_t start_offset = k.k->p.inode == inode->v.i_ino -+ ? max(offset, bkey_start_offset(k.k) << 9) -+ : offset; -+ loff_t end_offset = k.k->p.inode == inode->v.i_ino -+ ? MAX_LFS_FILESIZE -+ : k.k->p.offset << 9; -+ -+ /* -+ * Found a hole in the btree, now make sure it's -+ * a hole in the pagecache. We might have to -+ * keep searching if this hole is entirely dirty -+ * in the page cache: -+ */ -+ bch2_trans_unlock(trans); -+ loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, -+ start_offset, end_offset, 0, false); -+ if (pagecache_hole < end_offset) { -+ next_hole = pagecache_hole; - break; -+ } - } else { - offset = max(offset, bkey_start_offset(k.k) << 9); - } -diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c -index 15725b4ce393..0e99d940a320 100644 ---- a/fs/bcachefs/fs-ioctl.c -+++ b/fs/bcachefs/fs-ioctl.c -@@ -5,8 +5,8 @@ - #include "chardev.h" - #include "dirent.h" - #include "fs.h" --#include "fs-common.h" - #include "fs-ioctl.h" -+#include "namei.h" - #include "quota.h" - - #include -@@ -21,180 +21,6 @@ - #define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ - #define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ - --struct flags_set { -- unsigned mask; -- unsigned flags; -- -- unsigned projid; -- -- bool set_projinherit; -- bool projinherit; --}; -- --static int bch2_inode_flags_set(struct btree_trans *trans, -- struct bch_inode_info *inode, -- struct bch_inode_unpacked *bi, -- void *p) --{ -- struct bch_fs *c = inode->v.i_sb->s_fs_info; -- /* -- * We're relying on btree locking here for exclusion with other ioctl -- * calls - use the flags in the btree (@bi), not inode->i_flags: -- */ -- struct flags_set *s = p; -- unsigned newflags = s->flags; -- unsigned oldflags = bi->bi_flags & s->mask; -- -- if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) && -- !capable(CAP_LINUX_IMMUTABLE)) -- return -EPERM; -- -- if (!S_ISREG(bi->bi_mode) && -- !S_ISDIR(bi->bi_mode) && -- (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) -- return -EINVAL; -- -- if (s->set_projinherit) { -- bi->bi_fields_set &= ~(1 << Inode_opt_project); -- bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); -- } -- -- bi->bi_flags &= ~s->mask; -- bi->bi_flags |= newflags; -- -- bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); -- return 0; --} -- --static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) --{ -- unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); -- -- return put_user(flags, arg); --} -- --static int bch2_ioc_setflags(struct bch_fs *c, -- struct file *file, -- struct bch_inode_info *inode, -- void __user *arg) --{ -- struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; -- unsigned uflags; -- int ret; -- -- if (get_user(uflags, (int __user *) arg)) -- return -EFAULT; -- -- s.flags = map_flags_rev(bch_flags_to_uflags, uflags); -- if (uflags) -- return -EOPNOTSUPP; -- -- ret = mnt_want_write_file(file); -- if (ret) -- return ret; -- -- inode_lock(&inode->v); -- if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { -- ret = -EACCES; -- goto setflags_out; -- } -- -- mutex_lock(&inode->ei_update_lock); -- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: -- bch2_write_inode(c, inode, bch2_inode_flags_set, &s, -- ATTR_CTIME); -- mutex_unlock(&inode->ei_update_lock); -- --setflags_out: -- inode_unlock(&inode->v); -- mnt_drop_write_file(file); -- return ret; --} -- --static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, -- struct fsxattr __user *arg) --{ -- struct fsxattr fa = { 0 }; -- -- fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); -- -- if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) -- fa.fsx_xflags |= FS_XFLAG_PROJINHERIT; -- -- fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; -- -- if (copy_to_user(arg, &fa, sizeof(fa))) -- return -EFAULT; -- -- return 0; --} -- --static int fssetxattr_inode_update_fn(struct btree_trans *trans, -- struct bch_inode_info *inode, -- struct bch_inode_unpacked *bi, -- void *p) --{ -- struct flags_set *s = p; -- -- if (s->projid != bi->bi_project) { -- bi->bi_fields_set |= 1U << Inode_opt_project; -- bi->bi_project = s->projid; -- } -- -- return bch2_inode_flags_set(trans, inode, bi, p); --} -- --static int bch2_ioc_fssetxattr(struct bch_fs *c, -- struct file *file, -- struct bch_inode_info *inode, -- struct fsxattr __user *arg) --{ -- struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; -- struct fsxattr fa; -- int ret; -- -- if (copy_from_user(&fa, arg, sizeof(fa))) -- return -EFAULT; -- -- s.set_projinherit = true; -- s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0; -- fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT; -- -- s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); -- if (fa.fsx_xflags) -- return -EOPNOTSUPP; -- -- if (fa.fsx_projid >= U32_MAX) -- return -EINVAL; -- -- /* -- * inode fields accessible via the xattr interface are stored with a +1 -- * bias, so that 0 means unset: -- */ -- s.projid = fa.fsx_projid + 1; -- -- ret = mnt_want_write_file(file); -- if (ret) -- return ret; -- -- inode_lock(&inode->v); -- if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { -- ret = -EACCES; -- goto err; -- } -- -- mutex_lock(&inode->ei_update_lock); -- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: -- bch2_set_projid(c, inode, fa.fsx_projid) ?: -- bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, -- ATTR_CTIME); -- mutex_unlock(&inode->ei_update_lock); --err: -- inode_unlock(&inode->v); -- mnt_drop_write_file(file); -- return ret; --} -- - static int bch2_reinherit_attrs_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, -@@ -218,7 +44,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, - int ret = 0; - subvol_inum inum; - -- kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); -+ kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL); - if (!kname) - return -ENOMEM; - -@@ -346,7 +172,10 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) - if (get_user(flags, arg)) - return -EFAULT; - -- bch_notice(c, "shutdown by ioctl type %u", flags); -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ -+ prt_printf(&buf, "shutdown by ioctl type %u", flags); - - switch (flags) { - case FSOP_GOING_FLAGS_DEFAULT: -@@ -354,20 +183,23 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) - if (ret) - break; - bch2_journal_flush(&c->journal); -- bch2_fs_emergency_read_only(c); -+ bch2_fs_emergency_read_only2(c, &buf); - bdev_thaw(c->vfs_sb->s_bdev); - break; - case FSOP_GOING_FLAGS_LOGFLUSH: - bch2_journal_flush(&c->journal); - fallthrough; - case FSOP_GOING_FLAGS_NOLOGFLUSH: -- bch2_fs_emergency_read_only(c); -+ bch2_fs_emergency_read_only2(c, &buf); - break; - default: - ret = -EINVAL; -- break; -+ goto noprint; - } - -+ bch2_print_str(c, KERN_ERR, buf.buf); -+noprint: -+ printbuf_exit(&buf); - return ret; - } - -@@ -515,10 +347,12 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, - ret = -ENOENT; - goto err; - } -- ret = __bch2_unlink(dir, victim, true); -+ -+ ret = inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?: -+ __bch2_unlink(dir, victim, true); - if (!ret) { - fsnotify_rmdir(dir, victim); -- d_delete(victim); -+ d_invalidate(victim); - } - err: - inode_unlock(dir); -@@ -534,23 +368,6 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) - long ret; - - switch (cmd) { -- case FS_IOC_GETFLAGS: -- ret = bch2_ioc_getflags(inode, (int __user *) arg); -- break; -- -- case FS_IOC_SETFLAGS: -- ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg); -- break; -- -- case FS_IOC_FSGETXATTR: -- ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg); -- break; -- -- case FS_IOC_FSSETXATTR: -- ret = bch2_ioc_fssetxattr(c, file, inode, -- (void __user *) arg); -- break; -- - case BCHFS_IOC_REINHERIT_ATTRS: - ret = bch2_ioc_reinherit_attrs(c, file, inode, - (void __user *) arg); -diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h -index d30f9bb056fd..a657e4994b71 100644 ---- a/fs/bcachefs/fs-ioctl.h -+++ b/fs/bcachefs/fs-ioctl.h -@@ -2,79 +2,6 @@ - #ifndef _BCACHEFS_FS_IOCTL_H - #define _BCACHEFS_FS_IOCTL_H - --/* Inode flags: */ -- --/* bcachefs inode flags -> vfs inode flags: */ --static const __maybe_unused unsigned bch_flags_to_vfs[] = { -- [__BCH_INODE_sync] = S_SYNC, -- [__BCH_INODE_immutable] = S_IMMUTABLE, -- [__BCH_INODE_append] = S_APPEND, -- [__BCH_INODE_noatime] = S_NOATIME, --}; -- --/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ --static const __maybe_unused unsigned bch_flags_to_uflags[] = { -- [__BCH_INODE_sync] = FS_SYNC_FL, -- [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, -- [__BCH_INODE_append] = FS_APPEND_FL, -- [__BCH_INODE_nodump] = FS_NODUMP_FL, -- [__BCH_INODE_noatime] = FS_NOATIME_FL, --}; -- --/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ --static const __maybe_unused unsigned bch_flags_to_xflags[] = { -- [__BCH_INODE_sync] = FS_XFLAG_SYNC, -- [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, -- [__BCH_INODE_append] = FS_XFLAG_APPEND, -- [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, -- [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, -- //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; --}; -- --#define set_flags(_map, _in, _out) \ --do { \ -- unsigned _i; \ -- \ -- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -- if ((_in) & (1 << _i)) \ -- (_out) |= _map[_i]; \ -- else \ -- (_out) &= ~_map[_i]; \ --} while (0) -- --#define map_flags(_map, _in) \ --({ \ -- unsigned _out = 0; \ -- \ -- set_flags(_map, _in, _out); \ -- _out; \ --}) -- --#define map_flags_rev(_map, _in) \ --({ \ -- unsigned _i, _out = 0; \ -- \ -- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -- if ((_in) & _map[_i]) { \ -- (_out) |= 1 << _i; \ -- (_in) &= ~_map[_i]; \ -- } \ -- (_out); \ --}) -- --#define map_defined(_map) \ --({ \ -- unsigned _in = ~0; \ -- \ -- map_flags_rev(_map, _in); \ --}) -- --/* Set VFS inode flags from bcachefs inode: */ --static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) --{ -- set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); --} -- - long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); - long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); - -diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c -index 90ade8f648d9..3813658e72ad 100644 ---- a/fs/bcachefs/fs.c -+++ b/fs/bcachefs/fs.c -@@ -11,7 +11,6 @@ - #include "errcode.h" - #include "extents.h" - #include "fs.h" --#include "fs-common.h" - #include "fs-io.h" - #include "fs-ioctl.h" - #include "fs-io-buffered.h" -@@ -22,6 +21,7 @@ - #include "io_read.h" - #include "journal.h" - #include "keylist.h" -+#include "namei.h" - #include "quota.h" - #include "rebalance.h" - #include "snapshot.h" -@@ -33,6 +33,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -51,6 +52,29 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - struct bch_subvolume *); - -+/* Set VFS inode flags from bcachefs inode: */ -+static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode) -+{ -+ static const __maybe_unused unsigned bch_flags_to_vfs[] = { -+ [__BCH_INODE_sync] = S_SYNC, -+ [__BCH_INODE_immutable] = S_IMMUTABLE, -+ [__BCH_INODE_append] = S_APPEND, -+ [__BCH_INODE_noatime] = S_NOATIME, -+ }; -+ -+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); -+ -+ if (bch2_inode_casefold(c, &inode->ei_inode)) -+ inode->v.i_flags |= S_CASEFOLD; -+ else -+ inode->v.i_flags &= ~S_CASEFOLD; -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_has_case_insensitive) -+ inode->v.i_flags &= ~S_NO_CASEFOLD; -+ else -+ inode->v.i_flags |= S_NO_CASEFOLD; -+} -+ - void bch2_inode_update_after_write(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, -@@ -79,7 +103,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans, - - inode->ei_inode = *bi; - -- bch2_inode_flags_to_vfs(inode); -+ bch2_inode_flags_to_vfs(c, inode); - } - - int __must_check bch2_write_inode(struct bch_fs *c, -@@ -88,7 +112,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, - void *p, unsigned fields) - { - struct btree_trans *trans = bch2_trans_get(c); -- struct btree_iter iter = { NULL }; -+ struct btree_iter iter = {}; - struct bch_inode_unpacked inode_u; - int ret; - retry: -@@ -172,11 +196,6 @@ int bch2_fs_quota_transfer(struct bch_fs *c, - return ret; - } - --static bool subvol_inum_eq(subvol_inum a, subvol_inum b) --{ -- return a.subvol == b.subvol && a.inum == b.inum; --} -- - static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) - { - const subvol_inum *inum = data; -@@ -333,9 +352,8 @@ static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btre - if (!trans) { - __wait_on_freeing_inode(c, inode, inum); - } else { -- bch2_trans_unlock(trans); -- __wait_on_freeing_inode(c, inode, inum); -- int ret = bch2_trans_relock(trans); -+ int ret = drop_locks_do(trans, -+ (__wait_on_freeing_inode(c, inode, inum), 0)); - if (ret) - return ERR_PTR(ret); - } -@@ -631,17 +649,24 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, - const struct qstr *name) - { - struct bch_fs *c = trans->c; -- struct btree_iter dirent_iter = {}; - subvol_inum inum = {}; - struct printbuf buf = PRINTBUF; - -+ struct qstr lookup_name; -+ int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ struct btree_iter dirent_iter = {}; - struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, -- dir_hash_info, dir, name, 0); -- int ret = bkey_err(k); -+ dir_hash_info, dir, &lookup_name, 0); -+ ret = bkey_err(k); - if (ret) - return ERR_PTR(ret); - -- ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ -+ ret = bch2_dirent_read_target(trans, dir, d, &inum); - if (ret > 0) - ret = -ENOENT; - if (ret) -@@ -651,30 +676,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, - if (inode) - goto out; - -+ /* -+ * Note: if check/repair needs it, we commit before -+ * bch2_inode_hash_init_insert(), as after that point we can't take a -+ * restart - not in the top level loop with a commit_do(), like we -+ * usually do: -+ */ -+ - struct bch_subvolume subvol; - struct bch_inode_unpacked inode_u; - ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: -+ bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: -+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); - -+ /* -+ * don't remove it: check_inodes might find another inode that points -+ * back to this dirent -+ */ - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), -- c, "dirent to missing inode:\n %s", -- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ c, "dirent to missing inode:\n%s", -+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); - if (ret) - goto err; -- -- /* regular files may have hardlinks: */ -- if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && -- !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), -- c, -- "dirent points to inode that does not point back:\n %s", -- (bch2_bkey_val_to_text(&buf, c, k), -- prt_printf(&buf, "\n "), -- bch2_inode_unpacked_to_text(&buf, &inode_u), -- buf.buf))) { -- ret = -ENOENT; -- goto err; -- } - out: - bch2_trans_iter_exit(trans, &dirent_iter); - printbuf_exit(&buf); -@@ -698,6 +723,23 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, - if (IS_ERR(inode)) - inode = NULL; - -+#ifdef CONFIG_UNICODE -+ if (!inode && IS_CASEFOLDED(vdir)) { -+ /* -+ * Do not cache a negative dentry in casefolded directories -+ * as it would need to be invalidated in the following situation: -+ * - Lookup file "blAH" in a casefolded directory -+ * - Creation of file "BLAH" in a casefolded directory -+ * - Lookup file "blAH" in a casefolded directory -+ * which would fail if we had a negative dentry. -+ * -+ * We should come back to this when VFS has a method to handle -+ * this edgecase. -+ */ -+ return NULL; -+ } -+#endif -+ - return d_splice_alias(&inode->v, dentry); - } - -@@ -806,6 +848,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, - */ - set_nlink(&inode->v, 0); - } -+ -+ if (IS_CASEFOLDED(vdir)) -+ d_invalidate(dentry); - err: - bch2_trans_put(trans); - bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); -@@ -876,6 +921,8 @@ static int bch2_rename2(struct mnt_idmap *idmap, - struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); - struct bch_inode_unpacked dst_dir_u, src_dir_u; - struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; -+ struct d_casefold_enable casefold_enable_src = {}; -+ struct d_casefold_enable casefold_enable_dst = {}; - struct btree_trans *trans; - enum bch_rename_mode mode = flags & RENAME_EXCHANGE - ? BCH_RENAME_EXCHANGE -@@ -900,6 +947,21 @@ static int bch2_rename2(struct mnt_idmap *idmap, - src_inode, - dst_inode); - -+ if (src_dir != dst_dir) { -+ if (bch2_inode_casefold(c, &src_inode->ei_inode)) { -+ ret = d_casefold_enable(dst_dentry, &casefold_enable_dst, true); -+ if (ret) -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_EXCHANGE && -+ bch2_inode_casefold(c, &dst_inode->ei_inode)) { -+ ret = d_casefold_enable(src_dentry, &casefold_enable_src, true); -+ if (ret) -+ goto err; -+ } -+ } -+ - trans = bch2_trans_get(c); - - ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: -@@ -1004,6 +1066,9 @@ static int bch2_rename2(struct mnt_idmap *idmap, - src_inode, - dst_inode); - -+ d_casefold_enable_commit(&casefold_enable_dst, ret); -+ d_casefold_enable_commit(&casefold_enable_src, ret); -+ - return bch2_err_class(ret); - } - -@@ -1056,7 +1121,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_qid qid; - struct btree_trans *trans; -- struct btree_iter inode_iter = { NULL }; -+ struct btree_iter inode_iter = {}; - struct bch_inode_unpacked inode_u; - struct posix_acl *acl = NULL; - kuid_t kuid; -@@ -1216,10 +1281,20 @@ static int bch2_tmpfile(struct mnt_idmap *idmap, - return finish_open_simple(file, 0); - } - -+struct bch_fiemap_extent { -+ struct bkey_buf kbuf; -+ unsigned flags; -+}; -+ - static int bch2_fill_extent(struct bch_fs *c, - struct fiemap_extent_info *info, -- struct bkey_s_c k, unsigned flags) -+ struct bch_fiemap_extent *fe) - { -+ struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k); -+ unsigned flags = fe->flags; -+ -+ BUG_ON(!k.k->size); -+ - if (bkey_extent_is_direct_data(k.k)) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; -@@ -1272,110 +1347,225 @@ static int bch2_fill_extent(struct bch_fs *c, - } - } - --static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, -- u64 start, u64 len) -+/* -+ * Scan a range of an inode for data in pagecache. -+ * -+ * Intended to be retryable, so don't modify the output params until success is -+ * imminent. -+ */ -+static int -+bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, -+ bool nonblock) - { -- struct bch_fs *c = vinode->i_sb->s_fs_info; -- struct bch_inode_info *ei = to_bch_ei(vinode); -- struct btree_trans *trans; -- struct btree_iter iter; -- struct bkey_s_c k; -- struct bkey_buf cur, prev; -- bool have_extent = false; -- int ret = 0; -+ loff_t dstart, dend; - -- ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); -- if (ret) -+ dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock); -+ if (dstart < 0) -+ return dstart; -+ -+ if (dstart == *end) { -+ *start = dstart; -+ return 0; -+ } -+ -+ dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock); -+ if (dend < 0) -+ return dend; -+ -+ /* race */ -+ BUG_ON(dstart == dend); -+ -+ *start = dstart; -+ *end = dend; -+ return 0; -+} -+ -+/* -+ * Scan a range of pagecache that corresponds to a file mapping hole in the -+ * extent btree. If data is found, fake up an extent key so it looks like a -+ * delalloc extent to the rest of the fiemap processing code. -+ */ -+static int -+bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode, -+ u64 start, u64 end, struct bch_fiemap_extent *cur) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_i_extent *delextent; -+ struct bch_extent_ptr ptr = {}; -+ loff_t dstart = start << 9, dend = end << 9; -+ int ret; -+ -+ /* -+ * We hold btree locks here so we cannot block on folio locks without -+ * dropping trans locks first. Run a nonblocking scan for the common -+ * case of no folios over holes and fall back on failure. -+ * -+ * Note that dropping locks like this is technically racy against -+ * writeback inserting to the extent tree, but a non-sync fiemap scan is -+ * fundamentally racy with writeback anyways. Therefore, just report the -+ * range as delalloc regardless of whether we have to cycle trans locks. -+ */ -+ ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true); -+ if (ret == -EAGAIN) -+ ret = drop_locks_do(trans, -+ bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false)); -+ if (ret < 0) - return ret; - -- struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); -- if (start + len < start) -- return -EINVAL; -+ /* -+ * Create a fake extent key in the buffer. We have to add a dummy extent -+ * pointer for the fill code to add an extent entry. It's explicitly -+ * zeroed to reflect delayed allocation (i.e. phys offset 0). -+ */ -+ bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); -+ delextent = bkey_extent_init(cur->kbuf.k); -+ delextent->k.p = POS(inode->ei_inum.inum, dend >> 9); -+ delextent->k.size = (dend - dstart) >> 9; -+ bch2_bkey_append_ptr(&delextent->k_i, ptr); - -- start >>= 9; -+ cur->flags = FIEMAP_EXTENT_DELALLOC; - -- bch2_bkey_buf_init(&cur); -- bch2_bkey_buf_init(&prev); -- trans = bch2_trans_get(c); -+ return 0; -+} -+ -+static int bch2_next_fiemap_extent(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ u64 start, u64 end, -+ struct bch_fiemap_extent *cur) -+{ -+ u32 snapshot; -+ int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); -+ if (ret) -+ return ret; - -+ struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, -- POS(ei->v.i_ino, start), 0); -+ SPOS(inode->ei_inum.inum, start, snapshot), 0); - -- while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { -- enum btree_id data_btree = BTREE_ID_extents; -+ struct bkey_s_c k = -+ bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end)); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; - -- bch2_trans_begin(trans); -+ u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end; - -- u32 snapshot; -- ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); -- if (ret) -- continue; -+ ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur); -+ if (ret) -+ goto err; - -- bch2_btree_iter_set_snapshot(&iter, snapshot); -+ struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k); - -- k = bch2_btree_iter_peek_max(&iter, end); -- ret = bkey_err(k); -+ /* -+ * Does the pagecache or the btree take precedence? -+ * -+ * It _should_ be the pagecache, so that we correctly report delalloc -+ * extents when dirty in the pagecache (we're COW, after all). -+ * -+ * But we'd have to add per-sector writeback tracking to -+ * bch_folio_state, otherwise we report delalloc extents for clean -+ * cached data in the pagecache. -+ * -+ * We should do this, but even then fiemap won't report stable mappings: -+ * on bcachefs data moves around in the background (copygc, rebalance) -+ * and we don't provide a way for userspace to lock that out. -+ */ -+ if (k.k && -+ bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)), -+ pagecache_start)) { -+ bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k); -+ bch2_cut_front(iter.pos, cur->kbuf.k); -+ bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k); -+ cur->flags = 0; -+ } else if (k.k) { -+ bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k); -+ } -+ -+ if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) { -+ unsigned sectors = cur->kbuf.k->k.size; -+ s64 offset_into_extent = 0; -+ enum btree_id data_btree = BTREE_ID_extents; -+ ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, -+ &cur->kbuf); - if (ret) -- continue; -+ goto err; - -- if (!k.k) -- break; -+ struct bkey_i *k = cur->kbuf.k; -+ sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent); - -- if (!bkey_extent_is_data(k.k) && -- k.k->type != KEY_TYPE_reservation) { -- bch2_btree_iter_advance(&iter); -- continue; -- } -+ bch2_cut_front(POS(k->k.p.inode, -+ bkey_start_offset(&k->k) + offset_into_extent), -+ k); -+ bch2_key_resize(&k->k, sectors); -+ k->k.p = iter.pos; -+ k->k.p.offset += k->k.size; -+ } -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, -+ u64 start, u64 len) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *ei = to_bch_ei(vinode); -+ struct btree_trans *trans; -+ struct bch_fiemap_extent cur, prev; -+ int ret = 0; - -- s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); -- unsigned sectors = k.k->size - offset_into_extent; -+ ret = fiemap_prep(&ei->v, info, start, &len, 0); -+ if (ret) -+ return ret; - -- bch2_bkey_buf_reassemble(&cur, c, k); -+ if (start + len < start) -+ return -EINVAL; - -- ret = bch2_read_indirect_extent(trans, &data_btree, -- &offset_into_extent, &cur); -+ start >>= 9; -+ u64 end = (start + len) >> 9; -+ -+ bch2_bkey_buf_init(&cur.kbuf); -+ bch2_bkey_buf_init(&prev.kbuf); -+ bkey_init(&prev.kbuf.k->k); -+ -+ trans = bch2_trans_get(c); -+ -+ while (start < end) { -+ ret = lockrestart_do(trans, -+ bch2_next_fiemap_extent(trans, ei, start, end, &cur)); - if (ret) -- continue; -+ goto err; - -- k = bkey_i_to_s_c(cur.k); -- bch2_bkey_buf_realloc(&prev, c, k.k->u64s); -+ BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start); -+ BUG_ON(cur.kbuf.k->k.p.offset > end); - -- sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); -+ if (bkey_start_offset(&cur.kbuf.k->k) == end) -+ break; - -- bch2_cut_front(POS(k.k->p.inode, -- bkey_start_offset(k.k) + -- offset_into_extent), -- cur.k); -- bch2_key_resize(&cur.k->k, sectors); -- cur.k->k.p = iter.pos; -- cur.k->k.p.offset += cur.k->k.size; -+ start = cur.kbuf.k->k.p.offset; - -- if (have_extent) { -+ if (!bkey_deleted(&prev.kbuf.k->k)) { - bch2_trans_unlock(trans); -- ret = bch2_fill_extent(c, info, -- bkey_i_to_s_c(prev.k), 0); -+ ret = bch2_fill_extent(c, info, &prev); - if (ret) -- break; -+ goto err; - } - -- bkey_copy(prev.k, cur.k); -- have_extent = true; -- -- bch2_btree_iter_set_pos(&iter, -- POS(iter.pos.inode, iter.pos.offset + sectors)); -+ bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k); -+ prev.flags = cur.flags; - } -- bch2_trans_iter_exit(trans, &iter); - -- if (!ret && have_extent) { -+ if (!bkey_deleted(&prev.kbuf.k->k)) { - bch2_trans_unlock(trans); -- ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), -- FIEMAP_EXTENT_LAST); -+ prev.flags |= FIEMAP_EXTENT_LAST; -+ ret = bch2_fill_extent(c, info, &prev); - } -- -+err: - bch2_trans_put(trans); -- bch2_bkey_buf_exit(&cur, c); -- bch2_bkey_buf_exit(&prev, c); -- return ret < 0 ? ret : 0; -+ bch2_bkey_buf_exit(&cur.kbuf, c); -+ bch2_bkey_buf_exit(&prev.kbuf, c); -+ -+ return bch2_err_class(ret < 0 ? ret : 0); - } - - static const struct vm_operations_struct bch_vm_ops = { -@@ -1430,6 +1620,153 @@ static int bch2_open(struct inode *vinode, struct file *file) - return generic_file_open(vinode, file); - } - -+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -+static const __maybe_unused unsigned bch_flags_to_uflags[] = { -+ [__BCH_INODE_sync] = FS_SYNC_FL, -+ [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, -+ [__BCH_INODE_append] = FS_APPEND_FL, -+ [__BCH_INODE_nodump] = FS_NODUMP_FL, -+ [__BCH_INODE_noatime] = FS_NOATIME_FL, -+}; -+ -+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -+static const __maybe_unused unsigned bch_flags_to_xflags[] = { -+ [__BCH_INODE_sync] = FS_XFLAG_SYNC, -+ [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, -+ [__BCH_INODE_append] = FS_XFLAG_APPEND, -+ [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, -+ [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, -+}; -+ -+static int bch2_fileattr_get(struct dentry *dentry, -+ struct fileattr *fa) -+{ -+ struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags)); -+ -+ if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) -+ fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; -+ -+ if (bch2_inode_casefold(c, &inode->ei_inode)) -+ fa->flags |= FS_CASEFOLD_FL; -+ -+ fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ]; -+ return 0; -+} -+ -+struct flags_set { -+ unsigned mask; -+ unsigned flags; -+ unsigned projid; -+ bool set_project; -+ bool set_casefold; -+ bool casefold; -+}; -+ -+static int fssetxattr_inode_update_fn(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = trans->c; -+ struct flags_set *s = p; -+ -+ /* -+ * We're relying on btree locking here for exclusion with other ioctl -+ * calls - use the flags in the btree (@bi), not inode->i_flags: -+ */ -+ if (!S_ISREG(bi->bi_mode) && -+ !S_ISDIR(bi->bi_mode) && -+ (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags) -+ return -EINVAL; -+ -+ if (s->casefold != bch2_inode_casefold(c, bi)) { -+ int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold); -+ if (ret) -+ return ret; -+ } -+ -+ if (s->set_project) { -+ bi->bi_project = s->projid; -+ bi->bi_fields_set |= BIT(Inode_opt_project); -+ } -+ -+ bi->bi_flags &= ~s->mask; -+ bi->bi_flags |= s->flags; -+ -+ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); -+ return 0; -+} -+ -+static int bch2_fileattr_set(struct mnt_idmap *idmap, -+ struct dentry *dentry, -+ struct fileattr *fa) -+{ -+ struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct flags_set s = {}; -+ struct d_casefold_enable casefold_enable = {}; -+ int ret; -+ -+ guard(mutex)(&inode->ei_update_lock); -+ -+ if (fa->fsx_valid) { -+ fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT; -+ -+ s.mask = map_defined(bch_flags_to_xflags); -+ s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); -+ if (fa->fsx_xflags) -+ return -EOPNOTSUPP; -+ -+ if (fa->fsx_projid >= U32_MAX) -+ return -EINVAL; -+ -+ /* -+ * inode fields accessible via the xattr interface are stored with a +1 -+ * bias, so that 0 means unset: -+ */ -+ if ((inode->ei_inode.bi_project || -+ fa->fsx_projid) && -+ inode->ei_inode.bi_project != fa->fsx_projid + 1) { -+ s.projid = fa->fsx_projid + 1; -+ s.set_project = true; -+ } -+ } -+ -+ if (fa->flags_valid) { -+ s.mask = map_defined(bch_flags_to_uflags); -+ -+ s.set_casefold = true; -+ s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0; -+ fa->flags &= ~FS_CASEFOLD_FL; -+ -+ if (s.casefold && s.casefold != bch2_inode_casefold(c, &inode->ei_inode)) { -+ ret = d_casefold_enable(dentry, &casefold_enable, false); -+ if (ret) -+ goto err; -+ } -+ -+ s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); -+ if (fa->flags) { -+ ret = -EOPNOTSUPP; -+ goto err; -+ } -+ } -+ -+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: -+ (s.set_project -+ ? bch2_set_projid(c, inode, fa->fsx_projid) -+ : 0) ?: -+ bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, -+ ATTR_CTIME); -+err: -+ d_casefold_enable_commit(&casefold_enable, ret); -+ -+ return ret; -+} -+ - static const struct file_operations bch_file_operations = { - .open = bch2_open, - .llseek = bch2_llseek, -@@ -1457,6 +1794,8 @@ static const struct inode_operations bch_file_inode_operations = { - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, - #endif -+ .fileattr_get = bch2_fileattr_get, -+ .fileattr_set = bch2_fileattr_set, - }; - - static const struct inode_operations bch_dir_inode_operations = { -@@ -1477,6 +1816,8 @@ static const struct inode_operations bch_dir_inode_operations = { - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, - #endif -+ .fileattr_get = bch2_fileattr_get, -+ .fileattr_set = bch2_fileattr_set, - }; - - static const struct file_operations bch_dir_file_operations = { -@@ -1499,6 +1840,8 @@ static const struct inode_operations bch_symlink_inode_operations = { - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, - #endif -+ .fileattr_get = bch2_fileattr_get, -+ .fileattr_set = bch2_fileattr_set, - }; - - static const struct inode_operations bch_special_inode_operations = { -@@ -1509,6 +1852,8 @@ static const struct inode_operations bch_special_inode_operations = { - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, - #endif -+ .fileattr_get = bch2_fileattr_get, -+ .fileattr_set = bch2_fileattr_set, - }; - - static const struct address_space_operations bch_address_space_operations = { -@@ -1678,17 +2023,17 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child - if (ret) - goto err; - -- bch2_btree_iter_set_snapshot(&iter1, snapshot); -- bch2_btree_iter_set_snapshot(&iter2, snapshot); -+ bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); -+ bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); - - ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); - if (ret) - goto err; - - if (inode_u.bi_dir == dir->ei_inode.bi_inum) { -- bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); -+ bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); - -- k = bch2_btree_iter_peek_slot(&iter1); -+ k = bch2_btree_iter_peek_slot(trans, &iter1); - ret = bkey_err(k); - if (ret) - goto err; -@@ -1712,7 +2057,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child - * File with multiple hardlinks and our backref is to the wrong - * directory - linear search: - */ -- for_each_btree_key_continue_norestart(iter2, 0, k, ret) { -+ for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { - if (k.k->p.inode > dir->ei_inode.bi_inum) - break; - -@@ -1802,7 +2147,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, - break; - } - -- mapping_set_large_folios(inode->v.i_mapping); -+ mapping_set_folio_min_order(inode->v.i_mapping, -+ get_order(trans->c->opts.block_size)); - } - - static void bch2_free_inode(struct inode *vinode) -@@ -2008,55 +2354,19 @@ static struct bch_fs *bch2_path_to_fs(const char *path) - return c ?: ERR_PTR(-ENOENT); - } - --static int bch2_remount(struct super_block *sb, int *flags, -- struct bch_opts opts) --{ -- struct bch_fs *c = sb->s_fs_info; -- int ret = 0; -- -- opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); -- -- if (opts.read_only != c->opts.read_only) { -- down_write(&c->state_lock); -- -- if (opts.read_only) { -- bch2_fs_read_only(c); -- -- sb->s_flags |= SB_RDONLY; -- } else { -- ret = bch2_fs_read_write(c); -- if (ret) { -- bch_err(c, "error going rw: %i", ret); -- up_write(&c->state_lock); -- ret = -EINVAL; -- goto err; -- } -- -- sb->s_flags &= ~SB_RDONLY; -- } -- -- c->opts.read_only = opts.read_only; -- -- up_write(&c->state_lock); -- } -- -- if (opt_defined(opts, errors)) -- c->opts.errors = opts.errors; --err: -- return bch2_err_class(ret); --} -- - static int bch2_show_devname(struct seq_file *seq, struct dentry *root) - { - struct bch_fs *c = root->d_sb->s_fs_info; - bool first = true; - -- for_each_online_member(c, ca) { -+ rcu_read_lock(); -+ for_each_online_member_rcu(c, ca) { - if (!first) - seq_putc(seq, ':'); - first = false; - seq_puts(seq, ca->disk_sb.sb_name); - } -+ rcu_read_unlock(); - - return 0; - } -@@ -2163,7 +2473,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) - struct inode *vinode; - struct bch2_opts_parse *opts_parse = fc->fs_private; - struct bch_opts opts = opts_parse->opts; -- darray_str devs; -+ darray_const_str devs; - darray_fs devs_to_fs = {}; - int ret; - -@@ -2187,14 +2497,17 @@ static int bch2_fs_get_tree(struct fs_context *fc) - if (!IS_ERR(sb)) - goto got_sb; - -- c = bch2_fs_open(devs.data, devs.nr, opts); -+ c = bch2_fs_open(&devs, &opts); - ret = PTR_ERR_OR_ZERO(c); - if (ret) - goto err; - -+ if (opt_defined(opts, discard)) -+ set_bit(BCH_FS_discard_mount_opt_set, &c->flags); -+ - /* Some options can't be parsed until after the fs is started: */ - opts = bch2_opts_empty(); -- ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); -+ ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false); - if (ret) - goto err_stop_fs; - -@@ -2234,7 +2547,12 @@ static int bch2_fs_get_tree(struct fs_context *fc) - sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; - sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); - super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); -- super_set_sysfs_name_uuid(sb); -+ -+ if (c->sb.multi_device) -+ super_set_sysfs_name_uuid(sb); -+ else -+ strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name)); -+ - sb->s_shrink->seeks = 0; - c->vfs_sb = sb; - strscpy(sb->s_id, c->name, sizeof(sb->s_id)); -@@ -2245,15 +2563,16 @@ static int bch2_fs_get_tree(struct fs_context *fc) - - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - -- for_each_online_member(c, ca) { -+ rcu_read_lock(); -+ for_each_online_member_rcu(c, ca) { - struct block_device *bdev = ca->disk_sb.bdev; - - /* XXX: create an anonymous device for multi device filesystems */ - sb->s_bdev = bdev; - sb->s_dev = bdev->bd_dev; -- percpu_ref_put(&ca->io_ref); - break; - } -+ rcu_read_unlock(); - - c->dev = sb->s_dev; - -@@ -2264,6 +2583,11 @@ static int bch2_fs_get_tree(struct fs_context *fc) - - sb->s_shrink->seeks = 0; - -+#ifdef CONFIG_UNICODE -+ sb->s_encoding = c->cf_encoding; -+#endif -+ generic_set_sb_d_ops(sb); -+ - vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); - ret = PTR_ERR_OR_ZERO(vinode); - bch_err_msg(c, ret, "mounting: error getting root inode"); -@@ -2300,7 +2624,8 @@ static int bch2_fs_get_tree(struct fs_context *fc) - goto err; - - err_put_super: -- __bch2_fs_stop(c); -+ if (!sb->s_root) -+ __bch2_fs_stop(c); - deactivate_locked_super(sb); - goto err; - } -@@ -2343,6 +2668,8 @@ static int bch2_fs_parse_param(struct fs_context *fc, - int ret = bch2_parse_one_mount_opt(c, &opts->opts, - &opts->parse_later, param->key, - param->string); -+ if (ret) -+ pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret)); - - return bch2_err_class(ret); - } -@@ -2351,8 +2678,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc) - { - struct super_block *sb = fc->root->d_sb; - struct bch2_opts_parse *opts = fc->fs_private; -+ struct bch_fs *c = sb->s_fs_info; -+ int ret = 0; - -- return bch2_remount(sb, &fc->sb_flags, opts->opts); -+ opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); -+ -+ if (opts->opts.read_only != c->opts.read_only) { -+ down_write(&c->state_lock); -+ -+ if (opts->opts.read_only) { -+ bch2_fs_read_only(c); -+ -+ sb->s_flags |= SB_RDONLY; -+ } else { -+ ret = bch2_fs_read_write(c); -+ if (ret) { -+ bch_err(c, "error going rw: %i", ret); -+ up_write(&c->state_lock); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ sb->s_flags &= ~SB_RDONLY; -+ } -+ -+ c->opts.read_only = opts->opts.read_only; -+ -+ up_write(&c->state_lock); -+ } -+ -+ if (opt_defined(opts->opts, errors)) -+ c->opts.errors = opts->opts.errors; -+err: -+ return bch2_err_class(ret); - } - - static const struct fs_context_operations bch2_context_ops = { -@@ -2396,7 +2754,7 @@ static struct file_system_type bcache_fs_type = { - .name = "bcachefs", - .init_fs_context = bch2_init_fs_context, - .kill_sb = bch2_kill_sb, -- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, -+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS, - }; - - MODULE_ALIAS_FS("bcachefs"); -diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c -index 0e85131d0af8..fd3178189565 100644 ---- a/fs/bcachefs/fsck.c -+++ b/fs/bcachefs/fsck.c -@@ -6,14 +6,13 @@ - #include "btree_cache.h" - #include "btree_update.h" - #include "buckets.h" --#include "darray.h" - #include "dirent.h" - #include "error.h" - #include "fs.h" --#include "fs-common.h" - #include "fsck.h" - #include "inode.h" - #include "keylist.h" -+#include "namei.h" - #include "recovery_passes.h" - #include "snapshot.h" - #include "super.h" -@@ -21,15 +20,9 @@ - #include "xattr.h" - - #include -+#include - #include /* struct qstr */ - --static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, -- struct bkey_s_c_dirent d) --{ -- return inode->bi_dir == d.k->p.inode && -- inode->bi_dir_offset == d.k->p.offset; --} -- - static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) - { -@@ -116,50 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, - return ret; - } - --static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, -- struct bch_inode_unpacked *inode) --{ -- struct btree_iter iter; -- struct bkey_s_c k; -- int ret; -- -- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), -- BTREE_ITER_all_snapshots, k, ret) { -- if (k.k->p.offset != inode_nr) -- break; -- if (!bkey_is_inode(k.k)) -- continue; -- ret = bch2_inode_unpack(k, inode); -- goto found; -- } -- ret = -BCH_ERR_ENOENT_inode; --found: -- bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); -- bch2_trans_iter_exit(trans, &iter); -- return ret; --} -- --static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, -- struct bch_inode_unpacked *inode) --{ -- struct btree_iter iter; -- struct bkey_s_c k; -- int ret; -- -- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, -- SPOS(0, inode_nr, snapshot), 0); -- ret = bkey_err(k); -- if (ret) -- goto err; -- -- ret = bkey_is_inode(k.k) -- ? bch2_inode_unpack(k, inode) -- : -BCH_ERR_ENOENT_inode; --err: -- bch2_trans_iter_exit(trans, &iter); -- return ret; --} -- - static int lookup_dirent_in_snapshot(struct btree_trans *trans, - struct bch_hash_info hash_info, - subvol_inum dir, struct qstr *name, -@@ -179,32 +128,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, - return 0; - } - --static int __remove_dirent(struct btree_trans *trans, struct bpos pos) --{ -- struct bch_fs *c = trans->c; -- struct btree_iter iter; -- struct bch_inode_unpacked dir_inode; -- struct bch_hash_info dir_hash_info; -- int ret; -- -- ret = lookup_first_inode(trans, pos.inode, &dir_inode); -- if (ret) -- goto err; -- -- dir_hash_info = bch2_hash_info_init(c, &dir_inode); -- -- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); -- -- ret = bch2_btree_iter_traverse(&iter) ?: -- bch2_hash_delete_at(trans, bch2_dirent_hash_desc, -- &dir_hash_info, &iter, -- BTREE_UPDATE_internal_snapshot_node); -- bch2_trans_iter_exit(trans, &iter); --err: -- bch_err_fn(c, ret); -- return ret; --} -- - /* - * Find any subvolume associated with a tree of snapshots - * We can't rely on master_subvol - it might have been deleted. -@@ -242,7 +165,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - { - struct bch_fs *c = trans->c; - struct qstr lostfound_str = QSTR("lost+found"); -- struct btree_iter lostfound_iter = { NULL }; -+ struct btree_iter lostfound_iter = {}; - u64 inum = 0; - unsigned d_type = 0; - int ret; -@@ -287,7 +210,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - - struct bch_inode_unpacked root_inode; - struct bch_hash_info root_hash_info; -- ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); -+ ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); - bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", - root_inum.inum, subvolid); - if (ret) -@@ -313,7 +236,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - * The bch2_check_dirents pass has already run, dangling dirents - * shouldn't exist here: - */ -- ret = lookup_inode(trans, inum, snapshot, lostfound); -+ ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); - bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", - inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); - return ret; -@@ -341,7 +264,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - u64 cpu = raw_smp_processor_id(); - - bch2_inode_init_early(c, lostfound); -- bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); -+ bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); - lostfound->bi_dir = root_inode.bi_inum; - lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); - -@@ -351,8 +274,8 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - if (ret) - goto err; - -- bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); -- ret = bch2_btree_iter_traverse(&lostfound_iter); -+ bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); -+ ret = bch2_btree_iter_traverse(trans, &lostfound_iter); - if (ret) - goto err; - -@@ -362,6 +285,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - &lostfound_str, - lostfound->bi_inum, - &lostfound->bi_dir_offset, -+ BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create) ?: - bch2_inode_write_flags(trans, &lostfound_iter, lostfound, - BTREE_UPDATE_internal_snapshot_node); -@@ -377,6 +301,31 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) - inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) - return false; - -+ /* -+ * Subvolume roots are special: older versions of subvolume roots may be -+ * disconnected, it's only the newest version that matters. -+ * -+ * We only keep a single dirent pointing to a subvolume root, i.e. -+ * older versions of snapshots will not have a different dirent pointing -+ * to the same subvolume root. -+ * -+ * This is because dirents that point to subvolumes are only visible in -+ * the parent subvolume - versioning is not needed - and keeping them -+ * around would break fsck, because when we're crossing subvolumes we -+ * don't have a consistent snapshot ID to do check the inode <-> dirent -+ * relationships. -+ * -+ * Thus, a subvolume root that's been renamed after a snapshot will have -+ * a disconnected older version - that's expected. -+ * -+ * Note that taking a snapshot always updates the root inode (to update -+ * the dirent backpointer), so a subvolume root inode with -+ * BCH_INODE_has_child_snapshot is never visible. -+ */ -+ if (inode->bi_subvol && -+ (inode->bi_flags & BCH_INODE_has_child_snapshot)) -+ return false; -+ - return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); - } - -@@ -462,6 +411,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * - &name, - inode->bi_subvol ?: inode->bi_inum, - &inode->bi_dir_offset, -+ BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create); - if (ret) { - bch_err_msg(c, ret, "error creating dirent"); -@@ -548,7 +498,7 @@ static int remove_backpointer(struct btree_trans *trans, - SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); - int ret = bkey_err(d) ?: - dirent_points_to_inode(c, d, inode) ?: -- __remove_dirent(trans, d.k->p); -+ bch2_fsck_remove_dirent(trans, d.k->p); - bch2_trans_iter_exit(trans, &iter); - return ret; - } -@@ -595,12 +545,12 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub - u64 cpu = raw_smp_processor_id(); - - bch2_inode_init_early(c, &new_inode); -- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); -+ bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); - - new_inode.bi_subvol = subvolid; - - int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: -- bch2_btree_iter_traverse(&inode_iter) ?: -+ bch2_btree_iter_traverse(trans, &inode_iter) ?: - bch2_inode_write(trans, &inode_iter, &new_inode); - bch2_trans_iter_exit(trans, &inode_iter); - if (ret) -@@ -665,7 +615,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 - struct btree_iter iter = {}; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); -- struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); -+ struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); - bch2_trans_iter_exit(trans, &iter); - int ret = bkey_err(k); - if (ret) -@@ -685,7 +635,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 - - struct bch_inode_unpacked new_inode; - bch2_inode_init_early(c, &new_inode); -- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); -+ bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); - new_inode.bi_size = i_size; - new_inode.bi_inum = inum; - new_inode.bi_snapshot = snapshot; -@@ -816,12 +766,12 @@ static int ref_visible2(struct bch_fs *c, - - #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ - for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ -- (_i)->snapshot <= (_snapshot); _i++) \ -- if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) -+ (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ -+ if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) - - struct inode_walker_entry { - struct bch_inode_unpacked inode; -- u32 snapshot; -+ bool whiteout; - u64 count; - u64 i_size; - }; -@@ -850,13 +800,20 @@ static struct inode_walker inode_walker_init(void) - static int add_inode(struct bch_fs *c, struct inode_walker *w, - struct bkey_s_c inode) - { -- struct bch_inode_unpacked u; -- -- return bch2_inode_unpack(inode, &u) ?: -- darray_push(&w->inodes, ((struct inode_walker_entry) { -- .inode = u, -- .snapshot = inode.k->p.snapshot, -+ int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { -+ .whiteout = !bkey_is_inode(inode.k), - })); -+ if (ret) -+ return ret; -+ -+ struct inode_walker_entry *n = &darray_last(w->inodes); -+ if (!n->whiteout) { -+ return bch2_inode_unpack(inode, &n->inode); -+ } else { -+ n->inode.bi_inum = inode.k->p.inode; -+ n->inode.bi_snapshot = inode.k->p.snapshot; -+ return 0; -+ } - } - - static int get_inodes_all_snapshots(struct btree_trans *trans, -@@ -876,13 +833,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, - w->recalculate_sums = false; - w->inodes.nr = 0; - -- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), -- BTREE_ITER_all_snapshots, k, ret) { -- if (k.k->p.offset != inum) -+ for_each_btree_key_max_norestart(trans, iter, -+ BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), -+ BTREE_ITER_all_snapshots, k, ret) { -+ ret = add_inode(c, w, k); -+ if (ret) - break; -- -- if (bkey_is_inode(k.k)) -- add_inode(c, w, k); - } - bch2_trans_iter_exit(trans, &iter); - -@@ -894,48 +850,112 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, - return 0; - } - -+static int get_visible_inodes(struct btree_trans *trans, -+ struct inode_walker *w, -+ struct snapshots_seen *s, -+ u64 inum) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ w->inodes.nr = 0; -+ w->deletes.nr = 0; -+ -+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), -+ BTREE_ITER_all_snapshots, k, ret) { -+ if (k.k->p.offset != inum) -+ break; -+ -+ if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) -+ continue; -+ -+ if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) -+ continue; -+ -+ ret = bkey_is_inode(k.k) -+ ? add_inode(c, w, k) -+ : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret; -+} -+ - static struct inode_walker_entry * --lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) -+lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) - { -- bool is_whiteout = k.k->type == KEY_TYPE_whiteout; -+ struct bch_fs *c = trans->c; - - struct inode_walker_entry *i; - __darray_for_each(w->inodes, i) -- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) -+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)) - goto found; - - return NULL; - found: -- BUG_ON(k.k->p.snapshot > i->snapshot); -+ BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot); - -- if (k.k->p.snapshot != i->snapshot && !is_whiteout) { -- struct inode_walker_entry new = *i; -- -- new.snapshot = k.k->p.snapshot; -- new.count = 0; -- new.i_size = 0; -- -- struct printbuf buf = PRINTBUF; -- bch2_bkey_val_to_text(&buf, c, k); -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; - -- bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" -+ if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, -+ trans, snapshot_key_missing_inode_snapshot, -+ "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" - "unexpected because we should always update the inode when we update a key in that inode\n" - "%s", -- w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); -- printbuf_exit(&buf); -+ w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, -+ (bch2_bkey_val_to_text(&buf, c, k), -+ buf.buf))) { -+ struct bch_inode_unpacked new = i->inode; -+ struct bkey_i whiteout; -+ -+ new.bi_snapshot = k.k->p.snapshot; -+ -+ if (!i->whiteout) { -+ ret = __bch2_fsck_write_inode(trans, &new); -+ } else { -+ bkey_init(&whiteout.k); -+ whiteout.k.type = KEY_TYPE_whiteout; -+ whiteout.k.p = SPOS(0, i->inode.bi_inum, i->inode.bi_snapshot); -+ ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, -+ &whiteout, -+ BTREE_UPDATE_internal_snapshot_node); -+ } -+ -+ if (ret) -+ goto fsck_err; - -- while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) -+ ret = bch2_trans_commit(trans, NULL, NULL, 0); -+ if (ret) -+ goto fsck_err; -+ -+ struct inode_walker_entry new_entry = *i; -+ -+ new_entry.inode.bi_snapshot = k.k->p.snapshot; -+ new_entry.count = 0; -+ new_entry.i_size = 0; -+ -+ while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) - --i; - - size_t pos = i - w->inodes.data; -- int ret = darray_insert_item(&w->inodes, pos, new); -+ ret = darray_insert_item(&w->inodes, pos, new_entry); - if (ret) -- return ERR_PTR(ret); -+ goto fsck_err; - -- i = w->inodes.data + pos; -+ ret = -BCH_ERR_transaction_restart_nested; -+ goto fsck_err; - } - -+ printbuf_exit(&buf); - return i; -+fsck_err: -+ printbuf_exit(&buf); -+ return ERR_PTR(ret); - } - - static struct inode_walker_entry *walk_inode(struct btree_trans *trans, -@@ -950,42 +970,7 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, - - w->last_pos = k.k->p; - -- return lookup_inode_for_snapshot(trans->c, w, k); --} -- --static int get_visible_inodes(struct btree_trans *trans, -- struct inode_walker *w, -- struct snapshots_seen *s, -- u64 inum) --{ -- struct bch_fs *c = trans->c; -- struct btree_iter iter; -- struct bkey_s_c k; -- int ret; -- -- w->inodes.nr = 0; -- w->deletes.nr = 0; -- -- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), -- BTREE_ITER_all_snapshots, k, ret) { -- if (k.k->p.offset != inum) -- break; -- -- if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) -- continue; -- -- if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) -- continue; -- -- ret = bkey_is_inode(k.k) -- ? add_inode(c, w, k) -- : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); -- if (ret) -- break; -- } -- bch2_trans_iter_exit(trans, &iter); -- -- return ret; -+ return lookup_inode_for_snapshot(trans, w, k); - } - - /* -@@ -1063,6 +1048,23 @@ static int check_inode_dirent_inode(struct btree_trans *trans, - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - -+ if ((ret || dirent_points_to_inode_nowarn(d, inode)) && -+ inode->bi_subvol && -+ (inode->bi_flags & BCH_INODE_has_child_snapshot)) { -+ /* Older version of a renamed subvolume root: we won't have a -+ * correct dirent for it. That's expected, see -+ * inode_should_reattach(). -+ * -+ * We don't clear the backpointer field when doing the rename -+ * because there might be arbitrarily many versions in older -+ * snapshots. -+ */ -+ inode->bi_dir = 0; -+ inode->bi_dir_offset = 0; -+ *write_inode = true; -+ goto out; -+ } -+ - if (fsck_err_on(ret, - trans, inode_points_to_missing_dirent, - "inode points to missing dirent\n%s", -@@ -1083,7 +1085,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, - inode->bi_dir_offset = 0; - *write_inode = true; - } -- -+out: - ret = 0; - fsck_err: - bch2_trans_iter_exit(trans, &dirent_iter); -@@ -1092,32 +1094,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans, - return ret; - } - --static int get_snapshot_root_inode(struct btree_trans *trans, -- struct bch_inode_unpacked *root, -- u64 inum) --{ -- struct btree_iter iter; -- struct bkey_s_c k; -- int ret = 0; -- -- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, -- SPOS(0, inum, U32_MAX), -- BTREE_ITER_all_snapshots, k, ret) { -- if (k.k->p.offset != inum) -- break; -- if (bkey_is_inode(k.k)) -- goto found_root; -- } -- if (ret) -- goto err; -- BUG(); --found_root: -- ret = bch2_inode_unpack(k, root); --err: -- bch2_trans_iter_exit(trans, &iter); -- return ret; --} -- - static int check_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, -@@ -1148,20 +1124,23 @@ static int check_inode(struct btree_trans *trans, - goto err; - - if (snapshot_root->bi_inum != u.bi_inum) { -- ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); -+ ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); - if (ret) - goto err; - } - -- if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || -- INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), -- trans, inode_snapshot_mismatch, -- "inode hash info in different snapshots don't match")) { -- u.bi_hash_seed = snapshot_root->bi_hash_seed; -- SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); -- do_update = true; -+ if (u.bi_hash_seed != snapshot_root->bi_hash_seed || -+ INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { -+ ret = bch2_repair_inode_hash_info(trans, snapshot_root); -+ BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); -+ if (ret) -+ goto err; - } - -+ ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); -+ if (ret) -+ goto err; -+ - if (u.bi_dir || u.bi_dir_offset) { - ret = check_inode_dirent_inode(trans, &u, &do_update); - if (ret) -@@ -1464,7 +1443,9 @@ static int check_key_has_inode(struct btree_trans *trans, - if (k.k->type == KEY_TYPE_whiteout) - goto out; - -- if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { -+ bool have_inode = i && !i->whiteout; -+ -+ if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { - ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) -@@ -1475,16 +1456,16 @@ static int check_key_has_inode(struct btree_trans *trans, - goto err; - } - -- if (fsck_err_on(!i, -+ if (fsck_err_on(!have_inode, - trans, key_in_missing_inode, -- "key in missing inode:\n %s", -+ "key in missing inode:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - goto delete; - -- if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), -+ if (fsck_err_on(have_inode && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), - trans, key_in_wrong_inode_type, -- "key for wrong inode mode %o:\n %s", -+ "key for wrong inode mode %o:\n%s", - i->inode.bi_mode, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) -@@ -1510,21 +1491,21 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal - if (i->inode.bi_sectors == i->count) - continue; - -- count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); -+ count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); - - if (w->recalculate_sums) - i->count = count2; - - if (i->count != count2) { - bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", -- w->last_pos.inode, i->snapshot, i->count, count2); -+ w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); - i->count = count2; - } - - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), - trans, inode_i_sectors_wrong, - "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", -- w->last_pos.inode, i->snapshot, -+ w->last_pos.inode, i->inode.bi_snapshot, - i->inode.bi_sectors, i->count)) { - i->inode.bi_sectors = i->count; - ret = bch2_fsck_write_inode(trans, &i->inode); -@@ -1613,7 +1594,7 @@ static int overlapping_extents_found(struct btree_trans *trans, - { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; -- struct btree_iter iter1, iter2 = { NULL }; -+ struct btree_iter iter1, iter2 = {}; - struct bkey_s_c k1, k2; - int ret; - -@@ -1622,18 +1603,18 @@ static int overlapping_extents_found(struct btree_trans *trans, - bch2_trans_iter_init(trans, &iter1, btree, pos1, - BTREE_ITER_all_snapshots| - BTREE_ITER_not_extents); -- k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); -+ k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); - ret = bkey_err(k1); - if (ret) - goto err; - -- prt_str(&buf, "\n "); -+ prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k1); - - if (!bpos_eq(pos1, k1.k->p)) { -- prt_str(&buf, "\n wanted\n "); -+ prt_str(&buf, "\nwanted\n "); - bch2_bpos_to_text(&buf, pos1); -- prt_str(&buf, "\n "); -+ prt_str(&buf, "\n"); - bch2_bkey_to_text(&buf, &pos2); - - bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", -@@ -1642,12 +1623,12 @@ static int overlapping_extents_found(struct btree_trans *trans, - goto err; - } - -- bch2_trans_copy_iter(&iter2, &iter1); -+ bch2_trans_copy_iter(trans, &iter2, &iter1); - - while (1) { -- bch2_btree_iter_advance(&iter2); -+ bch2_btree_iter_advance(trans, &iter2); - -- k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); -+ k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); - ret = bkey_err(k2); - if (ret) - goto err; -@@ -1656,7 +1637,7 @@ static int overlapping_extents_found(struct btree_trans *trans, - break; - } - -- prt_str(&buf, "\n "); -+ prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k2); - - if (bpos_gt(k2.k->p, pos2.p) || -@@ -1667,7 +1648,7 @@ static int overlapping_extents_found(struct btree_trans *trans, - goto err; - } - -- prt_printf(&buf, "\n overwriting %s extent", -+ prt_printf(&buf, "\noverwriting %s extent", - pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); - - if (fsck_err(trans, extent_overlapping, -@@ -1688,6 +1669,8 @@ static int overlapping_extents_found(struct btree_trans *trans, - bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); - bch2_disk_reservation_put(c, &res); - -+ bch_info(c, "repair ret %s", bch2_err_str(ret)); -+ - if (ret) - goto err; - -@@ -1833,21 +1816,21 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, - for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); - inode->inodes.data && i >= inode->inodes.data; - --i) { -- if (i->snapshot > k.k->p.snapshot || -- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) -+ if (i->inode.bi_snapshot > k.k->p.snapshot || -+ !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) - continue; - - if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && - !bkey_extent_is_reservation(k), - trans, extent_past_end_of_inode, -- "extent type past end of inode %llu:%u, i_size %llu\n %s", -- i->inode.bi_inum, i->snapshot, i->inode.bi_size, -+ "extent type past end of inode %llu:%u, i_size %llu\n%s", -+ i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct btree_iter iter2; - -- bch2_trans_copy_iter(&iter2, iter); -- bch2_btree_iter_set_snapshot(&iter2, i->snapshot); -- ret = bch2_btree_iter_traverse(&iter2) ?: -+ bch2_trans_copy_iter(trans, &iter2, iter); -+ bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot); -+ ret = bch2_btree_iter_traverse(trans, &iter2) ?: - bch2_btree_delete_at(trans, &iter2, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter2); -@@ -1868,8 +1851,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, - for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); - inode->inodes.data && i >= inode->inodes.data; - --i) { -- if (i->snapshot > k.k->p.snapshot || -- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) -+ if (i->whiteout || -+ i->inode.bi_snapshot > k.k->p.snapshot || -+ !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) - continue; - - i->count += k.k->size; -@@ -1951,13 +1935,13 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ - if (i->inode.bi_nlink == i->count) - continue; - -- count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); -+ count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); - if (count2 < 0) - return count2; - - if (i->count != count2) { - bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", -- w->last_pos.inode, i->snapshot, i->count, count2); -+ w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); - i->count = count2; - if (i->inode.bi_nlink == i->count) - continue; -@@ -1966,7 +1950,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ - if (fsck_err_on(i->inode.bi_nlink != i->count, - trans, inode_dir_wrong_nlink, - "directory %llu:%u with wrong i_nlink: got %u, should be %llu", -- w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { -+ w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) { - i->inode.bi_nlink = i->count; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) -@@ -1985,169 +1969,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa - trans_was_restarted(trans, restart_count); - } - --noinline_for_stack --static int check_dirent_inode_dirent(struct btree_trans *trans, -- struct btree_iter *iter, -- struct bkey_s_c_dirent d, -- struct bch_inode_unpacked *target) --{ -- struct bch_fs *c = trans->c; -- struct printbuf buf = PRINTBUF; -- struct btree_iter bp_iter = { NULL }; -- int ret = 0; -- -- if (inode_points_to_dirent(target, d)) -- return 0; -- -- if (!target->bi_dir && -- !target->bi_dir_offset) { -- fsck_err_on(S_ISDIR(target->bi_mode), -- trans, inode_dir_missing_backpointer, -- "directory with missing backpointer\n%s", -- (printbuf_reset(&buf), -- bch2_bkey_val_to_text(&buf, c, d.s_c), -- prt_printf(&buf, "\n"), -- bch2_inode_unpacked_to_text(&buf, target), -- buf.buf)); -- -- fsck_err_on(target->bi_flags & BCH_INODE_unlinked, -- trans, inode_unlinked_but_has_dirent, -- "inode unlinked but has dirent\n%s", -- (printbuf_reset(&buf), -- bch2_bkey_val_to_text(&buf, c, d.s_c), -- prt_printf(&buf, "\n"), -- bch2_inode_unpacked_to_text(&buf, target), -- buf.buf)); -- -- target->bi_flags &= ~BCH_INODE_unlinked; -- target->bi_dir = d.k->p.inode; -- target->bi_dir_offset = d.k->p.offset; -- return __bch2_fsck_write_inode(trans, target); -- } -- -- if (bch2_inode_should_have_single_bp(target) && -- !fsck_err(trans, inode_wrong_backpointer, -- "dirent points to inode that does not point back:\n %s", -- (bch2_bkey_val_to_text(&buf, c, d.s_c), -- prt_printf(&buf, "\n "), -- bch2_inode_unpacked_to_text(&buf, target), -- buf.buf))) -- goto err; -- -- struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, -- SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot)); -- ret = bkey_err(bp_dirent); -- if (ret && !bch2_err_matches(ret, ENOENT)) -- goto err; -- -- bool backpointer_exists = !ret; -- ret = 0; -- -- if (fsck_err_on(!backpointer_exists, -- trans, inode_wrong_backpointer, -- "inode %llu:%u has wrong backpointer:\n" -- "got %llu:%llu\n" -- "should be %llu:%llu", -- target->bi_inum, target->bi_snapshot, -- target->bi_dir, -- target->bi_dir_offset, -- d.k->p.inode, -- d.k->p.offset)) { -- target->bi_dir = d.k->p.inode; -- target->bi_dir_offset = d.k->p.offset; -- ret = __bch2_fsck_write_inode(trans, target); -- goto out; -- } -- -- bch2_bkey_val_to_text(&buf, c, d.s_c); -- prt_newline(&buf); -- if (backpointer_exists) -- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); -- -- if (fsck_err_on(backpointer_exists && -- (S_ISDIR(target->bi_mode) || -- target->bi_subvol), -- trans, inode_dir_multiple_links, -- "%s %llu:%u with multiple links\n%s", -- S_ISDIR(target->bi_mode) ? "directory" : "subvolume", -- target->bi_inum, target->bi_snapshot, buf.buf)) { -- ret = __remove_dirent(trans, d.k->p); -- goto out; -- } -- -- /* -- * hardlinked file with nlink 0: -- * We're just adjusting nlink here so check_nlinks() will pick -- * it up, it ignores inodes with nlink 0 -- */ -- if (fsck_err_on(backpointer_exists && !target->bi_nlink, -- trans, inode_multiple_links_but_nlink_0, -- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", -- target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { -- target->bi_nlink++; -- target->bi_flags &= ~BCH_INODE_unlinked; -- ret = __bch2_fsck_write_inode(trans, target); -- if (ret) -- goto err; -- } --out: --err: --fsck_err: -- bch2_trans_iter_exit(trans, &bp_iter); -- printbuf_exit(&buf); -- bch_err_fn(c, ret); -- return ret; --} -- --noinline_for_stack --static int check_dirent_target(struct btree_trans *trans, -- struct btree_iter *iter, -- struct bkey_s_c_dirent d, -- struct bch_inode_unpacked *target) --{ -- struct bch_fs *c = trans->c; -- struct bkey_i_dirent *n; -- struct printbuf buf = PRINTBUF; -- int ret = 0; -- -- ret = check_dirent_inode_dirent(trans, iter, d, target); -- if (ret) -- goto err; -- -- if (fsck_err_on(d.v->d_type != inode_d_type(target), -- trans, dirent_d_type_wrong, -- "incorrect d_type: got %s, should be %s:\n%s", -- bch2_d_type_str(d.v->d_type), -- bch2_d_type_str(inode_d_type(target)), -- (printbuf_reset(&buf), -- bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { -- n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); -- ret = PTR_ERR_OR_ZERO(n); -- if (ret) -- goto err; -- -- bkey_reassemble(&n->k_i, d.s_c); -- n->v.d_type = inode_d_type(target); -- if (n->v.d_type == DT_SUBVOL) { -- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); -- n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); -- } else { -- n->v.d_inum = cpu_to_le64(target->bi_inum); -- } -- -- ret = bch2_trans_update(trans, iter, &n->k_i, 0); -- if (ret) -- goto err; -- -- d = dirent_i_to_s_c(n); -- } --err: --fsck_err: -- printbuf_exit(&buf); -- bch_err_fn(c, ret); -- return ret; --} -- - /* find a subvolume that's a descendent of @snapshot: */ - static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) - { -@@ -2241,35 +2062,46 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * - 0, subvolume); - ret = bkey_err(s.s_c); - if (ret && !bch2_err_matches(ret, ENOENT)) -- return ret; -+ goto err; - - if (ret) { - if (fsck_err(trans, dirent_to_missing_subvol, - "dirent points to missing subvolume\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) -- return __remove_dirent(trans, d.k->p); -+ return bch2_fsck_remove_dirent(trans, d.k->p); - ret = 0; - goto out; - } - -- if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, -- trans, subvol_fs_path_parent_wrong, -- "subvol with wrong fs_path_parent, should be be %u\n%s", -- parent_subvol, -- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { -- struct bkey_i_subvolume *n = -- bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); -- ret = PTR_ERR_OR_ZERO(n); -+ if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { -+ printbuf_reset(&buf); -+ -+ prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", -+ parent_subvol); -+ -+ ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, -+ le64_to_cpu(s.v->inode) }, &buf); - if (ret) - goto err; -+ prt_newline(&buf); -+ bch2_bkey_val_to_text(&buf, c, s.s_c); - -- n->v.fs_path_parent = cpu_to_le32(parent_subvol); -+ if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { -+ struct bkey_i_subvolume *n = -+ bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ goto err; -+ -+ n->v.fs_path_parent = cpu_to_le32(parent_subvol); -+ } - } - - u64 target_inum = le64_to_cpu(s.v->inode); - u32 target_snapshot = le32_to_cpu(s.v->snapshot); - -- ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root); -+ ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, -+ &subvol_root, 0); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - -@@ -2291,7 +2123,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * - goto err; - } - -- ret = check_dirent_target(trans, iter, d, &subvol_root); -+ ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); - if (ret) - goto err; - out: -@@ -2342,7 +2174,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, - if (ret) - goto err; - -- if (!i) -+ if (!i || i->whiteout) - goto out; - - if (dir->first_this_inode) -@@ -2363,6 +2195,41 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, - - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - -+ /* check casefold */ -+ if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding, -+ trans, dirent_casefold_mismatch, -+ "dirent casefold does not match dir casefold\n%s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), -+ buf.buf))) { -+ struct qstr name = bch2_dirent_get_name(d); -+ u32 subvol = d.v->d_type == DT_SUBVOL -+ ? d.v->d_parent_subvol -+ : 0; -+ u64 target = d.v->d_type == DT_SUBVOL -+ ? d.v->d_child_subvol -+ : d.v->d_inum; -+ u64 dir_offset; -+ -+ ret = bch2_hash_delete_at(trans, -+ bch2_dirent_hash_desc, hash_info, iter, -+ BTREE_UPDATE_internal_snapshot_node) ?: -+ bch2_dirent_create_snapshot(trans, subvol, -+ d.k->p.inode, d.k->p.snapshot, -+ hash_info, -+ d.v->d_type, -+ &name, -+ target, -+ &dir_offset, -+ BTREE_ITER_with_updates| -+ BTREE_UPDATE_internal_snapshot_node| -+ STR_HASH_must_create) ?: -+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -+ -+ /* might need another check_dirents pass */ -+ goto out; -+ } -+ - if (d.v->d_type == DT_SUBVOL) { - ret = check_dirent_to_subvol(trans, iter, d); - if (ret) -@@ -2378,13 +2245,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { -- ret = __remove_dirent(trans, d.k->p); -+ ret = bch2_fsck_remove_dirent(trans, d.k->p); - if (ret) - goto err; - } - - darray_for_each(target->inodes, i) { -- ret = check_dirent_target(trans, iter, d, &i->inode); -+ ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); - if (ret) - goto err; - } -@@ -2402,7 +2269,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, - BTREE_ID_dirents, - SPOS(k.k->p.inode, k.k->p.offset, *i), - BTREE_ITER_intent); -- ret = bch2_btree_iter_traverse(&delete_iter) ?: -+ ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - hash_info, - &delete_iter, -@@ -2482,7 +2349,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, - if (ret) - return ret; - -- if (!i) -+ if (!i || i->whiteout) - return 0; - - if (inode->first_this_inode) -@@ -2551,7 +2418,8 @@ static int check_root_trans(struct btree_trans *trans) - goto err; - } - -- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode); -+ ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, -+ &root_inode, 0); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - -@@ -2583,8 +2451,6 @@ int bch2_check_root(struct bch_fs *c) - return ret; - } - --typedef DARRAY(u32) darray_u32; -- - static bool darray_u32_has(darray_u32 *d, u32 v) - { - darray_for_each(*d, i) -@@ -2621,7 +2487,14 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, - u32 parent = le32_to_cpu(s.v->fs_path_parent); - - if (darray_u32_has(&subvol_path, parent)) { -- if (fsck_err(c, subvol_loop, "subvolume loop")) -+ printbuf_reset(&buf); -+ prt_printf(&buf, "subvolume loop:\n"); -+ -+ darray_for_each_reverse(subvol_path, i) -+ prt_printf(&buf, "%u ", *i); -+ prt_printf(&buf, "%u", parent); -+ -+ if (fsck_err(trans, subvol_loop, "%s", buf.buf)) - ret = reattach_subvol(trans, s); - break; - } -@@ -2629,7 +2502,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, - bch2_trans_iter_exit(trans, &parent_iter); - bch2_trans_iter_init(trans, &parent_iter, - BTREE_ID_subvolumes, POS(0, parent), 0); -- k = bch2_btree_iter_peek_slot(&parent_iter); -+ k = bch2_btree_iter_peek_slot(trans, &parent_iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -2637,7 +2510,8 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, - if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, - trans, subvol_unreachable, - "unreachable subvolume %s", -- (bch2_bkey_val_to_text(&buf, c, s.s_c), -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, s.s_c), - buf.buf))) { - ret = reattach_subvol(trans, s); - break; -@@ -2793,14 +2667,13 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) - redo_bi_depth = true; - - if (path_is_dup(&path, inode.bi_inum, snapshot)) { -- /* XXX print path */ -- bch_err(c, "directory structure loop"); -- -- darray_for_each(path, i) -- pr_err("%llu:%u", i->inum, i->snapshot); -- pr_err("%llu:%u", inode.bi_inum, snapshot); -+ printbuf_reset(&buf); -+ prt_printf(&buf, "directory structure loop:\n"); -+ darray_for_each_reverse(path, i) -+ prt_printf(&buf, "%llu:%u ", i->inum, i->snapshot); -+ prt_printf(&buf, "%llu:%u", inode.bi_inum, snapshot); - -- if (fsck_err(trans, dir_loop, "directory structure loop")) { -+ if (fsck_err(trans, dir_loop, "%s", buf.buf)) { - ret = remove_backpointer(trans, &inode); - bch_err_msg(c, ret, "removing dirent"); - if (ret) -@@ -3199,7 +3072,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) - { - struct bch_ioctl_fsck_offline arg; - struct fsck_thread *thr = NULL; -- darray_str(devs) = {}; -+ darray_const_str devs = {}; - long ret = 0; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) -@@ -3240,7 +3113,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - ret = PTR_ERR_OR_ZERO(optstr) ?: -- bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); -+ bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false); - if (!IS_ERR(optstr)) - kfree(optstr); - -@@ -3257,7 +3130,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) - - bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); - -- thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); -+ thr->c = bch2_fs_open(&devs, &thr->opts); - - if (!IS_ERR(thr->c) && - thr->c->opts.errors == BCH_ON_ERROR_panic) -@@ -3294,19 +3167,18 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) - c->opts.fix_errors = FSCK_FIX_ask; - - c->opts.fsck = true; -- set_bit(BCH_FS_fsck_running, &c->flags); -+ set_bit(BCH_FS_in_fsck, &c->flags); - -- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; -- int ret = bch2_run_online_recovery_passes(c); -+ int ret = bch2_run_online_recovery_passes(c, ~0ULL); - -- clear_bit(BCH_FS_fsck_running, &c->flags); -+ clear_bit(BCH_FS_in_fsck, &c->flags); - bch_err_fn(c, ret); - - c->stdio = NULL; - c->stdio_filter = NULL; - c->opts.fix_errors = old_fix_errors; - -- up(&c->online_fsck_mutex); -+ up(&c->recovery.run_lock); - bch2_ro_ref_put(c); - return ret; - } -@@ -3330,7 +3202,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) - if (!bch2_ro_ref_tryget(c)) - return -EROFS; - -- if (down_trylock(&c->online_fsck_mutex)) { -+ if (down_trylock(&c->recovery.run_lock)) { - bch2_ro_ref_put(c); - return -EAGAIN; - } -@@ -3348,7 +3220,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - - ret = PTR_ERR_OR_ZERO(optstr) ?: -- bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); -+ bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false); - if (!IS_ERR(optstr)) - kfree(optstr); - -@@ -3362,7 +3234,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) - bch_err_fn(c, ret); - if (thr) - bch2_fsck_thread_exit(&thr->thr); -- up(&c->online_fsck_mutex); -+ up(&c->recovery.run_lock); - bch2_ro_ref_put(c); - } - return ret; -diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c -index 339b80770f1d..5cf70108ae2f 100644 ---- a/fs/bcachefs/inode.c -+++ b/fs/bcachefs/inode.c -@@ -14,6 +14,7 @@ - #include "extent_update.h" - #include "fs.h" - #include "inode.h" -+#include "namei.h" - #include "opts.h" - #include "str_hash.h" - #include "snapshot.h" -@@ -240,6 +241,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, - u64 v[2]; - - unpacked->bi_inum = inode.k->p.offset; -+ unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); -@@ -284,13 +286,12 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, - { - memset(unpacked, 0, sizeof(*unpacked)); - -- unpacked->bi_snapshot = k.k->p.snapshot; -- - switch (k.k->type) { - case KEY_TYPE_inode: { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - - unpacked->bi_inum = inode.k->p.offset; -+ unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= 0; - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); -@@ -309,6 +310,7 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, - struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); - - unpacked->bi_inum = inode.k->p.offset; -+ unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); -@@ -326,8 +328,6 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, - int bch2_inode_unpack(struct bkey_s_c k, - struct bch_inode_unpacked *unpacked) - { -- unpacked->bi_snapshot = k.k->p.snapshot; -- - return likely(k.k->type == KEY_TYPE_inode_v3) - ? bch2_inode_unpack_v3(k, unpacked) - : bch2_inode_unpack_slowpath(k, unpacked); -@@ -367,6 +367,82 @@ int __bch2_inode_peek(struct btree_trans *trans, - return ret; - } - -+int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, -+ u64 inode_nr, u32 snapshot, -+ struct bch_inode_unpacked *inode, -+ unsigned flags) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, -+ SPOS(0, inode_nr, snapshot), flags); -+ int ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ ret = bkey_is_inode(k.k) -+ ? bch2_inode_unpack(k, inode) -+ : -BCH_ERR_ENOENT_inode; -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, -+ subvol_inum inum, -+ struct bch_inode_unpacked *inode) -+{ -+ struct btree_iter iter; -+ int ret; -+ -+ ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); -+ if (!ret) -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, -+ subvol_inum inum, -+ struct bch_inode_unpacked *inode) -+{ -+ struct btree_iter iter; -+ int ret; -+ -+ ret = bch2_inode_peek(trans, &iter, inode, inum, 0); -+ if (!ret) -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, -+ struct bch_inode_unpacked *inode) -+{ -+ return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); -+} -+ -+int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, -+ struct bch_inode_unpacked *root) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, -+ SPOS(0, inum, U32_MAX), -+ BTREE_ITER_all_snapshots, k, ret) { -+ if (k.k->p.offset != inum) -+ break; -+ if (bkey_is_inode(k.k)) { -+ ret = bch2_inode_unpack(k, root); -+ goto out; -+ } -+ } -+ /* We're only called when we know we have an inode for @inum */ -+ BUG_ON(!ret); -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ - int bch2_inode_write_flags(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, -@@ -731,10 +807,9 @@ int bch2_trigger_inode(struct btree_trans *trans, - bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); - } - -- s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); -- if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { -- struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; -- int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); -+ s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) }; -+ if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) { -+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes); - if (ret) - return ret; - } -@@ -833,7 +908,8 @@ void bch2_inode_init_early(struct bch_fs *c, - get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); - } - --void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, -+void bch2_inode_init_late(struct bch_fs *c, -+ struct bch_inode_unpacked *inode_u, u64 now, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct bch_inode_unpacked *parent) - { -@@ -857,6 +933,12 @@ void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, - BCH_INODE_OPTS() - #undef x - } -+ -+ if (!S_ISDIR(mode)) -+ inode_u->bi_casefold = 0; -+ -+ if (bch2_inode_casefold(c, inode_u)) -+ inode_u->bi_flags |= BCH_INODE_has_case_insensitive; - } - - void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, -@@ -864,23 +946,10 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - struct bch_inode_unpacked *parent) - { - bch2_inode_init_early(c, inode_u); -- bch2_inode_init_late(inode_u, bch2_current_time(c), -+ bch2_inode_init_late(c, inode_u, bch2_current_time(c), - uid, gid, mode, rdev, parent); - } - --static inline u32 bkey_generation(struct bkey_s_c k) --{ -- switch (k.k->type) { -- case KEY_TYPE_inode: -- case KEY_TYPE_inode_v2: -- BUG(); -- case KEY_TYPE_inode_generation: -- return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); -- default: -- return 0; -- } --} -- - static struct bkey_i_inode_alloc_cursor * - bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) - { -@@ -954,7 +1023,7 @@ int bch2_inode_create(struct btree_trans *trans, - BTREE_ITER_intent); - struct bkey_s_c k; - again: -- while ((k = bch2_btree_iter_peek(iter)).k && -+ while ((k = bch2_btree_iter_peek(trans, iter)).k && - !(ret = bkey_err(k)) && - bkey_lt(k.k->p, POS(0, max))) { - if (pos < iter->pos.offset) -@@ -965,7 +1034,7 @@ int bch2_inode_create(struct btree_trans *trans, - * we've found just one: - */ - pos = iter->pos.offset + 1; -- bch2_btree_iter_set_pos(iter, POS(0, pos)); -+ bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); - } - - if (!ret && pos < max) -@@ -981,12 +1050,12 @@ int bch2_inode_create(struct btree_trans *trans, - - /* Retry from start */ - pos = start = min; -- bch2_btree_iter_set_pos(iter, POS(0, pos)); -+ bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); - le32_add_cpu(&cursor->v.gen, 1); - goto again; - found_slot: -- bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); -- k = bch2_btree_iter_peek_slot(iter); -+ bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot)); -+ k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) { - bch2_trans_iter_exit(trans, iter); -@@ -1023,9 +1092,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, - if (ret) - goto err; - -- bch2_btree_iter_set_snapshot(&iter, snapshot); -+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - -- k = bch2_btree_iter_peek_max(&iter, end); -+ k = bch2_btree_iter_peek_max(trans, &iter, end); - ret = bkey_err(k); - if (ret) - goto err; -@@ -1056,7 +1125,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, - int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) - { - struct btree_trans *trans = bch2_trans_get(c); -- struct btree_iter iter = { NULL }; -+ struct btree_iter iter = {}; - struct bkey_s_c k; - u32 snapshot; - int ret; -@@ -1092,7 +1161,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) - bch2_fs_inconsistent(c, - "inode %llu:%u not found when deleting", - inum.inum, snapshot); -- ret = -EIO; -+ ret = -BCH_ERR_ENOENT_inode; - goto err; - } - -@@ -1113,38 +1182,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) - return ret; - } - --int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, -- subvol_inum inum, -- struct bch_inode_unpacked *inode) --{ -- struct btree_iter iter; -- int ret; -- -- ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); -- if (!ret) -- bch2_trans_iter_exit(trans, &iter); -- return ret; --} -- --int bch2_inode_find_by_inum_trans(struct btree_trans *trans, -- subvol_inum inum, -- struct bch_inode_unpacked *inode) --{ -- struct btree_iter iter; -- int ret; -- -- ret = bch2_inode_peek(trans, &iter, inode, inum, 0); -- if (!ret) -- bch2_trans_iter_exit(trans, &iter); -- return ret; --} -- --int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, -- struct bch_inode_unpacked *inode) --{ -- return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); --} -- - int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) - { - if (bi->bi_flags & BCH_INODE_unlinked) -@@ -1218,10 +1255,45 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i - return 0; - } - -+int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, -+ struct bch_inode_unpacked *bi, unsigned v) -+{ -+ struct bch_fs *c = trans->c; -+ -+#ifdef CONFIG_UNICODE -+ int ret = 0; -+ /* Not supported on individual files. */ -+ if (!S_ISDIR(bi->bi_mode)) -+ return -EOPNOTSUPP; -+ -+ /* -+ * Make sure the dir is empty, as otherwise we'd need to -+ * rehash everything and update the dirent keys. -+ */ -+ ret = bch2_empty_dir_trans(trans, inum); -+ if (ret < 0) -+ return ret; -+ -+ ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); -+ if (ret) -+ return ret; -+ -+ bch2_check_set_feature(c, BCH_FEATURE_casefolding); -+ -+ bi->bi_casefold = v + 1; -+ bi->bi_fields_set |= BIT(Inode_opt_casefold); -+ -+ return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); -+#else -+ bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); -+ return -EOPNOTSUPP; -+#endif -+} -+ - static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) - { - struct bch_fs *c = trans->c; -- struct btree_iter iter = { NULL }; -+ struct btree_iter iter = {}; - struct bkey_i_inode_generation delete; - struct bch_inode_unpacked inode_u; - struct bkey_s_c k; -@@ -1256,7 +1328,7 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum - bch2_fs_inconsistent(c, - "inode %llu:%u not found when deleting", - inum, snapshot); -- ret = -EIO; -+ ret = -BCH_ERR_ENOENT_inode; - goto err; - } - -diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h -index 428b9be6af34..77ad2d549541 100644 ---- a/fs/bcachefs/inode.h -+++ b/fs/bcachefs/inode.h -@@ -134,10 +134,21 @@ static inline int bch2_inode_peek(struct btree_trans *trans, - subvol_inum inum, unsigned flags) - { - return __bch2_inode_peek(trans, iter, inode, inum, flags, true); -- int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); -- return ret; - } - -+int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32, -+ struct bch_inode_unpacked *, unsigned); -+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, -+ subvol_inum, -+ struct bch_inode_unpacked *); -+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, -+ struct bch_inode_unpacked *); -+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, -+ struct bch_inode_unpacked *); -+ -+int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, -+ struct bch_inode_unpacked *root); -+ - int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); - -@@ -153,7 +164,7 @@ int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); - - void bch2_inode_init_early(struct bch_fs *, - struct bch_inode_unpacked *); --void bch2_inode_init_late(struct bch_inode_unpacked *, u64, -+void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64, - uid_t, gid_t, umode_t, dev_t, - struct bch_inode_unpacked *); - void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, -@@ -165,14 +176,6 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *, - - int bch2_inode_rm(struct bch_fs *, subvol_inum); - --int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, -- subvol_inum, -- struct bch_inode_unpacked *); --int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, -- struct bch_inode_unpacked *); --int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, -- struct bch_inode_unpacked *); -- - #define inode_opt_get(_c, _inode, _name) \ - ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) - -@@ -243,6 +246,14 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k) - } - } - -+static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi) -+{ -+ /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */ -+ return bi->bi_casefold -+ ? bi->bi_casefold - 1 -+ : c->opts.casefold; -+} -+ - /* i_nlink: */ - - static inline unsigned nlink_bias(umode_t mode) -@@ -277,13 +288,16 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i - bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; - - return S_ISDIR(inode->bi_mode) || -+ inode->bi_subvol || - (!inode->bi_nlink && inode_has_bp); - } - - struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); - void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, - struct bch_inode_unpacked *); --int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); -+int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *); -+int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, -+ struct bch_inode_unpacked *, unsigned); - - #include "rebalance.h" - -@@ -295,6 +309,14 @@ bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode - return io_opts_to_rebalance_opts(c, &io_opts); - } - -+#define BCACHEFS_ROOT_SUBVOL_INUM \ -+ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) -+ -+static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b) -+{ -+ return a.subvol == b.subvol && a.inum == b.inum; -+} -+ - int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); - int bch2_delete_dead_inodes(struct bch_fs *); - -diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h -index b99a5bf1a75e..1f00938b1bdc 100644 ---- a/fs/bcachefs/inode_format.h -+++ b/fs/bcachefs/inode_format.h -@@ -103,7 +103,8 @@ struct bch_inode_generation { - x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) \ - x(bi_depth, 32) \ -- x(bi_inodes_32bit, 8) -+ x(bi_inodes_32bit, 8) \ -+ x(bi_casefold, 8) - - /* subset of BCH_INODE_FIELDS */ - #define BCH_INODE_OPTS() \ -@@ -117,7 +118,8 @@ struct bch_inode_generation { - x(background_target, 16) \ - x(erasure_code, 16) \ - x(nocow, 8) \ -- x(inodes_32bit, 8) -+ x(inodes_32bit, 8) \ -+ x(casefold, 8) - - enum inode_opt_id { - #define x(name, ...) \ -@@ -127,6 +129,10 @@ enum inode_opt_id { - Inode_opt_nr, - }; - -+/* -+ * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive - -+ * for overlayfs -+ */ - #define BCH_INODE_FLAGS() \ - x(sync, 0) \ - x(immutable, 1) \ -@@ -137,7 +143,8 @@ enum inode_opt_id { - x(i_sectors_dirty, 6) \ - x(unlinked, 7) \ - x(backptr_untrusted, 8) \ -- x(has_child_snapshot, 9) -+ x(has_child_snapshot, 9) \ -+ x(has_case_insensitive, 10) - - /* bits 20+ reserved for packed fields below: */ - -diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c -index 5353979117b0..cc07729a4b62 100644 ---- a/fs/bcachefs/io_misc.c -+++ b/fs/bcachefs/io_misc.c -@@ -43,7 +43,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, - bch2_bkey_buf_init(&new); - closure_init_stack(&cl); - -- k = bch2_btree_iter_peek_slot(iter); -+ k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) - return ret; -@@ -115,7 +115,8 @@ int bch2_extent_fallocate(struct btree_trans *trans, - bch2_increment_clock(c, sectors_allocated, WRITE); - if (should_print_err(ret)) { - struct printbuf buf = PRINTBUF; -- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); -+ lockrestart_do(trans, -+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); - prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); -@@ -163,12 +164,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - if (ret) - continue; - -- bch2_btree_iter_set_snapshot(iter, snapshot); -+ bch2_btree_iter_set_snapshot(trans, iter, snapshot); - - /* - * peek_max() doesn't have ideal semantics for extents: - */ -- k = bch2_btree_iter_peek_max(iter, end_pos); -+ k = bch2_btree_iter_peek_max(trans, iter, end_pos); - if (!k.k) - break; - -@@ -229,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans, - u64 new_i_size, - bool warn) - { -- struct btree_iter iter = { NULL }; -+ struct btree_iter iter = {}; - struct bch_inode_unpacked inode_u; - int ret; - -@@ -398,7 +399,7 @@ case LOGGED_OP_FINSERT_start: - if (ret) - goto err; - } else { -- bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset)); -+ bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset)); - - ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -@@ -424,12 +425,12 @@ case LOGGED_OP_FINSERT_shift_extents: - if (ret) - goto btree_err; - -- bch2_btree_iter_set_snapshot(&iter, snapshot); -- bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); -+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot); -+ bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot)); - - k = insert -- ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0)) -- : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); -+ ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0)) -+ : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX)); - if ((ret = bkey_err(k))) - goto btree_err; - -diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c -index aa91fcf51eec..885c5f71a341 100644 ---- a/fs/bcachefs/io_read.c -+++ b/fs/bcachefs/io_read.c -@@ -9,6 +9,7 @@ - #include "bcachefs.h" - #include "alloc_background.h" - #include "alloc_foreground.h" -+#include "async_objs.h" - #include "btree_update.h" - #include "buckets.h" - #include "checksum.h" -@@ -17,6 +18,7 @@ - #include "data_update.h" - #include "disk_groups.h" - #include "ec.h" -+#include "enumerated_ref.h" - #include "error.h" - #include "io_read.h" - #include "io_misc.h" -@@ -25,8 +27,16 @@ - #include "subvolume.h" - #include "trace.h" - -+#include -+#include - #include - -+#ifdef CONFIG_BCACHEFS_DEBUG -+static unsigned bch2_read_corrupt_ratio; -+module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); -+MODULE_PARM_DESC(read_corrupt_ratio, ""); -+#endif -+ - #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - - static bool bch2_target_congested(struct bch_fs *c, u16 target) -@@ -73,17 +83,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) - - /* Cache promotion on read */ - --struct promote_op { -- struct rcu_head rcu; -- u64 start_time; -- -- struct rhash_head hash; -- struct bpos pos; -- -- struct data_update write; -- struct bio_vec bi_inline_vecs[]; /* must be last */ --}; -- - static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), -@@ -96,6 +95,33 @@ static inline bool have_io_error(struct bch_io_failures *failed) - return failed && failed->nr; - } - -+static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) -+{ -+ EBUG_ON(rbio->split); -+ -+ return rbio->data_update -+ ? container_of(rbio, struct data_update, rbio) -+ : NULL; -+} -+ -+static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) -+{ -+ struct data_update *u = rbio_data_update(orig); -+ if (!u) -+ return false; -+ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); -+ unsigned i = 0; -+ bkey_for_each_ptr(ptrs, ptr) { -+ if (ptr->dev == dev && -+ u->data_opts.rewrite_ptrs & BIT(i)) -+ return true; -+ i++; -+ } -+ -+ return false; -+} -+ - static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, - struct bpos pos, - struct bch_io_opts opts, -@@ -105,7 +131,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, - if (!have_io_error(failed)) { - BUG_ON(!opts.promote_target); - -- if (!(flags & BCH_READ_MAY_PROMOTE)) -+ if (!(flags & BCH_READ_may_promote)) - return -BCH_ERR_nopromote_may_not; - - if (bch2_bkey_has_target(c, k, opts.promote_target)) -@@ -125,98 +151,95 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, - return 0; - } - --static void promote_free(struct bch_fs *c, struct promote_op *op) -+static noinline void promote_free(struct bch_read_bio *rbio) - { -- int ret; -+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); -+ struct bch_fs *c = rbio->c; -+ -+ int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, -+ bch_promote_params); -+ BUG_ON(ret); -+ -+ async_object_list_del(c, promote, op->list_idx); - - bch2_data_update_exit(&op->write); - -- ret = rhashtable_remove_fast(&c->promote_table, &op->hash, -- bch_promote_params); -- BUG_ON(ret); -- bch2_write_ref_put(c, BCH_WRITE_REF_promote); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); - kfree_rcu(op, rcu); - } - - static void promote_done(struct bch_write_op *wop) - { -- struct promote_op *op = -- container_of(wop, struct promote_op, write.op); -- struct bch_fs *c = op->write.op.c; -+ struct promote_op *op = container_of(wop, struct promote_op, write.op); -+ struct bch_fs *c = op->write.rbio.c; - -- bch2_time_stats_update(&c->times[BCH_TIME_data_promote], -- op->start_time); -- promote_free(c, op); -+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); -+ promote_free(&op->write.rbio); - } - --static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) -+static void promote_start_work(struct work_struct *work) - { -- struct bio *bio = &op->write.op.wbio.bio; -+ struct promote_op *op = container_of(work, struct promote_op, work); - -- trace_and_count(op->write.op.c, read_promote, &rbio->bio); -+ bch2_data_update_read_done(&op->write); -+} - -- /* we now own pages: */ -- BUG_ON(!rbio->bounce); -- BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); -+static noinline void promote_start(struct bch_read_bio *rbio) -+{ -+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - -- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, -- sizeof(struct bio_vec) * rbio->bio.bi_vcnt); -- swap(bio->bi_vcnt, rbio->bio.bi_vcnt); -+ trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); - -- bch2_data_update_read_done(&op->write, rbio->pick.crc); -+ INIT_WORK(&op->work, promote_start_work); -+ queue_work(rbio->c->write_ref_wq, &op->work); - } - --static struct promote_op *__promote_alloc(struct btree_trans *trans, -- enum btree_id btree_id, -- struct bkey_s_c k, -- struct bpos pos, -- struct extent_ptr_decoded *pick, -- struct bch_io_opts opts, -- unsigned sectors, -- struct bch_read_bio **rbio, -- struct bch_io_failures *failed) -+static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bkey_s_c k, -+ struct bpos pos, -+ struct extent_ptr_decoded *pick, -+ unsigned sectors, -+ struct bch_read_bio *orig, -+ struct bch_io_failures *failed) - { - struct bch_fs *c = trans->c; -- struct promote_op *op = NULL; -- struct bio *bio; -- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - int ret; - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) -- return ERR_PTR(-BCH_ERR_nopromote_no_writes); -+ struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; - -- op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); -- if (!op) { -- ret = -BCH_ERR_nopromote_enomem; -- goto err; -- } -+ if (!have_io_error(failed)) { -+ update_opts.target = orig->opts.promote_target; -+ update_opts.extra_replicas = 1; -+ update_opts.write_flags |= BCH_WRITE_cached; -+ update_opts.write_flags |= BCH_WRITE_only_specified_devs; -+ } else { -+ update_opts.target = orig->opts.foreground_target; - -- op->start_time = local_clock(); -- op->pos = pos; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ unsigned ptr_bit = 1; -+ bkey_for_each_ptr(ptrs, ptr) { -+ if (bch2_dev_io_failures(failed, ptr->dev) && -+ !ptr_being_rewritten(orig, ptr->dev)) -+ update_opts.rewrite_ptrs |= ptr_bit; -+ ptr_bit <<= 1; -+ } - -- /* -- * We don't use the mempool here because extents that aren't -- * checksummed or compressed can be too big for the mempool: -- */ -- *rbio = kzalloc(sizeof(struct bch_read_bio) + -- sizeof(struct bio_vec) * pages, -- GFP_KERNEL); -- if (!*rbio) { -- ret = -BCH_ERR_nopromote_enomem; -- goto err; -+ if (!update_opts.rewrite_ptrs) -+ return NULL; - } - -- rbio_init(&(*rbio)->bio, opts); -- bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) -+ return ERR_PTR(-BCH_ERR_nopromote_no_writes); - -- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { -+ struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); -+ if (!op) { - ret = -BCH_ERR_nopromote_enomem; -- goto err; -+ goto err_put; - } - -- (*rbio)->bounce = true; -- (*rbio)->split = true; -- (*rbio)->kmalloc = true; -+ op->start_time = local_clock(); -+ op->pos = pos; - - if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, - bch_promote_params)) { -@@ -224,68 +247,61 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, - goto err; - } - -- bio = &op->write.op.wbio.bio; -- bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); -- -- struct data_update_opts update_opts = {}; -- -- if (!have_io_error(failed)) { -- update_opts.target = opts.promote_target; -- update_opts.extra_replicas = 1; -- update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; -- } else { -- update_opts.target = opts.foreground_target; -- -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- unsigned ptr_bit = 1; -- bkey_for_each_ptr(ptrs, ptr) { -- if (bch2_dev_io_failures(failed, ptr->dev)) -- update_opts.rewrite_ptrs |= ptr_bit; -- ptr_bit <<= 1; -- } -- } -+ ret = async_object_list_add(c, promote, op, &op->list_idx); -+ if (ret < 0) -+ goto err_remove_hash; - - ret = bch2_data_update_init(trans, NULL, NULL, &op->write, - writepoint_hashed((unsigned long) current), -- opts, -+ &orig->opts, - update_opts, - btree_id, k); -+ op->write.type = BCH_DATA_UPDATE_promote; - /* - * possible errors: -BCH_ERR_nocow_lock_blocked, - * -BCH_ERR_ENOSPC_disk_reservation: - */ -- if (ret) { -- BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, -- bch_promote_params)); -- goto err; -- } -+ if (ret) -+ goto err_remove_list; - -+ rbio_init_fragment(&op->write.rbio.bio, orig); -+ op->write.rbio.bounce = true; -+ op->write.rbio.promote = true; - op->write.op.end_io = promote_done; - -- return op; -+ return &op->write.rbio; -+err_remove_list: -+ async_object_list_del(c, promote, op->list_idx); -+err_remove_hash: -+ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, -+ bch_promote_params)); - err: -- if (*rbio) -- bio_free_pages(&(*rbio)->bio); -- kfree(*rbio); -- *rbio = NULL; -+ bio_free_pages(&op->write.op.wbio.bio); - /* We may have added to the rhashtable and thus need rcu freeing: */ - kfree_rcu(op, rcu); -- bch2_write_ref_put(c, BCH_WRITE_REF_promote); -+err_put: -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); - return ERR_PTR(ret); - } - - noinline --static struct promote_op *promote_alloc(struct btree_trans *trans, -+static struct bch_read_bio *promote_alloc(struct btree_trans *trans, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_ptr_decoded *pick, -- struct bch_io_opts opts, - unsigned flags, -- struct bch_read_bio **rbio, -+ struct bch_read_bio *orig, - bool *bounce, - bool *read_full, - struct bch_io_failures *failed) - { -+ /* -+ * We're in the retry path, but we don't know what to repair yet, and we -+ * don't want to do a promote here: -+ */ -+ if (failed && !failed->nr) -+ return NULL; -+ - struct bch_fs *c = trans->c; - /* - * if failed != NULL we're not actually doing a promote, we're -@@ -301,18 +317,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, - struct bpos pos = promote_full - ? bkey_start_pos(k.k) - : POS(k.k->p.inode, iter.bi_sector); -- struct promote_op *promote; - int ret; - -- ret = should_promote(c, k, pos, opts, flags, failed); -+ ret = should_promote(c, k, pos, orig->opts, flags, failed); - if (ret) - goto nopromote; - -- promote = __promote_alloc(trans, -- k.k->type == KEY_TYPE_reflink_v -- ? BTREE_ID_reflink -- : BTREE_ID_extents, -- k, pos, pick, opts, sectors, rbio, failed); -+ struct bch_read_bio *promote = -+ __promote_alloc(trans, -+ k.k->type == KEY_TYPE_reflink_v -+ ? BTREE_ID_reflink -+ : BTREE_ID_extents, -+ k, pos, pick, sectors, orig, failed); -+ if (!promote) -+ return NULL; -+ - ret = PTR_ERR_OR_ZERO(promote); - if (ret) - goto nopromote; -@@ -321,18 +340,38 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, - *read_full = promote_full; - return promote; - nopromote: -- trace_read_nopromote(c, ret); -+ trace_io_read_nopromote(c, ret); - return NULL; - } - -+void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op) -+{ -+ if (!op->write.read_done) { -+ prt_printf(out, "parent read: %px\n", op->write.rbio.parent); -+ printbuf_indent_add(out, 2); -+ bch2_read_bio_to_text(out, op->write.rbio.parent); -+ printbuf_indent_sub(out, 2); -+ } -+ -+ bch2_data_update_to_text(out, &op->write); -+} -+ - /* Read */ - - static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_read_bio *rbio, struct bpos read_pos) - { -- return bch2_inum_offset_err_msg_trans(trans, out, -- (subvol_inum) { rbio->subvol, read_pos.inode }, -- read_pos.offset << 9); -+ int ret = lockrestart_do(trans, -+ bch2_inum_offset_err_msg_trans(trans, out, -+ (subvol_inum) { rbio->subvol, read_pos.inode }, -+ read_pos.offset << 9)); -+ if (ret) -+ return ret; -+ -+ if (rbio->data_update) -+ prt_str(out, "(internal move) "); -+ -+ return 0; - } - - static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, -@@ -341,10 +380,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, - bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); - } - --#define READ_RETRY_AVOID 1 --#define READ_RETRY 2 --#define READ_ERR 3 -- - enum rbio_context { - RBIO_CONTEXT_NULL, - RBIO_CONTEXT_HIGHPRI, -@@ -375,20 +410,27 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) - { - BUG_ON(rbio->bounce && !rbio->split); - -- if (rbio->promote) -- promote_free(rbio->c, rbio->promote); -- rbio->promote = NULL; -- -- if (rbio->bounce) -- bch2_bio_free_pages_pool(rbio->c, &rbio->bio); -+ if (rbio->have_ioref) { -+ struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); -+ } - - if (rbio->split) { - struct bch_read_bio *parent = rbio->parent; - -- if (rbio->kmalloc) -- kfree(rbio); -- else -+ if (unlikely(rbio->promote)) { -+ if (!rbio->bio.bi_status) -+ promote_start(rbio); -+ else -+ promote_free(rbio); -+ } else { -+ async_object_list_del(rbio->c, rbio, rbio->list_idx); -+ -+ if (rbio->bounce) -+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); -+ - bio_put(&rbio->bio); -+ } - - rbio = parent; - } -@@ -408,61 +450,115 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) - bio_endio(&rbio->bio); - } - --static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, -- struct bvec_iter bvec_iter, -- struct bch_io_failures *failed, -- unsigned flags) -+static void get_rbio_extent(struct btree_trans *trans, -+ struct bch_read_bio *rbio, -+ struct bkey_buf *sk) - { -- struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; -- struct bkey_buf sk; - struct bkey_s_c k; -- int ret; -+ int ret = lockrestart_do(trans, -+ bkey_err(k = bch2_bkey_get_iter(trans, &iter, -+ rbio->data_btree, rbio->data_pos, 0))); -+ if (ret) -+ return; - -- flags &= ~BCH_READ_LAST_FRAGMENT; -- flags |= BCH_READ_MUST_CLONE; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ bkey_for_each_ptr(ptrs, ptr) -+ if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { -+ bch2_bkey_buf_reassemble(sk, trans->c, k); -+ break; -+ } - -- bch2_bkey_buf_init(&sk); -+ bch2_trans_iter_exit(trans, &iter); -+} -+ -+static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, -+ enum btree_id btree, struct bkey_s_c read_k) -+{ -+ struct bch_fs *c = trans->c; -+ -+ struct data_update *u = rbio_data_update(rbio); -+ if (u) -+ read_k = bkey_i_to_s_c(u->k.k); -+ -+ u64 flags = bch2_bkey_extent_flags(read_k); -+ if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) -+ return 0; -+ -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), -+ BTREE_ITER_intent); -+ int ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ if (!bkey_and_val_eq(k, read_k)) -+ goto out; - -- bch2_trans_iter_init(trans, &iter, rbio->data_btree, -- rbio->read_pos, BTREE_ITER_slots); -+ struct bkey_i *new = bch2_trans_kmalloc(trans, -+ bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); -+ ret = PTR_ERR_OR_ZERO(new) ?: -+ (bkey_reassemble(new, k), 0) ?: -+ bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: -+ bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: -+ bch2_trans_commit(trans, NULL, NULL, 0); -+ -+ /* -+ * Propagate key change back to data update path, in particular so it -+ * knows the extent has been poisoned and it's safe to change the -+ * checksum -+ */ -+ if (u && !ret) -+ bch2_bkey_buf_copy(&u->k, c, new); -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, -+ struct bch_read_bio *rbio, -+ struct bvec_iter bvec_iter, -+ struct bch_io_failures *failed, -+ unsigned flags) -+{ -+ struct data_update *u = container_of(rbio, struct data_update, rbio); - retry: - bch2_trans_begin(trans); -- rbio->bio.bi_status = 0; - -- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = lockrestart_do(trans, -+ bkey_err(k = bch2_bkey_get_iter(trans, &iter, -+ u->btree_id, bkey_start_pos(&u->k.k->k), -+ 0))); - if (ret) - goto err; - -- bch2_bkey_buf_reassemble(&sk, c, k); -- k = bkey_i_to_s_c(sk.k); -- -- if (!bch2_bkey_matches_ptr(c, k, -- rbio->pick.ptr, -- rbio->data_pos.offset - -- rbio->pick.crc.offset)) { -+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { - /* extent we wanted to read no longer exists: */ -- rbio->hole = true; -- goto out; -+ rbio->ret = -BCH_ERR_data_read_key_overwritten; -+ goto err; - } - - ret = __bch2_read_extent(trans, rbio, bvec_iter, -- rbio->read_pos, -- rbio->data_btree, -- k, 0, failed, flags); -- if (ret == READ_RETRY) -- goto retry; -- if (ret) -- goto err; --out: -- bch2_rbio_done(rbio); -- bch2_trans_iter_exit(trans, &iter); -- bch2_trans_put(trans); -- bch2_bkey_buf_exit(&sk, c); -- return; -+ bkey_start_pos(&u->k.k->k), -+ u->btree_id, -+ bkey_i_to_s_c(u->k.k), -+ 0, failed, flags, -1); - err: -- rbio->bio.bi_status = BLK_STS_IOERR; -- goto out; -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || -+ bch2_err_matches(ret, BCH_ERR_data_read_retry)) -+ goto retry; -+ -+ if (ret) { -+ rbio->bio.bi_status = BLK_STS_IOERR; -+ rbio->ret = ret; -+ } -+ -+ BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); -+ return ret; - } - - static void bch2_rbio_retry(struct work_struct *work) -@@ -478,68 +574,105 @@ static void bch2_rbio_retry(struct work_struct *work) - }; - struct bch_io_failures failed = { .nr = 0 }; - -- trace_and_count(c, read_retry, &rbio->bio); -+ struct btree_trans *trans = bch2_trans_get(c); -+ -+ struct bkey_buf sk; -+ bch2_bkey_buf_init(&sk); -+ bkey_init(&sk.k->k); -+ -+ trace_io_read_retry(&rbio->bio); -+ this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], -+ bvec_iter_sectors(rbio->bvec_iter)); - -- if (rbio->retry == READ_RETRY_AVOID) -- bch2_mark_io_failure(&failed, &rbio->pick); -+ get_rbio_extent(trans, rbio, &sk); - -- rbio->bio.bi_status = 0; -+ if (!bkey_deleted(&sk.k->k) && -+ bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) -+ bch2_mark_io_failure(&failed, &rbio->pick, -+ rbio->ret == -BCH_ERR_data_read_retry_csum_err); -+ -+ if (!rbio->split) { -+ rbio->bio.bi_status = 0; -+ rbio->ret = 0; -+ } -+ -+ unsigned subvol = rbio->subvol; -+ struct bpos read_pos = rbio->read_pos; - - rbio = bch2_rbio_free(rbio); - -- flags |= BCH_READ_IN_RETRY; -- flags &= ~BCH_READ_MAY_PROMOTE; -+ flags |= BCH_READ_in_retry; -+ flags &= ~BCH_READ_may_promote; -+ flags &= ~BCH_READ_last_fragment; -+ flags |= BCH_READ_must_clone; - -- if (flags & BCH_READ_NODECODE) { -- bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); -- } else { -- flags &= ~BCH_READ_LAST_FRAGMENT; -- flags |= BCH_READ_MUST_CLONE; -+ int ret = rbio->data_update -+ ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) -+ : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); - -- __bch2_read(c, rbio, iter, inum, &failed, flags); -+ if (ret) { -+ rbio->ret = ret; -+ rbio->bio.bi_status = BLK_STS_IOERR; - } --} - --static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, -- blk_status_t error) --{ -- rbio->retry = retry; -+ if (failed.nr || ret) { -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); - -- if (rbio->flags & BCH_READ_IN_RETRY) -- return; -+ lockrestart_do(trans, -+ bch2_inum_offset_err_msg_trans(trans, &buf, -+ (subvol_inum) { subvol, read_pos.inode }, -+ read_pos.offset << 9)); -+ if (rbio->data_update) -+ prt_str(&buf, "(internal move) "); - -- if (retry == READ_ERR) { -- rbio = bch2_rbio_free(rbio); -+ prt_str(&buf, "data read error, "); -+ if (!ret) -+ prt_str(&buf, "successful retry"); -+ else -+ prt_str(&buf, bch2_err_str(ret)); -+ prt_newline(&buf); - -- rbio->bio.bi_status = error; -- bch2_rbio_done(rbio); -- } else { -- bch2_rbio_punt(rbio, bch2_rbio_retry, -- RBIO_CONTEXT_UNBOUND, system_unbound_wq); -+ if (!bkey_deleted(&sk.k->k)) { -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); -+ prt_newline(&buf); -+ } -+ -+ bch2_io_failures_to_text(&buf, c, &failed); -+ -+ bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); - } -+ -+ bch2_rbio_done(rbio); -+ bch2_bkey_buf_exit(&sk, c); -+ bch2_trans_put(trans); - } - --static void bch2_read_io_err(struct work_struct *work) -+static void bch2_rbio_error(struct bch_read_bio *rbio, -+ int ret, blk_status_t blk_error) - { -- struct bch_read_bio *rbio = -- container_of(work, struct bch_read_bio, work); -- struct bio *bio = &rbio->bio; -- struct bch_fs *c = rbio->c; -- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; -- struct printbuf buf = PRINTBUF; -+ BUG_ON(ret >= 0); - -- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); -- prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); -+ rbio->ret = ret; -+ rbio->bio.bi_status = blk_error; - -- if (ca) { -- bch2_io_error(ca, BCH_MEMBER_ERROR_read); -- bch_err_ratelimited(ca, "%s", buf.buf); -+ bch2_rbio_parent(rbio)->saw_error = true; -+ -+ if (rbio->flags & BCH_READ_in_retry) -+ return; -+ -+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { -+ bch2_rbio_punt(rbio, bch2_rbio_retry, -+ RBIO_CONTEXT_UNBOUND, system_unbound_wq); - } else { -- bch_err_ratelimited(c, "%s", buf.buf); -- } -+ rbio = bch2_rbio_free(rbio); - -- printbuf_exit(&buf); -- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); -+ rbio->ret = ret; -+ rbio->bio.bi_status = blk_error; -+ -+ bch2_rbio_done(rbio); -+ } - } - - static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, -@@ -605,33 +738,6 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) - __bch2_rbio_narrow_crcs(trans, rbio)); - } - --static void bch2_read_csum_err(struct work_struct *work) --{ -- struct bch_read_bio *rbio = -- container_of(work, struct bch_read_bio, work); -- struct bch_fs *c = rbio->c; -- struct bio *src = &rbio->bio; -- struct bch_extent_crc_unpacked crc = rbio->pick.crc; -- struct nonce nonce = extent_nonce(rbio->version, crc); -- struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); -- struct printbuf buf = PRINTBUF; -- -- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); -- prt_str(&buf, "data "); -- bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); -- -- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; -- if (ca) { -- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); -- bch_err_ratelimited(ca, "%s", buf.buf); -- } else { -- bch_err_ratelimited(c, "%s", buf.buf); -- } -- -- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -- printbuf_exit(&buf); --} -- - static void bch2_read_decompress_err(struct work_struct *work) - { - struct bch_read_bio *rbio = -@@ -648,7 +754,7 @@ static void bch2_read_decompress_err(struct work_struct *work) - else - bch_err_ratelimited(c, "%s", buf.buf); - -- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); -+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); - printbuf_exit(&buf); - } - -@@ -668,7 +774,7 @@ static void bch2_read_decrypt_err(struct work_struct *work) - else - bch_err_ratelimited(c, "%s", buf.buf); - -- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); -+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); - printbuf_exit(&buf); - } - -@@ -678,9 +784,11 @@ static void __bch2_read_endio(struct work_struct *work) - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; -- struct bio *src = &rbio->bio; -- struct bio *dst = &bch2_rbio_parent(rbio)->bio; -- struct bvec_iter dst_iter = rbio->bvec_iter; -+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; -+ struct bch_read_bio *parent = bch2_rbio_parent(rbio); -+ struct bio *src = &rbio->bio; -+ struct bio *dst = &parent->bio; -+ struct bvec_iter dst_iter = rbio->bvec_iter; - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - unsigned nofs_flags; -@@ -698,8 +806,26 @@ static void __bch2_read_endio(struct work_struct *work) - src->bi_iter = rbio->bvec_iter; - } - -+ bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); -+ - csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); -- if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) -+ bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; -+ -+ /* -+ * Checksum error: if the bio wasn't bounced, we may have been -+ * reading into buffers owned by userspace (that userspace can -+ * scribble over) - retry the read, bouncing it this time: -+ */ -+ if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { -+ rbio->flags |= BCH_READ_must_bounce; -+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, -+ BLK_STS_IOERR); -+ goto out; -+ } -+ -+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); -+ -+ if (!csum_good) - goto csum_err; - - /* -@@ -712,32 +838,40 @@ static void __bch2_read_endio(struct work_struct *work) - if (unlikely(rbio->narrow_crcs)) - bch2_rbio_narrow_crcs(rbio); - -- if (rbio->flags & BCH_READ_NODECODE) -- goto nodecode; -+ if (likely(!parent->data_update)) { -+ /* Adjust crc to point to subset of data we want: */ -+ crc.offset += rbio->offset_into_extent; -+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - -- /* Adjust crc to point to subset of data we want: */ -- crc.offset += rbio->offset_into_extent; -- crc.live_size = bvec_iter_sectors(rbio->bvec_iter); -+ if (crc_is_compressed(crc)) { -+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ if (ret) -+ goto decrypt_err; - -- if (crc_is_compressed(crc)) { -- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); -- if (ret) -- goto decrypt_err; -+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && -+ !c->opts.no_data_io) -+ goto decompression_err; -+ } else { -+ /* don't need to decrypt the entire bio: */ -+ nonce = nonce_add(nonce, crc.offset << 9); -+ bio_advance(src, crc.offset << 9); - -- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && -- !c->opts.no_data_io) -- goto decompression_err; -- } else { -- /* don't need to decrypt the entire bio: */ -- nonce = nonce_add(nonce, crc.offset << 9); -- bio_advance(src, crc.offset << 9); -+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); -+ src->bi_iter.bi_size = dst_iter.bi_size; - -- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); -- src->bi_iter.bi_size = dst_iter.bi_size; -+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ if (ret) -+ goto decrypt_err; - -- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); -- if (ret) -- goto decrypt_err; -+ if (rbio->bounce) { -+ struct bvec_iter src_iter = src->bi_iter; -+ -+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); -+ } -+ } -+ } else { -+ if (rbio->split) -+ rbio->parent->pick = rbio->pick; - - if (rbio->bounce) { - struct bvec_iter src_iter = src->bi_iter; -@@ -754,12 +888,9 @@ static void __bch2_read_endio(struct work_struct *work) - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; -- -- promote_start(rbio->promote, rbio); -- rbio->promote = NULL; - } --nodecode: -- if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { -+ -+ if (likely(!(rbio->flags & BCH_READ_in_retry))) { - rbio = bch2_rbio_free(rbio); - bch2_rbio_done(rbio); - } -@@ -767,18 +898,7 @@ static void __bch2_read_endio(struct work_struct *work) - memalloc_nofs_restore(nofs_flags); - return; - csum_err: -- /* -- * Checksum error: if the bio wasn't bounced, we may have been -- * reading into buffers owned by userspace (that userspace can -- * scribble over) - retry the read, bouncing it this time: -- */ -- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { -- rbio->flags |= BCH_READ_MUST_BOUNCE; -- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); -- goto out; -- } -- -- bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); -+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); - goto out; - decompression_err: - bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); -@@ -797,27 +917,25 @@ static void bch2_read_endio(struct bio *bio) - struct workqueue_struct *wq = NULL; - enum rbio_context context = RBIO_CONTEXT_NULL; - -- if (rbio->have_ioref) { -- bch2_latency_acct(ca, rbio->submit_time, READ); -- percpu_ref_put(&ca->io_ref); -- } -+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, -+ rbio->submit_time, !bio->bi_status); - - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - - if (unlikely(bio->bi_status)) { -- bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); -+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); - return; - } - -- if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || -+ if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || - (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { -- trace_and_count(c, read_reuse_race, &rbio->bio); -+ trace_and_count(c, io_read_reuse_race, &rbio->bio); - -- if (rbio->flags & BCH_READ_RETRY_IF_STALE) -- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); -+ if (rbio->flags & BCH_READ_retry_if_stale) -+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); - else -- bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); -+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); - return; - } - -@@ -856,7 +974,7 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, - - prt_printf(&buf, "memory gen: %u", gen); - -- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); -+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); - if (!ret) { - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); -@@ -883,15 +1001,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - struct bvec_iter iter, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, -- struct bch_io_failures *failed, unsigned flags) -+ struct bch_io_failures *failed, unsigned flags, int dev) - { - struct bch_fs *c = trans->c; - struct extent_ptr_decoded pick; - struct bch_read_bio *rbio = NULL; -- struct promote_op *promote = NULL; - bool bounce = false, read_full = false, narrow_crcs = false; - struct bpos data_pos = bkey_start_pos(k.k); -- int pick_ret; -+ struct data_update *u = rbio_data_update(orig); -+ int ret = 0; - - if (bkey_extent_is_inline_data(k.k)) { - unsigned bytes = min_t(unsigned, iter.bi_size, -@@ -902,19 +1020,35 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - swap(iter.bi_size, bytes); - bio_advance_iter(&orig->bio, &iter, bytes); - zero_fill_bio_iter(&orig->bio, iter); -+ this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], -+ bvec_iter_sectors(iter)); - goto out_read_done; - } -+ -+ if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && -+ !orig->data_update) -+ return -BCH_ERR_extent_poisoned; - retry_pick: -- pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); -+ ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); - - /* hole or reservation - just zero fill: */ -- if (!pick_ret) -+ if (!ret) - goto hole; - -- if (unlikely(pick_ret < 0)) { -+ if (unlikely(ret < 0)) { -+ if (ret == -BCH_ERR_data_read_csum_err) { -+ int ret2 = maybe_poison_extent(trans, orig, data_btree, k); -+ if (ret2) { -+ ret = ret2; -+ goto err; -+ } -+ -+ trace_and_count(c, io_read_fail_and_poison, &orig->bio); -+ } -+ - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, orig, read_pos); -- prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); -+ prt_printf(&buf, "%s\n ", bch2_err_str(ret)); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err_ratelimited(c, "%s", buf.buf); -@@ -922,7 +1056,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - goto err; - } - -- if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { -+ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && -+ !c->chacha20_key_set) { - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); -@@ -930,10 +1065,12 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); -+ ret = -BCH_ERR_data_read_no_encryption_key; - goto err; - } - -- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); -+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, -+ BCH_DEV_READ_REF_io_read); - - /* - * Stale dirty pointers are treated as IO errors, but @failed isn't -@@ -941,56 +1078,58 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - * retry path, don't check here, it'll be caught in bch2_read_endio() - * and we'll end up in the retry path: - */ -- if ((flags & BCH_READ_IN_RETRY) && -+ if ((flags & BCH_READ_in_retry) && - !pick.ptr.cached && - ca && - unlikely(dev_ptr_stale(ca, &pick.ptr))) { - read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); -- bch2_mark_io_failure(failed, &pick); -- percpu_ref_put(&ca->io_ref); -+ bch2_mark_io_failure(failed, &pick, false); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); - goto retry_pick; - } - -- if (flags & BCH_READ_NODECODE) { -+ if (likely(!u)) { -+ if (!(flags & BCH_READ_last_fragment) || -+ bio_flagged(&orig->bio, BIO_CHAIN)) -+ flags |= BCH_READ_must_clone; -+ -+ narrow_crcs = !(flags & BCH_READ_in_retry) && -+ bch2_can_narrow_extent_crcs(k, pick.crc); -+ -+ if (narrow_crcs && (flags & BCH_READ_user_mapped)) -+ flags |= BCH_READ_must_bounce; -+ -+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); -+ -+ if (crc_is_compressed(pick.crc) || -+ (pick.crc.csum_type != BCH_CSUM_none && -+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -+ (bch2_csum_type_is_encryption(pick.crc.csum_type) && -+ (flags & BCH_READ_user_mapped)) || -+ (flags & BCH_READ_must_bounce)))) { -+ read_full = true; -+ bounce = true; -+ } -+ } else { - /* - * can happen if we retry, and the extent we were going to read - * has been merged in the meantime: - */ -- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { -+ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { - if (ca) -- percpu_ref_put(&ca->io_ref); -- goto hole; -+ enumerated_ref_put(&ca->io_ref[READ], -+ BCH_DEV_READ_REF_io_read); -+ rbio->ret = -BCH_ERR_data_read_buffer_too_small; -+ goto out_read_done; - } - - iter.bi_size = pick.crc.compressed_size << 9; -- goto get_bio; -- } -- -- if (!(flags & BCH_READ_LAST_FRAGMENT) || -- bio_flagged(&orig->bio, BIO_CHAIN)) -- flags |= BCH_READ_MUST_CLONE; -- -- narrow_crcs = !(flags & BCH_READ_IN_RETRY) && -- bch2_can_narrow_extent_crcs(k, pick.crc); -- -- if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) -- flags |= BCH_READ_MUST_BOUNCE; -- -- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); -- -- if (crc_is_compressed(pick.crc) || -- (pick.crc.csum_type != BCH_CSUM_none && -- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -- (bch2_csum_type_is_encryption(pick.crc.csum_type) && -- (flags & BCH_READ_USER_MAPPED)) || -- (flags & BCH_READ_MUST_BOUNCE)))) { - read_full = true; -- bounce = true; - } - - if (orig->opts.promote_target || have_io_error(failed)) -- promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, -- &rbio, &bounce, &read_full, failed); -+ rbio = promote_alloc(trans, iter, k, &pick, flags, orig, -+ &bounce, &read_full, failed); - - if (!read_full) { - EBUG_ON(crc_is_compressed(pick.crc)); -@@ -1009,7 +1148,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - pick.crc.offset = 0; - pick.crc.live_size = bvec_iter_sectors(iter); - } --get_bio: -+ - if (rbio) { - /* - * promote already allocated bounce rbio: -@@ -1024,17 +1163,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - } else if (bounce) { - unsigned sectors = pick.crc.compressed_size; - -- rbio = rbio_init(bio_alloc_bioset(NULL, -+ rbio = rbio_init_fragment(bio_alloc_bioset(NULL, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - 0, - GFP_NOFS, - &c->bio_read_split), -- orig->opts); -+ orig); - - bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - rbio->bounce = true; -- rbio->split = true; -- } else if (flags & BCH_READ_MUST_CLONE) { -+ } else if (flags & BCH_READ_must_clone) { - /* - * Have to clone if there were any splits, due to error - * reporting issues (if a split errored, and retrying didn't -@@ -1043,11 +1181,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - * from the whole bio, in which case we don't want to retry and - * lose the error) - */ -- rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, -+ rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, - &c->bio_read_split), -- orig->opts); -+ orig); - rbio->bio.bi_iter = iter; -- rbio->split = true; - } else { - rbio = orig; - rbio->bio.bi_iter = iter; -@@ -1056,77 +1193,70 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - - EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - -- rbio->c = c; - rbio->submit_time = local_clock(); -- if (rbio->split) -- rbio->parent = orig; -- else -+ if (!rbio->split) - rbio->end_io = orig->bio.bi_end_io; - rbio->bvec_iter = iter; - rbio->offset_into_extent= offset_into_extent; - rbio->flags = flags; - rbio->have_ioref = ca != NULL; - rbio->narrow_crcs = narrow_crcs; -- rbio->hole = 0; -- rbio->retry = 0; -+ rbio->ret = 0; - rbio->context = 0; -- /* XXX: only initialize this if needed */ -- rbio->devs_have = bch2_bkey_devs(k); - rbio->pick = pick; - rbio->subvol = orig->subvol; - rbio->read_pos = read_pos; - rbio->data_btree = data_btree; - rbio->data_pos = data_pos; - rbio->version = k.k->bversion; -- rbio->promote = promote; - INIT_WORK(&rbio->work, NULL); - -- if (flags & BCH_READ_NODECODE) -- orig->pick = pick; -- - rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick.ptr.offset; - rbio->bio.bi_end_io = bch2_read_endio; - -+ async_object_list_add(c, rbio, rbio, &rbio->list_idx); -+ -+ /* XXX: also nvme read recovery level */ -+ if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev))) -+ rbio->bio.bi_opf |= REQ_FUA; -+ - if (rbio->bounce) -- trace_and_count(c, read_bounce, &rbio->bio); -+ trace_and_count(c, io_read_bounce, &rbio->bio); - -- this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); -+ if (!u) -+ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); -+ else -+ this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); - bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - - /* - * If it's being moved internally, we don't want to flag it as a cache - * hit: - */ -- if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) -+ if (ca && pick.ptr.cached && !u) - bch2_bucket_io_time_reset(trans, pick.ptr.dev, - PTR_BUCKET_NR(ca, &pick.ptr), READ); - -- if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { -+ if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { - bio_inc_remaining(&orig->bio); -- trace_and_count(c, read_split, &orig->bio); -+ trace_and_count(c, io_read_split, &orig->bio); - } - - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ -- if (!(flags & BCH_READ_IN_RETRY)) -+ if (!(flags & BCH_READ_in_retry)) - bch2_trans_unlock(trans); - else - bch2_trans_unlock_long(trans); - -- if (!rbio->pick.idx) { -+ if (likely(!rbio->pick.do_ec_reconstruct)) { - if (unlikely(!rbio->have_ioref)) { -- struct printbuf buf = PRINTBUF; -- bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); -- prt_printf(&buf, "no device to read from:\n "); -- bch2_bkey_val_to_text(&buf, c, k); -- -- bch_err_ratelimited(c, "%s", buf.buf); -- printbuf_exit(&buf); -- -- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ bch2_rbio_error(rbio, -+ -BCH_ERR_data_read_retry_device_offline, -+ BLK_STS_IOERR); - goto out; - } - -@@ -1135,10 +1265,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - - if (unlikely(c->opts.no_data_io)) { -- if (likely(!(flags & BCH_READ_IN_RETRY))) -+ if (likely(!(flags & BCH_READ_in_retry))) - bio_endio(&rbio->bio); - } else { -- if (likely(!(flags & BCH_READ_IN_RETRY))) -+ if (likely(!(flags & BCH_READ_in_retry))) - submit_bio(&rbio->bio); - else - submit_bio_wait(&rbio->bio); -@@ -1152,15 +1282,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - } else { - /* Attempting reconstruct read: */ - if (bch2_ec_read_extent(trans, rbio, k)) { -- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, -+ BLK_STS_IOERR); - goto out; - } - -- if (likely(!(flags & BCH_READ_IN_RETRY))) -+ if (likely(!(flags & BCH_READ_in_retry))) - bio_endio(&rbio->bio); - } - out: -- if (likely(!(flags & BCH_READ_IN_RETRY))) { -+ if (likely(!(flags & BCH_READ_in_retry))) { - return 0; - } else { - bch2_trans_unlock(trans); -@@ -1170,54 +1301,57 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - rbio->context = RBIO_CONTEXT_UNBOUND; - bch2_read_endio(&rbio->bio); - -- ret = rbio->retry; -+ ret = rbio->ret; - rbio = bch2_rbio_free(rbio); - -- if (ret == READ_RETRY_AVOID) { -- bch2_mark_io_failure(failed, &pick); -- ret = READ_RETRY; -- } -- -- if (!ret) -- goto out_read_done; -+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) -+ bch2_mark_io_failure(failed, &pick, -+ ret == -BCH_ERR_data_read_retry_csum_err); - - return ret; - } - - err: -- if (flags & BCH_READ_IN_RETRY) -- return READ_ERR; -+ if (flags & BCH_READ_in_retry) -+ return ret; - -- orig->bio.bi_status = BLK_STS_IOERR; -+ orig->bio.bi_status = BLK_STS_IOERR; -+ orig->ret = ret; - goto out_read_done; - - hole: -+ this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], -+ bvec_iter_sectors(iter)); - /* -- * won't normally happen in the BCH_READ_NODECODE -- * (bch2_move_extent()) path, but if we retry and the extent we wanted -- * to read no longer exists we have to signal that: -+ * won't normally happen in the data update (bch2_move_extent()) path, -+ * but if we retry and the extent we wanted to read no longer exists we -+ * have to signal that: - */ -- if (flags & BCH_READ_NODECODE) -- orig->hole = true; -+ if (u) -+ orig->ret = -BCH_ERR_data_read_key_overwritten; - - zero_fill_bio_iter(&orig->bio, iter); - out_read_done: -- if (flags & BCH_READ_LAST_FRAGMENT) -+ if ((flags & BCH_READ_last_fragment) && -+ !(flags & BCH_READ_in_retry)) - bch2_rbio_done(orig); - return 0; - } - --void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, -- struct bvec_iter bvec_iter, subvol_inum inum, -- struct bch_io_failures *failed, unsigned flags) -+int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, -+ struct bvec_iter bvec_iter, subvol_inum inum, -+ struct bch_io_failures *failed, -+ struct bkey_buf *prev_read, -+ unsigned flags) - { -- struct btree_trans *trans = bch2_trans_get(c); -+ struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; -+ enum btree_id data_btree; - int ret; - -- BUG_ON(flags & BCH_READ_NODECODE); -+ EBUG_ON(rbio->data_update); - - bch2_bkey_buf_init(&sk); - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, -@@ -1225,7 +1359,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - BTREE_ITER_slots); - - while (1) { -- enum btree_id data_btree = BTREE_ID_extents; -+ data_btree = BTREE_ID_extents; - - bch2_trans_begin(trans); - -@@ -1234,12 +1368,12 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - if (ret) - goto err; - -- bch2_btree_iter_set_snapshot(&iter, snapshot); -+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - -- bch2_btree_iter_set_pos(&iter, -+ bch2_btree_iter_set_pos(trans, &iter, - POS(inum.inum, bvec_iter.bi_sector)); - -- k = bch2_btree_iter_peek_slot(&iter); -+ k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -1257,6 +1391,12 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - - k = bkey_i_to_s_c(sk.k); - -+ if (unlikely(flags & BCH_READ_in_retry)) { -+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) -+ failed->nr = 0; -+ bch2_bkey_buf_copy(prev_read, c, sk.k); -+ } -+ - /* - * With indirect extents, the amount of data to read is the min - * of the original extent and the indirect extent: -@@ -1267,42 +1407,86 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - swap(bvec_iter.bi_size, bytes); - - if (bvec_iter.bi_size == bytes) -- flags |= BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_last_fragment; - - ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, - data_btree, k, -- offset_into_extent, failed, flags); -+ offset_into_extent, failed, flags, -1); -+ swap(bvec_iter.bi_size, bytes); -+ - if (ret) - goto err; - -- if (flags & BCH_READ_LAST_FRAGMENT) -+ if (flags & BCH_READ_last_fragment) - break; - -- swap(bvec_iter.bi_size, bytes); - bio_advance_iter(&rbio->bio, &bvec_iter, bytes); - err: -+ if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) -+ flags |= BCH_READ_must_bounce; -+ - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && -- ret != READ_RETRY && -- ret != READ_RETRY_AVOID) -+ !bch2_err_matches(ret, BCH_ERR_data_read_retry)) - break; - } - -- bch2_trans_iter_exit(trans, &iter); -+ if (unlikely(ret)) { -+ if (ret != -BCH_ERR_extent_poisoned) { -+ struct printbuf buf = PRINTBUF; -+ lockrestart_do(trans, -+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, -+ bvec_iter.bi_sector << 9)); -+ prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } - -- if (ret) { -- struct printbuf buf = PRINTBUF; -- bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); -- prt_printf(&buf, "read error %i from btree lookup", ret); -- bch_err_ratelimited(c, "%s", buf.buf); -- printbuf_exit(&buf); -+ rbio->bio.bi_status = BLK_STS_IOERR; -+ rbio->ret = ret; - -- rbio->bio.bi_status = BLK_STS_IOERR; -- bch2_rbio_done(rbio); -+ if (!(flags & BCH_READ_in_retry)) -+ bch2_rbio_done(rbio); - } - -- bch2_trans_put(trans); -+ bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&sk, c); -+ return ret; -+} -+ -+static const char * const bch2_read_bio_flags[] = { -+#define x(n) #n, -+ BCH_READ_FLAGS() -+#undef x -+ NULL -+}; -+ -+void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) -+{ -+ u64 now = local_clock(); -+ prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0); -+ prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0); -+ -+ if (!rbio->split) -+ prt_printf(out, "end_io:\t%ps\n", rbio->end_io); -+ else -+ prt_printf(out, "parent:\t%px\n", rbio->parent); -+ -+ prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io); -+ -+ prt_printf(out, "promote:\t%u\n", rbio->promote); -+ prt_printf(out, "bounce:\t%u\n", rbio->bounce); -+ prt_printf(out, "split:\t%u\n", rbio->split); -+ prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); -+ prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); -+ prt_printf(out, "context:\t%u\n", rbio->context); -+ prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret)); -+ -+ prt_printf(out, "flags:\t"); -+ bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); -+ prt_newline(out); -+ -+ bch2_bio_to_text(out, &rbio->bio); - } - - void bch2_fs_io_read_exit(struct bch_fs *c) -@@ -1311,10 +1495,18 @@ void bch2_fs_io_read_exit(struct bch_fs *c) - rhashtable_destroy(&c->promote_table); - bioset_exit(&c->bio_read_split); - bioset_exit(&c->bio_read); -+ mempool_exit(&c->bio_bounce_pages); - } - - int bch2_fs_io_read_init(struct bch_fs *c) - { -+ if (mempool_init_page_pool(&c->bio_bounce_pages, -+ max_t(unsigned, -+ c->opts.btree_node_size, -+ c->opts.encoded_extent_max) / -+ PAGE_SIZE, 0)) -+ return -BCH_ERR_ENOMEM_bio_bounce_pages_init; -+ - if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_init; -diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h -index a82e8a94ccb6..c08b9c047b3e 100644 ---- a/fs/bcachefs/io_read.h -+++ b/fs/bcachefs/io_read.h -@@ -3,6 +3,8 @@ - #define _BCACHEFS_IO_READ_H - - #include "bkey_buf.h" -+#include "btree_iter.h" -+#include "extents_types.h" - #include "reflink.h" - - struct bch_read_bio { -@@ -35,19 +37,21 @@ struct bch_read_bio { - u16 flags; - union { - struct { -- u16 bounce:1, -+ u16 data_update:1, -+ promote:1, -+ bounce:1, - split:1, -- kmalloc:1, - have_ioref:1, - narrow_crcs:1, -- hole:1, -- retry:2, -+ saw_error:1, - context:2; - }; - u16 _state; - }; -- -- struct bch_devs_list devs_have; -+ s16 ret; -+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -+ unsigned list_idx; -+#endif - - struct extent_ptr_decoded pick; - -@@ -65,8 +69,6 @@ struct bch_read_bio { - struct bpos data_pos; - struct bversion version; - -- struct promote_op *promote; -- - struct bch_io_opts opts; - - struct work_struct work; -@@ -108,64 +110,103 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, - return 0; - } - -+#define BCH_READ_FLAGS() \ -+ x(retry_if_stale) \ -+ x(may_promote) \ -+ x(user_mapped) \ -+ x(last_fragment) \ -+ x(must_bounce) \ -+ x(must_clone) \ -+ x(in_retry) -+ -+enum __bch_read_flags { -+#define x(n) __BCH_READ_##n, -+ BCH_READ_FLAGS() -+#undef x -+}; -+ - enum bch_read_flags { -- BCH_READ_RETRY_IF_STALE = 1 << 0, -- BCH_READ_MAY_PROMOTE = 1 << 1, -- BCH_READ_USER_MAPPED = 1 << 2, -- BCH_READ_NODECODE = 1 << 3, -- BCH_READ_LAST_FRAGMENT = 1 << 4, -- -- /* internal: */ -- BCH_READ_MUST_BOUNCE = 1 << 5, -- BCH_READ_MUST_CLONE = 1 << 6, -- BCH_READ_IN_RETRY = 1 << 7, -+#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), -+ BCH_READ_FLAGS() -+#undef x - }; - - int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, - struct bvec_iter, struct bpos, enum btree_id, - struct bkey_s_c, unsigned, -- struct bch_io_failures *, unsigned); -+ struct bch_io_failures *, unsigned, int); - - static inline void bch2_read_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, unsigned flags) - { -- __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, -- data_btree, k, offset_into_extent, NULL, flags); -+ int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, -+ data_btree, k, offset_into_extent, NULL, flags, -1); -+ /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ -+ WARN(ret, "unhandled error from __bch2_read_extent()"); - } - --void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, -- subvol_inum, struct bch_io_failures *, unsigned flags); -+int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, -+ subvol_inum, -+ struct bch_io_failures *, struct bkey_buf *, unsigned flags); - - static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum) - { -- struct bch_io_failures failed = { .nr = 0 }; -- - BUG_ON(rbio->_state); - -- rbio->c = c; -- rbio->start_time = local_clock(); - rbio->subvol = inum.subvol; - -- __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, -- BCH_READ_RETRY_IF_STALE| -- BCH_READ_MAY_PROMOTE| -- BCH_READ_USER_MAPPED); -+ bch2_trans_run(c, -+ __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, -+ BCH_READ_retry_if_stale| -+ BCH_READ_may_promote| -+ BCH_READ_user_mapped)); -+} -+ -+static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, -+ struct bch_read_bio *orig) -+{ -+ struct bch_read_bio *rbio = to_rbio(bio); -+ -+ rbio->c = orig->c; -+ rbio->_state = 0; -+ rbio->flags = 0; -+ rbio->ret = 0; -+ rbio->split = true; -+ rbio->parent = orig; -+ rbio->opts = orig->opts; -+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -+ rbio->list_idx = 0; -+#endif -+ return rbio; - } - - static inline struct bch_read_bio *rbio_init(struct bio *bio, -- struct bch_io_opts opts) -+ struct bch_fs *c, -+ struct bch_io_opts opts, -+ bio_end_io_t end_io) - { - struct bch_read_bio *rbio = to_rbio(bio); - -- rbio->_state = 0; -- rbio->promote = NULL; -- rbio->opts = opts; -+ rbio->start_time = local_clock(); -+ rbio->c = c; -+ rbio->_state = 0; -+ rbio->flags = 0; -+ rbio->ret = 0; -+ rbio->opts = opts; -+ rbio->bio.bi_end_io = end_io; -+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -+ rbio->list_idx = 0; -+#endif - return rbio; - } - -+struct promote_op; -+void bch2_promote_op_to_text(struct printbuf *, struct promote_op *); -+void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *); -+ - void bch2_fs_io_read_exit(struct bch_fs *); - int bch2_fs_io_read_init(struct bch_fs *); - -diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c -index 03892388832b..52a60982a66b 100644 ---- a/fs/bcachefs/io_write.c -+++ b/fs/bcachefs/io_write.c -@@ -6,6 +6,7 @@ - - #include "bcachefs.h" - #include "alloc_foreground.h" -+#include "async_objs.h" - #include "bkey_buf.h" - #include "bset.h" - #include "btree_update.h" -@@ -15,6 +16,7 @@ - #include "compress.h" - #include "debug.h" - #include "ec.h" -+#include "enumerated_ref.h" - #include "error.h" - #include "extent_update.h" - #include "inode.h" -@@ -34,6 +36,12 @@ - #include - #include - -+#ifdef CONFIG_BCACHEFS_DEBUG -+static unsigned bch2_write_corrupt_ratio; -+module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); -+MODULE_PARM_DESC(write_corrupt_ratio, ""); -+#endif -+ - #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - - static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, -@@ -162,9 +170,9 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, - *i_sectors_delta = 0; - *disk_sectors_delta = 0; - -- bch2_trans_copy_iter(&iter, extent_iter); -+ bch2_trans_copy_iter(trans, &iter, extent_iter); - -- for_each_btree_key_max_continue_norestart(iter, -+ for_each_btree_key_max_continue_norestart(trans, iter, - new->k.p, BTREE_ITER_slots, old, ret) { - s64 sectors = min(new->k.p.offset, old.k->p.offset) - - max(bkey_start_offset(&new->k), -@@ -249,10 +257,35 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, - } - - if (i_sectors_delta) { -+ s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors); -+ if (unlikely(bi_sectors + i_sectors_delta < 0)) { -+ struct bch_fs *c = trans->c; -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", -+ extent_iter->pos.inode, bi_sectors, i_sectors_delta); -+ -+ bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); -+ if (print) -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ -+ if (i_sectors_delta < 0) -+ i_sectors_delta = -bi_sectors; -+ else -+ i_sectors_delta = 0; -+ } -+ - le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); - inode_update_flags = 0; - } - -+ /* -+ * extents, dirents and xattrs updates require that an inode update also -+ * happens - to ensure that if a key exists in one of those btrees with -+ * a given snapshot ID an inode is also present - so we may have to skip -+ * the nojournal optimization: -+ */ - if (inode->k.p.snapshot != iter.snapshot) { - inode->k.p.snapshot = iter.snapshot; - inode_update_flags = 0; -@@ -286,7 +319,7 @@ int bch2_extent_update(struct btree_trans *trans, - * path already traversed at iter->pos because - * bch2_trans_extent_update() will use it to attempt extent merging - */ -- ret = __bch2_btree_iter_traverse(iter); -+ ret = __bch2_btree_iter_traverse(trans, iter); - if (ret) - return ret; - -@@ -331,7 +364,7 @@ int bch2_extent_update(struct btree_trans *trans, - - if (i_sectors_delta_total) - *i_sectors_delta_total += i_sectors_delta; -- bch2_btree_iter_set_pos(iter, next_pos); -+ bch2_btree_iter_set_pos(trans, iter, next_pos); - return 0; - } - -@@ -370,11 +403,10 @@ static int bch2_write_index_default(struct bch_write_op *op) - bkey_start_pos(&sk.k->k), - BTREE_ITER_slots|BTREE_ITER_intent); - -- ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: -- bch2_extent_update(trans, inum, &iter, sk.k, -+ ret = bch2_extent_update(trans, inum, &iter, sk.k, - &op->res, - op->new_i_size, &op->i_sectors_delta, -- op->flags & BCH_WRITE_CHECK_ENOSPC); -+ op->flags & BCH_WRITE_check_enospc); - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -@@ -396,29 +428,36 @@ static int bch2_write_index_default(struct bch_write_op *op) - - /* Writes */ - --static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, -- u64 offset) -+void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) - { -- bch2_inum_offset_err_msg(op->c, out, -- (subvol_inum) { op->subvol, op->pos.inode, }, -- offset << 9); -- prt_printf(out, "write error%s: ", -- op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); --} -+ struct printbuf buf = PRINTBUF; - --void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) --{ -- __bch2_write_op_error(out, op, op->pos.offset); --} -+ if (op->subvol) { -+ bch2_inum_offset_err_msg(op->c, &buf, -+ (subvol_inum) { op->subvol, op->pos.inode, }, -+ offset << 9); -+ } else { -+ struct bpos pos = op->pos; -+ pos.offset = offset; -+ bch2_inum_snap_offset_err_msg(op->c, &buf, pos); -+ } - --static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, -- struct bch_write_op *op, u64 offset) --{ -- bch2_inum_offset_err_msg_trans(trans, out, -- (subvol_inum) { op->subvol, op->pos.inode, }, -- offset << 9); -- prt_printf(out, "write error%s: ", -- op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); -+ prt_str(&buf, "write error: "); -+ -+ va_list args; -+ va_start(args, fmt); -+ prt_vprintf(&buf, fmt, args); -+ va_end(args); -+ -+ if (op->flags & BCH_WRITE_move) { -+ struct data_update *u = container_of(op, struct data_update, op); -+ -+ prt_printf(&buf, "\n from internal move "); -+ bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); -+ } -+ -+ bch_err_ratelimited(op->c, "%s", buf.buf); -+ printbuf_exit(&buf); - } - - void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, -@@ -428,15 +467,28 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, - { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - struct bch_write_bio *n; -+ unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE; -+ unsigned ref_idx = type == BCH_DATA_btree -+ ? BCH_DEV_READ_REF_btree_node_write -+ : BCH_DEV_WRITE_REF_io_write; - - BUG_ON(c->opts.nochanges); - -+ const struct bch_extent_ptr *last = NULL; -+ bkey_for_each_ptr(ptrs, ptr) -+ last = ptr; -+ - bkey_for_each_ptr(ptrs, ptr) { -+ /* -+ * XXX: btree writes should be using io_ref[WRITE], but we -+ * aren't retrying failed btree writes yet (due to device -+ * removal/ro): -+ */ - struct bch_dev *ca = nocow - ? bch2_dev_have_ref(c, ptr->dev) -- : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); -+ : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx); - -- if (to_entry(ptr + 1) < ptrs.end) { -+ if (ptr != last) { - n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); - - n->bio.bi_end_io = wbio->bio.bi_end_io; -@@ -493,12 +545,13 @@ static void bch2_write_done(struct closure *cl) - bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - bch2_disk_reservation_put(c, &op->res); - -- if (!(op->flags & BCH_WRITE_MOVE)) -- bch2_write_ref_put(c, BCH_WRITE_REF_write); -+ if (!(op->flags & BCH_WRITE_move)) -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_write); - bch2_keylist_free(&op->insert_keys, op->inline_keys); - - EBUG_ON(cl->parent); - closure_debug_destroy(cl); -+ async_object_list_del(c, write_op, op->list_idx); - if (op->end_io) - op->end_io(op); - } -@@ -516,7 +569,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) - test_bit(ptr->dev, op->failed.d)); - - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) -- return -EIO; -+ return -BCH_ERR_data_write_io; - } - - if (dst != src) -@@ -539,7 +592,7 @@ static void __bch2_write_index(struct bch_write_op *op) - unsigned dev; - int ret = 0; - -- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { -+ if (unlikely(op->flags & BCH_WRITE_io_error)) { - ret = bch2_write_drop_io_error_ptrs(op); - if (ret) - goto err; -@@ -548,7 +601,7 @@ static void __bch2_write_index(struct bch_write_op *op) - if (!bch2_keylist_empty(keys)) { - u64 sectors_start = keylist_sectors(keys); - -- ret = !(op->flags & BCH_WRITE_MOVE) -+ ret = !(op->flags & BCH_WRITE_move) - ? bch2_write_index_default(op) - : bch2_data_update_index_update(op); - -@@ -560,11 +613,8 @@ static void __bch2_write_index(struct bch_write_op *op) - if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - -- struct printbuf buf = PRINTBUF; -- __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); -- prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); -- bch_err_ratelimited(c, "%s", buf.buf); -- printbuf_exit(&buf); -+ bch2_write_op_error(op, bkey_start_offset(&insert->k), -+ "btree update error: %s", bch2_err_str(ret)); - } - - if (ret) -@@ -573,21 +623,29 @@ static void __bch2_write_index(struct bch_write_op *op) - out: - /* If some a bucket wasn't written, we can't erasure code it: */ - for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) -- bch2_open_bucket_write_error(c, &op->open_buckets, dev); -+ bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); - - bch2_open_buckets_put(c, &op->open_buckets); - return; - err: - keys->top = keys->keys; - op->error = ret; -- op->flags |= BCH_WRITE_SUBMITTED; -+ op->flags |= BCH_WRITE_submitted; - goto out; - } - - static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) - { - if (state != wp->state) { -+ struct task_struct *p = current; - u64 now = ktime_get_ns(); -+ u64 runtime = p->se.sum_exec_runtime + -+ (now - p->se.exec_start); -+ -+ if (state == WRITE_POINT_runnable) -+ wp->last_runtime = runtime; -+ else if (wp->state == WRITE_POINT_runnable) -+ wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; - - if (wp->last_state_change && - time_after64(now, wp->last_state_change)) -@@ -601,7 +659,7 @@ static inline void wp_update_state(struct write_point *wp, bool running) - { - enum write_point_state state; - -- state = running ? WRITE_POINT_running : -+ state = running ? WRITE_POINT_runnable: - !list_empty(&wp->writes) ? WRITE_POINT_waiting_io - : WRITE_POINT_stopped; - -@@ -615,8 +673,8 @@ static CLOSURE_CALLBACK(bch2_write_index) - struct workqueue_struct *wq = index_update_wq(op); - unsigned long flags; - -- if ((op->flags & BCH_WRITE_SUBMITTED) && -- (op->flags & BCH_WRITE_MOVE)) -+ if ((op->flags & BCH_WRITE_submitted) && -+ (op->flags & BCH_WRITE_move)) - bch2_bio_free_pages_pool(op->c, &op->wbio.bio); - - spin_lock_irqsave(&wp->writes_lock, flags); -@@ -654,11 +712,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work) - if (!op) - break; - -- op->flags |= BCH_WRITE_IN_WORKER; -+ op->flags |= BCH_WRITE_in_worker; - - __bch2_write_index(op); - -- if (!(op->flags & BCH_WRITE_SUBMITTED)) -+ if (!(op->flags & BCH_WRITE_submitted)) - __bch2_write(op); - else - bch2_write_done(&op->cl); -@@ -676,13 +734,24 @@ static void bch2_write_endio(struct bio *bio) - ? bch2_dev_have_ref(c, wbio->dev) - : NULL; - -- if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, -- op->pos.inode, -- wbio->inode_offset << 9, -- "data write error: %s", -- bch2_blk_status_to_str(bio->bi_status))) { -+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, -+ wbio->submit_time, !bio->bi_status); -+ -+ if (unlikely(bio->bi_status)) { -+ if (ca) -+ bch_err_inum_offset_ratelimited(ca, -+ op->pos.inode, -+ wbio->inode_offset << 9, -+ "data write error: %s", -+ bch2_blk_status_to_str(bio->bi_status)); -+ else -+ bch_err_inum_offset_ratelimited(c, -+ op->pos.inode, -+ wbio->inode_offset << 9, -+ "data write error: %s", -+ bch2_blk_status_to_str(bio->bi_status)); - set_bit(wbio->dev, op->failed.d); -- op->flags |= BCH_WRITE_IO_ERROR; -+ op->flags |= BCH_WRITE_io_error; - } - - if (wbio->nocow) { -@@ -692,10 +761,9 @@ static void bch2_write_endio(struct bio *bio) - set_bit(wbio->dev, op->devs_need_flush->d); - } - -- if (wbio->have_ioref) { -- bch2_latency_acct(ca, wbio->submit_time, WRITE); -- percpu_ref_put(&ca->io_ref); -- } -+ if (wbio->have_ioref) -+ enumerated_ref_put(&ca->io_ref[WRITE], -+ BCH_DEV_WRITE_REF_io_write); - - if (wbio->bounce) - bch2_bio_free_pages_pool(c, bio); -@@ -729,7 +797,10 @@ static void init_append_extent(struct bch_write_op *op, - bch2_extent_crc_append(&e->k_i, crc); - - bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, -- op->flags & BCH_WRITE_CACHED); -+ op->flags & BCH_WRITE_cached); -+ -+ if (!(op->flags & BCH_WRITE_move)) -+ bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); - - bch2_keylist_push(&op->insert_keys); - } -@@ -789,7 +860,6 @@ static int bch2_write_rechecksum(struct bch_fs *c, - { - struct bio *bio = &op->wbio.bio; - struct bch_extent_crc_unpacked new_crc; -- int ret; - - /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ - -@@ -797,10 +867,10 @@ static int bch2_write_rechecksum(struct bch_fs *c, - bch2_csum_type_is_encryption(new_csum_type)) - new_csum_type = op->crc.csum_type; - -- ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, -- NULL, &new_crc, -- op->crc.offset, op->crc.live_size, -- new_csum_type); -+ int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, -+ NULL, &new_crc, -+ op->crc.offset, op->crc.live_size, -+ new_csum_type); - if (ret) - return ret; - -@@ -810,44 +880,12 @@ static int bch2_write_rechecksum(struct bch_fs *c, - return 0; - } - --static int bch2_write_decrypt(struct bch_write_op *op) --{ -- struct bch_fs *c = op->c; -- struct nonce nonce = extent_nonce(op->version, op->crc); -- struct bch_csum csum; -- int ret; -- -- if (!bch2_csum_type_is_encryption(op->crc.csum_type)) -- return 0; -- -- /* -- * If we need to decrypt data in the write path, we'll no longer be able -- * to verify the existing checksum (poly1305 mac, in this case) after -- * it's decrypted - this is the last point we'll be able to reverify the -- * checksum: -- */ -- csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); -- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) -- return -EIO; -- -- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); -- op->crc.csum_type = 0; -- op->crc.csum = (struct bch_csum) { 0, 0 }; -- return ret; --} -- --static enum prep_encoded_ret { -- PREP_ENCODED_OK, -- PREP_ENCODED_ERR, -- PREP_ENCODED_CHECKSUM_ERR, -- PREP_ENCODED_DO_WRITE, --} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) -+static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) - { - struct bch_fs *c = op->c; - struct bio *bio = &op->wbio.bio; -- -- if (!(op->flags & BCH_WRITE_DATA_ENCODED)) -- return PREP_ENCODED_OK; -+ struct bch_csum csum; -+ int ret = 0; - - BUG_ON(bio_sectors(bio) != op->crc.compressed_size); - -@@ -858,12 +896,13 @@ static enum prep_encoded_ret { - (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || - op->incompressible)) { - if (!crc_is_compressed(op->crc) && -- op->csum_type != op->crc.csum_type && -- bch2_write_rechecksum(c, op, op->csum_type) && -- !c->opts.no_data_io) -- return PREP_ENCODED_CHECKSUM_ERR; -+ op->csum_type != op->crc.csum_type) { -+ ret = bch2_write_rechecksum(c, op, op->csum_type); -+ if (ret) -+ return ret; -+ } - -- return PREP_ENCODED_DO_WRITE; -+ return 1; - } - - /* -@@ -871,20 +910,24 @@ static enum prep_encoded_ret { - * is, we have to decompress it: - */ - if (crc_is_compressed(op->crc)) { -- struct bch_csum csum; -- -- if (bch2_write_decrypt(op)) -- return PREP_ENCODED_CHECKSUM_ERR; -- - /* Last point we can still verify checksum: */ -- csum = bch2_checksum_bio(c, op->crc.csum_type, -- extent_nonce(op->version, op->crc), -- bio); -+ struct nonce nonce = extent_nonce(op->version, op->crc); -+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) -- return PREP_ENCODED_CHECKSUM_ERR; -+ goto csum_err; -+ -+ if (bch2_csum_type_is_encryption(op->crc.csum_type)) { -+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); -+ if (ret) -+ return ret; -+ -+ op->crc.csum_type = 0; -+ op->crc.csum = (struct bch_csum) { 0, 0 }; -+ } - -- if (bch2_bio_uncompress_inplace(op, bio)) -- return PREP_ENCODED_ERR; -+ ret = bch2_bio_uncompress_inplace(op, bio); -+ if (ret) -+ return ret; - } - - /* -@@ -896,22 +939,44 @@ static enum prep_encoded_ret { - * If the data is checksummed and we're only writing a subset, - * rechecksum and adjust bio to point to currently live data: - */ -- if ((op->crc.live_size != op->crc.uncompressed_size || -- op->crc.csum_type != op->csum_type) && -- bch2_write_rechecksum(c, op, op->csum_type) && -- !c->opts.no_data_io) -- return PREP_ENCODED_CHECKSUM_ERR; -+ if (op->crc.live_size != op->crc.uncompressed_size || -+ op->crc.csum_type != op->csum_type) { -+ ret = bch2_write_rechecksum(c, op, op->csum_type); -+ if (ret) -+ return ret; -+ } - - /* - * If we want to compress the data, it has to be decrypted: - */ -- if ((op->compression_opt || -- bch2_csum_type_is_encryption(op->crc.csum_type) != -- bch2_csum_type_is_encryption(op->csum_type)) && -- bch2_write_decrypt(op)) -- return PREP_ENCODED_CHECKSUM_ERR; -+ if (bch2_csum_type_is_encryption(op->crc.csum_type) && -+ (op->compression_opt || op->crc.csum_type != op->csum_type)) { -+ struct nonce nonce = extent_nonce(op->version, op->crc); -+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); -+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) -+ goto csum_err; - -- return PREP_ENCODED_OK; -+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); -+ if (ret) -+ return ret; -+ -+ op->crc.csum_type = 0; -+ op->crc.csum = (struct bch_csum) { 0, 0 }; -+ } -+ -+ return 0; -+csum_err: -+ bch2_write_op_error(op, op->pos.offset, -+ "error verifying existing checksum while moving existing data (memory corruption?)\n" -+ " expected %0llx:%0llx got %0llx:%0llx type %s", -+ op->crc.csum.hi, -+ op->crc.csum.lo, -+ csum.hi, -+ csum.lo, -+ op->crc.csum_type < BCH_CSUM_NR -+ ? __bch2_csum_types[op->crc.csum_type] -+ : "(unknown)"); -+ return -BCH_ERR_data_write_csum; - } - - static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, -@@ -926,43 +991,51 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - bool page_alloc_failed = false; - int ret, more = 0; - -+ if (op->incompressible) -+ op->compression_opt = 0; -+ - BUG_ON(!bio_sectors(src)); - - ec_buf = bch2_writepoint_ec_buf(c, wp); - -- switch (bch2_write_prep_encoded_data(op, wp)) { -- case PREP_ENCODED_OK: -- break; -- case PREP_ENCODED_ERR: -- ret = -EIO; -- goto err; -- case PREP_ENCODED_CHECKSUM_ERR: -- goto csum_err; -- case PREP_ENCODED_DO_WRITE: -- /* XXX look for bug here */ -- if (ec_buf) { -- dst = bch2_write_bio_alloc(c, wp, src, -- &page_alloc_failed, -- ec_buf); -- bio_copy_data(dst, src); -- bounce = true; -+ if (unlikely(op->flags & BCH_WRITE_data_encoded)) { -+ ret = bch2_write_prep_encoded_data(op, wp); -+ if (ret < 0) -+ goto err; -+ if (ret) { -+ if (ec_buf) { -+ dst = bch2_write_bio_alloc(c, wp, src, -+ &page_alloc_failed, -+ ec_buf); -+ bio_copy_data(dst, src); -+ bounce = true; -+ } -+ init_append_extent(op, wp, op->version, op->crc); -+ goto do_write; - } -- init_append_extent(op, wp, op->version, op->crc); -- goto do_write; - } - - if (ec_buf || - op->compression_opt || - (op->csum_type && -- !(op->flags & BCH_WRITE_PAGES_STABLE)) || -+ !(op->flags & BCH_WRITE_pages_stable)) || - (bch2_csum_type_is_encryption(op->csum_type) && -- !(op->flags & BCH_WRITE_PAGES_OWNED))) { -+ !(op->flags & BCH_WRITE_pages_owned))) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bounce = true; - } - -+#ifdef CONFIG_BCACHEFS_DEBUG -+ unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); -+ if (!bounce && write_corrupt_ratio) { -+ dst = bch2_write_bio_alloc(c, wp, src, -+ &page_alloc_failed, -+ ec_buf); -+ bounce = true; -+ } -+#endif - saved_iter = dst->bi_iter; - - do { -@@ -976,7 +1049,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - break; - - BUG_ON(op->compression_opt && -- (op->flags & BCH_WRITE_DATA_ENCODED) && -+ (op->flags & BCH_WRITE_data_encoded) && - bch2_csum_type_is_encryption(op->crc.csum_type)); - BUG_ON(op->compression_opt && !bounce); - -@@ -1014,7 +1087,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - } - } - -- if ((op->flags & BCH_WRITE_DATA_ENCODED) && -+ if ((op->flags & BCH_WRITE_data_encoded) && - !crc_is_compressed(crc) && - bch2_csum_type_is_encryption(op->crc.csum_type) == - bch2_csum_type_is_encryption(op->csum_type)) { -@@ -1032,12 +1105,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - * data can't be modified (by userspace) while it's in - * flight. - */ -- if (bch2_rechecksum_bio(c, src, version, op->crc, -+ ret = bch2_rechecksum_bio(c, src, version, op->crc, - &crc, &op->crc, - src_len >> 9, - bio_sectors(src) - (src_len >> 9), -- op->csum_type)) -- goto csum_err; -+ op->csum_type); -+ if (ret) -+ goto err; - /* - * rchecksum_bio sets compression_type on crc from op->crc, - * this isn't always correct as sometimes we're changing -@@ -1046,13 +1120,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - crc.compression_type = compression_type; - crc.nonce = nonce; - } else { -- if ((op->flags & BCH_WRITE_DATA_ENCODED) && -- bch2_rechecksum_bio(c, src, version, op->crc, -+ if ((op->flags & BCH_WRITE_data_encoded) && -+ (ret = bch2_rechecksum_bio(c, src, version, op->crc, - NULL, &op->crc, - src_len >> 9, - bio_sectors(src) - (src_len >> 9), -- op->crc.csum_type)) -- goto csum_err; -+ op->crc.csum_type))) -+ goto err; - - crc.compressed_size = dst_len >> 9; - crc.uncompressed_size = src_len >> 9; -@@ -1072,6 +1146,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - - init_append_extent(op, wp, version, crc); - -+#ifdef CONFIG_BCACHEFS_DEBUG -+ if (write_corrupt_ratio) { -+ swap(dst->bi_iter.bi_size, dst_len); -+ bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); -+ swap(dst->bi_iter.bi_size, dst_len); -+ } -+#endif -+ - if (dst != src) - bio_advance(dst, dst_len); - bio_advance(src, src_len); -@@ -1103,16 +1185,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - do_write: - *_dst = dst; - return more; --csum_err: -- { -- struct printbuf buf = PRINTBUF; -- bch2_write_op_error(&buf, op); -- prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); -- bch_err_ratelimited(c, "%s", buf.buf); -- printbuf_exit(&buf); -- } -- -- ret = -EIO; - err: - if (to_wbio(dst)->bounce) - bch2_bio_free_pages_pool(c, dst); -@@ -1190,39 +1262,36 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) - { - struct bch_fs *c = op->c; - struct btree_trans *trans = bch2_trans_get(c); -+ int ret = 0; - - for_each_keylist_key(&op->insert_keys, orig) { -- int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, -+ ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, - bkey_start_pos(&orig->k), orig->k.p, - BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); - })); -- -- if (ret && !bch2_err_matches(ret, EROFS)) { -- struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); -- -- struct printbuf buf = PRINTBUF; -- bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); -- prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); -- bch_err_ratelimited(c, "%s", buf.buf); -- printbuf_exit(&buf); -- } -- -- if (ret) { -- op->error = ret; -+ if (ret) - break; -- } - } - - bch2_trans_put(trans); -+ -+ if (ret && !bch2_err_matches(ret, EROFS)) { -+ struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); -+ bch2_write_op_error(op, bkey_start_offset(&insert->k), -+ "btree update error: %s", bch2_err_str(ret)); -+ } -+ -+ if (ret) -+ op->error = ret; - } - - static void __bch2_nocow_write_done(struct bch_write_op *op) - { -- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { -- op->error = -EIO; -- } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) -+ if (unlikely(op->flags & BCH_WRITE_io_error)) { -+ op->error = -BCH_ERR_data_write_io; -+ } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) - bch2_nocow_write_convert_unwritten(op); - } - -@@ -1251,7 +1320,7 @@ static void bch2_nocow_write(struct bch_write_op *op) - struct bucket_to_lock *stale_at; - int stale, ret; - -- if (op->flags & BCH_WRITE_MOVE) -+ if (op->flags & BCH_WRITE_move) - return; - - darray_init(&buckets); -@@ -1275,7 +1344,7 @@ static void bch2_nocow_write(struct bch_write_op *op) - if (ret) - break; - -- k = bch2_btree_iter_peek_slot(&iter); -+ k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - break; -@@ -1294,7 +1363,8 @@ static void bch2_nocow_write(struct bch_write_op *op) - /* Get iorefs before dropping btree locks: */ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) { -- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); -+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, -+ BCH_DEV_WRITE_REF_io_write); - if (unlikely(!ca)) - goto err_get_ioref; - -@@ -1309,7 +1379,7 @@ static void bch2_nocow_write(struct bch_write_op *op) - }), GFP_KERNEL|__GFP_NOFAIL); - - if (ptr->unwritten) -- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; -+ op->flags |= BCH_WRITE_convert_unwritten; - } - - /* Unlock before taking nocow locks, doing IO: */ -@@ -1317,7 +1387,7 @@ static void bch2_nocow_write(struct bch_write_op *op) - bch2_trans_unlock(trans); - - bch2_cut_front(op->pos, op->insert_keys.top); -- if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) -+ if (op->flags & BCH_WRITE_convert_unwritten) - bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); - - darray_for_each(buckets, i) { -@@ -1342,7 +1412,7 @@ static void bch2_nocow_write(struct bch_write_op *op) - wbio_init(bio)->put_bio = true; - bio->bi_opf = op->wbio.bio.bi_opf; - } else { -- op->flags |= BCH_WRITE_SUBMITTED; -+ op->flags |= BCH_WRITE_submitted; - } - - op->pos.offset += bio_sectors(bio); -@@ -1352,13 +1422,14 @@ static void bch2_nocow_write(struct bch_write_op *op) - bio->bi_private = &op->cl; - bio->bi_opf |= REQ_OP_WRITE; - closure_get(&op->cl); -+ - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, - op->insert_keys.top, true); - - bch2_keylist_push(&op->insert_keys); -- if (op->flags & BCH_WRITE_SUBMITTED) -+ if (op->flags & BCH_WRITE_submitted) - break; -- bch2_btree_iter_advance(&iter); -+ bch2_btree_iter_advance(trans, &iter); - } - out: - bch2_trans_iter_exit(trans, &iter); -@@ -1370,21 +1441,18 @@ static void bch2_nocow_write(struct bch_write_op *op) - darray_exit(&buckets); - - if (ret) { -- struct printbuf buf = PRINTBUF; -- bch2_write_op_error(&buf, op); -- prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); -- bch_err_ratelimited(c, "%s", buf.buf); -- printbuf_exit(&buf); -+ bch2_write_op_error(op, op->pos.offset, -+ "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); - op->error = ret; -- op->flags |= BCH_WRITE_SUBMITTED; -+ op->flags |= BCH_WRITE_submitted; - } - - /* fallback to cow write path? */ -- if (!(op->flags & BCH_WRITE_SUBMITTED)) { -+ if (!(op->flags & BCH_WRITE_submitted)) { - closure_sync(&op->cl); - __bch2_nocow_write_done(op); - op->insert_keys.top = op->insert_keys.keys; -- } else if (op->flags & BCH_WRITE_SYNC) { -+ } else if (op->flags & BCH_WRITE_sync) { - closure_sync(&op->cl); - bch2_nocow_write_done(&op->cl.work); - } else { -@@ -1398,7 +1466,8 @@ static void bch2_nocow_write(struct bch_write_op *op) - return; - err_get_ioref: - darray_for_each(buckets, i) -- percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref); -+ enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE], -+ BCH_DEV_WRITE_REF_io_write); - - /* Fall back to COW path: */ - goto out; -@@ -1414,7 +1483,7 @@ static void bch2_nocow_write(struct bch_write_op *op) - "pointer to invalid bucket in nocow path on device %llu\n %s", - stale_at->b.inode, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -- ret = -EIO; -+ ret = -BCH_ERR_data_write_invalid_ptr; - } else { - /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; -@@ -1436,7 +1505,7 @@ static void __bch2_write(struct bch_write_op *op) - - if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { - bch2_nocow_write(op); -- if (op->flags & BCH_WRITE_SUBMITTED) -+ if (op->flags & BCH_WRITE_submitted) - goto out_nofs_restore; - } - again: -@@ -1466,7 +1535,7 @@ static void __bch2_write(struct bch_write_op *op) - ret = bch2_trans_run(c, lockrestart_do(trans, - bch2_alloc_sectors_start_trans(trans, - op->target, -- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), -+ op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), - op->write_point, - &op->devs_have, - op->nr_replicas, -@@ -1489,16 +1558,12 @@ static void __bch2_write(struct bch_write_op *op) - bch2_alloc_sectors_done_inlined(c, wp); - err: - if (ret <= 0) { -- op->flags |= BCH_WRITE_SUBMITTED; -+ op->flags |= BCH_WRITE_submitted; - - if (unlikely(ret < 0)) { -- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { -- struct printbuf buf = PRINTBUF; -- bch2_write_op_error(&buf, op); -- prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); -- bch_err_ratelimited(c, "%s", buf.buf); -- printbuf_exit(&buf); -- } -+ if (!(op->flags & BCH_WRITE_alloc_nowait)) -+ bch2_write_op_error(op, op->pos.offset, -+ "%s(): %s", __func__, bch2_err_str(ret)); - op->error = ret; - break; - } -@@ -1524,14 +1589,14 @@ static void __bch2_write(struct bch_write_op *op) - * synchronously here if we weren't able to submit all of the IO at - * once, as that signals backpressure to the caller. - */ -- if ((op->flags & BCH_WRITE_SYNC) || -- (!(op->flags & BCH_WRITE_SUBMITTED) && -- !(op->flags & BCH_WRITE_IN_WORKER))) { -+ if ((op->flags & BCH_WRITE_sync) || -+ (!(op->flags & BCH_WRITE_submitted) && -+ !(op->flags & BCH_WRITE_in_worker))) { - bch2_wait_on_allocator(c, &op->cl); - - __bch2_write_index(op); - -- if (!(op->flags & BCH_WRITE_SUBMITTED)) -+ if (!(op->flags & BCH_WRITE_submitted)) - goto again; - bch2_write_done(&op->cl); - } else { -@@ -1552,8 +1617,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) - - memset(&op->failed, 0, sizeof(op->failed)); - -- op->flags |= BCH_WRITE_WROTE_DATA_INLINE; -- op->flags |= BCH_WRITE_SUBMITTED; -+ op->flags |= BCH_WRITE_wrote_data_inline; -+ op->flags |= BCH_WRITE_submitted; - - bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); - -@@ -1616,8 +1681,10 @@ CLOSURE_CALLBACK(bch2_write) - BUG_ON(!op->write_point.v); - BUG_ON(bkey_eq(op->pos, POS_MAX)); - -- if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) -- op->flags |= BCH_WRITE_ALLOC_NOWAIT; -+ async_object_list_add(c, write_op, op, &op->list_idx); -+ -+ if (op->flags & BCH_WRITE_only_specified_devs) -+ op->flags |= BCH_WRITE_alloc_nowait; - - op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); - op->start_time = local_clock(); -@@ -1625,11 +1692,8 @@ CLOSURE_CALLBACK(bch2_write) - wbio_init(bio)->put_bio = false; - - if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { -- struct printbuf buf = PRINTBUF; -- bch2_write_op_error(&buf, op); -- prt_printf(&buf, "misaligned write"); -- printbuf_exit(&buf); -- op->error = -EIO; -+ bch2_write_op_error(op, op->pos.offset, "misaligned write"); -+ op->error = -BCH_ERR_data_write_misaligned; - goto err; - } - -@@ -1638,13 +1702,14 @@ CLOSURE_CALLBACK(bch2_write) - goto err; - } - -- if (!(op->flags & BCH_WRITE_MOVE) && -- !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { -+ if (!(op->flags & BCH_WRITE_move) && -+ !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { - op->error = -BCH_ERR_erofs_no_writes; - goto err; - } - -- this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); -+ if (!(op->flags & BCH_WRITE_move)) -+ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); - bch2_increment_clock(c, bio_sectors(bio), WRITE); - - data_len = min_t(u64, bio->bi_iter.bi_size, -@@ -1662,6 +1727,7 @@ CLOSURE_CALLBACK(bch2_write) - bch2_disk_reservation_put(c, &op->res); - - closure_debug_destroy(&op->cl); -+ async_object_list_del(c, write_op, op->list_idx); - if (op->end_io) - op->end_io(op); - } -@@ -1675,27 +1741,33 @@ static const char * const bch2_write_flags[] = { - - void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) - { -- prt_str(out, "pos: "); -+ if (!out->nr_tabstops) -+ printbuf_tabstop_push(out, 32); -+ -+ prt_printf(out, "pos:\t"); - bch2_bpos_to_text(out, op->pos); - prt_newline(out); - printbuf_indent_add(out, 2); - -- prt_str(out, "started: "); -+ prt_printf(out, "started:\t"); - bch2_pr_time_units(out, local_clock() - op->start_time); - prt_newline(out); - -- prt_str(out, "flags: "); -+ prt_printf(out, "flags:\t"); - prt_bitflags(out, bch2_write_flags, op->flags); - prt_newline(out); - -- prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); -+ prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); -+ prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); -+ -+ prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); -+ prt_printf(out, "ret\t%s\n", bch2_err_str(op->error)); - - printbuf_indent_sub(out, 2); - } - - void bch2_fs_io_write_exit(struct bch_fs *c) - { -- mempool_exit(&c->bio_bounce_pages); - bioset_exit(&c->replica_set); - bioset_exit(&c->bio_write); - } -@@ -1706,12 +1778,5 @@ int bch2_fs_io_write_init(struct bch_fs *c) - bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) - return -BCH_ERR_ENOMEM_bio_write_init; - -- if (mempool_init_page_pool(&c->bio_bounce_pages, -- max_t(unsigned, -- c->opts.btree_node_size, -- c->opts.encoded_extent_max) / -- PAGE_SIZE, 0)) -- return -BCH_ERR_ENOMEM_bio_bounce_pages_init; -- - return 0; - } -diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h -index b4626013abc8..2c0a8f35ee1f 100644 ---- a/fs/bcachefs/io_write.h -+++ b/fs/bcachefs/io_write.h -@@ -11,45 +11,11 @@ - void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); - void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); - --#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT --void bch2_latency_acct(struct bch_dev *, u64, int); --#else --static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} --#endif -- - void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - enum bch_data_type, const struct bkey_i *, bool); - --void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); -- --#define BCH_WRITE_FLAGS() \ -- x(ALLOC_NOWAIT) \ -- x(CACHED) \ -- x(DATA_ENCODED) \ -- x(PAGES_STABLE) \ -- x(PAGES_OWNED) \ -- x(ONLY_SPECIFIED_DEVS) \ -- x(WROTE_DATA_INLINE) \ -- x(FROM_INTERNAL) \ -- x(CHECK_ENOSPC) \ -- x(SYNC) \ -- x(MOVE) \ -- x(IN_WORKER) \ -- x(SUBMITTED) \ -- x(IO_ERROR) \ -- x(CONVERT_UNWRITTEN) -- --enum __bch_write_flags { --#define x(f) __BCH_WRITE_##f, -- BCH_WRITE_FLAGS() --#undef x --}; -- --enum bch_write_flags { --#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), -- BCH_WRITE_FLAGS() --#undef x --}; -+__printf(3, 4) -+void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); - - static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) - { -diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h -index 6e878a6f2f0b..5da4eb8bb6f6 100644 ---- a/fs/bcachefs/io_write_types.h -+++ b/fs/bcachefs/io_write_types.h -@@ -13,6 +13,34 @@ - #include - #include - -+#define BCH_WRITE_FLAGS() \ -+ x(alloc_nowait) \ -+ x(cached) \ -+ x(data_encoded) \ -+ x(pages_stable) \ -+ x(pages_owned) \ -+ x(only_specified_devs) \ -+ x(wrote_data_inline) \ -+ x(check_enospc) \ -+ x(sync) \ -+ x(move) \ -+ x(in_worker) \ -+ x(submitted) \ -+ x(io_error) \ -+ x(convert_unwritten) -+ -+enum __bch_write_flags { -+#define x(f) __BCH_WRITE_##f, -+ BCH_WRITE_FLAGS() -+#undef x -+}; -+ -+enum bch_write_flags { -+#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), -+ BCH_WRITE_FLAGS() -+#undef x -+}; -+ - struct bch_write_bio { - struct_group(wbio, - struct bch_fs *c; -@@ -43,6 +71,10 @@ struct bch_write_op { - void (*end_io)(struct bch_write_op *); - u64 start_time; - -+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -+ unsigned list_idx; -+#endif -+ - unsigned written; /* sectors */ - u16 flags; - s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ -@@ -64,7 +96,7 @@ struct bch_write_op { - struct bpos pos; - struct bversion version; - -- /* For BCH_WRITE_DATA_ENCODED: */ -+ /* For BCH_WRITE_data_encoded: */ - struct bch_extent_crc_unpacked crc; - - struct write_point_specifier write_point; -diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c -index 05b1250619ec..f2963a6cca88 100644 ---- a/fs/bcachefs/journal.c -+++ b/fs/bcachefs/journal.c -@@ -12,6 +12,7 @@ - #include "btree_update.h" - #include "btree_write_buffer.h" - #include "buckets.h" -+#include "enumerated_ref.h" - #include "error.h" - #include "journal.h" - #include "journal_io.h" -@@ -20,13 +21,6 @@ - #include "journal_seq_blacklist.h" - #include "trace.h" - --static const char * const bch2_journal_errors[] = { --#define x(n) #n, -- JOURNAL_ERRORS() --#undef x -- NULL --}; -- - static inline bool journal_seq_unwritten(struct journal *j, u64 seq) - { - return seq > j->seq_ondisk; -@@ -56,14 +50,20 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 - prt_printf(out, "seq:\t%llu\n", seq); - printbuf_indent_add(out, 2); - -- prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); -+ if (!buf->write_started) -+ prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); - -- prt_printf(out, "size:\t"); -- prt_human_readable_u64(out, vstruct_bytes(buf->data)); -- prt_newline(out); -+ struct closure *cl = &buf->io; -+ int r = atomic_read(&cl->remaining); -+ prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK); -+ -+ if (buf->data) { -+ prt_printf(out, "size:\t"); -+ prt_human_readable_u64(out, vstruct_bytes(buf->data)); -+ prt_newline(out); -+ } - -- prt_printf(out, "expires:\t"); -- prt_printf(out, "%li jiffies\n", buf->expires - jiffies); -+ prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies); - - prt_printf(out, "flags:\t"); - if (buf->noflush) -@@ -87,6 +87,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 - - static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) - { -+ lockdep_assert_held(&j->lock); -+ out->atomic++; -+ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 24); - -@@ -95,6 +98,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) - seq++) - bch2_journal_buf_to_text(out, j, seq); - prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); -+ -+ --out->atomic; - } - - static inline struct journal_buf * -@@ -104,10 +109,8 @@ journal_seq_to_buf(struct journal *j, u64 seq) - - EBUG_ON(seq > journal_cur_seq(j)); - -- if (journal_seq_unwritten(j, seq)) { -+ if (journal_seq_unwritten(j, seq)) - buf = j->buf + (seq & JOURNAL_BUF_MASK); -- EBUG_ON(le64_to_cpu(buf->data->seq) != seq); -- } - return buf; - } - -@@ -139,8 +142,10 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) - bool stuck = false; - struct printbuf buf = PRINTBUF; - -- if (!(error == JOURNAL_ERR_journal_full || -- error == JOURNAL_ERR_journal_pin_full) || -+ buf.atomic++; -+ -+ if (!(error == -BCH_ERR_journal_full || -+ error == -BCH_ERR_journal_pin_full) || - nr_unwritten_journal_entries(j) || - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) - return stuck; -@@ -164,12 +169,12 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) - return stuck; - } - j->err_seq = journal_cur_seq(j); -- spin_unlock(&j->lock); - -- bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", -- bch2_journal_errors[error]); -- bch2_journal_debug_to_text(&buf, j); -- bch_err(c, "%s", buf.buf); -+ __bch2_journal_debug_to_text(&buf, j); -+ spin_unlock(&j->lock); -+ prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"), -+ bch2_err_str(error)); -+ bch2_print_str(c, KERN_ERR, buf.buf); - - printbuf_reset(&buf); - bch2_journal_pins_to_text(&buf, j); -@@ -195,7 +200,8 @@ void bch2_journal_do_writes(struct journal *j) - if (w->write_started) - continue; - -- if (!journal_state_count(j->reservations, idx)) { -+ if (!journal_state_seq_count(j, j->reservations, seq)) { -+ j->seq_write_started = seq; - w->write_started = true; - closure_call(&w->io, bch2_journal_write, j->wq, NULL); - } -@@ -276,7 +282,24 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t - - sectors = vstruct_blocks_plus(buf->data, c->block_bits, - buf->u64s_reserved) << c->block_bits; -- BUG_ON(sectors > buf->sectors); -+ if (unlikely(sectors > buf->sectors)) { -+ struct printbuf err = PRINTBUF; -+ err.atomic++; -+ -+ prt_printf(&err, "journal entry overran reserved space: %u > %u\n", -+ sectors, buf->sectors); -+ prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n", -+ le32_to_cpu(buf->data->u64s), buf->u64s_reserved, -+ j->cur_entry_u64s, -+ c->block_bits); -+ prt_printf(&err, "fatal error - emergency read only"); -+ bch2_journal_halt_locked(j); -+ -+ bch_err(c, "%s", err.buf); -+ printbuf_exit(&err); -+ return; -+ } -+ - buf->sectors = sectors; - - /* -@@ -306,17 +329,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t - - bch2_journal_space_available(j); - -- __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq)); --} -- --void bch2_journal_halt(struct journal *j) --{ -- spin_lock(&j->lock); -- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); -- if (!j->err_seq) -- j->err_seq = journal_cur_seq(j); -- journal_wake(j); -- spin_unlock(&j->lock); -+ __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); - } - - void bch2_journal_halt_locked(struct journal *j) -@@ -329,6 +342,13 @@ void bch2_journal_halt_locked(struct journal *j) - journal_wake(j); - } - -+void bch2_journal_halt(struct journal *j) -+{ -+ spin_lock(&j->lock); -+ bch2_journal_halt_locked(j); -+ spin_unlock(&j->lock); -+} -+ - static bool journal_entry_want_write(struct journal *j) - { - bool ret = !journal_entry_is_open(j) || -@@ -377,29 +397,41 @@ static int journal_entry_open(struct journal *j) - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - - if (j->blocked) -- return JOURNAL_ERR_blocked; -+ return -BCH_ERR_journal_blocked; - - if (j->cur_entry_error) - return j->cur_entry_error; - -- if (bch2_journal_error(j)) -- return JOURNAL_ERR_insufficient_devices; /* -EROFS */ -+ int ret = bch2_journal_error(j); -+ if (unlikely(ret)) -+ return ret; - - if (!fifo_free(&j->pin)) -- return JOURNAL_ERR_journal_pin_full; -+ return -BCH_ERR_journal_pin_full; - - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) -- return JOURNAL_ERR_max_in_flight; -+ return -BCH_ERR_journal_max_in_flight; -+ -+ if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) -+ return -BCH_ERR_journal_max_open; - - if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { - bch_err(c, "cannot start: journal seq overflow"); - if (bch2_fs_emergency_read_only_locked(c)) - bch_err(c, "fatal error - emergency read only"); -- return JOURNAL_ERR_insufficient_devices; /* -EROFS */ -+ return -BCH_ERR_journal_shutdown; - } - -+ if (!j->free_buf && !buf->data) -+ return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */ -+ - BUG_ON(!j->cur_entry_sectors); - -+ if (!buf->data) { -+ swap(buf->data, j->free_buf); -+ swap(buf->buf_size, j->free_buf_size); -+ } -+ - buf->expires = - (journal_cur_seq(j) == j->flushed_seq_ondisk - ? jiffies -@@ -415,7 +447,7 @@ static int journal_entry_open(struct journal *j) - u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); - - if (u64s <= (ssize_t) j->early_journal_entries.nr) -- return JOURNAL_ERR_journal_full; -+ return -BCH_ERR_journal_full; - - if (fifo_empty(&j->pin) && j->reclaim_thread) - wake_up_process(j->reclaim_thread); -@@ -464,7 +496,7 @@ static int journal_entry_open(struct journal *j) - - new.idx++; - BUG_ON(journal_state_count(new, new.idx)); -- BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); -+ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK)); - - journal_state_inc(&new); - -@@ -514,6 +546,33 @@ static void journal_write_work(struct work_struct *work) - spin_unlock(&j->lock); - } - -+static void journal_buf_prealloc(struct journal *j) -+{ -+ if (j->free_buf && -+ j->free_buf_size >= j->buf_size_want) -+ return; -+ -+ unsigned buf_size = j->buf_size_want; -+ -+ spin_unlock(&j->lock); -+ void *buf = kvmalloc(buf_size, GFP_NOFS); -+ spin_lock(&j->lock); -+ -+ if (buf && -+ (!j->free_buf || -+ buf_size > j->free_buf_size)) { -+ swap(buf, j->free_buf); -+ swap(buf_size, j->free_buf_size); -+ } -+ -+ if (unlikely(buf)) { -+ spin_unlock(&j->lock); -+ /* kvfree can sleep */ -+ kvfree(buf); -+ spin_lock(&j->lock); -+ } -+} -+ - static int __journal_res_get(struct journal *j, struct journal_res *res, - unsigned flags) - { -@@ -525,25 +584,28 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, - if (journal_res_get_fast(j, res, flags)) - return 0; - -- if (bch2_journal_error(j)) -- return -BCH_ERR_erofs_journal_err; -+ ret = bch2_journal_error(j); -+ if (unlikely(ret)) -+ return ret; - - if (j->blocked) -- return -BCH_ERR_journal_res_get_blocked; -+ return -BCH_ERR_journal_blocked; - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) { -- ret = JOURNAL_ERR_journal_full; -+ ret = -BCH_ERR_journal_full; - can_discard = j->can_discard; - goto out; - } - - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { -- ret = JOURNAL_ERR_max_in_flight; -+ ret = -BCH_ERR_journal_max_in_flight; - goto out; - } - - spin_lock(&j->lock); - -+ journal_buf_prealloc(j); -+ - /* - * Recheck after taking the lock, so we don't race with another thread - * that just did journal_entry_open() and call bch2_journal_entry_close() -@@ -566,25 +628,48 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, - j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); -- ret = journal_entry_open(j) ?: JOURNAL_ERR_retry; -+ ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open; - unlock: - can_discard = j->can_discard; - spin_unlock(&j->lock); - out: -- if (ret == JOURNAL_ERR_retry) -- goto retry; -- if (!ret) -+ if (likely(!ret)) - return 0; -+ if (ret == -BCH_ERR_journal_retry_open) -+ goto retry; - - if (journal_error_check_stuck(j, ret, flags)) -- ret = -BCH_ERR_journal_res_get_blocked; -+ ret = -BCH_ERR_journal_stuck; -+ -+ if (ret == -BCH_ERR_journal_max_in_flight && -+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && -+ trace_journal_entry_full_enabled()) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_printbuf_make_room(&buf, 4096); - -- if (ret == JOURNAL_ERR_max_in_flight && -- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { -+ spin_lock(&j->lock); -+ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); -+ bch2_journal_bufs_to_text(&buf, j); -+ spin_unlock(&j->lock); -+ -+ trace_journal_entry_full(c, buf.buf); -+ printbuf_exit(&buf); -+ count_event(c, journal_entry_full); -+ } - -+ if (ret == -BCH_ERR_journal_max_open && -+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && -+ trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; -+ -+ bch2_printbuf_make_room(&buf, 4096); -+ -+ spin_lock(&j->lock); - prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); - bch2_journal_bufs_to_text(&buf, j); -+ spin_unlock(&j->lock); -+ - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - count_event(c, journal_entry_full); -@@ -594,8 +679,8 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, - * Journal is full - can't rely on reclaim from work item due to - * freezing: - */ -- if ((ret == JOURNAL_ERR_journal_full || -- ret == JOURNAL_ERR_journal_pin_full) && -+ if ((ret == -BCH_ERR_journal_full || -+ ret == -BCH_ERR_journal_pin_full) && - !(flags & JOURNAL_RES_GET_NONBLOCK)) { - if (can_discard) { - bch2_journal_do_discards(j); -@@ -608,17 +693,17 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, - } - } - -- return ret == JOURNAL_ERR_insufficient_devices -- ? -BCH_ERR_erofs_journal_err -- : -BCH_ERR_journal_res_get_blocked; -+ return ret; - } - - static unsigned max_dev_latency(struct bch_fs *c) - { - u64 nsecs = 0; - -- for_each_rw_member(c, ca) -+ rcu_read_lock(); -+ for_each_rw_member_rcu(c, ca) - nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); -+ rcu_read_unlock(); - - return nsecs_to_jiffies(nsecs); - } -@@ -640,7 +725,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - int ret; - - if (closure_wait_event_timeout(&j->async_wait, -- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || -+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK), - HZ)) - return ret; -@@ -654,19 +739,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - remaining_wait = max(0, remaining_wait - HZ); - - if (closure_wait_event_timeout(&j->async_wait, -- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || -+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK), - remaining_wait)) - return ret; - - struct printbuf buf = PRINTBUF; - bch2_journal_debug_to_text(&buf, j); -- bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", -- buf.buf); -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); - printbuf_exit(&buf); - - closure_wait_event(&j->async_wait, -- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || -+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK)); - return ret; - } -@@ -687,7 +772,6 @@ void bch2_journal_entry_res_resize(struct journal *j, - goto out; - - j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); -- smp_mb(); - state = READ_ONCE(j->reservations); - - if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && -@@ -906,11 +990,11 @@ int bch2_journal_meta(struct journal *j) - { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) -- return -EROFS; -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) -+ return -BCH_ERR_erofs_no_writes; - - int ret = __bch2_journal_meta(j); -- bch2_write_ref_put(c, BCH_WRITE_REF_journal); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); - return ret; - } - -@@ -951,7 +1035,8 @@ static void __bch2_journal_block(struct journal *j) - new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - -- journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); -+ if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL) -+ journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); - } - } - -@@ -992,7 +1077,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou - *blocked = true; - } - -- ret = journal_state_count(s, idx) > open -+ ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open - ? ERR_PTR(-EAGAIN) - : buf; - break; -@@ -1213,6 +1298,16 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - - int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) - { -+ struct bch_fs *c = ca->fs; -+ -+ if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) -+ return 0; -+ -+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { -+ bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); -+ return -BCH_ERR_erofs_filesystem_full; -+ } -+ - unsigned nr; - int ret; - -@@ -1233,7 +1328,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) - min(1 << 13, - (1 << 24) / ca->mi.bucket_size)); - -- ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs); -+ ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); - err: - bch_err_fn(ca, ret); - return ret; -@@ -1241,13 +1336,14 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) - - int bch2_fs_journal_alloc(struct bch_fs *c) - { -- for_each_online_member(c, ca) { -+ for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) { - if (ca->journal.nr) - continue; - - int ret = bch2_dev_journal_alloc(ca, true); - if (ret) { -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], -+ BCH_DEV_READ_REF_fs_journal_alloc); - return ret; - } - } -@@ -1336,19 +1432,26 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) - - nr = cur_seq - last_seq; - -- if (nr + 1 > j->pin.size) { -- free_fifo(&j->pin); -- init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); -- if (!j->pin.data) { -- bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); -- return -BCH_ERR_ENOMEM_journal_pin_fifo; -- } -+ /* -+ * Extra fudge factor, in case we crashed when the journal pin fifo was -+ * nearly or completely full. We'll need to be able to open additional -+ * journal entries (at least a few) in order for journal replay to get -+ * going: -+ */ -+ nr += nr / 4; -+ -+ nr = max(nr, JOURNAL_PIN); -+ init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); -+ if (!j->pin.data) { -+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); -+ return -BCH_ERR_ENOMEM_journal_pin_fifo; - } - - j->replay_journal_seq = last_seq; - j->replay_journal_seq_end = cur_seq; - j->last_seq_ondisk = last_seq; - j->flushed_seq_ondisk = cur_seq - 1; -+ j->seq_write_started = cur_seq - 1; - j->seq_ondisk = cur_seq - 1; - j->pin.front = last_seq; - j->pin.back = cur_seq; -@@ -1385,19 +1488,29 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) - j->last_empty_seq = cur_seq - 1; /* to match j->seq */ - - spin_lock(&j->lock); -- -- set_bit(JOURNAL_running, &j->flags); - j->last_flush_write = jiffies; - -- j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); -- j->reservations.unwritten_idx++; -+ j->reservations.idx = journal_cur_seq(j); - - c->last_bucket_seq_cleanup = journal_cur_seq(j); -+ spin_unlock(&j->lock); - -+ return 0; -+} -+ -+void bch2_journal_set_replay_done(struct journal *j) -+{ -+ /* -+ * journal_space_available must happen before setting JOURNAL_running -+ * JOURNAL_running must happen before JOURNAL_replay_done -+ */ -+ spin_lock(&j->lock); - bch2_journal_space_available(j); -- spin_unlock(&j->lock); - -- return bch2_journal_reclaim_start(j); -+ set_bit(JOURNAL_need_flush_write, &j->flags); -+ set_bit(JOURNAL_running, &j->flags); -+ set_bit(JOURNAL_replay_done, &j->flags); -+ spin_unlock(&j->lock); - } - - /* init/exit: */ -@@ -1443,7 +1556,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) - unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); - - for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { -- ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, -+ ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, - nr_bvecs), GFP_KERNEL); - if (!ja->bio[i]) - return -BCH_ERR_ENOMEM_dev_journal_init; -@@ -1482,10 +1595,11 @@ void bch2_fs_journal_exit(struct journal *j) - - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) - kvfree(j->buf[i].data); -+ kvfree(j->free_buf); - free_fifo(&j->pin); - } - --int bch2_fs_journal_init(struct journal *j) -+void bch2_fs_journal_init_early(struct journal *j) - { - static struct lock_class_key res_key; - -@@ -1504,19 +1618,17 @@ int bch2_fs_journal_init(struct journal *j) - atomic64_set(&j->reservations.counter, - ((union journal_res_state) - { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); -+} - -- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) -- return -BCH_ERR_ENOMEM_journal_pin_fifo; -+int bch2_fs_journal_init(struct journal *j) -+{ -+ j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; -+ j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); -+ if (!j->free_buf) -+ return -BCH_ERR_ENOMEM_journal_buf; - -- for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { -- j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; -- j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); -- if (!j->buf[i].data) -- return -BCH_ERR_ENOMEM_journal_buf; -+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) - j->buf[i].idx = i; -- } -- -- j->pin.front = j->pin.back = 1; - - j->wq = alloc_workqueue("bcachefs_journal", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); -@@ -1564,6 +1676,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) - prt_printf(out, "average write size:\t"); - prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); - prt_newline(out); -+ prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0); - prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); - prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); - prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); -@@ -1571,7 +1684,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) - ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - prt_printf(out, "blocked:\t%u\n", j->blocked); - prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); -- prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); -+ prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error)); - prt_printf(out, "current entry:\t"); - - switch (s.cur_entry_offset) { -diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h -index 107f7f901cd9..8ff00a0ec778 100644 ---- a/fs/bcachefs/journal.h -+++ b/fs/bcachefs/journal.h -@@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j) - closure_wake_up(&j->async_wait); - } - --static inline struct journal_buf *journal_cur_buf(struct journal *j) --{ -- return j->buf + j->reservations.idx; --} -- - /* Sequence number of oldest dirty journal entry */ - - static inline u64 journal_last_seq(struct journal *j) -@@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j) - return j->seq_ondisk + 1; - } - -+static inline struct journal_buf *journal_cur_buf(struct journal *j) -+{ -+ unsigned idx = (journal_cur_seq(j) & -+ JOURNAL_BUF_MASK & -+ ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx; -+ -+ return j->buf + idx; -+} -+ - static inline int journal_state_count(union journal_res_state s, int idx) - { - switch (idx) { -@@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx) - BUG(); - } - -+static inline int journal_state_seq_count(struct journal *j, -+ union journal_res_state s, u64 seq) -+{ -+ if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR) -+ return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK); -+ else -+ return 0; -+} -+ - static inline void journal_state_inc(union journal_res_state *s) - { - s->buf0_count += s->idx == 0; -@@ -193,7 +206,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) - static inline struct jset_entry * - journal_res_entry(struct journal *j, struct journal_res *res) - { -- return vstruct_idx(j->buf[res->idx].data, res->offset); -+ return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset); - } - - static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, -@@ -267,8 +280,9 @@ bool bch2_journal_entry_close(struct journal *); - void bch2_journal_do_writes(struct journal *); - void bch2_journal_buf_put_final(struct journal *, u64); - --static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) -+static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) - { -+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK; - union journal_res_state s; - - s = journal_state_buf_put(j, idx); -@@ -276,8 +290,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s - bch2_journal_buf_put_final(j, seq); - } - --static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) -+static inline void bch2_journal_buf_put(struct journal *j, u64 seq) - { -+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK; - union journal_res_state s; - - s = journal_state_buf_put(j, idx); -@@ -306,7 +321,7 @@ static inline void bch2_journal_res_put(struct journal *j, - BCH_JSET_ENTRY_btree_keys, - 0, 0, 0); - -- bch2_journal_buf_put(j, res->idx, res->seq); -+ bch2_journal_buf_put(j, res->seq); - - res->ref = 0; - } -@@ -335,8 +350,10 @@ static inline int journal_res_get_fast(struct journal *j, - - /* - * Check if there is still room in the current journal -- * entry: -+ * entry, smp_rmb() guarantees that reads from reservations.counter -+ * occur before accessing cur_entry_u64s: - */ -+ smp_rmb(); - if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) - return 0; - -@@ -361,9 +378,9 @@ static inline int journal_res_get_fast(struct journal *j, - &old.v, new.v)); - - res->ref = true; -- res->idx = old.idx; - res->offset = old.cur_entry_offset; -- res->seq = le64_to_cpu(j->buf[old.idx].data->seq); -+ res->seq = journal_cur_seq(j); -+ res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK; - return 1; - } - -@@ -390,6 +407,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re - (flags & JOURNAL_RES_GET_NONBLOCK) != 0, - NULL, _THIS_IP_); - EBUG_ON(!res->ref); -+ BUG_ON(!res->seq); - } - return 0; - } -@@ -408,8 +426,8 @@ int bch2_journal_flush(struct journal *); - bool bch2_journal_noflush_seq(struct journal *, u64, u64); - int bch2_journal_meta(struct journal *); - --void bch2_journal_halt(struct journal *); - void bch2_journal_halt_locked(struct journal *); -+void bch2_journal_halt(struct journal *); - - static inline int bch2_journal_error(struct journal *j) - { -@@ -419,12 +437,6 @@ static inline int bch2_journal_error(struct journal *j) - - struct bch_dev; - --static inline void bch2_journal_set_replay_done(struct journal *j) --{ -- BUG_ON(!test_bit(JOURNAL_running, &j->flags)); -- set_bit(JOURNAL_replay_done, &j->flags); --} -- - void bch2_journal_unblock(struct journal *); - void bch2_journal_block(struct journal *); - struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); -@@ -441,10 +453,12 @@ void bch2_dev_journal_stop(struct journal *, struct bch_dev *); - - void bch2_fs_journal_stop(struct journal *); - int bch2_fs_journal_start(struct journal *, u64); -+void bch2_journal_set_replay_done(struct journal *); - - void bch2_dev_journal_exit(struct bch_dev *); - int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); - void bch2_fs_journal_exit(struct journal *); -+void bch2_fs_journal_init_early(struct journal *); - int bch2_fs_journal_init(struct journal *); - - #endif /* _BCACHEFS_JOURNAL_H */ -diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c -index 11c39e0c34f4..06f7b018492c 100644 ---- a/fs/bcachefs/journal_io.c -+++ b/fs/bcachefs/journal_io.c -@@ -19,6 +19,7 @@ - - #include - #include -+#include - - void bch2_journal_pos_from_member_info_set(struct bch_fs *c) - { -@@ -214,12 +215,12 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, - - fsck_err_on(same_device, - c, journal_entry_dup_same_device, -- "duplicate journal entry on same device\n %s", -+ "duplicate journal entry on same device\n%s", - buf.buf); - - fsck_err_on(not_identical, - c, journal_entry_replicas_data_mismatch, -- "found duplicate but non identical journal entries\n %s", -+ "found duplicate but non identical journal entries\n%s", - buf.buf); - - if (entry_ptr.csum_good && !identical) -@@ -308,8 +309,8 @@ static void journal_entry_err_msg(struct printbuf *out, - break; \ - case WRITE: \ - bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ -- bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ -- if (bch2_fs_inconsistent(c)) { \ -+ if (bch2_fs_inconsistent(c, \ -+ "corrupt metadata before write: %s\n", _buf.buf)) {\ - ret = -BCH_ERR_fsck_errors_not_fixed; \ - goto fsck_err; \ - } \ -@@ -764,6 +765,23 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs - journal_entry_btree_keys_to_text(out, c, entry); - } - -+static int journal_entry_log_bkey_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ struct bkey_validate_context from) -+{ -+ from.flags = 0; -+ return journal_entry_btree_keys_validate(c, jset, entry, -+ version, big_endian, from); -+} -+ -+static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ journal_entry_btree_keys_to_text(out, c, entry); -+} -+ - static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, -@@ -1041,13 +1059,19 @@ static int journal_read_bucket(struct bch_dev *ca, - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, buf->data, sectors_read << 9); - -+ u64 submit_time = local_clock(); - ret = submit_bio_wait(bio); - kfree(bio); - -- if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, -- "journal read error: sector %llu", -- offset) || -- bch2_meta_read_fault("journal")) { -+ if (!ret && bch2_meta_read_fault("journal")) -+ ret = -BCH_ERR_EIO_fault_injected; -+ -+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, -+ submit_time, !ret); -+ -+ if (ret) { -+ bch_err_dev_ratelimited(ca, -+ "journal read error: sector %llu", offset); - /* - * We don't error out of the recovery process - * here, since the relevant journal entry may be -@@ -1110,13 +1134,16 @@ static int journal_read_bucket(struct bch_dev *ca, - struct bch_csum csum; - csum_good = jset_csum_good(c, j, &csum); - -- if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, -- "%s", -- (printbuf_reset(&err), -- prt_str(&err, "journal "), -- bch2_csum_err_msg(&err, csum_type, j->csum, csum), -- err.buf))) -+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); -+ -+ if (!csum_good) { -+ bch_err_dev_ratelimited(ca, "%s", -+ (printbuf_reset(&err), -+ prt_str(&err, "journal "), -+ bch2_csum_err_msg(&err, csum_type, j->csum, csum), -+ err.buf)); - saw_bad = true; -+ } - - ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), - j->encrypted_start, -@@ -1192,7 +1219,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) - out: - bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); - kvfree(buf.data); -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); - closure_return(cl); - return; - err: -@@ -1227,7 +1254,8 @@ int bch2_journal_read(struct bch_fs *c, - - if ((ca->mi.state == BCH_MEMBER_STATE_rw || - ca->mi.state == BCH_MEMBER_STATE_ro) && -- percpu_ref_tryget(&ca->io_ref)) -+ enumerated_ref_tryget(&ca->io_ref[READ], -+ BCH_DEV_READ_REF_journal_read)) - closure_call(&ca->journal.read, - bch2_journal_read_device, - system_unbound_wq, -@@ -1236,7 +1264,8 @@ int bch2_journal_read(struct bch_fs *c, - degraded = true; - } - -- closure_sync(&jlist.cl); -+ while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) -+ ; - - if (jlist.ret) - return jlist.ret; -@@ -1362,8 +1391,8 @@ int bch2_journal_read(struct bch_fs *c, - missing_end = seq - 1; - fsck_err(c, journal_entries_missing, - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" -- " prev at %s\n" -- " next at %s, continue?", -+ "prev at %s\n" -+ "next at %s, continue?", - missing_start, missing_end, - *last_seq, *blacklist_seq - 1, - buf1.buf, buf2.buf); -@@ -1377,7 +1406,7 @@ int bch2_journal_read(struct bch_fs *c, - } - - genradix_for_each(&c->journal_entries, radix_iter, _i) { -- struct bch_replicas_padded replicas = { -+ union bch_replicas_padded replicas = { - .e.data_type = BCH_DATA_journal, - .e.nr_devs = 0, - .e.nr_required = 1, -@@ -1417,7 +1446,7 @@ int bch2_journal_read(struct bch_fs *c, - !bch2_replicas_marked(c, &replicas.e) && - (le64_to_cpu(i->j.seq) == *last_seq || - fsck_err(c, journal_entry_replicas_not_marked, -- "superblock not marked as containing replicas for journal entry %llu\n %s", -+ "superblock not marked as containing replicas for journal entry %llu\n%s", - le64_to_cpu(i->j.seq), buf.buf))) { - ret = bch2_mark_replicas(c, &replicas.e); - if (ret) -@@ -1434,10 +1463,11 @@ int bch2_journal_read(struct bch_fs *c, - - static void journal_advance_devs_to_next_bucket(struct journal *j, - struct dev_alloc_list *devs, -- unsigned sectors, u64 seq) -+ unsigned sectors, __le64 seq) - { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - -+ rcu_read_lock(); - darray_for_each(*devs, i) { - struct bch_dev *ca = rcu_dereference(c->devs[*i]); - if (!ca) -@@ -1459,6 +1489,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); - } - } -+ rcu_read_unlock(); - } - - static void __journal_write_alloc(struct journal *j, -@@ -1471,7 +1502,8 @@ static void __journal_write_alloc(struct journal *j, - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - darray_for_each(*devs, i) { -- struct bch_dev *ca = rcu_dereference(c->devs[*i]); -+ struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, -+ BCH_DEV_WRITE_REF_journal_write); - if (!ca) - continue; - -@@ -1485,8 +1517,10 @@ static void __journal_write_alloc(struct journal *j, - ca->mi.state != BCH_MEMBER_STATE_rw || - !ja->nr || - bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || -- sectors > ja->sectors_free) -+ sectors > ja->sectors_free) { -+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); - continue; -+ } - - bch2_dev_stripe_increment(ca, &j->wp.stripe); - -@@ -1509,15 +1543,8 @@ static void __journal_write_alloc(struct journal *j, - } - } - --/** -- * journal_write_alloc - decide where to write next journal entry -- * -- * @j: journal object -- * @w: journal buf (entry to be written) -- * -- * Returns: 0 on success, or -EROFS on failure -- */ --static int journal_write_alloc(struct journal *j, struct journal_buf *w) -+static int journal_write_alloc(struct journal *j, struct journal_buf *w, -+ unsigned *replicas) - { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_devs_mask devs; -@@ -1525,29 +1552,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) - unsigned sectors = vstruct_sectors(w->data, c->block_bits); - unsigned target = c->opts.metadata_target ?: - c->opts.foreground_target; -- unsigned replicas = 0, replicas_want = -- READ_ONCE(c->opts.metadata_replicas); -+ unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); - unsigned replicas_need = min_t(unsigned, replicas_want, - READ_ONCE(c->opts.metadata_replicas_required)); - bool advance_done = false; - -- rcu_read_lock(); -- -- /* We might run more than once if we have to stop and do discards: */ -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); -- bkey_for_each_ptr(ptrs, p) { -- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); -- if (ca) -- replicas += ca->mi.durability; -- } -- - retry_target: - devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); - retry_alloc: -- __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); -+ __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); - -- if (likely(replicas >= replicas_want)) -+ if (likely(*replicas >= replicas_want)) - goto done; - - if (!advance_done) { -@@ -1556,18 +1572,16 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) - goto retry_alloc; - } - -- if (replicas < replicas_want && target) { -+ if (*replicas < replicas_want && target) { - /* Retry from all devices: */ - target = 0; - advance_done = false; - goto retry_target; - } - done: -- rcu_read_unlock(); -- - BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - -- return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; -+ return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; - } - - static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -@@ -1600,18 +1614,12 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) - kvfree(new_buf); - } - --static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) --{ -- return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); --} -- - static CLOSURE_CALLBACK(journal_write_done) - { - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); -- struct bch_replicas_padded replicas; -- union journal_res_state old, new; -+ union bch_replicas_padded replicas; - u64 seq = le64_to_cpu(w->data->seq); - int err = 0; - -@@ -1620,17 +1628,27 @@ static CLOSURE_CALLBACK(journal_write_done) - : j->noflush_write_time, j->write_start_time); - - if (!w->devs_written.nr) { -- bch_err(c, "unable to write journal to sufficient devices"); -- err = -EIO; -+ err = -BCH_ERR_journal_write_err; - } else { - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - w->devs_written); -- if (bch2_mark_replicas(c, &replicas.e)) -- err = -EIO; -+ err = bch2_mark_replicas(c, &replicas.e); - } - -- if (err) -- bch2_fatal_error(c); -+ if (err && !bch2_journal_error(j)) { -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ -+ if (err == -BCH_ERR_journal_write_err) -+ prt_printf(&buf, "unable to write journal to sufficient devices"); -+ else -+ prt_printf(&buf, "journal write error marking replicas: %s", bch2_err_str(err)); -+ -+ bch2_fs_emergency_read_only2(c, &buf); -+ -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ } - - closure_debug_destroy(cl); - -@@ -1641,7 +1659,23 @@ static CLOSURE_CALLBACK(journal_write_done) - j->err_seq = seq; - w->write_done = true; - -+ if (!j->free_buf || j->free_buf_size < w->buf_size) { -+ swap(j->free_buf, w->data); -+ swap(j->free_buf_size, w->buf_size); -+ } -+ -+ if (w->data) { -+ void *buf = w->data; -+ w->data = NULL; -+ w->buf_size = 0; -+ -+ spin_unlock(&j->lock); -+ kvfree(buf); -+ spin_lock(&j->lock); -+ } -+ - bool completed = false; -+ bool do_discards = false; - - for (seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); -@@ -1650,11 +1684,10 @@ static CLOSURE_CALLBACK(journal_write_done) - if (!w->write_done) - break; - -- if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { -+ if (!j->err_seq && !w->noflush) { - j->flushed_seq_ondisk = seq; - j->last_seq_ondisk = w->last_seq; - -- bch2_do_discards(c); - closure_wake_up(&c->freelist_wait); - bch2_reset_alloc_cursors(c); - } -@@ -1671,16 +1704,6 @@ static CLOSURE_CALLBACK(journal_write_done) - if (j->watermark != BCH_WATERMARK_stripe) - journal_reclaim_kick(&c->journal); - -- old.v = atomic64_read(&j->reservations.counter); -- do { -- new.v = old.v; -- BUG_ON(journal_state_count(new, new.unwritten_idx)); -- BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); -- -- new.unwritten_idx++; -- } while (!atomic64_try_cmpxchg(&j->reservations.counter, -- &old.v, new.v)); -- - closure_wake_up(&w->wait); - completed = true; - } -@@ -1695,7 +1718,7 @@ static CLOSURE_CALLBACK(journal_write_done) - } - - if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && -- new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { -+ j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { - struct journal_buf *buf = journal_cur_buf(j); - long delta = buf->expires - jiffies; - -@@ -1715,6 +1738,9 @@ static CLOSURE_CALLBACK(journal_write_done) - */ - bch2_journal_do_writes(j); - spin_unlock(&j->lock); -+ -+ if (do_discards) -+ bch2_do_discards(c); - } - - static void journal_write_endio(struct bio *bio) -@@ -1724,20 +1750,23 @@ static void journal_write_endio(struct bio *bio) - struct journal *j = &ca->fs->journal; - struct journal_buf *w = j->buf + jbio->buf_idx; - -- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, -+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, -+ jbio->submit_time, !bio->bi_status); -+ -+ if (bio->bi_status) { -+ bch_err_dev_ratelimited(ca, - "error writing journal entry %llu: %s", - le64_to_cpu(w->data->seq), -- bch2_blk_status_to_str(bio->bi_status)) || -- bch2_meta_write_fault("journal")) { -- unsigned long flags; -+ bch2_blk_status_to_str(bio->bi_status)); - -+ unsigned long flags; - spin_lock_irqsave(&j->err_lock, flags); - bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); - spin_unlock_irqrestore(&j->err_lock, flags); - } - - closure_put(&w->io); -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); - } - - static CLOSURE_CALLBACK(journal_write_submit) -@@ -1748,18 +1777,17 @@ static CLOSURE_CALLBACK(journal_write_submit) - unsigned sectors = vstruct_sectors(w->data, c->block_bits); - - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { -- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); -- if (!ca) { -- /* XXX: fix this */ -- bch_err(c, "missing device for journal write\n"); -- continue; -- } -+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], - sectors); - - struct journal_device *ja = &ca->journal; -- struct bio *bio = &ja->bio[w->idx]->bio; -+ struct journal_bio *jbio = ja->bio[w->idx]; -+ struct bio *bio = &jbio->bio; -+ -+ jbio->submit_time = local_clock(); -+ - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = ptr->offset; - bio->bi_end_io = journal_write_endio; -@@ -1791,6 +1819,10 @@ static CLOSURE_CALLBACK(journal_write_preflush) - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - -+ /* -+ * Wait for previous journal writes to comelete; they won't necessarily -+ * be flushed if they're still in flight -+ */ - if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { - spin_lock(&j->lock); - if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { -@@ -1803,8 +1835,9 @@ static CLOSURE_CALLBACK(journal_write_preflush) - } - - if (w->separate_flush) { -- for_each_rw_member(c, ca) { -- percpu_ref_get(&ca->io_ref); -+ for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) { -+ enumerated_ref_get(&ca->io_ref[WRITE], -+ BCH_DEV_WRITE_REF_journal_write); - - struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->bio[w->idx]->bio; -@@ -1984,7 +2017,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * - * write anything at all. - */ - if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) -- return -EIO; -+ return error; - - if (error || - w->noflush || -@@ -2013,13 +2046,10 @@ CLOSURE_CALLBACK(bch2_journal_write) - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); -- struct bch_replicas_padded replicas; -- unsigned nr_rw_members = 0; -+ union bch_replicas_padded replicas; -+ unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]); - int ret; - -- for_each_rw_member(c, ca) -- nr_rw_members++; -- - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - BUG_ON(!w->write_started); - BUG_ON(w->write_allocated); -@@ -2033,7 +2063,8 @@ CLOSURE_CALLBACK(bch2_journal_write) - - ret = bch2_journal_write_pick_flush(j, w); - spin_unlock(&j->lock); -- if (ret) -+ -+ if (unlikely(ret)) - goto err; - - mutex_lock(&j->buf_lock); -@@ -2041,43 +2072,30 @@ CLOSURE_CALLBACK(bch2_journal_write) - - ret = bch2_journal_write_prep(j, w); - mutex_unlock(&j->buf_lock); -- if (ret) -- goto err; - -- j->entry_bytes_written += vstruct_bytes(w->data); -+ if (unlikely(ret)) -+ goto err; - -+ unsigned replicas_allocated = 0; - while (1) { -- spin_lock(&j->lock); -- ret = journal_write_alloc(j, w); -+ ret = journal_write_alloc(j, w, &replicas_allocated); - if (!ret || !j->can_discard) - break; - -- spin_unlock(&j->lock); - bch2_journal_do_discards(j); - } - -- if (ret && !bch2_journal_error(j)) { -- struct printbuf buf = PRINTBUF; -- buf.atomic++; -- -- prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), -- le64_to_cpu(w->data->seq), -- vstruct_sectors(w->data, c->block_bits), -- bch2_err_str(ret)); -- __bch2_journal_debug_to_text(&buf, j); -- spin_unlock(&j->lock); -- bch2_print_string_as_lines(KERN_ERR, buf.buf); -- printbuf_exit(&buf); -- } -- if (ret) -- goto err; -+ if (unlikely(ret)) -+ goto err_allocate_write; - -+ spin_lock(&j->lock); - /* - * write is allocated, no longer need to account for it in - * bch2_journal_space_available(): - */ - w->sectors = 0; - w->write_allocated = true; -+ j->entry_bytes_written += vstruct_bytes(w->data); - - /* - * journal entry has been compacted and allocated, recalculate space -@@ -2089,9 +2107,6 @@ CLOSURE_CALLBACK(bch2_journal_write) - - w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - -- if (c->opts.nochanges) -- goto no_io; -- - /* - * Mark journal replicas before we submit the write to guarantee - * recovery will find the journal entries after a crash. -@@ -2102,15 +2117,33 @@ CLOSURE_CALLBACK(bch2_journal_write) - if (ret) - goto err; - -+ if (c->opts.nochanges) -+ goto no_io; -+ - if (!JSET_NO_FLUSH(w->data)) - continue_at(cl, journal_write_preflush, j->wq); - else - continue_at(cl, journal_write_submit, j->wq); - return; --no_io: -- continue_at(cl, journal_write_done, j->wq); -- return; -+err_allocate_write: -+ if (!bch2_journal_error(j)) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_journal_debug_to_text(&buf, j); -+ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), -+ le64_to_cpu(w->data->seq), -+ vstruct_sectors(w->data, c->block_bits), -+ bch2_err_str(ret)); -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ } - err: - bch2_fatal_error(c); -+no_io: -+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { -+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); -+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); -+ } -+ - continue_at(cl, journal_write_done, j->wq); - } -diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h -index 12b39fcb4424..ffa543424e9e 100644 ---- a/fs/bcachefs/journal_io.h -+++ b/fs/bcachefs/journal_io.h -@@ -2,7 +2,7 @@ - #ifndef _BCACHEFS_JOURNAL_IO_H - #define _BCACHEFS_JOURNAL_IO_H - --#include "darray.h" -+#include - - void bch2_journal_pos_from_member_info_set(struct bch_fs *); - void bch2_journal_pos_from_member_info_resume(struct bch_fs *); -diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c -index d373cd181a7f..70f36f6bc482 100644 ---- a/fs/bcachefs/journal_reclaim.c -+++ b/fs/bcachefs/journal_reclaim.c -@@ -17,6 +17,8 @@ - #include - #include - -+static bool __should_discard_bucket(struct journal *, struct journal_device *); -+ - /* Free space calculations: */ - - static unsigned journal_space_from(struct journal_device *ja, -@@ -203,8 +205,7 @@ void bch2_journal_space_available(struct journal *j) - ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) - ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; - -- if (ja->discard_idx != ja->dirty_idx_ondisk) -- can_discard = true; -+ can_discard |= __should_discard_bucket(j, ja); - - max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); - nr_online++; -@@ -214,19 +215,21 @@ void bch2_journal_space_available(struct journal *j) - j->can_discard = can_discard; - - if (nr_online < metadata_replicas_required(c)) { -- struct printbuf buf = PRINTBUF; -- buf.atomic++; -- prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" -- "rw journal devs:", nr_online, metadata_replicas_required(c)); -- -- rcu_read_lock(); -- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) -- prt_printf(&buf, " %s", ca->name); -- rcu_read_unlock(); -- -- bch_err(c, "%s", buf.buf); -- printbuf_exit(&buf); -- ret = JOURNAL_ERR_insufficient_devices; -+ if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) { -+ struct printbuf buf = PRINTBUF; -+ buf.atomic++; -+ prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" -+ "rw journal devs:", nr_online, metadata_replicas_required(c)); -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) -+ prt_printf(&buf, " %s", ca->name); -+ rcu_read_unlock(); -+ -+ bch_err(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } -+ ret = -BCH_ERR_insufficient_journal_devices; - goto out; - } - -@@ -240,7 +243,7 @@ void bch2_journal_space_available(struct journal *j) - total = j->space[journal_space_total].total; - - if (!j->space[journal_space_discarded].next_entry) -- ret = JOURNAL_ERR_journal_full; -+ ret = -BCH_ERR_journal_full; - - if ((j->space[journal_space_clean_ondisk].next_entry < - j->space[journal_space_clean_ondisk].total) && -@@ -252,7 +255,10 @@ void bch2_journal_space_available(struct journal *j) - - bch2_journal_set_watermark(j); - out: -- j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; -+ j->cur_entry_sectors = !ret -+ ? round_down(j->space[journal_space_discarded].next_entry, -+ block_sectors(c)) -+ : 0; - j->cur_entry_error = ret; - - if (!ret) -@@ -261,12 +267,19 @@ void bch2_journal_space_available(struct journal *j) - - /* Discards - last part of journal reclaim: */ - --static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -+static bool __should_discard_bucket(struct journal *j, struct journal_device *ja) - { -- bool ret; -+ unsigned min_free = max(4, ja->nr / 8); - -+ return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < -+ min_free && -+ ja->discard_idx != ja->dirty_idx_ondisk; -+} -+ -+static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -+{ - spin_lock(&j->lock); -- ret = ja->discard_idx != ja->dirty_idx_ondisk; -+ bool ret = __should_discard_bucket(j, ja); - spin_unlock(&j->lock); - - return ret; -@@ -282,12 +295,12 @@ void bch2_journal_do_discards(struct journal *j) - - mutex_lock(&j->discard_lock); - -- for_each_rw_member(c, ca) { -+ for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) { - struct journal_device *ja = &ca->journal; - - while (should_discard_bucket(j, ja)) { - if (!c->opts.nochanges && -- ca->mi.discard && -+ bch2_discard_opt_enabled(c, ca) && - bdev_max_discard_sectors(ca->disk_sb.bdev)) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, -@@ -614,7 +627,8 @@ static u64 journal_seq_to_flush(struct journal *j) - - spin_lock(&j->lock); - -- for_each_rw_member(c, ca) { -+ rcu_read_lock(); -+ for_each_rw_member_rcu(c, ca) { - struct journal_device *ja = &ca->journal; - unsigned nr_buckets, bucket_to_flush; - -@@ -624,12 +638,11 @@ static u64 journal_seq_to_flush(struct journal *j) - /* Try to keep the journal at most half full: */ - nr_buckets = ja->nr / 2; - -- nr_buckets = min(nr_buckets, ja->nr); -- - bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; - seq_to_flush = max(seq_to_flush, - ja->bucket_seq[bucket_to_flush]); - } -+ rcu_read_unlock(); - - /* Also flush if the pin fifo is more than half full */ - seq_to_flush = max_t(s64, seq_to_flush, -@@ -645,7 +658,6 @@ static u64 journal_seq_to_flush(struct journal *j) - * @j: journal object - * @direct: direct or background reclaim? - * @kicked: requested to run since we last ran? -- * Returns: 0 on success, or -EIO if the journal has been shutdown - * - * Background journal reclaim writes out btree nodes. It should be run - * early enough so that we never completely run out of journal buckets. -@@ -685,11 +697,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) - if (kthread && kthread_should_stop()) - break; - -- if (bch2_journal_error(j)) { -- ret = -EIO; -+ ret = bch2_journal_error(j); -+ if (ret) - break; -- } - -+ /* XXX shove journal discards off to another thread */ - bch2_journal_do_discards(j); - - seq_to_flush = journal_seq_to_flush(j); -@@ -952,7 +964,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) - seq = 0; - spin_lock(&j->lock); - while (!ret) { -- struct bch_replicas_padded replicas; -+ union bch_replicas_padded replicas; - - seq = max(seq, journal_last_seq(j)); - if (seq >= j->pin.back) -diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c -index 62b910f2fb27..68b960e08f12 100644 ---- a/fs/bcachefs/journal_sb.c -+++ b/fs/bcachefs/journal_sb.c -@@ -2,8 +2,8 @@ - - #include "bcachefs.h" - #include "journal_sb.h" --#include "darray.h" - -+#include - #include - - /* BCH_SB_FIELD_journal: */ -diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c -index 1f25c111c54c..e463d2d95359 100644 ---- a/fs/bcachefs/journal_seq_blacklist.c -+++ b/fs/bcachefs/journal_seq_blacklist.c -@@ -231,15 +231,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c) - struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; - BUG_ON(nr != t->nr); - -- unsigned i; -- for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); -- src < bl->start + nr; -- src++, i = eytzinger0_next(i, nr)) { -+ src = bl->start; -+ eytzinger0_for_each(i, nr) { - BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); - BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); - - if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) - *dst++ = *src; -+ src++; - } - - unsigned new_nr = dst - bl->start; -diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h -index 1ef3a28ed6ab..51104bbb99da 100644 ---- a/fs/bcachefs/journal_types.h -+++ b/fs/bcachefs/journal_types.h -@@ -12,7 +12,11 @@ - /* btree write buffer steals 8 bits for its own purposes: */ - #define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) - --#define JOURNAL_BUF_BITS 2 -+#define JOURNAL_STATE_BUF_BITS 2 -+#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS) -+#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1) -+ -+#define JOURNAL_BUF_BITS 4 - #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) - #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) - -@@ -82,7 +86,6 @@ struct journal_entry_pin { - - struct journal_res { - bool ref; -- u8 idx; - u16 u64s; - u32 offset; - u64 seq; -@@ -98,9 +101,8 @@ union journal_res_state { - }; - - struct { -- u64 cur_entry_offset:20, -+ u64 cur_entry_offset:22, - idx:2, -- unwritten_idx:2, - buf0_count:10, - buf1_count:10, - buf2_count:10, -@@ -110,13 +112,13 @@ union journal_res_state { - - /* bytes: */ - #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ --#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ -+#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ - - /* - * We stash some journal state as sentinal values in cur_entry_offset: - * note - cur_entry_offset is in units of u64s - */ --#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) -+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1) - - #define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) - #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) -@@ -149,28 +151,10 @@ enum journal_flags { - #undef x - }; - --/* Reasons we may fail to get a journal reservation: */ --#define JOURNAL_ERRORS() \ -- x(ok) \ -- x(retry) \ -- x(blocked) \ -- x(max_in_flight) \ -- x(journal_full) \ -- x(journal_pin_full) \ -- x(journal_stuck) \ -- x(insufficient_devices) -- --enum journal_errors { --#define x(n) JOURNAL_ERR_##n, -- JOURNAL_ERRORS() --#undef x --}; -- --typedef DARRAY(u64) darray_u64; -- - struct journal_bio { - struct bch_dev *ca; - unsigned buf_idx; -+ u64 submit_time; - - struct bio bio; - }; -@@ -199,7 +183,7 @@ struct journal { - * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if - * insufficient devices: - */ -- enum journal_errors cur_entry_error; -+ int cur_entry_error; - unsigned cur_entry_offset_if_blocked; - - unsigned buf_size_want; -@@ -220,6 +204,8 @@ struct journal { - * other is possibly being written out. - */ - struct journal_buf buf[JOURNAL_BUF_NR]; -+ void *free_buf; -+ unsigned free_buf_size; - - spinlock_t lock; - -@@ -237,6 +223,7 @@ struct journal { - /* Sequence number of most recent journal entry (last entry in @pin) */ - atomic64_t seq; - -+ u64 seq_write_started; - /* seq, last_seq from the most recent journal entry successfully written */ - u64 seq_ondisk; - u64 flushed_seq_ondisk; -diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c -index ce794d55818f..2f63fc6d456f 100644 ---- a/fs/bcachefs/lru.c -+++ b/fs/bcachefs/lru.c -@@ -6,6 +6,7 @@ - #include "btree_iter.h" - #include "btree_update.h" - #include "btree_write_buffer.h" -+#include "ec.h" - #include "error.h" - #include "lru.h" - #include "recovery.h" -@@ -59,9 +60,9 @@ int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); - } - --int bch2_lru_change(struct btree_trans *trans, -- u16 lru_id, u64 dev_bucket, -- u64 old_time, u64 new_time) -+int __bch2_lru_change(struct btree_trans *trans, -+ u16 lru_id, u64 dev_bucket, -+ u64 old_time, u64 new_time) - { - if (old_time == new_time) - return 0; -@@ -78,7 +79,9 @@ static const char * const bch2_lru_types[] = { - }; - - int bch2_lru_check_set(struct btree_trans *trans, -- u16 lru_id, u64 time, -+ u16 lru_id, -+ u64 dev_bucket, -+ u64 time, - struct bkey_s_c referring_k, - struct bkey_buf *last_flushed) - { -@@ -87,9 +90,7 @@ int bch2_lru_check_set(struct btree_trans *trans, - struct btree_iter lru_iter; - struct bkey_s_c lru_k = - bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, -- lru_pos(lru_id, -- bucket_to_u64(referring_k.k->p), -- time), 0); -+ lru_pos(lru_id, dev_bucket, time), 0); - int ret = bkey_err(lru_k); - if (ret) - return ret; -@@ -100,11 +101,10 @@ int bch2_lru_check_set(struct btree_trans *trans, - goto err; - - if (fsck_err(trans, alloc_key_to_missing_lru_entry, -- "missing %s lru entry\n" -- " %s", -+ "missing %s lru entry\n%s", - bch2_lru_types[lru_type(lru_k)], - (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { -- ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time); -+ ret = bch2_lru_set(trans, lru_id, dev_bucket, time); - if (ret) - goto err; - } -@@ -116,57 +116,81 @@ int bch2_lru_check_set(struct btree_trans *trans, - return ret; - } - -+static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) -+{ -+ enum bch_lru_type type = lru_type(lru_k); -+ -+ switch (type) { -+ case BCH_LRU_read: -+ case BCH_LRU_fragmentation: -+ return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset)); -+ case BCH_LRU_stripes: -+ return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset)); -+ default: -+ BUG(); -+ } -+} -+ -+static u64 bkey_lru_type_idx(struct bch_fs *c, -+ enum bch_lru_type type, -+ struct bkey_s_c k) -+{ -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a; -+ -+ switch (type) { -+ case BCH_LRU_read: -+ a = bch2_alloc_to_v4(k, &a_convert); -+ return alloc_lru_idx_read(*a); -+ case BCH_LRU_fragmentation: { -+ a = bch2_alloc_to_v4(k, &a_convert); -+ -+ rcu_read_lock(); -+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); -+ u64 idx = ca -+ ? alloc_lru_idx_fragmentation(*a, ca) -+ : 0; -+ rcu_read_unlock(); -+ return idx; -+ } -+ case BCH_LRU_stripes: -+ return k.k->type == KEY_TYPE_stripe -+ ? stripe_lru_pos(bkey_s_c_to_stripe(k).v) -+ : 0; -+ default: -+ BUG(); -+ } -+} -+ - static int bch2_check_lru_key(struct btree_trans *trans, - struct btree_iter *lru_iter, - struct bkey_s_c lru_k, - struct bkey_buf *last_flushed) - { - struct bch_fs *c = trans->c; -- struct btree_iter iter; -- struct bkey_s_c k; -- struct bch_alloc_v4 a_convert; -- const struct bch_alloc_v4 *a; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; -- enum bch_lru_type type = lru_type(lru_k); -- struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset); -- u64 idx; -- int ret; -- -- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos); - -- if (fsck_err_on(!ca, -- trans, lru_entry_to_invalid_bucket, -- "lru key points to nonexistent device:bucket %llu:%llu", -- alloc_pos.inode, alloc_pos.offset)) -- return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); -+ struct bbpos bp = lru_pos_to_bp(lru_k); - -- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); -- ret = bkey_err(k); -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0); -+ int ret = bkey_err(k); - if (ret) - goto err; - -- a = bch2_alloc_to_v4(k, &a_convert); -- -- switch (type) { -- case BCH_LRU_read: -- idx = alloc_lru_idx_read(*a); -- break; -- case BCH_LRU_fragmentation: -- idx = alloc_lru_idx_fragmentation(*a, ca); -- break; -- } -+ enum bch_lru_type type = lru_type(lru_k); -+ u64 idx = bkey_lru_type_idx(c, type, k); - -- if (lru_k.k->type != KEY_TYPE_set || -- lru_pos_time(lru_k.k->p) != idx) { -+ if (lru_pos_time(lru_k.k->p) != idx) { - ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); - if (ret) - goto err; - - if (fsck_err(trans, lru_entry_bad, - "incorrect lru entry: lru %s time %llu\n" -- " %s\n" -- " for %s", -+ "%s\n" -+ "for %s", - bch2_lru_types[type], - lru_pos_time(lru_k.k->p), - (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), -@@ -176,7 +200,6 @@ static int bch2_check_lru_key(struct btree_trans *trans, - err: - fsck_err: - bch2_trans_iter_exit(trans, &iter); -- bch2_dev_put(ca); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h -index f31a6cf1514c..8abd0aa2083a 100644 ---- a/fs/bcachefs/lru.h -+++ b/fs/bcachefs/lru.h -@@ -28,9 +28,14 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) - { - u16 lru_id = l.k->p.inode >> 48; - -- if (lru_id == BCH_LRU_FRAGMENTATION_START) -+ switch (lru_id) { -+ case BCH_LRU_BUCKET_FRAGMENTATION: - return BCH_LRU_fragmentation; -- return BCH_LRU_read; -+ case BCH_LRU_STRIPE_FRAGMENTATION: -+ return BCH_LRU_stripes; -+ default: -+ return BCH_LRU_read; -+ } - } - - int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); -@@ -46,10 +51,19 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos); - - int bch2_lru_del(struct btree_trans *, u16, u64, u64); - int bch2_lru_set(struct btree_trans *, u16, u64, u64); --int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); -+int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); -+ -+static inline int bch2_lru_change(struct btree_trans *trans, -+ u16 lru_id, u64 dev_bucket, -+ u64 old_time, u64 new_time) -+{ -+ return old_time != new_time -+ ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time) -+ : 0; -+} - - struct bkey_buf; --int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); -+int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); - - int bch2_check_lrus(struct bch_fs *); - -diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h -index f372cb3b8cda..b7392ad8e41f 100644 ---- a/fs/bcachefs/lru_format.h -+++ b/fs/bcachefs/lru_format.h -@@ -9,7 +9,8 @@ struct bch_lru { - - #define BCH_LRU_TYPES() \ - x(read) \ -- x(fragmentation) -+ x(fragmentation) \ -+ x(stripes) - - enum bch_lru_type { - #define x(n) BCH_LRU_##n, -@@ -17,7 +18,8 @@ enum bch_lru_type { - #undef x - }; - --#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) -+#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1) -+#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2) - - #define LRU_TIME_BITS 48 - #define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) -diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c -index ddc187fb693d..bb7a92270c09 100644 ---- a/fs/bcachefs/migrate.c -+++ b/fs/bcachefs/migrate.c -@@ -4,10 +4,13 @@ - */ - - #include "bcachefs.h" -+#include "backpointers.h" - #include "bkey_buf.h" - #include "btree_update.h" - #include "btree_update_interior.h" -+#include "btree_write_buffer.h" - #include "buckets.h" -+#include "ec.h" - #include "errcode.h" - #include "extents.h" - #include "io_write.h" -@@ -15,11 +18,12 @@ - #include "keylist.h" - #include "migrate.h" - #include "move.h" -+#include "progress.h" - #include "replicas.h" - #include "super-io.h" - - static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, -- unsigned dev_idx, int flags, bool metadata) -+ unsigned dev_idx, unsigned flags, bool metadata) - { - unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; - unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; -@@ -36,11 +40,28 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, - return 0; - } - -+static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, -+ struct btree *b, unsigned dev_idx, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_buf k; -+ -+ bch2_bkey_buf_init(&k); -+ bch2_bkey_buf_copy(&k, c, &b->key); -+ -+ int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?: -+ bch2_btree_node_update_key(trans, iter, b, k.k, 0, false); -+ -+ bch_err_fn(c, ret); -+ bch2_bkey_buf_exit(&k, c); -+ return ret; -+} -+ - static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - unsigned dev_idx, -- int flags) -+ unsigned flags) - { - struct bch_fs *c = trans->c; - struct bkey_i *n; -@@ -76,7 +97,27 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, - return 0; - } - --static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+static int bch2_dev_btree_drop_key(struct btree_trans *trans, -+ struct bkey_s_c_backpointer bp, -+ unsigned dev_idx, -+ struct bkey_buf *last_flushed, -+ unsigned flags) -+{ -+ struct btree_iter iter; -+ struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed); -+ int ret = PTR_ERR_OR_ZERO(b); -+ if (ret) -+ return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret; -+ -+ ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); -+ -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int bch2_dev_usrdata_drop(struct bch_fs *c, -+ struct progress_indicator_state *progress, -+ unsigned dev_idx, unsigned flags) - { - struct btree_trans *trans = bch2_trans_get(c); - enum btree_id id; -@@ -88,8 +129,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) - - ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, -- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -- bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); -+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ -+ bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); -+ bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); -+ })); - if (ret) - break; - } -@@ -99,7 +142,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) - return ret; - } - --static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+static int bch2_dev_metadata_drop(struct bch_fs *c, -+ struct progress_indicator_state *progress, -+ unsigned dev_idx, unsigned flags) - { - struct btree_trans *trans; - struct btree_iter iter; -@@ -123,29 +168,23 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) - retry: - ret = 0; - while (bch2_trans_begin(trans), -- (b = bch2_btree_iter_peek_node(&iter)) && -+ (b = bch2_btree_iter_peek_node(trans, &iter)) && - !(ret = PTR_ERR_OR_ZERO(b))) { -+ bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); -+ - if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) - goto next; - -- bch2_bkey_buf_copy(&k, c, &b->key); -- -- ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), -- dev_idx, flags, true); -- if (ret) -- break; -- -- ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); -+ ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - -- bch_err_msg(c, ret, "updating btree node key"); - if (ret) - break; - next: -- bch2_btree_iter_next_node(&iter); -+ bch2_btree_iter_next_node(trans, &iter); - } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; -@@ -167,8 +206,72 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) - return ret; - } - --int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, -+ struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, -+ unsigned flags) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, -+ last_flushed); -+ int ret = bkey_err(k); -+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) -+ return 0; -+ if (ret) -+ return ret; -+ -+ if (!k.k || !bch2_bkey_has_device_c(k, dev_idx)) -+ goto out; -+ -+ /* -+ * XXX: pass flags arg to invalidate_stripe_to_dev and handle it -+ * properly -+ */ -+ -+ if (bkey_is_btree_ptr(k.k)) -+ ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags); -+ else if (k.k->type == KEY_TYPE_stripe) -+ ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags); -+ else -+ ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags) - { -- return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: -- bch2_dev_metadata_drop(c, dev_idx, flags); -+ struct btree_trans *trans = bch2_trans_get(c); -+ -+ struct bkey_buf last_flushed; -+ bch2_bkey_buf_init(&last_flushed); -+ bkey_init(&last_flushed.k->k); -+ -+ int ret = bch2_btree_write_buffer_flush_sync(trans) ?: -+ for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, -+ POS(dev_idx, 0), -+ POS(dev_idx, U64_MAX), 0, k, -+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ -+ if (k.k->type != KEY_TYPE_backpointer) -+ continue; -+ -+ data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), -+ &last_flushed, flags); -+ -+ })); -+ -+ bch2_bkey_buf_exit(&last_flushed, trans->c); -+ bch2_trans_put(trans); -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags) -+{ -+ struct progress_indicator_state progress; -+ bch2_progress_init(&progress, c, -+ BIT_ULL(BTREE_ID_extents)| -+ BIT_ULL(BTREE_ID_reflink)); -+ -+ return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?: -+ bch2_dev_metadata_drop(c, &progress, dev_idx, flags); - } -diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h -index 027efaa0d575..30018140711b 100644 ---- a/fs/bcachefs/migrate.h -+++ b/fs/bcachefs/migrate.h -@@ -2,6 +2,7 @@ - #ifndef _BCACHEFS_MIGRATE_H - #define _BCACHEFS_MIGRATE_H - --int bch2_dev_data_drop(struct bch_fs *, unsigned, int); -+int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned); -+int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned); - - #endif /* _BCACHEFS_MIGRATE_H */ -diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c -index 160b4374160a..79f4722621d5 100644 ---- a/fs/bcachefs/move.c -+++ b/fs/bcachefs/move.c -@@ -38,28 +38,28 @@ const char * const bch2_data_ops_strs[] = { - NULL - }; - --static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, -+static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) - { -- if (trace_move_extent_enabled()) { -+ if (trace_io_move_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); -- trace_move_extent(c, buf.buf); -+ trace_io_move(c, buf.buf); - printbuf_exit(&buf); - } - } - --static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) -+static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) - { -- if (trace_move_extent_read_enabled()) { -+ if (trace_io_move_read_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); -- trace_move_extent_read(c, buf.buf); -+ trace_io_move_read(c, buf.buf); - printbuf_exit(&buf); - } - } -@@ -67,18 +67,14 @@ static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) - struct moving_io { - struct list_head read_list; - struct list_head io_list; -- struct move_bucket_in_flight *b; -+ struct move_bucket *b; - struct closure cl; - bool read_completed; - - unsigned read_sectors; - unsigned write_sectors; - -- struct bch_read_bio rbio; -- - struct data_update write; -- /* Must be last since it is variable size */ -- struct bio_vec bi_inline_vecs[]; - }; - - static void move_free(struct moving_io *io) -@@ -88,43 +84,85 @@ static void move_free(struct moving_io *io) - if (io->b) - atomic_dec(&io->b->count); - -- bch2_data_update_exit(&io->write); -- - mutex_lock(&ctxt->lock); - list_del(&io->io_list); - wake_up(&ctxt->wait); - mutex_unlock(&ctxt->lock); - -+ if (!io->write.data_opts.scrub) { -+ bch2_data_update_exit(&io->write); -+ } else { -+ bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); -+ kfree(io->write.bvecs); -+ } - kfree(io); - } - - static void move_write_done(struct bch_write_op *op) - { - struct moving_io *io = container_of(op, struct moving_io, write.op); -+ struct bch_fs *c = op->c; - struct moving_context *ctxt = io->write.ctxt; - -- if (io->write.op.error) -+ if (op->error) { -+ if (trace_io_move_write_fail_enabled()) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_write_op_to_text(&buf, op); -+ trace_io_move_write_fail(c, buf.buf); -+ printbuf_exit(&buf); -+ } -+ this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); -+ - ctxt->write_error = true; -+ } - -- atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); -- atomic_dec(&io->write.ctxt->write_ios); -+ atomic_sub(io->write_sectors, &ctxt->write_sectors); -+ atomic_dec(&ctxt->write_ios); - move_free(io); - closure_put(&ctxt->cl); - } - - static void move_write(struct moving_io *io) - { -- if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { -+ struct bch_fs *c = io->write.op.c; -+ struct moving_context *ctxt = io->write.ctxt; -+ struct bch_read_bio *rbio = &io->write.rbio; -+ -+ if (ctxt->stats) { -+ if (rbio->bio.bi_status) -+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, -+ &ctxt->stats->sectors_error_uncorrected); -+ else if (rbio->saw_error) -+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, -+ &ctxt->stats->sectors_error_corrected); -+ } -+ -+ /* -+ * If the extent has been bitrotted, we're going to have to give it a -+ * new checksum in order to move it - but the poison bit will ensure -+ * that userspace still gets the appropriate error. -+ */ -+ if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && -+ (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { -+ struct bch_extent_crc_unpacked crc = rbio->pick.crc; -+ struct nonce nonce = extent_nonce(rbio->version, crc); -+ -+ rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, -+ nonce, &rbio->bio); -+ rbio->ret = 0; -+ } -+ -+ if (unlikely(rbio->ret || io->write.data_opts.scrub)) { - move_free(io); - return; - } - -- if (trace_move_extent_write_enabled()) { -- struct bch_fs *c = io->write.op.c; -+ if (trace_io_move_write_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); -- trace_move_extent_write(c, buf.buf); -+ trace_io_move_write(c, buf.buf); - printbuf_exit(&buf); - } - -@@ -132,7 +170,7 @@ static void move_write(struct moving_io *io) - atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); - atomic_inc(&io->write.ctxt->write_ios); - -- bch2_data_update_read_done(&io->write, io->rbio.pick.crc); -+ bch2_data_update_read_done(&io->write); - } - - struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) -@@ -145,7 +183,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx - - static void move_read_endio(struct bio *bio) - { -- struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); -+ struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); - struct moving_context *ctxt = io->write.ctxt; - - atomic_sub(io->read_sectors, &ctxt->read_sectors); -@@ -250,7 +288,7 @@ void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) - } - - int bch2_move_extent(struct moving_context *ctxt, -- struct move_bucket_in_flight *bucket_in_flight, -+ struct move_bucket *bucket_in_flight, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_io_opts io_opts, -@@ -258,14 +296,10 @@ int bch2_move_extent(struct moving_context *ctxt, - { - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- struct moving_io *io; -- const union bch_extent_entry *entry; -- struct extent_ptr_decoded p; -- unsigned sectors = k.k->size, pages; - int ret = -ENOMEM; - -- trace_move_extent2(c, k, &io_opts, &data_opts); -+ trace_io_move2(c, k, &io_opts, &data_opts); -+ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); - - if (ctxt->stats) - ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); -@@ -273,7 +307,8 @@ int bch2_move_extent(struct moving_context *ctxt, - bch2_data_update_opts_normalize(k, &data_opts); - - if (!data_opts.rewrite_ptrs && -- !data_opts.extra_replicas) { -+ !data_opts.extra_replicas && -+ !data_opts.scrub) { - if (data_opts.kill_ptrs) - return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); - return 0; -@@ -285,13 +320,7 @@ int bch2_move_extent(struct moving_context *ctxt, - */ - bch2_trans_unlock(trans); - -- /* write path might have to decompress data: */ -- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -- sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); -- -- pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); -- io = kzalloc(sizeof(struct moving_io) + -- sizeof(struct bio_vec) * pages, GFP_KERNEL); -+ struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); - if (!io) - goto err; - -@@ -300,31 +329,27 @@ int bch2_move_extent(struct moving_context *ctxt, - io->read_sectors = k.k->size; - io->write_sectors = k.k->size; - -- bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); -- io->write.op.wbio.bio.bi_ioprio = -- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); -- -- if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, -- GFP_KERNEL)) -- goto err_free; -+ if (!data_opts.scrub) { -+ ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, -+ &io_opts, data_opts, iter->btree_id, k); -+ if (ret) -+ goto err_free; - -- io->rbio.c = c; -- io->rbio.opts = io_opts; -- bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); -- io->rbio.bio.bi_vcnt = pages; -- io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); -- io->rbio.bio.bi_iter.bi_size = sectors << 9; -+ io->write.op.end_io = move_write_done; -+ } else { -+ bch2_bkey_buf_init(&io->write.k); -+ bch2_bkey_buf_reassemble(&io->write.k, c, k); - -- io->rbio.bio.bi_opf = REQ_OP_READ; -- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); -- io->rbio.bio.bi_end_io = move_read_endio; -+ io->write.op.c = c; -+ io->write.data_opts = data_opts; - -- ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, -- io_opts, data_opts, iter->btree_id, k); -- if (ret) -- goto err_free_pages; -+ ret = bch2_data_update_bios_init(&io->write, c, &io_opts); -+ if (ret) -+ goto err_free; -+ } - -- io->write.op.end_io = move_write_done; -+ io->write.rbio.bio.bi_end_io = move_read_endio; -+ io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, k.k->size); -@@ -339,9 +364,7 @@ int bch2_move_extent(struct moving_context *ctxt, - atomic_inc(&io->b->count); - } - -- this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); -- this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); -- trace_move_extent_read2(c, k); -+ trace_io_move_read2(c, k); - - mutex_lock(&ctxt->lock); - atomic_add(io->read_sectors, &ctxt->read_sectors); -@@ -356,39 +379,39 @@ int bch2_move_extent(struct moving_context *ctxt, - * ctxt when doing wakeup - */ - closure_get(&ctxt->cl); -- bch2_read_extent(trans, &io->rbio, -- bkey_start_pos(k.k), -- iter->btree_id, k, 0, -- BCH_READ_NODECODE| -- BCH_READ_LAST_FRAGMENT); -+ __bch2_read_extent(trans, &io->write.rbio, -+ io->write.rbio.bio.bi_iter, -+ bkey_start_pos(k.k), -+ iter->btree_id, k, 0, -+ NULL, -+ BCH_READ_last_fragment, -+ data_opts.scrub ? data_opts.read_dev : -1); - return 0; --err_free_pages: -- bio_free_pages(&io->write.op.wbio.bio); - err_free: - kfree(io); - err: -- if (ret == -BCH_ERR_data_update_done) -+ if (bch2_err_matches(ret, BCH_ERR_data_update_done)) - return 0; - - if (bch2_err_matches(ret, EROFS) || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - -- count_event(c, move_extent_start_fail); -+ count_event(c, io_move_start_fail); - -- if (trace_move_extent_start_fail_enabled()) { -+ if (trace_io_move_start_fail_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, ": "); - prt_str(&buf, bch2_err_str(ret)); -- trace_move_extent_start_fail(c, buf.buf); -+ trace_io_move_start_fail(c, buf.buf); - printbuf_exit(&buf); - } - return ret; - } - --static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, -+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, - struct per_snapshot_io_opts *io_opts, - struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ - struct btree_iter *extent_iter, -@@ -399,6 +422,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, - struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; - int ret = 0; - -+ if (extent_iter->min_depth) -+ return opts_ret; -+ - if (extent_k.k->type == KEY_TYPE_reflink_v) - goto out; - -@@ -518,11 +544,42 @@ int bch2_move_ratelimit(struct moving_context *ctxt) - return 0; - } - --static int bch2_move_data_btree(struct moving_context *ctxt, -- struct bpos start, -- struct bpos end, -- move_pred_fn pred, void *arg, -- enum btree_id btree_id) -+/* -+ * Move requires non extents iterators, and there's also no need for it to -+ * signal indirect_extent_missing_error: -+ */ -+static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c_reflink_p p) -+{ -+ if (unlikely(REFLINK_P_ERROR(p.v))) -+ return bkey_s_c_null; -+ -+ struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v)); -+ -+ bch2_trans_iter_init(trans, iter, -+ BTREE_ID_reflink, reflink_pos, -+ BTREE_ITER_not_extents); -+ -+ struct bkey_s_c k = bch2_btree_iter_peek(trans, iter); -+ if (!k.k || bkey_err(k)) { -+ bch2_trans_iter_exit(trans, iter); -+ return k; -+ } -+ -+ if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { -+ bch2_trans_iter_exit(trans, iter); -+ return bkey_s_c_null; -+ } -+ -+ return k; -+} -+ -+int bch2_move_data_btree(struct moving_context *ctxt, -+ struct bpos start, -+ struct bpos end, -+ move_pred_fn pred, void *arg, -+ enum btree_id btree_id, unsigned level) - { - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; -@@ -548,10 +605,56 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - ctxt->stats->pos = BBPOS(btree_id, start); - } - -+retry_root: - bch2_trans_begin(trans); -- bch2_trans_iter_init(trans, &iter, btree_id, start, -- BTREE_ITER_prefetch| -- BTREE_ITER_all_snapshots); -+ -+ if (level == bch2_btree_id_root(c, btree_id)->level + 1) { -+ bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1, -+ BTREE_ITER_prefetch| -+ BTREE_ITER_not_extents| -+ BTREE_ITER_all_snapshots); -+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter); -+ ret = PTR_ERR_OR_ZERO(b); -+ if (ret) -+ goto root_err; -+ -+ if (b != btree_node_root(c, b)) { -+ bch2_trans_iter_exit(trans, &iter); -+ goto retry_root; -+ } -+ -+ k = bkey_i_to_s_c(&b->key); -+ -+ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, -+ iter.pos, &iter, k); -+ ret = PTR_ERR_OR_ZERO(io_opts); -+ if (ret) -+ goto root_err; -+ -+ memset(&data_opts, 0, sizeof(data_opts)); -+ if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) -+ goto out; -+ -+ -+ if (!data_opts.scrub) -+ ret = bch2_btree_node_rewrite_pos(trans, btree_id, level, -+ k.k->p, data_opts.target, 0); -+ else -+ ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); -+ -+root_err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { -+ bch2_trans_iter_exit(trans, &iter); -+ goto retry_root; -+ } -+ -+ goto out; -+ } -+ -+ bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, -+ BTREE_ITER_prefetch| -+ BTREE_ITER_not_extents| -+ BTREE_ITER_all_snapshots); - - if (ctxt->rate) - bch2_ratelimit_reset(ctxt->rate); -@@ -561,7 +664,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - - bch2_trans_begin(trans); - -- k = bch2_btree_iter_peek(&iter); -+ k = bch2_btree_iter_peek(trans, &iter); - if (!k.k) - break; - -@@ -571,7 +674,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - if (ret) - break; - -- if (bkey_ge(bkey_start_pos(k.k), end)) -+ if (bkey_gt(bkey_start_pos(k.k), end)) - break; - - if (ctxt->stats) -@@ -581,17 +684,16 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - k.k->type == KEY_TYPE_reflink_p && - REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -- s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - - bch2_trans_iter_exit(trans, &reflink_iter); -- k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); -+ k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - -- if (bkey_deleted(k.k)) -+ if (!k.k) - goto next_nondata; - - /* -@@ -612,7 +714,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - continue; - - memset(&data_opts, 0, sizeof(data_opts)); -- if (!pred(c, arg, k, io_opts, &data_opts)) -+ if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) - goto next; - - /* -@@ -622,12 +724,19 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - -- ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); -+ if (!level) -+ ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); -+ else if (!data_opts.scrub) -+ ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, -+ k.k->p, data_opts.target, 0); -+ else -+ ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); -+ - if (ret2) { - if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) - continue; - -- if (ret2 == -ENOMEM) { -+ if (bch2_err_matches(ret2, ENOMEM)) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; -@@ -640,9 +749,10 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - if (ctxt->stats) - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); - next_nondata: -- bch2_btree_iter_advance(&iter); -+ if (!bch2_btree_iter_advance(trans, &iter)) -+ break; - } -- -+out: - bch2_trans_iter_exit(trans, &reflink_iter); - bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&sk, c); -@@ -672,7 +782,7 @@ int __bch2_move_data(struct moving_context *ctxt, - ret = bch2_move_data_btree(ctxt, - id == start.btree ? start.pos : POS_MIN, - id == end.btree ? end.pos : POS_MAX, -- pred, arg, id); -+ pred, arg, id, 0); - if (ret) - break; - } -@@ -689,21 +799,23 @@ int bch2_move_data(struct bch_fs *c, - bool wait_on_copygc, - move_pred_fn pred, void *arg) - { -- - struct moving_context ctxt; -- int ret; - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); -- ret = __bch2_move_data(&ctxt, start, end, pred, arg); -+ int ret = __bch2_move_data(&ctxt, start, end, pred, arg); - bch2_moving_ctxt_exit(&ctxt); - - return ret; - } - --int bch2_evacuate_bucket(struct moving_context *ctxt, -- struct move_bucket_in_flight *bucket_in_flight, -- struct bpos bucket, int gen, -- struct data_update_opts _data_opts) -+static int __bch2_move_data_phys(struct moving_context *ctxt, -+ struct move_bucket *bucket_in_flight, -+ unsigned dev, -+ u64 bucket_start, -+ u64 bucket_end, -+ unsigned data_types, -+ bool copygc, -+ move_pred_fn pred, void *arg) - { - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; -@@ -712,16 +824,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - struct btree_iter iter = {}, bp_iter = {}; - struct bkey_buf sk; - struct bkey_s_c k; -- struct data_update_opts data_opts; -- unsigned sectors_moved = 0; - struct bkey_buf last_flushed; -+ u64 check_mismatch_done = bucket_start; - int ret = 0; - -- struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); -+ struct bch_dev *ca = bch2_dev_tryget(c, dev); - if (!ca) - return 0; - -- trace_bucket_evacuate(c, &bucket); -+ bucket_end = min(bucket_end, ca->mi.nbuckets); -+ -+ struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); -+ struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); -@@ -732,15 +846,11 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - */ - bch2_trans_begin(trans); - -- bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, -- bucket_pos_to_bp_start(ca, bucket), 0); -- -- bch_err_msg(c, ret, "looking up alloc key"); -- if (ret) -- goto err; -+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); - - ret = bch2_btree_write_buffer_tryflush(trans); -- bch_err_msg(c, ret, "flushing btree write buffer"); -+ if (!bch2_err_matches(ret, EROFS)) -+ bch_err_msg(c, ret, "flushing btree write buffer"); - if (ret) - goto err; - -@@ -750,122 +860,182 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - - bch2_trans_begin(trans); - -- k = bch2_btree_iter_peek(&bp_iter); -+ k = bch2_btree_iter_peek(trans, &bp_iter); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - -- if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) -+ if (!k.k || bkey_gt(k.k->p, bp_end)) - break; - -+ if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { -+ while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { -+ bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, -+ copygc, &last_flushed); -+ } -+ continue; -+ } -+ - if (k.k->type != KEY_TYPE_backpointer) - goto next; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - -- if (!bp.v->level) { -- k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); -- ret = bkey_err(k); -- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -- continue; -- if (ret) -- goto err; -- if (!k.k) -- goto next; -+ if (ctxt->stats) -+ ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - -- bch2_bkey_buf_reassemble(&sk, c, k); -- k = bkey_i_to_s_c(sk.k); -+ if (!(data_types & BIT(bp.v->data_type))) -+ goto next; -+ -+ if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes) -+ goto next; -+ -+ k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); -+ ret = bkey_err(k); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ goto err; -+ if (!k.k) -+ goto next; - -+ if (!bp.v->level) { - ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); - if (ret) { - bch2_trans_iter_exit(trans, &iter); - continue; - } -+ } - -- data_opts = _data_opts; -- data_opts.target = io_opts.background_target; -- data_opts.rewrite_ptrs = 0; -- -- unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ -- unsigned i = 0; -- const union bch_extent_entry *entry; -- struct extent_ptr_decoded p; -- bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { -- if (p.ptr.dev == bucket.inode) { -- if (p.ptr.cached) { -- bch2_trans_iter_exit(trans, &iter); -- goto next; -- } -- data_opts.rewrite_ptrs |= 1U << i; -- break; -- } -- i++; -- } -- -- ret = bch2_move_extent(ctxt, bucket_in_flight, -- &iter, k, io_opts, data_opts); -+ struct data_update_opts data_opts = {}; -+ if (!pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts)) { - bch2_trans_iter_exit(trans, &iter); -+ goto next; -+ } - -- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -- continue; -- if (ret == -ENOMEM) { -- /* memory allocation failure, wait for some IO to finish */ -- bch2_move_ctxt_wait_for_io(ctxt); -- continue; -- } -- if (ret) -- goto err; -- -- if (ctxt->stats) -- atomic64_add(sectors, &ctxt->stats->sectors_seen); -- sectors_moved += sectors; -- } else { -- struct btree *b; -+ if (data_opts.scrub && -+ !bch2_dev_idx_is_online(c, data_opts.read_dev)) { -+ bch2_trans_iter_exit(trans, &iter); -+ ret = -BCH_ERR_device_offline; -+ break; -+ } - -- b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); -- ret = PTR_ERR_OR_ZERO(b); -- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) -- goto next; -- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -- continue; -- if (ret) -- goto err; -- if (!b) -- goto next; -+ bch2_bkey_buf_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); - -- unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); -+ /* move_extent will drop locks */ -+ unsigned sectors = bp.v->bucket_len; - -- ret = bch2_btree_node_rewrite(trans, &iter, b, 0); -- bch2_trans_iter_exit(trans, &iter); -+ if (!bp.v->level) -+ ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); -+ else if (!data_opts.scrub) -+ ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, -+ k.k->p, data_opts.target, 0); -+ else -+ ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); - -- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -- continue; -- if (ret) -- goto err; -+ bch2_trans_iter_exit(trans, &iter); - -- if (ctxt->rate) -- bch2_ratelimit_increment(ctxt->rate, sectors); -- if (ctxt->stats) { -- atomic64_add(sectors, &ctxt->stats->sectors_seen); -- atomic64_add(sectors, &ctxt->stats->sectors_moved); -- } -- sectors_moved += btree_sectors(c); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret == -ENOMEM) { -+ /* memory allocation failure, wait for some IO to finish */ -+ bch2_move_ctxt_wait_for_io(ctxt); -+ continue; - } -+ if (ret) -+ goto err; -+ -+ if (ctxt->stats) -+ atomic64_add(sectors, &ctxt->stats->sectors_seen); - next: -- bch2_btree_iter_advance(&bp_iter); -+ bch2_btree_iter_advance(trans, &bp_iter); - } - -- trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); -+ while (check_mismatch_done < bucket_end) -+ bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, -+ copygc, &last_flushed); - err: - bch2_trans_iter_exit(trans, &bp_iter); -- bch2_dev_put(ca); - bch2_bkey_buf_exit(&sk, c); - bch2_bkey_buf_exit(&last_flushed, c); -+ bch2_dev_put(ca); - return ret; - } - -+int bch2_move_data_phys(struct bch_fs *c, -+ unsigned dev, -+ u64 start, -+ u64 end, -+ unsigned data_types, -+ struct bch_ratelimit *rate, -+ struct bch_move_stats *stats, -+ struct write_point_specifier wp, -+ bool wait_on_copygc, -+ move_pred_fn pred, void *arg) -+{ -+ struct moving_context ctxt; -+ -+ bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); -+ -+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); -+ if (ctxt.stats) { -+ ctxt.stats->phys = true; -+ ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; -+ } -+ -+ int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, -+ data_types, false, pred, arg); -+ bch2_moving_ctxt_exit(&ctxt); -+ -+ return ret; -+} -+ -+struct evacuate_bucket_arg { -+ struct bpos bucket; -+ int gen; -+ struct data_update_opts data_opts; -+}; -+ -+static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, -+ enum btree_id btree, struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_update_opts *data_opts) -+{ -+ struct evacuate_bucket_arg *arg = _arg; -+ -+ *data_opts = arg->data_opts; -+ -+ unsigned i = 0; -+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { -+ if (ptr->dev == arg->bucket.inode && -+ (arg->gen < 0 || arg->gen == ptr->gen) && -+ !ptr->cached) -+ data_opts->rewrite_ptrs |= BIT(i); -+ i++; -+ } -+ -+ return data_opts->rewrite_ptrs != 0; -+} -+ -+int bch2_evacuate_bucket(struct moving_context *ctxt, -+ struct move_bucket *bucket_in_flight, -+ struct bpos bucket, int gen, -+ struct data_update_opts data_opts) -+{ -+ struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; -+ -+ return __bch2_move_data_phys(ctxt, bucket_in_flight, -+ bucket.inode, -+ bucket.offset, -+ bucket.offset + 1, -+ ~0, -+ true, -+ evacuate_bucket_pred, &arg); -+} -+ - typedef bool (*move_btree_pred)(struct bch_fs *, void *, - struct btree *, struct bch_io_opts *, - struct data_update_opts *); -@@ -906,7 +1076,7 @@ static int bch2_move_btree(struct bch_fs *c, - retry: - ret = 0; - while (bch2_trans_begin(trans), -- (b = bch2_btree_iter_peek_node(&iter)) && -+ (b = bch2_btree_iter_peek_node(trans, &iter)) && - !(ret = PTR_ERR_OR_ZERO(b))) { - if (kthread && kthread_should_stop()) - break; -@@ -920,13 +1090,13 @@ static int bch2_move_btree(struct bch_fs *c, - if (!pred(c, arg, b, &io_opts, &data_opts)) - goto next; - -- ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; -+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - next: -- bch2_btree_iter_next_node(&iter); -+ bch2_btree_iter_next_node(trans, &iter); - } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; -@@ -945,7 +1115,7 @@ static int bch2_move_btree(struct bch_fs *c, - } - - static bool rereplicate_pred(struct bch_fs *c, void *arg, -- struct bkey_s_c k, -+ enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) - { -@@ -977,7 +1147,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, - } - - static bool migrate_pred(struct bch_fs *c, void *arg, -- struct bkey_s_c k, -+ enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) - { -@@ -1004,15 +1174,7 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) - { -- return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); --} -- --static bool migrate_btree_pred(struct bch_fs *c, void *arg, -- struct btree *b, -- struct bch_io_opts *io_opts, -- struct data_update_opts *data_opts) --{ -- return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); -+ return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts); - } - - /* -@@ -1068,7 +1230,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) - } - - static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, -- struct bkey_s_c k, -+ enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) - { -@@ -1101,7 +1263,32 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) - { -- return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); -+ return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), -+ io_opts, data_opts); -+} -+ -+static bool scrub_pred(struct bch_fs *c, void *_arg, -+ enum btree_id btree, struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_update_opts *data_opts) -+{ -+ struct bch_ioctl_data *arg = _arg; -+ -+ if (k.k->type != KEY_TYPE_btree_ptr_v2) { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (p.ptr.dev == arg->migrate.dev) { -+ if (!p.crc.csum_type) -+ return false; -+ break; -+ } -+ } -+ -+ data_opts->scrub = true; -+ data_opts->read_dev = arg->migrate.dev; -+ return true; - } - - int bch2_data_job(struct bch_fs *c, -@@ -1118,6 +1305,22 @@ int bch2_data_job(struct bch_fs *c, - bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); - - switch (op.op) { -+ case BCH_DATA_OP_scrub: -+ /* -+ * prevent tests from spuriously failing, make sure we see all -+ * btree nodes that need to be repaired -+ */ -+ bch2_btree_interior_updates_flush(c); -+ -+ ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, -+ op.scrub.data_types, -+ NULL, -+ stats, -+ writepoint_hashed((unsigned long) current), -+ false, -+ scrub_pred, &op) ?: ret; -+ break; -+ - case BCH_DATA_OP_rereplicate: - stats->data_type = BCH_DATA_journal; - ret = bch2_journal_flush_device_pins(&c->journal, -1); -@@ -1137,14 +1340,14 @@ int bch2_data_job(struct bch_fs *c, - - stats->data_type = BCH_DATA_journal; - ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); -- ret = bch2_move_btree(c, start, end, -- migrate_btree_pred, &op, stats) ?: ret; -- ret = bch2_move_data(c, start, end, -- NULL, -- stats, -- writepoint_hashed((unsigned long) current), -- true, -- migrate_pred, &op) ?: ret; -+ ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, -+ ~0, -+ NULL, -+ stats, -+ writepoint_hashed((unsigned long) current), -+ true, -+ migrate_pred, &op) ?: ret; -+ bch2_btree_interior_updates_flush(c); - ret = bch2_replicas_gc2(c) ?: ret; - break; - case BCH_DATA_OP_rewrite_old_nodes: -@@ -1176,17 +1379,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) - prt_newline(out); - printbuf_indent_add(out, 2); - -- prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); -- prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); -- prt_printf(out, "bytes seen: "); -+ prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); -+ prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); -+ prt_printf(out, "bytes seen:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); - prt_newline(out); - -- prt_printf(out, "bytes moved: "); -+ prt_printf(out, "bytes moved:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); - prt_newline(out); - -- prt_printf(out, "bytes raced: "); -+ prt_printf(out, "bytes raced:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); - prt_newline(out); - -@@ -1195,7 +1398,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) - - static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) - { -- struct moving_io *io; -+ if (!out->nr_tabstops) -+ printbuf_tabstop_push(out, 32); - - bch2_move_stats_to_text(out, ctxt->stats); - printbuf_indent_add(out, 2); -@@ -1215,8 +1419,9 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str - printbuf_indent_add(out, 2); - - mutex_lock(&ctxt->lock); -+ struct moving_io *io; - list_for_each_entry(io, &ctxt->ios, io_list) -- bch2_write_op_to_text(out, &io->write.op); -+ bch2_data_update_inflight_to_text(out, &io->write); - mutex_unlock(&ctxt->lock); - - printbuf_indent_sub(out, 4); -diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h -index 51e0505a8156..86b80499ac55 100644 ---- a/fs/bcachefs/move.h -+++ b/fs/bcachefs/move.h -@@ -72,7 +72,7 @@ do { \ - break; \ - } while (1) - --typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, -+typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, - struct bch_io_opts *, struct data_update_opts *); - - extern const char * const bch2_data_ops_strs[]; -@@ -116,12 +116,18 @@ int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, - int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); - - int bch2_move_extent(struct moving_context *, -- struct move_bucket_in_flight *, -+ struct move_bucket *, - struct btree_iter *, - struct bkey_s_c, - struct bch_io_opts, - struct data_update_opts); - -+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, -+ struct per_snapshot_io_opts *, struct bpos, -+ struct btree_iter *, struct bkey_s_c); -+ -+int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, -+ move_pred_fn, void *, enum btree_id, unsigned); - int __bch2_move_data(struct moving_context *, - struct bbpos, - struct bbpos, -@@ -135,8 +141,13 @@ int bch2_move_data(struct bch_fs *, - bool, - move_pred_fn, void *); - -+int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, -+ struct bch_ratelimit *, struct bch_move_stats *, -+ struct write_point_specifier, bool, -+ move_pred_fn, void *); -+ - int bch2_evacuate_bucket(struct moving_context *, -- struct move_bucket_in_flight *, -+ struct move_bucket *, - struct bpos, int, - struct data_update_opts); - int bch2_data_job(struct bch_fs *, -diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h -index e22841ef31e4..c5c62cd600de 100644 ---- a/fs/bcachefs/move_types.h -+++ b/fs/bcachefs/move_types.h -@@ -3,33 +3,43 @@ - #define _BCACHEFS_MOVE_TYPES_H - - #include "bbpos_types.h" -+#include "bcachefs_ioctl.h" - - struct bch_move_stats { -- enum bch_data_type data_type; -- struct bbpos pos; - char name[32]; -+ bool phys; -+ enum bch_ioctl_data_event_ret ret; -+ -+ union { -+ struct { -+ enum bch_data_type data_type; -+ struct bbpos pos; -+ }; -+ struct { -+ unsigned dev; -+ u64 offset; -+ }; -+ }; - - atomic64_t keys_moved; - atomic64_t keys_raced; - atomic64_t sectors_seen; - atomic64_t sectors_moved; - atomic64_t sectors_raced; -+ atomic64_t sectors_error_corrected; -+ atomic64_t sectors_error_uncorrected; - }; - - struct move_bucket_key { - struct bpos bucket; -- u8 gen; -+ unsigned gen; - }; - - struct move_bucket { -+ struct move_bucket *next; -+ struct rhash_head hash; - struct move_bucket_key k; - unsigned sectors; --}; -- --struct move_bucket_in_flight { -- struct move_bucket_in_flight *next; -- struct rhash_head hash; -- struct move_bucket bucket; - atomic_t count; - }; - -diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c -index 6718dc37c5a3..e7a2a13554d7 100644 ---- a/fs/bcachefs/movinggc.c -+++ b/fs/bcachefs/movinggc.c -@@ -8,6 +8,7 @@ - #include "bcachefs.h" - #include "alloc_background.h" - #include "alloc_foreground.h" -+#include "backpointers.h" - #include "btree_iter.h" - #include "btree_update.h" - #include "btree_write_buffer.h" -@@ -27,47 +28,32 @@ - #include - - struct buckets_in_flight { -- struct rhashtable table; -- struct move_bucket_in_flight *first; -- struct move_bucket_in_flight *last; -- size_t nr; -- size_t sectors; -+ struct rhashtable table; -+ struct move_bucket *first; -+ struct move_bucket *last; -+ size_t nr; -+ size_t sectors; -+ -+ DARRAY(struct move_bucket *) to_evacuate; - }; - - static const struct rhashtable_params bch_move_bucket_params = { -- .head_offset = offsetof(struct move_bucket_in_flight, hash), -- .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), -+ .head_offset = offsetof(struct move_bucket, hash), -+ .key_offset = offsetof(struct move_bucket, k), - .key_len = sizeof(struct move_bucket_key), - .automatic_shrinking = true, - }; - --static struct move_bucket_in_flight * --move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) -+static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b) - { -- struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL); -- int ret; -- -- if (!new) -- return ERR_PTR(-ENOMEM); -- -- new->bucket = b; -- -- ret = rhashtable_lookup_insert_fast(&list->table, &new->hash, -- bch_move_bucket_params); -- if (ret) { -- kfree(new); -- return ERR_PTR(ret); -- } -- - if (!list->first) -- list->first = new; -+ list->first = b; - else -- list->last->next = new; -+ list->last->next = b; - -- list->last = new; -+ list->last = b; - list->nr++; -- list->sectors += b.sectors; -- return new; -+ list->sectors += b->sectors; - } - - static int bch2_bucket_is_movable(struct btree_trans *trans, -@@ -89,9 +75,12 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, - if (!ca) - goto out; - -+ if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) -+ goto out; -+ - if (ca->mi.state != BCH_MEMBER_STATE_rw || - !bch2_dev_is_online(ca)) -- goto out_put; -+ goto out; - - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); -@@ -100,19 +89,26 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, - u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - - ret = lru_idx && lru_idx <= time; --out_put: -- bch2_dev_put(ca); - out: -+ bch2_dev_put(ca); - bch2_trans_iter_exit(trans, &iter); - return ret; - } - -+static void move_bucket_free(struct buckets_in_flight *list, -+ struct move_bucket *b) -+{ -+ int ret = rhashtable_remove_fast(&list->table, &b->hash, -+ bch_move_bucket_params); -+ BUG_ON(ret); -+ kfree(b); -+} -+ - static void move_buckets_wait(struct moving_context *ctxt, - struct buckets_in_flight *list, - bool flush) - { -- struct move_bucket_in_flight *i; -- int ret; -+ struct move_bucket *i; - - while ((i = list->first)) { - if (flush) -@@ -126,12 +122,9 @@ static void move_buckets_wait(struct moving_context *ctxt, - list->last = NULL; - - list->nr--; -- list->sectors -= i->bucket.sectors; -+ list->sectors -= i->sectors; - -- ret = rhashtable_remove_fast(&list->table, &i->hash, -- bch_move_bucket_params); -- BUG_ON(ret); -- kfree(i); -+ move_bucket_free(list, i); - } - - bch2_trans_unlock_long(ctxt->trans); -@@ -143,11 +136,8 @@ static bool bucket_in_flight(struct buckets_in_flight *list, - return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); - } - --typedef DARRAY(struct move_bucket) move_buckets; -- - static int bch2_copygc_get_buckets(struct moving_context *ctxt, -- struct buckets_in_flight *buckets_in_flight, -- move_buckets *buckets) -+ struct buckets_in_flight *buckets_in_flight) - { - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; -@@ -164,11 +154,9 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, - if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) - return ret; - -- bch2_trans_begin(trans); -- - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, -- lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), -- lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), -+ lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), -+ lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), - 0, k, ({ - struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; - int ret2 = 0; -@@ -184,20 +172,34 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, - else if (bucket_in_flight(buckets_in_flight, b.k)) - in_flight++; - else { -- ret2 = darray_push(buckets, b); -+ struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); -+ ret2 = b_i ? 0 : -ENOMEM; - if (ret2) - goto err; -+ -+ *b_i = b; -+ -+ ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); -+ if (ret2) { -+ kfree(b_i); -+ goto err; -+ } -+ -+ ret2 = rhashtable_lookup_insert_fast(&buckets_in_flight->table, &b_i->hash, -+ bch_move_bucket_params); -+ BUG_ON(ret2); -+ - sectors += b.sectors; - } - -- ret2 = buckets->nr >= nr_to_get; -+ ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; - err: - ret2; - })); - - pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", - buckets_in_flight->nr, buckets_in_flight->sectors, -- saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret); -+ saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); - - return ret < 0 ? ret : 0; - } -@@ -212,40 +214,30 @@ static int bch2_copygc(struct moving_context *ctxt, - struct data_update_opts data_opts = { - .btree_insert_flags = BCH_WATERMARK_copygc, - }; -- move_buckets buckets = { 0 }; -- struct move_bucket_in_flight *f; - u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); - u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); - int ret = 0; - -- ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); -+ ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); - if (ret) - goto err; - -- darray_for_each(buckets, i) { -+ darray_for_each(buckets_in_flight->to_evacuate, i) { - if (kthread_should_stop() || freezing(current)) - break; - -- f = move_bucket_in_flight_add(buckets_in_flight, *i); -- ret = PTR_ERR_OR_ZERO(f); -- if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */ -- ret = 0; -- continue; -- } -- if (ret == -ENOMEM) { /* flush IO, continue later */ -- ret = 0; -- break; -- } -+ struct move_bucket *b = *i; -+ *i = NULL; - -- ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, -- f->bucket.k.gen, data_opts); -+ move_bucket_in_flight_add(buckets_in_flight, b); -+ -+ ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts); - if (ret) - goto err; - - *did_work = true; - } - err: -- - /* no entries in LRU btree found, or got to end: */ - if (bch2_err_matches(ret, ENOENT)) - ret = 0; -@@ -255,12 +247,34 @@ static int bch2_copygc(struct moving_context *ctxt, - - sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; - sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; -- trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); -+ trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved); - -- darray_exit(&buckets); -+ darray_for_each(buckets_in_flight->to_evacuate, i) -+ if (*i) -+ move_bucket_free(buckets_in_flight, *i); -+ darray_exit(&buckets_in_flight->to_evacuate); - return ret; - } - -+static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) -+{ -+ struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); -+ struct bch_dev_usage usage; -+ -+ for (unsigned i = 0; i < BCH_DATA_NR; i++) -+ usage.buckets[i] = usage_full.d[i].buckets; -+ -+ s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * -+ ca->mi.bucket_size) >> 1); -+ s64 fragmented = 0; -+ -+ for (unsigned i = 0; i < BCH_DATA_NR; i++) -+ if (data_type_movable(i)) -+ fragmented += usage_full.d[i].fragmented; -+ -+ return max(0LL, fragmented_allowed - fragmented); -+} -+ - /* - * Copygc runs when the amount of fragmented data is above some arbitrary - * threshold: -@@ -275,23 +289,14 @@ static int bch2_copygc(struct moving_context *ctxt, - * often and continually reduce the amount of fragmented space as the device - * fills up. So, we increase the threshold by half the current free space. - */ --unsigned long bch2_copygc_wait_amount(struct bch_fs *c) -+u64 bch2_copygc_wait_amount(struct bch_fs *c) - { -- s64 wait = S64_MAX, fragmented_allowed, fragmented; -- -- for_each_rw_member(c, ca) { -- struct bch_dev_usage usage = bch2_dev_usage_read(ca); -+ u64 wait = U64_MAX; - -- fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * -- ca->mi.bucket_size) >> 1); -- fragmented = 0; -- -- for (unsigned i = 0; i < BCH_DATA_NR; i++) -- if (data_type_movable(i)) -- fragmented += usage.d[i].fragmented; -- -- wait = min(wait, max(0LL, fragmented_allowed - fragmented)); -- } -+ rcu_read_lock(); -+ for_each_rw_member_rcu(c, ca) -+ wait = min(wait, bch2_copygc_dev_wait_amount(ca)); -+ rcu_read_unlock(); - - return wait; - } -@@ -314,9 +319,28 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) - c->copygc_wait_at) << 9); - prt_newline(out); - -- prt_printf(out, "Currently calculated wait:\t"); -- prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); -- prt_newline(out); -+ bch2_printbuf_make_room(out, 4096); -+ -+ rcu_read_lock(); -+ out->atomic++; -+ -+ prt_printf(out, "Currently calculated wait:\n"); -+ for_each_rw_member_rcu(c, ca) { -+ prt_printf(out, " %s:\t", ca->name); -+ prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); -+ prt_newline(out); -+ } -+ -+ struct task_struct *t = rcu_dereference(c->copygc_thread); -+ if (t) -+ get_task_struct(t); -+ --out->atomic; -+ rcu_read_unlock(); -+ -+ if (t) { -+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); -+ put_task_struct(t); -+ } - } - - static int bch2_copygc_thread(void *arg) -@@ -325,22 +349,23 @@ static int bch2_copygc_thread(void *arg) - struct moving_context ctxt; - struct bch_move_stats move_stats; - struct io_clock *clock = &c->io_clock[WRITE]; -- struct buckets_in_flight *buckets; -+ struct buckets_in_flight buckets = {}; - u64 last, wait; -- int ret = 0; - -- buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL); -- if (!buckets) -- return -ENOMEM; -- ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); -+ int ret = rhashtable_init(&buckets.table, &bch_move_bucket_params); - bch_err_msg(c, ret, "allocating copygc buckets in flight"); -- if (ret) { -- kfree(buckets); -+ if (ret) - return ret; -- } - - set_freezable(); - -+ /* -+ * Data move operations can't run until after check_snapshots has -+ * completed, and bch2_snapshot_is_ancestor() is available. -+ */ -+ kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || -+ kthread_should_stop()); -+ - bch2_move_stats_init(&move_stats, "copygc"); - bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, - writepoint_ptr(&c->copygc_write_point), -@@ -353,13 +378,13 @@ static int bch2_copygc_thread(void *arg) - cond_resched(); - - if (!c->opts.copygc_enabled) { -- move_buckets_wait(&ctxt, buckets, true); -+ move_buckets_wait(&ctxt, &buckets, true); - kthread_wait_freezable(c->opts.copygc_enabled || - kthread_should_stop()); - } - - if (unlikely(freezing(current))) { -- move_buckets_wait(&ctxt, buckets, true); -+ move_buckets_wait(&ctxt, &buckets, true); - __refrigerator(false); - continue; - } -@@ -370,7 +395,7 @@ static int bch2_copygc_thread(void *arg) - if (wait > clock->max_slop) { - c->copygc_wait_at = last; - c->copygc_wait = last + wait; -- move_buckets_wait(&ctxt, buckets, true); -+ move_buckets_wait(&ctxt, &buckets, true); - trace_and_count(c, copygc_wait, c, wait, last + wait); - bch2_kthread_io_clock_wait(clock, last + wait, - MAX_SCHEDULE_TIMEOUT); -@@ -380,7 +405,7 @@ static int bch2_copygc_thread(void *arg) - c->copygc_wait = 0; - - c->copygc_running = true; -- ret = bch2_copygc(&ctxt, buckets, &did_work); -+ ret = bch2_copygc(&ctxt, &buckets, &did_work); - c->copygc_running = false; - - wake_up(&c->copygc_running_wq); -@@ -391,16 +416,14 @@ static int bch2_copygc_thread(void *arg) - if (min_member_capacity == U64_MAX) - min_member_capacity = 128 * 2048; - -- move_buckets_wait(&ctxt, buckets, true); -+ move_buckets_wait(&ctxt, &buckets, true); - bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), - MAX_SCHEDULE_TIMEOUT); - } - } - -- move_buckets_wait(&ctxt, buckets, true); -- -- rhashtable_destroy(&buckets->table); -- kfree(buckets); -+ move_buckets_wait(&ctxt, &buckets, true); -+ rhashtable_destroy(&buckets.table); - bch2_moving_ctxt_exit(&ctxt); - bch2_move_stats_exit(&move_stats, c); - -diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h -index ea181fef5bc9..b9683d22bab0 100644 ---- a/fs/bcachefs/movinggc.h -+++ b/fs/bcachefs/movinggc.h -@@ -2,9 +2,18 @@ - #ifndef _BCACHEFS_MOVINGGC_H - #define _BCACHEFS_MOVINGGC_H - --unsigned long bch2_copygc_wait_amount(struct bch_fs *); -+u64 bch2_copygc_wait_amount(struct bch_fs *); - void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); - -+static inline void bch2_copygc_wakeup(struct bch_fs *c) -+{ -+ rcu_read_lock(); -+ struct task_struct *p = rcu_dereference(c->copygc_thread); -+ if (p) -+ wake_up_process(p); -+ rcu_read_unlock(); -+} -+ - void bch2_copygc_stop(struct bch_fs *); - int bch2_copygc_start(struct bch_fs *); - void bch2_fs_copygc_init(struct bch_fs *); -diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/namei.c -similarity index 52% -rename from fs/bcachefs/fs-common.c -rename to fs/bcachefs/namei.c -index 2c3d46ac70c6..a84b69d6caef 100644 ---- a/fs/bcachefs/fs-common.c -+++ b/fs/bcachefs/namei.c -@@ -4,13 +4,21 @@ - #include "acl.h" - #include "btree_update.h" - #include "dirent.h" --#include "fs-common.h" - #include "inode.h" -+#include "namei.h" - #include "subvolume.h" - #include "xattr.h" - - #include - -+static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode) -+{ -+ return (subvol_inum) { -+ .subvol = inode->bi_parent_subvol ?: inum.subvol, -+ .inum = inode->bi_dir, -+ }; -+} -+ - static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) - { - return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; -@@ -28,8 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, - unsigned flags) - { - struct bch_fs *c = trans->c; -- struct btree_iter dir_iter = { NULL }; -- struct btree_iter inode_iter = { NULL }; -+ struct btree_iter dir_iter = {}; -+ struct btree_iter inode_iter = {}; - subvol_inum new_inum = dir; - u64 now = bch2_current_time(c); - u64 cpu = raw_smp_processor_id(); -@@ -49,7 +57,7 @@ int bch2_create_trans(struct btree_trans *trans, - - if (!(flags & BCH_CREATE_SNAPSHOT)) { - /* Normal create path - allocate a new inode: */ -- bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); -+ bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u); - - if (flags & BCH_CREATE_TMPFILE) - new_inode->bi_flags |= BCH_INODE_unlinked; -@@ -123,8 +131,8 @@ int bch2_create_trans(struct btree_trans *trans, - if (ret) - goto err; - -- bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); -- ret = bch2_btree_iter_traverse(&dir_iter); -+ bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot); -+ ret = bch2_btree_iter_traverse(trans, &dir_iter); - if (ret) - goto err; - } -@@ -153,16 +161,13 @@ int bch2_create_trans(struct btree_trans *trans, - dir_u->bi_nlink++; - dir_u->bi_mtime = dir_u->bi_ctime = now; - -- ret = bch2_inode_write(trans, &dir_iter, dir_u); -- if (ret) -- goto err; -- -- ret = bch2_dirent_create(trans, dir, &dir_hash, -- dir_type, -- name, -- dir_target, -- &dir_offset, -- STR_HASH_must_create|BTREE_ITER_with_updates); -+ ret = bch2_dirent_create(trans, dir, &dir_hash, -+ dir_type, -+ name, -+ dir_target, -+ &dir_offset, -+ STR_HASH_must_create|BTREE_ITER_with_updates) ?: -+ bch2_inode_write(trans, &dir_iter, dir_u); - if (ret) - goto err; - -@@ -175,9 +180,9 @@ int bch2_create_trans(struct btree_trans *trans, - new_inode->bi_depth = dir_u->bi_depth + 1; - - inode_iter.flags &= ~BTREE_ITER_all_snapshots; -- bch2_btree_iter_set_snapshot(&inode_iter, snapshot); -+ bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot); - -- ret = bch2_btree_iter_traverse(&inode_iter) ?: -+ ret = bch2_btree_iter_traverse(trans, &inode_iter) ?: - bch2_inode_write(trans, &inode_iter, new_inode); - err: - bch2_trans_iter_exit(trans, &inode_iter); -@@ -191,8 +196,8 @@ int bch2_link_trans(struct btree_trans *trans, - const struct qstr *name) - { - struct bch_fs *c = trans->c; -- struct btree_iter dir_iter = { NULL }; -- struct btree_iter inode_iter = { NULL }; -+ struct btree_iter dir_iter = {}; -+ struct btree_iter inode_iter = {}; - struct bch_hash_info dir_hash; - u64 now = bch2_current_time(c); - u64 dir_offset = 0; -@@ -225,7 +230,8 @@ int bch2_link_trans(struct btree_trans *trans, - - ret = bch2_dirent_create(trans, dir, &dir_hash, - mode_to_type(inode_u->bi_mode), -- name, inum.inum, &dir_offset, -+ name, inum.inum, -+ &dir_offset, - STR_HASH_must_create); - if (ret) - goto err; -@@ -249,9 +255,9 @@ int bch2_unlink_trans(struct btree_trans *trans, - bool deleting_subvol) - { - struct bch_fs *c = trans->c; -- struct btree_iter dir_iter = { NULL }; -- struct btree_iter dirent_iter = { NULL }; -- struct btree_iter inode_iter = { NULL }; -+ struct btree_iter dir_iter = {}; -+ struct btree_iter dirent_iter = {}; -+ struct btree_iter inode_iter = {}; - struct bch_hash_info dir_hash; - subvol_inum inum; - u64 now = bch2_current_time(c); -@@ -297,7 +303,7 @@ int bch2_unlink_trans(struct btree_trans *trans, - if (ret) - goto err; - -- k = bch2_btree_iter_peek_slot(&dirent_iter); -+ k = bch2_btree_iter_peek_slot(trans, &dirent_iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -306,8 +312,8 @@ int bch2_unlink_trans(struct btree_trans *trans, - * If we're deleting a subvolume, we need to really delete the - * dirent, not just emit a whiteout in the current snapshot: - */ -- bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); -- ret = bch2_btree_iter_traverse(&dirent_iter); -+ bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot); -+ ret = bch2_btree_iter_traverse(trans, &dirent_iter); - if (ret) - goto err; - } else { -@@ -343,6 +349,9 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, - bool ret = false; - - for (id = 0; id < Inode_opt_nr; id++) { -+ if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold) -+ continue; -+ - /* Skip attributes that were explicitly set on this inode */ - if (dst_u->bi_fields_set & (1 << id)) - continue; -@@ -386,10 +395,10 @@ int bch2_rename_trans(struct btree_trans *trans, - enum bch_rename_mode mode) - { - struct bch_fs *c = trans->c; -- struct btree_iter src_dir_iter = { NULL }; -- struct btree_iter dst_dir_iter = { NULL }; -- struct btree_iter src_inode_iter = { NULL }; -- struct btree_iter dst_inode_iter = { NULL }; -+ struct btree_iter src_dir_iter = {}; -+ struct btree_iter dst_dir_iter = {}; -+ struct btree_iter src_inode_iter = {}; -+ struct btree_iter dst_inode_iter = {}; - struct bch_hash_info src_hash, dst_hash; - subvol_inum src_inum, dst_inum; - u64 src_offset, dst_offset; -@@ -403,8 +412,7 @@ int bch2_rename_trans(struct btree_trans *trans, - - src_hash = bch2_hash_info_init(c, src_dir_u); - -- if (dst_dir.inum != src_dir.inum || -- dst_dir.subvol != src_dir.subvol) { -+ if (!subvol_inum_eq(dst_dir, src_dir)) { - ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, - BTREE_ITER_intent); - if (ret) -@@ -417,8 +425,8 @@ int bch2_rename_trans(struct btree_trans *trans, - } - - ret = bch2_dirent_rename(trans, -- src_dir, &src_hash, -- dst_dir, &dst_hash, -+ src_dir, &src_hash, &src_dir_u->bi_size, -+ dst_dir, &dst_hash, &dst_dir_u->bi_size, - src_name, &src_inum, &src_offset, - dst_name, &dst_inum, &dst_offset, - mode); -@@ -496,32 +504,41 @@ int bch2_rename_trans(struct btree_trans *trans, - } - } - -- if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && -- S_ISDIR(src_inode_u->bi_mode)) { -- ret = -EXDEV; -- goto err; -- } -+ if (!subvol_inum_eq(dst_dir, src_dir)) { -+ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && -+ S_ISDIR(src_inode_u->bi_mode)) { -+ ret = -EXDEV; -+ goto err; -+ } - -- if (mode == BCH_RENAME_EXCHANGE && -- bch2_reinherit_attrs(dst_inode_u, src_dir_u) && -- S_ISDIR(dst_inode_u->bi_mode)) { -- ret = -EXDEV; -- goto err; -- } -+ if (mode == BCH_RENAME_EXCHANGE && -+ bch2_reinherit_attrs(dst_inode_u, src_dir_u) && -+ S_ISDIR(dst_inode_u->bi_mode)) { -+ ret = -EXDEV; -+ goto err; -+ } - -- if (is_subdir_for_nlink(src_inode_u)) { -- src_dir_u->bi_nlink--; -- dst_dir_u->bi_nlink++; -- } -+ ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?: -+ (mode == BCH_RENAME_EXCHANGE -+ ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u) -+ : 0); -+ if (ret) -+ goto err; - -- if (S_ISDIR(src_inode_u->bi_mode) && -- !src_inode_u->bi_subvol) -- src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; -+ if (is_subdir_for_nlink(src_inode_u)) { -+ src_dir_u->bi_nlink--; -+ dst_dir_u->bi_nlink++; -+ } - -- if (mode == BCH_RENAME_EXCHANGE && -- S_ISDIR(dst_inode_u->bi_mode) && -- !dst_inode_u->bi_subvol) -- dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; -+ if (S_ISDIR(src_inode_u->bi_mode) && -+ !src_inode_u->bi_subvol) -+ src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; -+ -+ if (mode == BCH_RENAME_EXCHANGE && -+ S_ISDIR(dst_inode_u->bi_mode) && -+ !dst_inode_u->bi_subvol) -+ dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; -+ } - - if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { - dst_dir_u->bi_nlink--; -@@ -560,6 +577,8 @@ int bch2_rename_trans(struct btree_trans *trans, - return ret; - } - -+/* inum_to_path */ -+ - static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) - { - bch2_printbuf_make_room(out, n); -@@ -590,31 +609,39 @@ static inline void reverse_bytes(void *b, size_t n) - } - } - --/* XXX: we don't yet attempt to print paths when we don't know the subvol */ --int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path) -+static int __bch2_inum_to_path(struct btree_trans *trans, -+ u32 subvol, u64 inum, u32 snapshot, -+ struct printbuf *path) - { - unsigned orig_pos = path->pos; - int ret = 0; - -- while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL && -- inum.inum == BCACHEFS_ROOT_INO)) { -+ while (true) { -+ if (!snapshot) { -+ ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); -+ if (ret) -+ goto disconnected; -+ } -+ - struct bch_inode_unpacked inode; -- ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); -+ ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); - if (ret) - goto disconnected; - -+ if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL && -+ inode.bi_inum == BCACHEFS_ROOT_INO) -+ break; -+ - if (!inode.bi_dir && !inode.bi_dir_offset) { - ret = -BCH_ERR_ENOENT_inode_no_backpointer; - goto disconnected; - } - -- inum.subvol = inode.bi_parent_subvol ?: inum.subvol; -- inum.inum = inode.bi_dir; -- -- u32 snapshot; -- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); -- if (ret) -- goto disconnected; -+ inum = inode.bi_dir; -+ if (inode.bi_parent_subvol) { -+ subvol = inode.bi_parent_subvol; -+ snapshot = 0; -+ } - - struct btree_iter d_iter; - struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, -@@ -650,3 +677,339 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb - prt_str_reversed(path, "(disconnected)"); - goto out; - } -+ -+int bch2_inum_to_path(struct btree_trans *trans, -+ subvol_inum inum, -+ struct printbuf *path) -+{ -+ return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path); -+} -+ -+int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot, -+ snapshot_id_list *snapshot_overwrites, -+ struct printbuf *path) -+{ -+ return __bch2_inum_to_path(trans, 0, inum, snapshot, path); -+} -+ -+/* fsck */ -+ -+static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, -+ struct bkey_s_c_dirent d, -+ struct bch_inode_unpacked *target, -+ bool in_fsck) -+{ -+ struct bch_fs *c = trans->c; -+ struct printbuf buf = PRINTBUF; -+ struct btree_iter bp_iter = {}; -+ int ret = 0; -+ -+ if (inode_points_to_dirent(target, d)) -+ return 0; -+ -+ if (!target->bi_dir && -+ !target->bi_dir_offset) { -+ fsck_err_on(S_ISDIR(target->bi_mode), -+ trans, inode_dir_missing_backpointer, -+ "directory with missing backpointer\n%s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, d.s_c), -+ prt_printf(&buf, "\n"), -+ bch2_inode_unpacked_to_text(&buf, target), -+ buf.buf)); -+ -+ fsck_err_on(target->bi_flags & BCH_INODE_unlinked, -+ trans, inode_unlinked_but_has_dirent, -+ "inode unlinked but has dirent\n%s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, d.s_c), -+ prt_printf(&buf, "\n"), -+ bch2_inode_unpacked_to_text(&buf, target), -+ buf.buf)); -+ -+ target->bi_flags &= ~BCH_INODE_unlinked; -+ target->bi_dir = d.k->p.inode; -+ target->bi_dir_offset = d.k->p.offset; -+ return __bch2_fsck_write_inode(trans, target); -+ } -+ -+ if (bch2_inode_should_have_single_bp(target) && -+ !fsck_err(trans, inode_wrong_backpointer, -+ "dirent points to inode that does not point back:\n%s", -+ (bch2_bkey_val_to_text(&buf, c, d.s_c), -+ prt_newline(&buf), -+ bch2_inode_unpacked_to_text(&buf, target), -+ buf.buf))) -+ goto err; -+ -+ struct bkey_s_c_dirent bp_dirent = -+ bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, -+ SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), -+ 0, dirent); -+ ret = bkey_err(bp_dirent); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ goto err; -+ -+ bool backpointer_exists = !ret; -+ ret = 0; -+ -+ if (!backpointer_exists) { -+ if (fsck_err(trans, inode_wrong_backpointer, -+ "inode %llu:%u has wrong backpointer:\n" -+ "got %llu:%llu\n" -+ "should be %llu:%llu", -+ target->bi_inum, target->bi_snapshot, -+ target->bi_dir, -+ target->bi_dir_offset, -+ d.k->p.inode, -+ d.k->p.offset)) { -+ target->bi_dir = d.k->p.inode; -+ target->bi_dir_offset = d.k->p.offset; -+ ret = __bch2_fsck_write_inode(trans, target); -+ } -+ } else { -+ bch2_bkey_val_to_text(&buf, c, d.s_c); -+ prt_newline(&buf); -+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); -+ -+ if (S_ISDIR(target->bi_mode) || target->bi_subvol) { -+ /* -+ * XXX: verify connectivity of the other dirent -+ * up to the root before removing this one -+ * -+ * Additionally, bch2_lookup would need to cope with the -+ * dirent it found being removed - or should we remove -+ * the other one, even though the inode points to it? -+ */ -+ if (in_fsck) { -+ if (fsck_err(trans, inode_dir_multiple_links, -+ "%s %llu:%u with multiple links\n%s", -+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume", -+ target->bi_inum, target->bi_snapshot, buf.buf)) -+ ret = bch2_fsck_remove_dirent(trans, d.k->p); -+ } else { -+ bch2_fs_inconsistent(c, -+ "%s %llu:%u with multiple links\n%s", -+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume", -+ target->bi_inum, target->bi_snapshot, buf.buf); -+ } -+ -+ goto out; -+ } else { -+ /* -+ * hardlinked file with nlink 0: -+ * We're just adjusting nlink here so check_nlinks() will pick -+ * it up, it ignores inodes with nlink 0 -+ */ -+ if (fsck_err_on(!target->bi_nlink, -+ trans, inode_multiple_links_but_nlink_0, -+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", -+ target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { -+ target->bi_nlink++; -+ target->bi_flags &= ~BCH_INODE_unlinked; -+ ret = __bch2_fsck_write_inode(trans, target); -+ if (ret) -+ goto err; -+ } -+ } -+ } -+out: -+err: -+fsck_err: -+ bch2_trans_iter_exit(trans, &bp_iter); -+ printbuf_exit(&buf); -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+int __bch2_check_dirent_target(struct btree_trans *trans, -+ struct btree_iter *dirent_iter, -+ struct bkey_s_c_dirent d, -+ struct bch_inode_unpacked *target, -+ bool in_fsck) -+{ -+ struct bch_fs *c = trans->c; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); -+ if (ret) -+ goto err; -+ -+ if (fsck_err_on(d.v->d_type != inode_d_type(target), -+ trans, dirent_d_type_wrong, -+ "incorrect d_type: got %s, should be %s:\n%s", -+ bch2_d_type_str(d.v->d_type), -+ bch2_d_type_str(inode_d_type(target)), -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { -+ struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ goto err; -+ -+ bkey_reassemble(&n->k_i, d.s_c); -+ n->v.d_type = inode_d_type(target); -+ if (n->v.d_type == DT_SUBVOL) { -+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); -+ n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); -+ } else { -+ n->v.d_inum = cpu_to_le64(target->bi_inum); -+ } -+ -+ ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); -+ if (ret) -+ goto err; -+ } -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+/* -+ * BCH_INODE_has_case_insensitive: -+ * We have to track whether directories have any descendent directory that is -+ * casefolded - for overlayfs: -+ */ -+ -+static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum) -+{ -+ struct btree_iter iter = {}; -+ int ret = 0; -+ -+ while (true) { -+ struct bch_inode_unpacked inode; -+ ret = bch2_inode_peek(trans, &iter, &inode, inum, -+ BTREE_ITER_intent|BTREE_ITER_with_updates); -+ if (ret) -+ break; -+ -+ if (inode.bi_flags & BCH_INODE_has_case_insensitive) -+ break; -+ -+ inode.bi_flags |= BCH_INODE_has_case_insensitive; -+ ret = bch2_inode_write(trans, &iter, &inode); -+ if (ret) -+ break; -+ -+ bch2_trans_iter_exit(trans, &iter); -+ if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM)) -+ break; -+ -+ inum = parent_inum(inum, &inode); -+ } -+ -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum, -+ struct bch_inode_unpacked *inode) -+{ -+ if (!bch2_inode_casefold(trans->c, inode)) -+ return 0; -+ -+ inode->bi_flags |= BCH_INODE_has_case_insensitive; -+ -+ return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode)); -+} -+ -+int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode, -+ snapshot_id_list *snapshot_overwrites, -+ bool *do_update) -+{ -+ struct printbuf buf = PRINTBUF; -+ bool repairing_parents = false; -+ int ret = 0; -+ -+ if (!S_ISDIR(inode->bi_mode)) { -+ /* -+ * Old versions set bi_casefold for non dirs, but that's -+ * unnecessary and wasteful -+ */ -+ if (inode->bi_casefold) { -+ inode->bi_casefold = 0; -+ *do_update = true; -+ } -+ return 0; -+ } -+ -+ if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive) -+ return 0; -+ -+ if (bch2_inode_casefold(trans->c, inode) && -+ !(inode->bi_flags & BCH_INODE_has_case_insensitive)) { -+ prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ", -+ inode->bi_inum, inode->bi_snapshot); -+ -+ ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot, -+ snapshot_overwrites, &buf); -+ if (ret) -+ goto err; -+ -+ if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) { -+ inode->bi_flags |= BCH_INODE_has_case_insensitive; -+ *do_update = true; -+ } -+ } -+ -+ if (!(inode->bi_flags & BCH_INODE_has_case_insensitive)) -+ goto out; -+ -+ struct bch_inode_unpacked dir = *inode; -+ u32 snapshot = dir.bi_snapshot; -+ -+ while (!(dir.bi_inum == BCACHEFS_ROOT_INO && -+ dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { -+ if (dir.bi_parent_subvol) { -+ ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ snapshot_overwrites = NULL; -+ } -+ -+ ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0); -+ if (ret) -+ goto err; -+ -+ if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) { -+ prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n"); -+ -+ ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot, -+ snapshot_overwrites, &buf); -+ if (ret) -+ goto err; -+ -+ if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) { -+ dir.bi_flags |= BCH_INODE_has_case_insensitive; -+ ret = __bch2_fsck_write_inode(trans, &dir); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ /* -+ * We only need to check the first parent, unless we find an -+ * inconsistency -+ */ -+ if (!repairing_parents) -+ break; -+ } -+out: -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ if (ret) -+ return ret; -+ -+ if (repairing_parents) { -+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: -+ -BCH_ERR_transaction_restart_nested; -+ } -+ -+ return 0; -+} -diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/namei.h -similarity index 52% -rename from fs/bcachefs/fs-common.h -rename to fs/bcachefs/namei.h -index 2b59210bb5e8..ae6ebc2d0785 100644 ---- a/fs/bcachefs/fs-common.h -+++ b/fs/bcachefs/namei.h -@@ -1,6 +1,6 @@ - /* SPDX-License-Identifier: GPL-2.0 */ --#ifndef _BCACHEFS_FS_COMMON_H --#define _BCACHEFS_FS_COMMON_H -+#ifndef _BCACHEFS_NAMEI_H -+#define _BCACHEFS_NAMEI_H - - #include "dirent.h" - -@@ -43,5 +43,37 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, - struct bch_inode_unpacked *); - - int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -+int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32, -+ snapshot_id_list *, struct printbuf *); - --#endif /* _BCACHEFS_FS_COMMON_H */ -+int __bch2_check_dirent_target(struct btree_trans *, -+ struct btree_iter *, -+ struct bkey_s_c_dirent, -+ struct bch_inode_unpacked *, bool); -+ -+static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode, -+ struct bkey_s_c_dirent d) -+{ -+ return inode->bi_dir == d.k->p.inode && -+ inode->bi_dir_offset == d.k->p.offset; -+} -+ -+static inline int bch2_check_dirent_target(struct btree_trans *trans, -+ struct btree_iter *dirent_iter, -+ struct bkey_s_c_dirent d, -+ struct bch_inode_unpacked *target, -+ bool in_fsck) -+{ -+ if (likely(inode_points_to_dirent(target, d) && -+ d.v->d_type == inode_d_type(target))) -+ return 0; -+ -+ return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); -+} -+ -+int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum, -+ struct bch_inode_unpacked *); -+int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *, -+ snapshot_id_list *, bool *); -+ -+#endif /* _BCACHEFS_NAMEI_H */ -diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c -index 3c21981a4a1c..962218fa68ec 100644 ---- a/fs/bcachefs/nocow_locking.c -+++ b/fs/bcachefs/nocow_locking.c -@@ -133,12 +133,10 @@ void bch2_fs_nocow_locking_exit(struct bch_fs *c) - BUG_ON(atomic_read(&l->l[j])); - } - --int bch2_fs_nocow_locking_init(struct bch_fs *c) -+void bch2_fs_nocow_locking_init_early(struct bch_fs *c) - { - struct bucket_nocow_lock_table *t = &c->nocow_locks; - - for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) - spin_lock_init(&l->lock); -- -- return 0; - } -diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h -index f9d6a426a960..48b8a003c0d2 100644 ---- a/fs/bcachefs/nocow_locking.h -+++ b/fs/bcachefs/nocow_locking.h -@@ -45,6 +45,6 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, - void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); - - void bch2_fs_nocow_locking_exit(struct bch_fs *); --int bch2_fs_nocow_locking_init(struct bch_fs *); -+void bch2_fs_nocow_locking_init_early(struct bch_fs *); - - #endif /* _BCACHEFS_NOCOW_LOCKING_H */ -diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c -index 6772faf385a5..b1cf88905b81 100644 ---- a/fs/bcachefs/opts.c -+++ b/fs/bcachefs/opts.c -@@ -7,7 +7,9 @@ - #include "compress.h" - #include "disk_groups.h" - #include "error.h" -+#include "movinggc.h" - #include "opts.h" -+#include "rebalance.h" - #include "recovery_passes.h" - #include "super-io.h" - #include "util.h" -@@ -19,6 +21,11 @@ const char * const bch2_error_actions[] = { - NULL - }; - -+const char * const bch2_degraded_actions[] = { -+ BCH_DEGRADED_ACTIONS() -+ NULL -+}; -+ - const char * const bch2_fsck_fix_opts[] = { - BCH_FIX_ERRORS_OPTS() - NULL -@@ -44,7 +51,7 @@ const char * const __bch2_btree_ids[] = { - NULL - }; - --static const char * const __bch2_csum_types[] = { -+const char * const __bch2_csum_types[] = { - BCH_CSUM_TYPES() - NULL - }; -@@ -163,16 +170,6 @@ const char * const bch2_d_types[BCH_DT_MAX] = { - [DT_SUBVOL] = "subvol", - }; - --u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) --{ -- BUG(); --} -- --void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) --{ -- BUG(); --} -- - void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) - { - #define x(_name, ...) \ -@@ -223,6 +220,21 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) - } - } - -+/* dummy option, for options that aren't stored in the superblock */ -+typedef u64 (*sb_opt_get_fn)(const struct bch_sb *); -+typedef void (*sb_opt_set_fn)(struct bch_sb *, u64); -+typedef u64 (*member_opt_get_fn)(const struct bch_member *); -+typedef void (*member_opt_set_fn)(struct bch_member *, u64); -+ -+__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL; -+__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL; -+__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL; -+__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL; -+ -+#define type_compatible_or_null(_p, _type) \ -+ __builtin_choose_expr( \ -+ __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL) -+ - const struct bch_option bch2_opt_table[] = { - #define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 - #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ -@@ -239,15 +251,15 @@ const struct bch_option bch2_opt_table[] = { - - #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ - [Opt_##_name] = { \ -- .attr = { \ -- .name = #_name, \ -- .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ -- }, \ -- .flags = _flags, \ -- .hint = _hint, \ -- .help = _help, \ -- .get_sb = _sb_opt, \ -- .set_sb = SET_##_sb_opt, \ -+ .attr.name = #_name, \ -+ .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ -+ .flags = _flags, \ -+ .hint = _hint, \ -+ .help = _help, \ -+ .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \ -+ .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \ -+ .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \ -+ .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\ - _type \ - }, - -@@ -268,20 +280,20 @@ int bch2_opt_lookup(const char *name) - return -1; - } - --struct synonym { -+struct opt_synonym { - const char *s1, *s2; - }; - --static const struct synonym bch_opt_synonyms[] = { -+static const struct opt_synonym bch2_opt_synonyms[] = { - { "quota", "usrquota" }, - }; - - static int bch2_mount_opt_lookup(const char *name) - { -- const struct synonym *i; -+ const struct opt_synonym *i; - -- for (i = bch_opt_synonyms; -- i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); -+ for (i = bch2_opt_synonyms; -+ i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms); - i++) - if (!strcmp(name, i->s1)) - name = i->s2; -@@ -289,6 +301,30 @@ static int bch2_mount_opt_lookup(const char *name) - return bch2_opt_lookup(name); - } - -+struct opt_val_synonym { -+ const char *opt, *v1, *v2; -+}; -+ -+static const struct opt_val_synonym bch2_opt_val_synonyms[] = { -+ { "degraded", "true", "yes" }, -+ { "degraded", "false", "no" }, -+ { "degraded", "1", "yes" }, -+ { "degraded", "0", "no" }, -+}; -+ -+static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val) -+{ -+ const struct opt_val_synonym *i; -+ -+ for (i = bch2_opt_val_synonyms; -+ i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms); -+ i++) -+ if (!strcmp(opt, i->opt) && !strcmp(val, i->v1)) -+ return i->v2; -+ -+ return val; -+} -+ - int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) - { - if (v < opt->min) { -@@ -332,21 +368,22 @@ int bch2_opt_parse(struct bch_fs *c, - { - ssize_t ret; - -+ if (err) -+ printbuf_indent_add_nextline(err, 2); -+ - switch (opt->type) { - case BCH_OPT_BOOL: -- if (val) { -- ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); -- if (ret != -BCH_ERR_option_not_bool) { -- *res = ret; -- } else { -- if (err) -- prt_printf(err, "%s: must be bool", opt->attr.name); -- return ret; -- } -+ if (!val) -+ val = "1"; -+ -+ ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); -+ if (ret != -BCH_ERR_option_not_bool) { -+ *res = ret; - } else { -- *res = 1; -+ if (err) -+ prt_printf(err, "%s: must be bool", opt->attr.name); -+ return ret; - } -- - break; - case BCH_OPT_UINT: - if (!val) { -@@ -355,9 +392,15 @@ int bch2_opt_parse(struct bch_fs *c, - return -EINVAL; - } - -- ret = opt->flags & OPT_HUMAN_READABLE -- ? bch2_strtou64_h(val, res) -- : kstrtou64(val, 10, res); -+ if (*val != '-') { -+ ret = opt->flags & OPT_HUMAN_READABLE -+ ? bch2_strtou64_h(val, res) -+ : kstrtou64(val, 10, res); -+ } else { -+ prt_printf(err, "%s: must be a non-negative number", opt->attr.name); -+ return -BCH_ERR_option_negative; -+ } -+ - if (ret < 0) { - if (err) - prt_printf(err, "%s: must be a number", -@@ -475,11 +518,16 @@ void bch2_opts_to_text(struct printbuf *out, - } - } - --int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) -+int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v) - { - int ret = 0; - - switch (id) { -+ case Opt_state: -+ if (ca) -+ return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED); -+ break; -+ - case Opt_compression: - case Opt_background_compression: - ret = bch2_check_set_has_compressed_data(c, v); -@@ -488,19 +536,17 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) - if (v) - bch2_check_set_feature(c, BCH_FEATURE_ec); - break; -+ default: -+ break; - } - - return ret; - } - --int bch2_opts_check_may_set(struct bch_fs *c) -+int bch2_opts_hooks_pre_set(struct bch_fs *c) - { -- unsigned i; -- int ret; -- -- for (i = 0; i < bch2_opts_nr; i++) { -- ret = bch2_opt_check_may_set(c, i, -- bch2_opt_get_by_id(&c->opts, i)); -+ for (unsigned i = 0; i < bch2_opts_nr; i++) { -+ int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); - if (ret) - return ret; - } -@@ -508,6 +554,61 @@ int bch2_opts_check_may_set(struct bch_fs *c) - return 0; - } - -+void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, -+ struct bch_opts *new_opts, enum bch_opt_id id) -+{ -+ switch (id) { -+ case Opt_foreground_target: -+ if (new_opts->foreground_target && -+ !new_opts->background_target) -+ bch2_set_rebalance_needs_scan(c, inum); -+ break; -+ case Opt_compression: -+ if (new_opts->compression && -+ !new_opts->background_compression) -+ bch2_set_rebalance_needs_scan(c, inum); -+ break; -+ case Opt_background_target: -+ if (new_opts->background_target) -+ bch2_set_rebalance_needs_scan(c, inum); -+ break; -+ case Opt_background_compression: -+ if (new_opts->background_compression) -+ bch2_set_rebalance_needs_scan(c, inum); -+ break; -+ case Opt_rebalance_enabled: -+ bch2_rebalance_wakeup(c); -+ break; -+ case Opt_copygc_enabled: -+ bch2_copygc_wakeup(c); -+ break; -+ case Opt_discard: -+ if (!ca) { -+ mutex_lock(&c->sb_lock); -+ for_each_member_device(c, ca) { -+ struct bch_member *m = -+ bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx); -+ SET_BCH_MEMBER_DISCARD(m, c->opts.discard); -+ } -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } -+ break; -+ case Opt_version_upgrade: -+ /* -+ * XXX: in the future we'll likely want to do compatible -+ * upgrades at runtime as well, but right now there's nothing -+ * that does that: -+ */ -+ if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible) -+ bch2_sb_upgrade_incompat(c); -+ break; -+ default: -+ break; -+ } -+} -+ - int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, - struct printbuf *parse_later, - const char *name, const char *val) -@@ -530,6 +631,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, - if (id < 0) - return 0; - -+ /* must have a value for synonym lookup - but OPT_FN is weird */ -+ if (!val && bch2_opt_table[id].type != BCH_OPT_FN) -+ val = "1"; -+ -+ val = bch2_opt_val_synonym_lookup(name, val); -+ - if (!(bch2_opt_table[id].flags & OPT_MOUNT)) - goto bad_opt; - -@@ -543,14 +650,15 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, - goto bad_opt; - - ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); -- if (ret == -BCH_ERR_option_needs_open_fs && parse_later) { -- prt_printf(parse_later, "%s=%s,", name, val); -- if (parse_later->allocation_failure) { -- ret = -ENOMEM; -- goto out; -+ if (ret == -BCH_ERR_option_needs_open_fs) { -+ ret = 0; -+ -+ if (parse_later) { -+ prt_printf(parse_later, "%s=%s,", name, val); -+ if (parse_later->allocation_failure) -+ ret = -ENOMEM; - } - -- ret = 0; - goto out; - } - -@@ -561,28 +669,24 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, - bch2_opt_set_by_id(opts, id, v); - - ret = 0; -- goto out; -- -+out: -+ printbuf_exit(&err); -+ return ret; - bad_opt: -- pr_err("Bad mount option %s", name); - ret = -BCH_ERR_option_name; - goto out; -- - bad_val: -- pr_err("Invalid mount option %s", err.buf); - ret = -BCH_ERR_option_value; -- --out: -- printbuf_exit(&err); -- return ret; -+ goto out; - } - - int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, -- struct printbuf *parse_later, char *options) -+ struct printbuf *parse_later, char *options, -+ bool ignore_unknown) - { - char *copied_opts, *copied_opts_start; - char *opt, *name, *val; -- int ret; -+ int ret = 0; - - if (!options) - return 0; -@@ -607,24 +711,37 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, - val = opt; - - ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val); -- if (ret < 0) -- goto out; -+ if (ret == -BCH_ERR_option_name && ignore_unknown) -+ ret = 0; -+ if (ret) { -+ pr_err("Error parsing option %s: %s", name, bch2_err_str(ret)); -+ break; -+ } - } - -- ret = 0; -- goto out; -- --out: - kfree(copied_opts_start); - return ret; - } - --u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) -+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx) - { - const struct bch_option *opt = bch2_opt_table + id; - u64 v; - -- v = opt->get_sb(sb); -+ if (dev_idx < 0) { -+ v = opt->get_sb(sb); -+ } else { -+ if (WARN(!bch2_member_exists(sb, dev_idx), -+ "tried to set device option %s on nonexistent device %i", -+ opt->attr.name, dev_idx)) -+ return 0; -+ -+ struct bch_member m = bch2_sb_member_get(sb, dev_idx); -+ v = opt->get_member(&m); -+ } -+ -+ if (opt->flags & OPT_SB_FIELD_ONE_BIAS) -+ --v; - - if (opt->flags & OPT_SB_FIELD_ILOG2) - v = 1ULL << v; -@@ -641,34 +758,20 @@ u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) - */ - int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) - { -- unsigned id; -- -- for (id = 0; id < bch2_opts_nr; id++) { -+ for (unsigned id = 0; id < bch2_opts_nr; id++) { - const struct bch_option *opt = bch2_opt_table + id; - -- if (opt->get_sb == BCH2_NO_SB_OPT) -- continue; -- -- bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); -+ if (opt->get_sb) -+ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1)); - } - - return 0; - } - --struct bch_dev_sb_opt_set { -- void (*set_sb)(struct bch_member *, u64); --}; -- --static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = { --#define x(n, set) [Opt_##n] = { .set_sb = SET_##set }, -- BCH_DEV_OPT_SETTERS() --#undef x --}; -- --void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, -+bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, - const struct bch_option *opt, u64 v) - { -- enum bch_opt_id id = opt - bch2_opt_table; -+ bool changed = false; - - if (opt->flags & OPT_SB_FIELD_SECTORS) - v >>= 9; -@@ -679,34 +782,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, - if (opt->flags & OPT_SB_FIELD_ONE_BIAS) - v++; - -- if (opt->flags & OPT_FS) { -- if (opt->set_sb != SET_BCH2_NO_SB_OPT) -- opt->set_sb(sb, v); -+ if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) { -+ changed = v != opt->get_sb(sb); -+ -+ opt->set_sb(sb, v); - } - -- if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) { -+ if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { - if (WARN(!bch2_member_exists(sb, dev_idx), - "tried to set device option %s on nonexistent device %i", - opt->attr.name, dev_idx)) -- return; -+ return false; - - struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); -- -- const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id; -- if (set->set_sb) -- set->set_sb(m, v); -- else -- pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name); -+ changed = v != opt->get_member(m); -+ opt->set_member(m, v); - } -+ -+ return changed; - } - --void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, -+bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, - const struct bch_option *opt, u64 v) - { - mutex_lock(&c->sb_lock); -- __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); -- bch2_write_super(c); -+ bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); -+ if (changed) -+ bch2_write_super(c); - mutex_unlock(&c->sb_lock); -+ return changed; - } - - /* io opts: */ -diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h -index 9d397fc2a1f0..2a02606254b3 100644 ---- a/fs/bcachefs/opts.h -+++ b/fs/bcachefs/opts.h -@@ -11,11 +11,13 @@ - struct bch_fs; - - extern const char * const bch2_error_actions[]; -+extern const char * const bch2_degraded_actions[]; - extern const char * const bch2_fsck_fix_opts[]; - extern const char * const bch2_version_upgrade_opts[]; - extern const char * const bch2_sb_features[]; - extern const char * const bch2_sb_compat[]; - extern const char * const __bch2_btree_ids[]; -+extern const char * const __bch2_csum_types[]; - extern const char * const __bch2_csum_opts[]; - extern const char * const __bch2_compression_types[]; - extern const char * const bch2_compression_opts[]; -@@ -50,10 +52,6 @@ static inline const char *bch2_d_type_str(unsigned d_type) - * apply the options from that struct that are defined. - */ - --/* dummy option, for options that aren't stored in the superblock */ --u64 BCH2_NO_SB_OPT(const struct bch_sb *); --void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); -- - /* When can be set: */ - enum opt_flags { - OPT_FS = BIT(0), /* Filesystem option */ -@@ -132,19 +130,24 @@ enum fsck_err_opts { - OPT_FS|OPT_FORMAT| \ - OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(512, 1U << 16), \ -- BCH_SB_BLOCK_SIZE, 8, \ -+ BCH_SB_BLOCK_SIZE, 4 << 10, \ - "size", NULL) \ - x(btree_node_size, u32, \ - OPT_FS|OPT_FORMAT| \ - OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(512, 1U << 20), \ -- BCH_SB_BTREE_NODE_SIZE, 512, \ -+ BCH_SB_BTREE_NODE_SIZE, 256 << 10, \ - "size", "Btree node size, default 256k") \ - x(errors, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ - NULL, "Action to take on filesystem error") \ -+ x(write_error_timeout, u16, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(1, 300), \ -+ BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ -+ NULL, "Number of consecutive write errors allowed before kicking out a device")\ - x(metadata_replicas, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ -@@ -181,6 +184,11 @@ enum fsck_err_opts { - OPT_STR(__bch2_csum_opts), \ - BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ - NULL, NULL) \ -+ x(checksum_err_retry_nr, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(0, 32), \ -+ BCH_SB_CSUM_ERR_RETRY_NR, 3, \ -+ NULL, NULL) \ - x(compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_compression), \ -@@ -197,7 +205,7 @@ enum fsck_err_opts { - BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ - NULL, "Hash function for directory entries and xattrs")\ - x(metadata_target, u16, \ -- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_METADATA_TARGET, 0, \ - "(target)", "Device or label for metadata writes") \ -@@ -221,6 +229,11 @@ enum fsck_err_opts { - OPT_BOOL(), \ - BCH_SB_ERASURE_CODE, false, \ - NULL, "Enable erasure coding (DO NOT USE YET)") \ -+ x(casefold, u8, \ -+ OPT_FS|OPT_INODE|OPT_FORMAT, \ -+ OPT_BOOL(), \ -+ BCH_SB_CASEFOLD, false, \ -+ NULL, "Dirent lookups are casefolded") \ - x(inodes_32bit, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ -@@ -295,24 +308,14 @@ enum fsck_err_opts { - NULL, "Enable project quotas") \ - x(degraded, u8, \ - OPT_FS|OPT_MOUNT, \ -- OPT_BOOL(), \ -- BCH2_NO_SB_OPT, false, \ -+ OPT_STR(bch2_degraded_actions), \ -+ BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \ - NULL, "Allow mounting in degraded mode") \ -- x(very_degraded, u8, \ -- OPT_FS|OPT_MOUNT, \ -- OPT_BOOL(), \ -- BCH2_NO_SB_OPT, false, \ -- NULL, "Allow mounting in when data will be missing") \ - x(no_splitbrain_check, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don't kick drives out when splitbrain detected")\ -- x(discard, u8, \ -- OPT_FS|OPT_MOUNT|OPT_DEVICE, \ -- OPT_BOOL(), \ -- BCH2_NO_SB_OPT, true, \ -- NULL, "Enable discard/TRIM support") \ - x(verbose, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ -@@ -447,7 +450,7 @@ enum fsck_err_opts { - BCH2_NO_SB_OPT, false, \ - NULL, "Reconstruct alloc btree") \ - x(version_upgrade, u8, \ -- OPT_FS|OPT_MOUNT, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_version_upgrade_opts), \ - BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ - NULL, "Set superblock to latest version,\n" \ -@@ -487,45 +490,56 @@ enum fsck_err_opts { - BCH2_NO_SB_OPT, true, \ - NULL, "Enable rebalance: disable for debugging, or to\n"\ - "quiet the system when doing performance testing\n")\ -+ x(rebalance_on_ac_only, u8, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_REBALANCE_AC_ONLY, false, \ -+ NULL, "Enable rebalance while on mains power only\n") \ -+ x(auto_snapshot_deletion, u8, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, true, \ -+ NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\ -+ "quiet the system when doing performance testing\n")\ - x(no_data_io, u8, \ - OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Skip submit_bio() for data reads and writes, " \ - "for performance testing purposes") \ -- x(fs_size, u64, \ -- OPT_DEVICE, \ -+ x(state, u64, \ -+ OPT_DEVICE|OPT_RUNTIME, \ -+ OPT_STR(bch2_member_states), \ -+ BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ -+ "state", "rw,ro,failed,spare") \ -+ x(bucket_size, u32, \ -+ OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(0, S64_MAX), \ -- BCH2_NO_SB_OPT, 0, \ -- "size", "Size of filesystem on device") \ -- x(bucket, u32, \ -- OPT_DEVICE, \ -- OPT_UINT(0, S64_MAX), \ -- BCH2_NO_SB_OPT, 0, \ -+ BCH_MEMBER_BUCKET_SIZE, 0, \ - "size", "Specifies the bucket size; must be greater than the btree node size")\ - x(durability, u8, \ -- OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \ -+ OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \ - OPT_UINT(0, BCH_REPLICAS_MAX), \ -- BCH2_NO_SB_OPT, 1, \ -+ BCH_MEMBER_DURABILITY, 1, \ - "n", "Data written to this device will be considered\n"\ - "to have already been replicated n times") \ - x(data_allowed, u8, \ - OPT_DEVICE, \ - OPT_BITFIELD(__bch2_data_types), \ -- BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ -+ BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ - "types", "Allowed data types for this device: journal, btree, and/or user")\ -+ x(discard, u8, \ -+ OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_MEMBER_DISCARD, true, \ -+ NULL, "Enable discard/TRIM support") \ - x(btree_node_prefetch, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ -- NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ -+ NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\ - " prefetched sequentially") - --#define BCH_DEV_OPT_SETTERS() \ -- x(discard, BCH_MEMBER_DISCARD) \ -- x(durability, BCH_MEMBER_DURABILITY) \ -- x(data_allowed, BCH_MEMBER_DATA_ALLOWED) -- - struct bch_opts { - #define x(_name, _bits, ...) unsigned _name##_defined:1; - BCH_OPTS() -@@ -582,8 +596,6 @@ struct printbuf; - - struct bch_option { - struct attribute attr; -- u64 (*get_sb)(const struct bch_sb *); -- void (*set_sb)(struct bch_sb *, u64); - enum opt_type type; - enum opt_flags flags; - u64 min, max; -@@ -595,6 +607,12 @@ struct bch_option { - const char *hint; - const char *help; - -+ u64 (*get_sb)(const struct bch_sb *); -+ void (*set_sb)(struct bch_sb *, u64); -+ -+ u64 (*get_member)(const struct bch_member *); -+ void (*set_member)(struct bch_member *, u64); -+ - }; - - extern const struct bch_option bch2_opt_table[]; -@@ -603,12 +621,12 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); - u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); - void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); - --u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); -+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); - int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); --void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); -+bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); - - struct bch_dev; --void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); -+bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); - - int bch2_opt_lookup(const char *); - int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); -@@ -625,12 +643,15 @@ void bch2_opts_to_text(struct printbuf *, - struct bch_fs *, struct bch_sb *, - unsigned, unsigned, unsigned); - --int bch2_opt_check_may_set(struct bch_fs *, int, u64); --int bch2_opts_check_may_set(struct bch_fs *); -+int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64); -+int bch2_opts_hooks_pre_set(struct bch_fs *); -+void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, -+ struct bch_opts *, enum bch_opt_id); -+ - int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, - struct printbuf *, const char *, const char *); - int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, -- char *); -+ char *, bool); - - /* inode opts: */ - -diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c -index 4cf5a2af1e6f..3302bbc78a09 100644 ---- a/fs/bcachefs/printbuf.c -+++ b/fs/bcachefs/printbuf.c -@@ -276,6 +276,25 @@ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) - buf->has_indent_or_tabstops = true; - } - -+/** -+ * bch2_printbuf_indent_add_nextline() - add to the current indent level for -+ * subsequent lines -+ * -+ * @buf: printbuf to control -+ * @spaces: number of spaces to add to the current indent level -+ * -+ * Subsequent lines - not the current line - will be indented by @spaces more -+ * spaces. -+ */ -+void bch2_printbuf_indent_add_nextline(struct printbuf *buf, unsigned spaces) -+{ -+ if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) -+ spaces = 0; -+ -+ buf->indent += spaces; -+ buf->has_indent_or_tabstops = true; -+} -+ - /** - * bch2_printbuf_indent_sub() - subtract from the current indent level - * -diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h -index d0dd398baa2b..1ca476adbf6f 100644 ---- a/fs/bcachefs/printbuf.h -+++ b/fs/bcachefs/printbuf.h -@@ -112,6 +112,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *); - int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); - - void bch2_printbuf_indent_add(struct printbuf *, unsigned); -+void bch2_printbuf_indent_add_nextline(struct printbuf *, unsigned); - void bch2_printbuf_indent_sub(struct printbuf *, unsigned); - - void bch2_prt_newline(struct printbuf *); -diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c -new file mode 100644 -index 000000000000..d09898566abe ---- /dev/null -+++ b/fs/bcachefs/progress.c -@@ -0,0 +1,61 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bbpos.h" -+#include "disk_accounting.h" -+#include "progress.h" -+ -+void bch2_progress_init(struct progress_indicator_state *s, -+ struct bch_fs *c, -+ u64 btree_id_mask) -+{ -+ memset(s, 0, sizeof(*s)); -+ -+ s->next_print = jiffies + HZ * 10; -+ -+ for (unsigned i = 0; i < BTREE_ID_NR; i++) { -+ if (!(btree_id_mask & BIT_ULL(i))) -+ continue; -+ -+ struct disk_accounting_pos acc; -+ disk_accounting_key_init(acc, btree, .id = i); -+ -+ u64 v; -+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); -+ s->nodes_total += div64_ul(v, btree_sectors(c)); -+ } -+} -+ -+static inline bool progress_update_p(struct progress_indicator_state *s) -+{ -+ bool ret = time_after_eq(jiffies, s->next_print); -+ -+ if (ret) -+ s->next_print = jiffies + HZ * 10; -+ return ret; -+} -+ -+void bch2_progress_update_iter(struct btree_trans *trans, -+ struct progress_indicator_state *s, -+ struct btree_iter *iter, -+ const char *msg) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = path_l(btree_iter_path(trans, iter))->b; -+ -+ s->nodes_seen += b != s->last_node; -+ s->last_node = b; -+ -+ if (progress_update_p(s)) { -+ struct printbuf buf = PRINTBUF; -+ unsigned percent = s->nodes_total -+ ? div64_u64(s->nodes_seen * 100, s->nodes_total) -+ : 0; -+ -+ prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", -+ msg, percent, s->nodes_seen, s->nodes_total); -+ bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); -+ -+ bch_info(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } -+} -diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h -new file mode 100644 -index 000000000000..23fb1811f943 ---- /dev/null -+++ b/fs/bcachefs/progress.h -@@ -0,0 +1,29 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_PROGRESS_H -+#define _BCACHEFS_PROGRESS_H -+ -+/* -+ * Lame progress indicators -+ * -+ * We don't like to use these because they print to the dmesg console, which is -+ * spammy - we much prefer to be wired up to a userspace programm (e.g. via -+ * thread_with_file) and have it print the progress indicator. -+ * -+ * But some code is old and doesn't support that, or runs in a context where -+ * that's not yet practical (mount). -+ */ -+ -+struct progress_indicator_state { -+ unsigned long next_print; -+ u64 nodes_seen; -+ u64 nodes_total; -+ struct btree *last_node; -+}; -+ -+void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); -+void bch2_progress_update_iter(struct btree_trans *, -+ struct progress_indicator_state *, -+ struct btree_iter *, -+ const char *); -+ -+#endif /* _BCACHEFS_PROGRESS_H */ -diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c -index 8b857fc33244..3d4755d73af7 100644 ---- a/fs/bcachefs/quota.c -+++ b/fs/bcachefs/quota.c -@@ -516,7 +516,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, - bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, - KEY_TYPE_QUOTA_NOCHECK); - advance: -- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); -+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); - return 0; - } - -diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c -index bef2aa1b8bcd..2cf3d55d0bbc 100644 ---- a/fs/bcachefs/rcu_pending.c -+++ b/fs/bcachefs/rcu_pending.c -@@ -1,6 +1,7 @@ - // SPDX-License-Identifier: GPL-2.0 - #define pr_fmt(fmt) "%s() " fmt "\n", __func__ - -+#include - #include - #include - #include -@@ -9,8 +10,6 @@ - #include - - #include "rcu_pending.h" --#include "darray.h" --#include "util.h" - - #define static_array_for_each(_a, _i) \ - for (typeof(&(_a)[0]) _i = _a; \ -diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c -index d0a1f5cd5c2b..de1ec9e0caa0 100644 ---- a/fs/bcachefs/rebalance.c -+++ b/fs/bcachefs/rebalance.c -@@ -26,9 +26,8 @@ - - /* bch_extent_rebalance: */ - --static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) -+static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) - { -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - - bkey_extent_entry_for_each(ptrs, entry) -@@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s - return NULL; - } - -+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) -+{ -+ return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); -+} -+ - static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s_c k, -@@ -76,11 +80,13 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, - unsigned ptr_bit = 1; - unsigned rewrite_ptrs = 0; - -+ rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } -+ rcu_read_unlock(); - - return rewrite_ptrs; - } -@@ -91,17 +97,24 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - -+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) -+ return 0; -+ - return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | - bch2_bkey_ptrs_need_move(c, opts, ptrs); - } - - u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) - { -- const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ -+ const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); - if (!opts) - return 0; - -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) -+ return 0; -+ - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - u64 sectors = 0; -@@ -121,10 +134,14 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) - } - } - incompressible: -- if (opts->background_target) -+ if (opts->background_target) { -+ rcu_read_lock(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -- if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) -+ if (!p.ptr.cached && -+ !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) - sectors += p.crc.compressed_size; -+ rcu_read_unlock(); -+ } - - return sectors; - } -@@ -228,7 +245,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) - bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_intent); -- k = bch2_btree_iter_peek_slot(&iter); -+ k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -257,7 +274,7 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) - int ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_set_rebalance_needs_scan_trans(trans, inum)); -- rebalance_wakeup(c); -+ bch2_rebalance_wakeup(c); - return ret; - } - -@@ -276,7 +293,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, - bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_intent); -- k = bch2_btree_iter_peek_slot(&iter); -+ k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -296,7 +313,7 @@ static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, - struct btree_iter *work_iter) - { - return !kthread_should_stop() -- ? bch2_btree_iter_peek(work_iter) -+ ? bch2_btree_iter_peek(trans, work_iter) - : bkey_s_c_null; - } - -@@ -304,7 +321,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) - { -- if (!bch2_bkey_rebalance_opts(k)) -+ if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) - return 0; - - struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); -@@ -330,7 +347,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, - work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, - work_pos, - BTREE_ITER_all_snapshots); -- struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter); -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter); - if (bkey_err(k)) - return k; - -@@ -341,7 +358,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, - memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; -- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; -+ data_opts->write_flags |= BCH_WRITE_only_specified_devs; - - if (!data_opts->rewrite_ptrs) { - /* -@@ -442,22 +459,11 @@ static int do_rebalance_extent(struct moving_context *ctxt, - return ret; - } - --static bool rebalance_pred(struct bch_fs *c, void *arg, -- struct bkey_s_c k, -- struct bch_io_opts *io_opts, -- struct data_update_opts *data_opts) --{ -- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); -- data_opts->target = io_opts->background_target; -- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; -- return data_opts->rewrite_ptrs != 0; --} -- - static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) - { - struct btree_trans *trans = ctxt->trans; -+ struct bch_fs *c = trans->c; - struct bch_fs_rebalance *r = &trans->c->rebalance; -- int ret; - - bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - ctxt->stats = &r->scan_stats; -@@ -472,11 +478,34 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) - - r->state = BCH_REBALANCE_scanning; - -- ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: -- commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -- bch2_clear_rebalance_needs_scan(trans, inum, cookie)); -+ struct per_snapshot_io_opts snapshot_io_opts; -+ per_snapshot_io_opts_init(&snapshot_io_opts, c); -+ -+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, -+ r->scan_start.pos, r->scan_end.pos, -+ BTREE_ITER_all_snapshots| -+ BTREE_ITER_not_extents| -+ BTREE_ITER_prefetch, k, ({ -+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - -+ struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, -+ &snapshot_io_opts, iter.pos, &iter, k); -+ PTR_ERR_OR_ZERO(io_opts); -+ })) ?: -+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -+ bch2_clear_rebalance_needs_scan(trans, inum, cookie)); -+ -+ per_snapshot_io_opts_exit(&snapshot_io_opts); - bch2_move_stats_exit(&r->scan_stats, trans->c); -+ -+ /* -+ * Ensure that the rebalance_work entries we created are seen by the -+ * next iteration of do_rebalance(), so we don't end up stuck in -+ * rebalance_wait(): -+ */ -+ atomic64_inc(&r->scan_stats.sectors_seen); -+ bch2_btree_write_buffer_flush_sync(trans); -+ - return ret; - } - -@@ -501,12 +530,19 @@ static void rebalance_wait(struct bch_fs *c) - bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); - } - -+static bool bch2_rebalance_enabled(struct bch_fs *c) -+{ -+ return c->opts.rebalance_enabled && -+ !(c->opts.rebalance_on_ac_only && -+ c->rebalance.on_battery); -+} -+ - static int do_rebalance(struct moving_context *ctxt) - { - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct bch_fs_rebalance *r = &c->rebalance; -- struct btree_iter rebalance_work_iter, extent_iter = { NULL }; -+ struct btree_iter rebalance_work_iter, extent_iter = {}; - struct bkey_s_c k; - int ret = 0; - -@@ -520,9 +556,9 @@ static int do_rebalance(struct moving_context *ctxt) - BTREE_ITER_all_snapshots); - - while (!bch2_move_ratelimit(ctxt)) { -- if (!c->opts.rebalance_enabled) { -+ if (!bch2_rebalance_enabled(c)) { - bch2_moving_ctxt_flush_all(ctxt); -- kthread_wait_freezable(c->opts.rebalance_enabled || -+ kthread_wait_freezable(bch2_rebalance_enabled(c) || - kthread_should_stop()); - } - -@@ -547,7 +583,7 @@ static int do_rebalance(struct moving_context *ctxt) - if (ret) - break; - -- bch2_btree_iter_advance(&rebalance_work_iter); -+ bch2_btree_iter_advance(trans, &rebalance_work_iter); - } - - bch2_trans_iter_exit(trans, &extent_iter); -@@ -576,6 +612,13 @@ static int bch2_rebalance_thread(void *arg) - - set_freezable(); - -+ /* -+ * Data move operations can't run until after check_snapshots has -+ * completed, and bch2_snapshot_is_ancestor() is available. -+ */ -+ kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || -+ kthread_should_stop()); -+ - bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, - writepoint_ptr(&c->rebalance_write_point), - true); -@@ -590,8 +633,20 @@ static int bch2_rebalance_thread(void *arg) - - void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) - { -+ printbuf_tabstop_push(out, 32); -+ - struct bch_fs_rebalance *r = &c->rebalance; - -+ /* print pending work */ -+ struct disk_accounting_pos acc; -+ disk_accounting_key_init(acc, rebalance_work); -+ u64 v; -+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); -+ -+ prt_printf(out, "pending work:\t"); -+ prt_human_readable_u64(out, v << 9); -+ prt_printf(out, "\n\n"); -+ - prt_str(out, bch2_rebalance_state_strs[r->state]); - prt_newline(out); - printbuf_indent_add(out, 2); -@@ -600,15 +655,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) - case BCH_REBALANCE_waiting: { - u64 now = atomic64_read(&c->io_clock[WRITE].now); - -- prt_str(out, "io wait duration: "); -+ prt_printf(out, "io wait duration:\t"); - bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); - prt_newline(out); - -- prt_str(out, "io wait remaining: "); -+ prt_printf(out, "io wait remaining:\t"); - bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); - prt_newline(out); - -- prt_str(out, "duration waited: "); -+ prt_printf(out, "duration waited:\t"); - bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); - prt_newline(out); - break; -@@ -621,6 +676,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) - break; - } - prt_newline(out); -+ -+ rcu_read_lock(); -+ struct task_struct *t = rcu_dereference(c->rebalance.thread); -+ if (t) -+ get_task_struct(t); -+ rcu_read_unlock(); -+ -+ if (t) { -+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); -+ put_task_struct(t); -+ } -+ - printbuf_indent_sub(out, 2); - } - -@@ -635,7 +702,7 @@ void bch2_rebalance_stop(struct bch_fs *c) - c->rebalance.thread = NULL; - - if (p) { -- /* for sychronizing with rebalance_wakeup() */ -+ /* for sychronizing with bch2_rebalance_wakeup() */ - synchronize_rcu(); - - kthread_stop(p); -@@ -666,7 +733,156 @@ int bch2_rebalance_start(struct bch_fs *c) - return 0; - } - --void bch2_fs_rebalance_init(struct bch_fs *c) -+#ifdef CONFIG_POWER_SUPPLY -+#include -+ -+static int bch2_rebalance_power_notifier(struct notifier_block *nb, -+ unsigned long event, void *data) -+{ -+ struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier); -+ -+ c->rebalance.on_battery = !power_supply_is_system_supplied(); -+ bch2_rebalance_wakeup(c); -+ return NOTIFY_OK; -+} -+#endif -+ -+void bch2_fs_rebalance_exit(struct bch_fs *c) -+{ -+#ifdef CONFIG_POWER_SUPPLY -+ power_supply_unreg_notifier(&c->rebalance.power_notifier); -+#endif -+} -+ -+int bch2_fs_rebalance_init(struct bch_fs *c) - { -- bch2_pd_controller_init(&c->rebalance.pd); -+ struct bch_fs_rebalance *r = &c->rebalance; -+ -+ bch2_pd_controller_init(&r->pd); -+ -+#ifdef CONFIG_POWER_SUPPLY -+ r->power_notifier.notifier_call = bch2_rebalance_power_notifier; -+ int ret = power_supply_reg_notifier(&r->power_notifier); -+ if (ret) -+ return ret; -+ -+ r->on_battery = !power_supply_is_system_supplied(); -+#endif -+ return 0; -+} -+ -+static int check_rebalance_work_one(struct btree_trans *trans, -+ struct btree_iter *extent_iter, -+ struct btree_iter *rebalance_iter, -+ struct bkey_buf *last_flushed) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c extent_k, rebalance_k; -+ struct printbuf buf = PRINTBUF; -+ -+ int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?: -+ bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter)); -+ if (ret) -+ return ret; -+ -+ if (!extent_k.k && -+ extent_iter->btree_id == BTREE_ID_reflink && -+ (!rebalance_k.k || -+ rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { -+ bch2_trans_iter_exit(trans, extent_iter); -+ bch2_trans_iter_init(trans, extent_iter, -+ BTREE_ID_extents, POS_MIN, -+ BTREE_ITER_prefetch| -+ BTREE_ITER_all_snapshots); -+ return -BCH_ERR_transaction_restart_nested; -+ } -+ -+ if (!extent_k.k && !rebalance_k.k) -+ return 1; -+ -+ int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, -+ rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); -+ -+ struct bkey deleted; -+ bkey_init(&deleted); -+ -+ if (cmp < 0) { -+ deleted.p = extent_k.k->p; -+ rebalance_k.k = &deleted; -+ } else if (cmp > 0) { -+ deleted.p = rebalance_k.k->p; -+ extent_k.k = &deleted; -+ } -+ -+ bool should_have_rebalance = -+ bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; -+ bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; -+ -+ if (should_have_rebalance != have_rebalance) { -+ ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed); -+ if (ret) -+ return ret; -+ -+ bch2_bkey_val_to_text(&buf, c, extent_k); -+ } -+ -+ if (fsck_err_on(!should_have_rebalance && have_rebalance, -+ trans, rebalance_work_incorrectly_set, -+ "rebalance work incorrectly set\n%s", buf.buf)) { -+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, -+ extent_k.k->p, false); -+ if (ret) -+ goto err; -+ } -+ -+ if (fsck_err_on(should_have_rebalance && !have_rebalance, -+ trans, rebalance_work_incorrectly_unset, -+ "rebalance work incorrectly unset\n%s", buf.buf)) { -+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, -+ extent_k.k->p, true); -+ if (ret) -+ goto err; -+ } -+ -+ if (cmp <= 0) -+ bch2_btree_iter_advance(trans, extent_iter); -+ if (cmp >= 0) -+ bch2_btree_iter_advance(trans, rebalance_iter); -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+int bch2_check_rebalance_work(struct bch_fs *c) -+{ -+ struct btree_trans *trans = bch2_trans_get(c); -+ struct btree_iter rebalance_iter, extent_iter; -+ int ret = 0; -+ -+ bch2_trans_iter_init(trans, &extent_iter, -+ BTREE_ID_reflink, POS_MIN, -+ BTREE_ITER_prefetch); -+ bch2_trans_iter_init(trans, &rebalance_iter, -+ BTREE_ID_rebalance_work, POS_MIN, -+ BTREE_ITER_prefetch); -+ -+ struct bkey_buf last_flushed; -+ bch2_bkey_buf_init(&last_flushed); -+ bkey_init(&last_flushed.k->k); -+ -+ while (!ret) { -+ bch2_trans_begin(trans); -+ -+ ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ ret = 0; -+ } -+ -+ bch2_bkey_buf_exit(&last_flushed, c); -+ bch2_trans_iter_exit(trans, &extent_iter); -+ bch2_trans_iter_exit(trans, &rebalance_iter); -+ bch2_trans_put(trans); -+ return ret < 0 ? ret : 0; - } -diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h -index 62a3859d3823..5d9214fe1a22 100644 ---- a/fs/bcachefs/rebalance.h -+++ b/fs/bcachefs/rebalance.h -@@ -37,7 +37,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); - int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); - int bch2_set_fs_needs_rebalance(struct bch_fs *); - --static inline void rebalance_wakeup(struct bch_fs *c) -+static inline void bch2_rebalance_wakeup(struct bch_fs *c) - { - struct task_struct *p; - -@@ -52,6 +52,10 @@ void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); - - void bch2_rebalance_stop(struct bch_fs *); - int bch2_rebalance_start(struct bch_fs *); --void bch2_fs_rebalance_init(struct bch_fs *); -+ -+void bch2_fs_rebalance_exit(struct bch_fs *); -+int bch2_fs_rebalance_init(struct bch_fs *); -+ -+int bch2_check_rebalance_work(struct bch_fs *); - - #endif /* _BCACHEFS_REBALANCE_H */ -diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h -index fe5098c17dfc..33d77286f1d5 100644 ---- a/fs/bcachefs/rebalance_types.h -+++ b/fs/bcachefs/rebalance_types.h -@@ -30,6 +30,11 @@ struct bch_fs_rebalance { - struct bbpos scan_start; - struct bbpos scan_end; - struct bch_move_stats scan_stats; -+ -+ bool on_battery; -+#ifdef CONFIG_POWER_SUPPLY -+ struct notifier_block power_notifier; -+#endif - }; - - #endif /* _BCACHEFS_REBALANCE_TYPES_H */ -diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c -index 71c786cdb192..4fca57575565 100644 ---- a/fs/bcachefs/recovery.c -+++ b/fs/bcachefs/recovery.c -@@ -13,12 +13,13 @@ - #include "disk_accounting.h" - #include "errcode.h" - #include "error.h" --#include "fs-common.h" - #include "journal_io.h" - #include "journal_reclaim.h" - #include "journal_seq_blacklist.h" - #include "logged_ops.h" - #include "move.h" -+#include "movinggc.h" -+#include "namei.h" - #include "quota.h" - #include "rebalance.h" - #include "recovery.h" -@@ -32,8 +33,9 @@ - #include - #include - -- --int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) -+int bch2_btree_lost_data(struct bch_fs *c, -+ struct printbuf *msg, -+ enum btree_id btree) - { - u64 b = BIT_ULL(btree); - int ret = 0; -@@ -42,32 +44,32 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - if (!(c->sb.btrees_lost_data & b)) { -- struct printbuf buf = PRINTBUF; -- bch2_btree_id_to_text(&buf, btree); -- bch_err(c, "flagging btree %s lost data", buf.buf); -- printbuf_exit(&buf); -+ prt_printf(msg, "flagging btree "); -+ bch2_btree_id_to_text(msg, btree); -+ prt_printf(msg, " lost data\n"); -+ - ext->btrees_lost_data |= cpu_to_le64(b); - } - - /* Once we have runtime self healing for topology errors we won't need this: */ -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - - /* Btree node accounting will be off: */ - __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; - - #ifdef CONFIG_BCACHEFS_DEBUG - /* - * These are much more minor, and don't need to be corrected right away, - * but in debug mode we want the next fsck run to be clean: - */ -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret; - #endif - - switch (btree) { - case BTREE_ID_alloc: -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); -@@ -77,26 +79,30 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); - goto out; - case BTREE_ID_backpointers: -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret; - goto out; - case BTREE_ID_need_discard: -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_freespace: -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_bucket_gens: -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_lru: -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_accounting: -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; -+ goto out; -+ case BTREE_ID_snapshots: -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; - goto out; - default: -- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; -+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; - goto out; - } - out: -@@ -113,11 +119,8 @@ static void kill_btree(struct bch_fs *c, enum btree_id btree) - } - - /* for -o reconstruct_alloc: */ --static void bch2_reconstruct_alloc(struct bch_fs *c) -+void bch2_reconstruct_alloc(struct bch_fs *c) - { -- bch2_journal_log_msg(c, "dropping alloc info"); -- bch_info(c, "dropping and reconstructing all alloc info"); -- - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - -@@ -159,6 +162,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) - - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - -+ c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info)); -+ - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - -@@ -198,7 +203,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, k->level, - BTREE_ITER_intent); -- int ret = bch2_btree_iter_traverse(&iter); -+ int ret = bch2_btree_iter_traverse(trans, &iter); - if (ret) - goto out; - -@@ -261,7 +266,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, k->level, - iter_flags); -- ret = bch2_btree_iter_traverse(&iter); -+ ret = bch2_btree_iter_traverse(trans, &iter); - if (ret) - goto out; - -@@ -270,7 +275,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, - bch2_trans_iter_exit(trans, &iter); - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, 0, iter_flags); -- ret = bch2_btree_iter_traverse(&iter) ?: -+ ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_increase_depth(trans, iter.path, 0) ?: - -BCH_ERR_transaction_restart_nested; - goto out; -@@ -281,7 +286,12 @@ static int bch2_journal_replay_key(struct btree_trans *trans, - goto out; - - if (k->k->k.type == KEY_TYPE_accounting) { -- ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k); -+ struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ goto out; -+ -+ bkey_copy(n, k->k); - goto out; - } - -@@ -389,9 +399,9 @@ int bch2_journal_replay(struct bch_fs *c) - * Now, replay any remaining keys in the order in which they appear in - * the journal, unpinning those journal entries as we go: - */ -- sort(keys_sorted.data, keys_sorted.nr, -- sizeof(keys_sorted.data[0]), -- journal_sort_seq_cmp, NULL); -+ sort_nonatomic(keys_sorted.data, keys_sorted.nr, -+ sizeof(keys_sorted.data[0]), -+ journal_sort_seq_cmp, NULL); - - darray_for_each(keys_sorted, kp) { - cond_resched(); -@@ -429,7 +439,7 @@ int bch2_journal_replay(struct bch_fs *c) - trans = NULL; - - if (!c->opts.retain_recovery_info && -- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) -+ c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) - bch2_journal_keys_put_initial(c); - - replay_now_at(j, j->replay_journal_seq_end); -@@ -584,9 +594,6 @@ static int read_btree_roots(struct bch_fs *c) - buf.buf, bch2_err_str(ret))) { - if (btree_id_is_alloc(i)) - r->error = 0; -- -- ret = bch2_btree_lost_data(c, i); -- BUG_ON(ret); - } - } - -@@ -666,7 +673,7 @@ static bool check_version_upgrade(struct bch_fs *c) - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } - -- bch_info(c, "%s", buf.buf); -+ bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - - ret = true; -@@ -682,7 +689,7 @@ static bool check_version_upgrade(struct bch_fs *c) - bch2_version_to_text(&buf, c->sb.version_incompat_allowed); - prt_newline(&buf); - -- bch_info(c, "%s", buf.buf); -+ bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - - ret = true; -@@ -789,11 +796,11 @@ int bch2_fs_recovery(struct bch_fs *c) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - -- if (c->opts.fsck) -- set_bit(BCH_FS_fsck_running, &c->flags); - if (c->sb.clean) - set_bit(BCH_FS_clean_recovery, &c->flags); -- set_bit(BCH_FS_recovery_running, &c->flags); -+ if (c->opts.fsck) -+ set_bit(BCH_FS_in_fsck, &c->flags); -+ set_bit(BCH_FS_in_recovery, &c->flags); - - ret = bch2_blacklist_table_initialize(c); - if (ret) { -@@ -888,8 +895,37 @@ int bch2_fs_recovery(struct bch_fs *c) - if (ret) - goto err; - -- if (c->opts.reconstruct_alloc) -+ ret = bch2_fs_resize_on_mount(c); -+ if (ret) { -+ up_write(&c->state_lock); -+ goto err; -+ } -+ -+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { -+ bch_info(c, "filesystem is an unresized image file, mounting ro"); -+ c->opts.read_only = true; -+ } -+ -+ if (!c->opts.read_only && -+ (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { -+ bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); -+ - bch2_reconstruct_alloc(c); -+ } else if (c->opts.reconstruct_alloc) { -+ bch2_journal_log_msg(c, "dropping alloc info"); -+ bch_info(c, "dropping and reconstructing all alloc info"); -+ -+ bch2_reconstruct_alloc(c); -+ } -+ -+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { -+ /* We can't go RW to fix errors without alloc info */ -+ if (c->opts.fix_errors == FSCK_FIX_yes || -+ c->opts.fix_errors == FSCK_FIX_ask) -+ c->opts.fix_errors = FSCK_FIX_no; -+ if (c->opts.errors == BCH_ON_ERROR_fix_safe) -+ c->opts.errors = BCH_ON_ERROR_continue; -+ } - - /* - * After an unclean shutdown, skip then next few journal sequence -@@ -899,7 +935,7 @@ int bch2_fs_recovery(struct bch_fs *c) - * journal sequence numbers: - */ - if (!c->sb.clean) -- journal_seq += 8; -+ journal_seq += JOURNAL_BUF_NR * 4; - - if (blacklist_seq != journal_seq) { - ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", -@@ -932,8 +968,10 @@ int bch2_fs_recovery(struct bch_fs *c) - set_bit(BCH_FS_btree_running, &c->flags); - - ret = bch2_sb_set_upgrade_extra(c); -+ if (ret) -+ goto err; - -- ret = bch2_run_recovery_passes(c); -+ ret = bch2_run_recovery_passes(c, 0); - if (ret) - goto err; - -@@ -944,8 +982,7 @@ int bch2_fs_recovery(struct bch_fs *c) - * multithreaded use: - */ - set_bit(BCH_FS_may_go_rw, &c->flags); -- clear_bit(BCH_FS_fsck_running, &c->flags); -- clear_bit(BCH_FS_recovery_running, &c->flags); -+ clear_bit(BCH_FS_in_fsck, &c->flags); - - /* in case we don't run journal replay, i.e. norecovery mode */ - set_bit(BCH_FS_accounting_replay_done, &c->flags); -@@ -968,9 +1005,8 @@ int bch2_fs_recovery(struct bch_fs *c) - bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); - clear_bit(BCH_FS_errors_fixed, &c->flags); - -- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; -- -- ret = bch2_run_recovery_passes(c); -+ ret = bch2_run_recovery_passes(c, -+ BCH_RECOVERY_PASS_check_alloc_info); - if (ret) - goto err; - -@@ -1014,7 +1050,7 @@ int bch2_fs_recovery(struct bch_fs *c) - - if (c->opts.fsck && - !test_bit(BCH_FS_error, &c->flags) && -- c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && -+ c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 && - ext->btrees_lost_data) { - ext->btrees_lost_data = 0; - write_sb = true; -@@ -1075,8 +1111,17 @@ int bch2_fs_recovery(struct bch_fs *c) - return ret; - err: - fsck_err: -- bch2_fs_emergency_read_only(c); -- goto out; -+ { -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ -+ prt_printf(&buf, "error in recovery: %s", bch2_err_str(ret)); -+ bch2_fs_emergency_read_only2(c, &buf); -+ -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ } -+ return ret; - } - - int bch2_fs_initialize(struct bch_fs *c) -@@ -1125,14 +1170,17 @@ int bch2_fs_initialize(struct bch_fs *c) - * journal_res_get() will crash if called before this has - * set up the journal.pin FIFO and journal.cur pointer: - */ -- bch2_fs_journal_start(&c->journal, 1); -- set_bit(BCH_FS_accounting_replay_done, &c->flags); -- bch2_journal_set_replay_done(&c->journal); -+ ret = bch2_fs_journal_start(&c->journal, 1); -+ if (ret) -+ goto err; - - ret = bch2_fs_read_write_early(c); - if (ret) - goto err; - -+ set_bit(BCH_FS_accounting_replay_done, &c->flags); -+ bch2_journal_set_replay_done(&c->journal); -+ - for_each_member_device(c, ca) { - ret = bch2_dev_usage_init(ca, false); - if (ret) { -@@ -1189,7 +1237,10 @@ int bch2_fs_initialize(struct bch_fs *c) - if (ret) - goto err; - -- c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; -+ c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1; -+ -+ bch2_copygc_wakeup(c); -+ bch2_rebalance_wakeup(c); - - if (enabled_qtypes(c)) { - ret = bch2_fs_quota_read(c); -@@ -1209,7 +1260,7 @@ int bch2_fs_initialize(struct bch_fs *c) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - -- c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; -+ c->recovery.curr_pass = BCH_RECOVERY_PASS_NR; - return 0; - err: - bch_err_fn(c, ret); -diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h -index b0d55754b21b..c023f52fc2d6 100644 ---- a/fs/bcachefs/recovery.h -+++ b/fs/bcachefs/recovery.h -@@ -2,7 +2,8 @@ - #ifndef _BCACHEFS_RECOVERY_H - #define _BCACHEFS_RECOVERY_H - --int bch2_btree_lost_data(struct bch_fs *, enum btree_id); -+int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id); -+void bch2_reconstruct_alloc(struct bch_fs *); - - int bch2_journal_replay(struct bch_fs *); - -diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c -index 0b3c951c32da..f74f14227137 100644 ---- a/fs/bcachefs/recovery_passes.c -+++ b/fs/bcachefs/recovery_passes.c -@@ -12,6 +12,7 @@ - #include "journal.h" - #include "lru.h" - #include "logged_ops.h" -+#include "movinggc.h" - #include "rebalance.h" - #include "recovery.h" - #include "recovery_passes.h" -@@ -27,6 +28,145 @@ const char * const bch2_recovery_passes[] = { - NULL - }; - -+static const u8 passes_to_stable_map[] = { -+#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, -+ BCH_RECOVERY_PASSES() -+#undef x -+}; -+ -+static const u8 passes_from_stable_map[] = { -+#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, -+ BCH_RECOVERY_PASSES() -+#undef x -+}; -+ -+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) -+{ -+ return passes_to_stable_map[pass]; -+} -+ -+u64 bch2_recovery_passes_to_stable(u64 v) -+{ -+ u64 ret = 0; -+ for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) -+ if (v & BIT_ULL(i)) -+ ret |= BIT_ULL(passes_to_stable_map[i]); -+ return ret; -+} -+ -+static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) -+{ -+ return pass < ARRAY_SIZE(passes_from_stable_map) -+ ? passes_from_stable_map[pass] -+ : 0; -+} -+ -+u64 bch2_recovery_passes_from_stable(u64 v) -+{ -+ u64 ret = 0; -+ for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) -+ if (v & BIT_ULL(i)) -+ ret |= BIT_ULL(passes_from_stable_map[i]); -+ return ret; -+} -+ -+static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, -+ enum bch_validate_flags flags, struct printbuf *err) -+{ -+ return 0; -+} -+ -+static void bch2_sb_recovery_passes_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_recovery_passes *r = -+ field_to_type(f, recovery_passes); -+ unsigned nr = recovery_passes_nr_entries(r); -+ -+ if (out->nr_tabstops < 1) -+ printbuf_tabstop_push(out, 32); -+ if (out->nr_tabstops < 2) -+ printbuf_tabstop_push(out, 16); -+ -+ prt_printf(out, "Pass\tLast run\tLast runtime\n"); -+ -+ for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { -+ if (!i->last_run) -+ continue; -+ -+ unsigned idx = i - r->start; -+ -+ prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); -+ -+ bch2_prt_datetime(out, le64_to_cpu(i->last_run)); -+ prt_tab(out); -+ -+ bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); -+ prt_newline(out); -+ } -+} -+ -+static void bch2_sb_recovery_pass_complete(struct bch_fs *c, -+ enum bch_recovery_pass pass, -+ s64 start_time) -+{ -+ enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); -+ s64 end_time = ktime_get_real_seconds(); -+ -+ mutex_lock(&c->sb_lock); -+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); -+ __clear_bit_le64(stable, ext->recovery_passes_required); -+ -+ struct bch_sb_field_recovery_passes *r = -+ bch2_sb_field_get(c->disk_sb.sb, recovery_passes); -+ -+ if (stable >= recovery_passes_nr_entries(r)) { -+ unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); -+ -+ r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); -+ if (!r) { -+ bch_err(c, "error creating recovery_passes sb section"); -+ goto out; -+ } -+ } -+ -+ r->start[stable].last_run = cpu_to_le64(end_time); -+ r->start[stable].last_runtime = cpu_to_le32(max(0, end_time - start_time)); -+out: -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+} -+ -+static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) -+{ -+ enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); -+ bool ret = false; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ struct bch_sb_field_recovery_passes *r = -+ bch2_sb_field_get(c->disk_sb.sb, recovery_passes); -+ -+ if (stable < recovery_passes_nr_entries(r)) { -+ struct recovery_pass_entry *i = r->start + stable; -+ -+ /* -+ * Ratelimit if the last runtime was more than 1% of the time -+ * since we last ran -+ */ -+ ret = (u64) le32_to_cpu(i->last_runtime) * 100 > -+ ktime_get_real_seconds() - le64_to_cpu(i->last_run); -+ } -+ -+ return ret; -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { -+ .validate = bch2_sb_recovery_passes_validate, -+ .to_text = bch2_sb_recovery_passes_to_text -+}; -+ - /* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ - static int bch2_recovery_pass_empty(struct bch_fs *c) - { -@@ -46,11 +186,36 @@ static int bch2_set_may_go_rw(struct bch_fs *c) - - set_bit(BCH_FS_may_go_rw, &c->flags); - -- if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) -+ if (keys->nr || -+ !c->opts.read_only || -+ !c->sb.clean || -+ c->opts.recovery_passes || -+ (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))) { -+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { -+ bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); -+ bch2_reconstruct_alloc(c); -+ } -+ - return bch2_fs_read_write_early(c); -+ } - return 0; - } - -+/* -+ * Make sure root inode is readable while we're still in recovery and can rewind -+ * for repair: -+ */ -+static int bch2_lookup_root_inode(struct bch_fs *c) -+{ -+ subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; -+ struct bch_inode_unpacked inode_u; -+ struct bch_subvolume subvol; -+ -+ return bch2_trans_do(c, -+ bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: -+ bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); -+} -+ - struct recovery_pass_fn { - int (*fn)(struct bch_fs *); - unsigned when; -@@ -62,255 +227,348 @@ static struct recovery_pass_fn recovery_pass_fns[] = { - #undef x - }; - --static const u8 passes_to_stable_map[] = { --#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, -- BCH_RECOVERY_PASSES() --#undef x --}; -+static u64 bch2_recovery_passes_match(unsigned flags) -+{ -+ u64 ret = 0; - --static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) -+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) -+ if (recovery_pass_fns[i].when & flags) -+ ret |= BIT_ULL(i); -+ return ret; -+} -+ -+u64 bch2_fsck_recovery_passes(void) - { -- return passes_to_stable_map[pass]; -+ return bch2_recovery_passes_match(PASS_FSCK); - } - --u64 bch2_recovery_passes_to_stable(u64 v) -+static void bch2_run_async_recovery_passes(struct bch_fs *c) - { -- u64 ret = 0; -- for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) -- if (v & BIT_ULL(i)) -- ret |= BIT_ULL(passes_to_stable_map[i]); -- return ret; -+ if (!down_trylock(&c->recovery.run_lock)) -+ return; -+ -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) -+ goto unlock; -+ -+ if (queue_work(system_long_wq, &c->recovery.work)) -+ return; -+ -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); -+unlock: -+ up(&c->recovery.run_lock); - } - --u64 bch2_recovery_passes_from_stable(u64 v) -+static bool recovery_pass_needs_set(struct bch_fs *c, -+ enum bch_recovery_pass pass, -+ enum bch_run_recovery_pass_flags *flags) - { -- static const u8 map[] = { --#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, -- BCH_RECOVERY_PASSES() --#undef x -- }; -+ struct bch_fs_recovery *r = &c->recovery; -+ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); -+ bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); - -- u64 ret = 0; -- for (unsigned i = 0; i < ARRAY_SIZE(map); i++) -- if (v & BIT_ULL(i)) -- ret |= BIT_ULL(map[i]); -- return ret; -+ if ((*flags & RUN_RECOVERY_PASS_ratelimit) && -+ !bch2_recovery_pass_want_ratelimit(c, pass)) -+ *flags &= ~RUN_RECOVERY_PASS_ratelimit; -+ -+ /* -+ * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do -+ * anything if the pass has already run: these mean we need a prior pass -+ * to run before we continue to repair, we don't expect that pass to fix -+ * the damage we encountered. -+ * -+ * Otherwise, we run run_explicit_recovery_pass when we find damage, so -+ * it should run again even if it's already run: -+ */ -+ -+ if (persistent -+ ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) -+ : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass))) -+ return true; -+ -+ if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && -+ (r->passes_ratelimiting & BIT_ULL(pass))) -+ return true; -+ -+ return false; - } - - /* - * For when we need to rewind recovery passes and run a pass we skipped: - */ --static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, -- enum bch_recovery_pass pass) -+int __bch2_run_explicit_recovery_pass(struct bch_fs *c, -+ struct printbuf *out, -+ enum bch_recovery_pass pass, -+ enum bch_run_recovery_pass_flags flags) - { -- if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) -- return -BCH_ERR_not_in_recovery; -+ struct bch_fs_recovery *r = &c->recovery; -+ int ret = 0; - -- if (c->recovery_passes_complete & BIT_ULL(pass)) -- return 0; -+ lockdep_assert_held(&c->sb_lock); - -- bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); -+ bch2_printbuf_make_room(out, 1024); -+ out->atomic++; - -- if (pass < BCH_RECOVERY_PASS_set_may_go_rw && -- c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { -- if (print) -- bch_info(c, "need recovery pass %s (%u), but already rw", -- bch2_recovery_passes[pass], pass); -- return -BCH_ERR_cannot_rewind_recovery; -- } -+ unsigned long lockflags; -+ spin_lock_irqsave(&r->lock, lockflags); - -- if (print) -- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", -- bch2_recovery_passes[pass], pass, -- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); -+ if (!recovery_pass_needs_set(c, pass, &flags)) -+ goto out; - -- c->opts.recovery_passes |= BIT_ULL(pass); -+ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); -+ bool rewind = in_recovery && r->curr_pass > pass; -+ bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; - -- if (c->curr_recovery_pass > pass) { -- c->next_recovery_pass = pass; -- c->recovery_passes_complete &= (1ULL << pass) >> 1; -- return -BCH_ERR_restart_recovery; -- } else { -- return 0; -+ if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) { -+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); -+ __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); - } --} - --int bch2_run_explicit_recovery_pass(struct bch_fs *c, -- enum bch_recovery_pass pass) --{ -- unsigned long flags; -- spin_lock_irqsave(&c->recovery_pass_lock, flags); -- int ret = __bch2_run_explicit_recovery_pass(c, pass); -- spin_unlock_irqrestore(&c->recovery_pass_lock, flags); -- return ret; --} -+ if (pass < BCH_RECOVERY_PASS_set_may_go_rw && -+ (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { -+ prt_printf(out, "need recovery pass %s (%u), but already rw\n", -+ bch2_recovery_passes[pass], pass); -+ ret = -BCH_ERR_cannot_rewind_recovery; -+ goto out; -+ } - --int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, -- enum bch_recovery_pass pass) --{ -- lockdep_assert_held(&c->sb_lock); -+ if (ratelimit) -+ r->passes_ratelimiting |= BIT_ULL(pass); -+ else -+ r->passes_ratelimiting &= ~BIT_ULL(pass); - -- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); -- __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); -+ if (in_recovery && !ratelimit) { -+ prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", -+ bch2_recovery_passes[pass], pass, -+ bch2_recovery_passes[r->curr_pass], r->curr_pass, -+ rewind ? " - rewinding" : ""); - -- return bch2_run_explicit_recovery_pass(c, pass); --} -- --int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, -- enum bch_recovery_pass pass) --{ -- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); -+ r->passes_to_run |= BIT_ULL(pass); - -- mutex_lock(&c->sb_lock); -- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); -+ if (rewind) { -+ r->next_pass = pass; -+ r->passes_complete &= (1ULL << pass) >> 1; -+ ret = -BCH_ERR_restart_recovery; -+ } -+ } else { -+ prt_printf(out, "scheduling recovery pass %s (%u)%s\n", -+ bch2_recovery_passes[pass], pass, -+ ratelimit ? " - ratelimiting" : ""); - -- if (!test_bit_le64(s, ext->recovery_passes_required)) { -- __set_bit_le64(s, ext->recovery_passes_required); -- bch2_write_super(c); -+ struct recovery_pass_fn *p = recovery_pass_fns + pass; -+ if (p->when & PASS_ONLINE) -+ bch2_run_async_recovery_passes(c); - } -- mutex_unlock(&c->sb_lock); -- -- return bch2_run_explicit_recovery_pass(c, pass); -+out: -+ spin_unlock_irqrestore(&r->lock, lockflags); -+ --out->atomic; -+ return ret; - } - --static void bch2_clear_recovery_pass_required(struct bch_fs *c, -- enum bch_recovery_pass pass) -+int bch2_run_explicit_recovery_pass(struct bch_fs *c, -+ struct printbuf *out, -+ enum bch_recovery_pass pass, -+ enum bch_run_recovery_pass_flags flags) - { -- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); -+ int ret = 0; - -- mutex_lock(&c->sb_lock); -- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); -+ scoped_guard(mutex, &c->sb_lock) { -+ if (!recovery_pass_needs_set(c, pass, &flags)) -+ return 0; - -- if (test_bit_le64(s, ext->recovery_passes_required)) { -- __clear_bit_le64(s, ext->recovery_passes_required); -+ ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); - bch2_write_super(c); - } -- mutex_unlock(&c->sb_lock); --} -- --u64 bch2_fsck_recovery_passes(void) --{ -- u64 ret = 0; - -- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) -- if (recovery_pass_fns[i].when & PASS_FSCK) -- ret |= BIT_ULL(i); - return ret; - } - --static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -+int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) - { -- struct recovery_pass_fn *p = recovery_pass_fns + pass; -+ enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent; - -- if (c->opts.recovery_passes_exclude & BIT_ULL(pass)) -- return false; -- if (c->opts.recovery_passes & BIT_ULL(pass)) -- return true; -- if ((p->when & PASS_FSCK) && c->opts.fsck) -- return true; -- if ((p->when & PASS_UNCLEAN) && !c->sb.clean) -- return true; -- if (p->when & PASS_ALWAYS) -- return true; -- return false; -+ if (!recovery_pass_needs_set(c, pass, &flags)) -+ return 0; -+ -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ -+ mutex_lock(&c->sb_lock); -+ int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, -+ RUN_RECOVERY_PASS_nopersistent); -+ mutex_unlock(&c->sb_lock); -+ -+ bch2_print_str(c, KERN_NOTICE, buf.buf); -+ printbuf_exit(&buf); -+ return ret; - } - - static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) - { -+ struct bch_fs_recovery *r = &c->recovery; - struct recovery_pass_fn *p = recovery_pass_fns + pass; -- int ret; - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); -- ret = p->fn(c); -- if (ret) -+ -+ s64 start_time = ktime_get_real_seconds(); -+ int ret = p->fn(c); -+ -+ r->passes_to_run &= ~BIT_ULL(pass); -+ -+ if (ret) { -+ r->passes_failing |= BIT_ULL(pass); - return ret; -+ } -+ -+ r->passes_failing = 0; -+ -+ if (!test_bit(BCH_FS_error, &c->flags)) -+ bch2_sb_recovery_pass_complete(c, pass, start_time); -+ - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_CONT " done\n"); - - return 0; - } - --int bch2_run_online_recovery_passes(struct bch_fs *c) -+static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, -+ bool online) - { -+ struct bch_fs_recovery *r = &c->recovery; - int ret = 0; - -- down_read(&c->state_lock); -+ spin_lock_irq(&r->lock); - -- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { -- struct recovery_pass_fn *p = recovery_pass_fns + i; -+ if (online) -+ orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); - -- if (!(p->when & PASS_ONLINE)) -- continue; -+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) -+ orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); - -- ret = bch2_run_recovery_pass(c, i); -- if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { -- i = c->curr_recovery_pass; -- continue; -+ /* -+ * A failed recovery pass will be retried after another pass succeeds - -+ * but not this iteration. -+ * -+ * This is because some passes depend on repair done by other passes: we -+ * may want to retry, but we don't want to loop on failing passes. -+ */ -+ -+ orig_passes_to_run &= ~r->passes_failing; -+ -+ r->passes_to_run = orig_passes_to_run; -+ -+ while (r->passes_to_run) { -+ unsigned prev_done = r->pass_done; -+ unsigned pass = __ffs64(r->passes_to_run); -+ r->curr_pass = pass; -+ r->next_pass = r->curr_pass + 1; -+ r->passes_to_run &= ~BIT_ULL(pass); -+ -+ spin_unlock_irq(&r->lock); -+ -+ int ret2 = bch2_run_recovery_pass(c, pass) ?: -+ bch2_journal_flush(&c->journal); -+ -+ spin_lock_irq(&r->lock); -+ -+ if (r->next_pass < r->curr_pass) { -+ /* Rewind: */ -+ r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); -+ } else if (!ret2) { -+ r->pass_done = max(r->pass_done, pass); -+ r->passes_complete |= BIT_ULL(pass); -+ } else { -+ ret = ret2; - } -- if (ret) -+ -+ if (ret && !online) - break; -+ -+ if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && -+ r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { -+ bch2_copygc_wakeup(c); -+ bch2_rebalance_wakeup(c); -+ } - } - -- up_read(&c->state_lock); -+ clear_bit(BCH_FS_in_recovery, &c->flags); -+ spin_unlock_irq(&r->lock); - - return ret; - } - --int bch2_run_recovery_passes(struct bch_fs *c) -+static void bch2_async_recovery_passes_work(struct work_struct *work) - { -- int ret = 0; -+ struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); -+ struct bch_fs_recovery *r = &c->recovery; -+ -+ __bch2_run_recovery_passes(c, -+ c->sb.recovery_passes_required & ~r->passes_ratelimiting, -+ true); -+ -+ up(&r->run_lock); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); -+} -+ -+int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) -+{ -+ return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); -+} -+ -+int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) -+{ -+ u64 passes = -+ bch2_recovery_passes_match(PASS_ALWAYS) | -+ (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | -+ (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | -+ c->opts.recovery_passes | -+ c->sb.recovery_passes_required; - - /* - * We can't allow set_may_go_rw to be excluded; that would cause us to - * use the journal replay keys for updates where it's not expected. - */ - c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; -+ passes &= ~c->opts.recovery_passes_exclude; - -- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { -- c->next_recovery_pass = c->curr_recovery_pass + 1; -+ passes &= ~(BIT_ULL(from) - 1); - -- spin_lock_irq(&c->recovery_pass_lock); -- unsigned pass = c->curr_recovery_pass; -- -- if (c->opts.recovery_pass_last && -- c->curr_recovery_pass > c->opts.recovery_pass_last) { -- spin_unlock_irq(&c->recovery_pass_lock); -- break; -- } -+ down(&c->recovery.run_lock); -+ int ret = __bch2_run_recovery_passes(c, passes, false); -+ up(&c->recovery.run_lock); - -- if (!should_run_recovery_pass(c, pass)) { -- c->curr_recovery_pass++; -- c->recovery_pass_done = max(c->recovery_pass_done, pass); -- spin_unlock_irq(&c->recovery_pass_lock); -- continue; -- } -- spin_unlock_irq(&c->recovery_pass_lock); -+ return ret; -+} - -- ret = bch2_run_recovery_pass(c, pass) ?: -- bch2_journal_flush(&c->journal); -+static void prt_passes(struct printbuf *out, const char *msg, u64 passes) -+{ -+ prt_printf(out, "%s:\t", msg); -+ prt_bitflags(out, bch2_recovery_passes, passes); -+ prt_newline(out); -+} - -- if (!ret && !test_bit(BCH_FS_error, &c->flags)) -- bch2_clear_recovery_pass_required(c, pass); -- -- spin_lock_irq(&c->recovery_pass_lock); -- if (c->next_recovery_pass < c->curr_recovery_pass) { -- /* -- * bch2_run_explicit_recovery_pass() was called: we -- * can't always catch -BCH_ERR_restart_recovery because -- * it may have been called from another thread (btree -- * node read completion) -- */ -- ret = 0; -- c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); -- } else { -- c->recovery_passes_complete |= BIT_ULL(pass); -- c->recovery_pass_done = max(c->recovery_pass_done, pass); -- } -- c->curr_recovery_pass = c->next_recovery_pass; -- spin_unlock_irq(&c->recovery_pass_lock); -+void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bch_fs_recovery *r = &c->recovery; -+ -+ printbuf_tabstop_push(out, 32); -+ prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); -+ prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & -+ bch2_recovery_passes_match(PASS_ONLINE)); -+ prt_passes(out, "Complete passes", r->passes_complete); -+ prt_passes(out, "Failing passes", r->passes_failing); -+ -+ if (r->curr_pass) { -+ prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); -+ prt_passes(out, "Current passes", r->passes_to_run); - } -+} - -- return ret; -+void bch2_fs_recovery_passes_init(struct bch_fs *c) -+{ -+ spin_lock_init(&c->recovery.lock); -+ sema_init(&c->recovery.run_lock, 1); -+ -+ INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); - } -diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h -index 7d7339c8fa29..dc0d2014ff9b 100644 ---- a/fs/bcachefs/recovery_passes.h -+++ b/fs/bcachefs/recovery_passes.h -@@ -3,16 +3,32 @@ - - extern const char * const bch2_recovery_passes[]; - -+extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes; -+ - u64 bch2_recovery_passes_to_stable(u64 v); - u64 bch2_recovery_passes_from_stable(u64 v); - - u64 bch2_fsck_recovery_passes(void); - --int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); --int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); --int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); -+enum bch_run_recovery_pass_flags { -+ RUN_RECOVERY_PASS_nopersistent = BIT(0), -+ RUN_RECOVERY_PASS_ratelimit = BIT(1), -+}; -+ -+int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); -+ -+int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, -+ enum bch_recovery_pass, -+ enum bch_run_recovery_pass_flags); -+int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, -+ enum bch_recovery_pass, -+ enum bch_run_recovery_pass_flags); -+ -+int bch2_run_online_recovery_passes(struct bch_fs *, u64); -+int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); -+ -+void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *); - --int bch2_run_online_recovery_passes(struct bch_fs *); --int bch2_run_recovery_passes(struct bch_fs *); -+void bch2_fs_recovery_passes_init(struct bch_fs *); - - #endif /* _BCACHEFS_RECOVERY_PASSES_H */ -diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h -new file mode 100644 -index 000000000000..c434eafbca19 ---- /dev/null -+++ b/fs/bcachefs/recovery_passes_format.h -@@ -0,0 +1,104 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H -+#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H -+ -+#define PASS_SILENT BIT(0) -+#define PASS_FSCK BIT(1) -+#define PASS_UNCLEAN BIT(2) -+#define PASS_ALWAYS BIT(3) -+#define PASS_ONLINE BIT(4) -+#define PASS_ALLOC BIT(5) -+#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC) -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+#define PASS_FSCK_DEBUG BIT(1) -+#else -+#define PASS_FSCK_DEBUG 0 -+#endif -+ -+/* -+ * Passes may be reordered, but the second field is a persistent identifier and -+ * must never change: -+ */ -+#define BCH_RECOVERY_PASSES() \ -+ x(recovery_pass_empty, 41, PASS_SILENT) \ -+ x(scan_for_btree_nodes, 37, 0) \ -+ x(check_topology, 4, 0) \ -+ x(accounting_read, 39, PASS_ALWAYS) \ -+ x(alloc_read, 0, PASS_ALWAYS) \ -+ x(stripes_read, 1, 0) \ -+ x(initialize_subvolumes, 2, 0) \ -+ x(snapshots_read, 3, PASS_ALWAYS) \ -+ x(check_allocations, 5, PASS_FSCK_ALLOC) \ -+ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ -+ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ -+ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ -+ x(journal_replay, 9, PASS_ALWAYS) \ -+ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \ -+ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \ -+ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \ -+ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ -+ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \ -+ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \ -+ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ -+ x(bucket_gens_init, 17, 0) \ -+ x(reconstruct_snapshots, 38, 0) \ -+ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ -+ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ -+ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ -+ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ -+ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ -+ x(fs_upgrade_for_subvolumes, 22, 0) \ -+ x(check_inodes, 24, PASS_FSCK) \ -+ x(check_extents, 25, PASS_FSCK) \ -+ x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ -+ x(check_dirents, 27, PASS_FSCK) \ -+ x(check_xattrs, 28, PASS_FSCK) \ -+ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ -+ x(check_unreachable_inodes, 40, PASS_FSCK) \ -+ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ -+ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ -+ x(check_nlinks, 31, PASS_FSCK) \ -+ x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \ -+ x(resume_logged_ops, 23, PASS_ALWAYS) \ -+ x(delete_dead_inodes, 32, PASS_ALWAYS) \ -+ x(fix_reflink_p, 33, 0) \ -+ x(set_fs_needs_rebalance, 34, 0) \ -+ x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT) -+ -+/* We normally enumerate recovery passes in the order we run them: */ -+enum bch_recovery_pass { -+#define x(n, id, when) BCH_RECOVERY_PASS_##n, -+ BCH_RECOVERY_PASSES() -+#undef x -+ BCH_RECOVERY_PASS_NR -+}; -+ -+/* But we also need stable identifiers that can be used in the superblock */ -+enum bch_recovery_pass_stable { -+#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, -+ BCH_RECOVERY_PASSES() -+#undef x -+}; -+ -+struct recovery_pass_entry { -+ __le64 last_run; -+ __le32 last_runtime; -+ __le32 flags; -+}; -+ -+struct bch_sb_field_recovery_passes { -+ struct bch_sb_field field; -+ struct recovery_pass_entry start[]; -+}; -+ -+static inline unsigned -+recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r) -+{ -+ return r -+ ? ((vstruct_end(&r->field) - (void *) &r->start[0]) / -+ sizeof(struct recovery_pass_entry)) -+ : 0; -+} -+ -+#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */ -diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h -index 418557960ed6..aa9526938cc3 100644 ---- a/fs/bcachefs/recovery_passes_types.h -+++ b/fs/bcachefs/recovery_passes_types.h -@@ -2,79 +2,26 @@ - #ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H - #define _BCACHEFS_RECOVERY_PASSES_TYPES_H - --#define PASS_SILENT BIT(0) --#define PASS_FSCK BIT(1) --#define PASS_UNCLEAN BIT(2) --#define PASS_ALWAYS BIT(3) --#define PASS_ONLINE BIT(4) -- --#ifdef CONFIG_BCACHEFS_DEBUG --#define PASS_FSCK_DEBUG BIT(1) --#else --#define PASS_FSCK_DEBUG 0 --#endif -- --/* -- * Passes may be reordered, but the second field is a persistent identifier and -- * must never change: -- */ --#define BCH_RECOVERY_PASSES() \ -- x(recovery_pass_empty, 41, PASS_SILENT) \ -- x(scan_for_btree_nodes, 37, 0) \ -- x(check_topology, 4, 0) \ -- x(accounting_read, 39, PASS_ALWAYS) \ -- x(alloc_read, 0, PASS_ALWAYS) \ -- x(stripes_read, 1, PASS_ALWAYS) \ -- x(initialize_subvolumes, 2, 0) \ -- x(snapshots_read, 3, PASS_ALWAYS) \ -- x(check_allocations, 5, PASS_FSCK) \ -- x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ -- x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ -- x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ -- x(journal_replay, 9, PASS_ALWAYS) \ -- x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ -- x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ -- x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ -- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ -- x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ -- x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ -- x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ -- x(bucket_gens_init, 17, 0) \ -- x(reconstruct_snapshots, 38, 0) \ -- x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ -- x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ -- x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ -- x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ -- x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ -- x(fs_upgrade_for_subvolumes, 22, 0) \ -- x(check_inodes, 24, PASS_FSCK) \ -- x(check_extents, 25, PASS_FSCK) \ -- x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ -- x(check_dirents, 27, PASS_FSCK) \ -- x(check_xattrs, 28, PASS_FSCK) \ -- x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ -- x(check_unreachable_inodes, 40, PASS_FSCK) \ -- x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ -- x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ -- x(check_nlinks, 31, PASS_FSCK) \ -- x(resume_logged_ops, 23, PASS_ALWAYS) \ -- x(delete_dead_inodes, 32, PASS_ALWAYS) \ -- x(fix_reflink_p, 33, 0) \ -- x(set_fs_needs_rebalance, 34, 0) -- --/* We normally enumerate recovery passes in the order we run them: */ --enum bch_recovery_pass { --#define x(n, id, when) BCH_RECOVERY_PASS_##n, -- BCH_RECOVERY_PASSES() --#undef x -- BCH_RECOVERY_PASS_NR --}; -- --/* But we also need stable identifiers that can be used in the superblock */ --enum bch_recovery_pass_stable { --#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, -- BCH_RECOVERY_PASSES() --#undef x -+struct bch_fs_recovery { -+ /* -+ * Two different uses: -+ * "Has this fsck pass?" - i.e. should this type of error be an -+ * emergency read-only -+ * And, in certain situations fsck will rewind to an earlier pass: used -+ * for signaling to the toplevel code which pass we want to run now. -+ */ -+ enum bch_recovery_pass curr_pass; -+ enum bch_recovery_pass next_pass; -+ /* never rewinds version of curr_pass */ -+ enum bch_recovery_pass pass_done; -+ u64 passes_to_run; -+ /* bitmask of recovery passes that we actually ran */ -+ u64 passes_complete; -+ u64 passes_failing; -+ u64 passes_ratelimiting; -+ spinlock_t lock; -+ struct semaphore run_lock; -+ struct work_struct work; - }; - - #endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ -diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c -index 441e648f28b5..3a13dbcab6ba 100644 ---- a/fs/bcachefs/reflink.c -+++ b/fs/bcachefs/reflink.c -@@ -3,6 +3,7 @@ - #include "bkey_buf.h" - #include "btree_update.h" - #include "buckets.h" -+#include "enumerated_ref.h" - #include "error.h" - #include "extents.h" - #include "inode.h" -@@ -185,12 +186,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, - BUG_ON(missing_start < refd_start); - BUG_ON(missing_end > refd_end); - -- if (fsck_err(trans, reflink_p_to_missing_reflink_v, -- "pointer to missing indirect extent\n" -- " %s\n" -- " missing range %llu-%llu", -- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), -- missing_start, missing_end)) { -+ struct bpos missing_pos = bkey_start_pos(p.k); -+ missing_pos.offset += missing_start - live_start; -+ -+ prt_printf(&buf, "pointer to missing indirect extent in "); -+ ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); -+ if (ret) -+ goto err; -+ -+ prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9); -+ bch2_bkey_val_to_text(&buf, c, p.s_c); -+ -+ prt_printf(&buf, "\nmissing reflink btree range %llu-%llu", -+ missing_start, missing_end); -+ -+ if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) { - struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); - ret = PTR_ERR_OR_ZERO(new); - if (ret) -@@ -314,10 +324,10 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, - __le64 *refcount = bkey_refcount(bkey_i_to_s(new)); - if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { - bch2_bkey_val_to_text(&buf, c, p.s_c); -- prt_printf(&buf, "\n "); -+ prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - log_fsck_err(trans, reflink_refcount_underflow, -- "indirect extent refcount underflow while marking\n %s", -+ "indirect extent refcount underflow while marking\n%s", - buf.buf); - goto next; - } -@@ -486,7 +496,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, - bool reflink_p_may_update_opts_field) - { - struct bch_fs *c = trans->c; -- struct btree_iter reflink_iter = { NULL }; -+ struct btree_iter reflink_iter = {}; - struct bkey_s_c k; - struct bkey_i *r_v; - struct bkey_i_reflink_p *r_p; -@@ -498,7 +508,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, - - bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, - BTREE_ITER_intent); -- k = bch2_btree_iter_peek_prev(&reflink_iter); -+ k = bch2_btree_iter_peek_prev(trans, &reflink_iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -560,12 +570,13 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, - return ret; - } - --static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) -+static struct bkey_s_c get_next_src(struct btree_trans *trans, -+ struct btree_iter *iter, struct bpos end) - { - struct bkey_s_c k; - int ret; - -- for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) { -+ for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) { - if (bkey_extent_is_unwritten(k)) - continue; - -@@ -574,7 +585,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) - } - - if (bkey_ge(iter->pos, end)) -- bch2_btree_iter_set_pos(iter, end); -+ bch2_btree_iter_set_pos(trans, iter, end); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; - } - -@@ -597,10 +608,10 @@ s64 bch2_remap_range(struct bch_fs *c, - u64 dst_done = 0; - u32 dst_snapshot, src_snapshot; - bool reflink_p_may_update_opts_field = -- bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); -+ !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); - int ret = 0, ret2 = 0; - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink)) - return -BCH_ERR_erofs_no_writes; - - bch2_check_set_feature(c, BCH_FEATURE_reflink); -@@ -638,27 +649,27 @@ s64 bch2_remap_range(struct bch_fs *c, - if (ret) - continue; - -- bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); -+ bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot); - - ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, - &dst_snapshot); - if (ret) - continue; - -- bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); -+ bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot); - - if (dst_inum.inum < src_inum.inum) { - /* Avoid some lock cycle transaction restarts */ -- ret = bch2_btree_iter_traverse(&dst_iter); -+ ret = bch2_btree_iter_traverse(trans, &dst_iter); - if (ret) - continue; - } - - dst_done = dst_iter.pos.offset - dst_start.offset; - src_want = POS(src_start.inode, src_start.offset + dst_done); -- bch2_btree_iter_set_pos(&src_iter, src_want); -+ bch2_btree_iter_set_pos(trans, &src_iter, src_want); - -- src_k = get_next_src(&src_iter, src_end); -+ src_k = get_next_src(trans, &src_iter, src_end); - ret = bkey_err(src_k); - if (ret) - continue; -@@ -729,7 +740,7 @@ s64 bch2_remap_range(struct bch_fs *c, - - do { - struct bch_inode_unpacked inode_u; -- struct btree_iter inode_iter = { NULL }; -+ struct btree_iter inode_iter = {}; - - bch2_trans_begin(trans); - -@@ -751,7 +762,7 @@ s64 bch2_remap_range(struct bch_fs *c, - bch2_bkey_buf_exit(&new_src, c); - bch2_bkey_buf_exit(&new_dst, c); - -- bch2_write_ref_put(c, BCH_WRITE_REF_reflink); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink); - - return dst_done ?: ret ?: ret2; - } -@@ -786,8 +797,8 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), - trans, reflink_v_refcount_wrong, - "reflink key has wrong refcount:\n" -- " %s\n" -- " should be %u", -+ "%s\n" -+ "should be %u", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf), - r->refcount)) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); -diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c -index 6992e7469112..2b4b8445d418 100644 ---- a/fs/bcachefs/sb-counters.c -+++ b/fs/bcachefs/sb-counters.c -@@ -5,7 +5,13 @@ - - /* BCH_SB_FIELD_counters */ - --static const char * const bch2_counter_names[] = { -+static const u8 counters_to_stable_map[] = { -+#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, -+ BCH_PERSISTENT_COUNTERS() -+#undef x -+}; -+ -+const char * const bch2_counter_names[] = { - #define x(t, n, ...) (#t), - BCH_PERSISTENT_COUNTERS() - #undef x -@@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) - return 0; - - return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; --}; -+} - - static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) - { - return 0; --}; -+} - - static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -@@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field_counters *ctrs = field_to_type(f, counters); - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - -- for (unsigned i = 0; i < nr; i++) -- prt_printf(out, "%s \t%llu\n", -- i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", -- le64_to_cpu(ctrs->d[i])); --}; -+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { -+ unsigned stable = counters_to_stable_map[i]; -+ if (stable < nr) -+ prt_printf(out, "%s \t%llu\n", -+ bch2_counter_names[i], -+ le64_to_cpu(ctrs->d[stable])); -+ } -+} - - int bch2_sb_counters_to_cpu(struct bch_fs *c) - { - struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); -- unsigned int i; - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); -- u64 val = 0; - -- for (i = 0; i < BCH_COUNTER_NR; i++) -+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) - c->counters_on_mount[i] = 0; - -- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { -- val = le64_to_cpu(ctrs->d[i]); -- percpu_u64_set(&c->counters[i], val); -- c->counters_on_mount[i] = val; -+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { -+ unsigned stable = counters_to_stable_map[i]; -+ if (stable < nr) { -+ u64 v = le64_to_cpu(ctrs->d[stable]); -+ percpu_u64_set(&c->counters[i], v); -+ c->counters_on_mount[i] = v; -+ } - } -+ - return 0; --}; -+} - - int bch2_sb_counters_from_cpu(struct bch_fs *c) - { - struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - struct bch_sb_field_counters *ret; -- unsigned int i; - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - - if (nr < BCH_COUNTER_NR) { - ret = bch2_sb_field_resize(&c->disk_sb, counters, -- sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); -- -+ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); - if (ret) { - ctrs = ret; - nr = bch2_sb_counter_nr_entries(ctrs); - } - } - -+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { -+ unsigned stable = counters_to_stable_map[i]; -+ if (stable < nr) -+ ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); -+ } - -- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) -- ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); - return 0; - } - -@@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = { - .validate = bch2_sb_counters_validate, - .to_text = bch2_sb_counters_to_text, - }; -+ -+#ifndef NO_BCACHEFS_CHARDEV -+long bch2_ioctl_query_counters(struct bch_fs *c, -+ struct bch_ioctl_query_counters __user *user_arg) -+{ -+ struct bch_ioctl_query_counters arg; -+ int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); -+ if (ret) -+ return ret; -+ -+ if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || -+ arg.pad) -+ return -EINVAL; -+ -+ arg.nr = min(arg.nr, BCH_COUNTER_NR); -+ ret = put_user(arg.nr, &user_arg->nr); -+ if (ret) -+ return ret; -+ -+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { -+ unsigned stable = counters_to_stable_map[i]; -+ -+ if (stable < arg.nr) { -+ u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) -+ ? percpu_u64_get(&c->counters[i]) -+ : c->counters_on_mount[i]; -+ -+ ret = put_user(v, &user_arg->d[stable]); -+ if (ret) -+ return ret; -+ } -+ } -+ -+ return 0; -+} -+#endif -diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h -index 81f8aec9fcb1..a4329ad8dd1b 100644 ---- a/fs/bcachefs/sb-counters.h -+++ b/fs/bcachefs/sb-counters.h -@@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *); - void bch2_fs_counters_exit(struct bch_fs *); - int bch2_fs_counters_init(struct bch_fs *); - -+extern const char * const bch2_counter_names[]; - extern const struct bch_sb_field_ops bch_sb_field_ops_counters; - -+long bch2_ioctl_query_counters(struct bch_fs *, -+ struct bch_ioctl_query_counters __user *); -+ - #endif // _BCACHEFS_SB_COUNTERS_H -diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h -index fdcf598f08b1..7c0c9c842b4e 100644 ---- a/fs/bcachefs/sb-counters_format.h -+++ b/fs/bcachefs/sb-counters_format.h -@@ -9,10 +9,26 @@ enum counters_flags { - - #define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0, TYPE_SECTORS) \ -+ x(io_read_inline, 80, TYPE_SECTORS) \ -+ x(io_read_hole, 81, TYPE_SECTORS) \ -+ x(io_read_promote, 30, TYPE_COUNTER) \ -+ x(io_read_bounce, 31, TYPE_COUNTER) \ -+ x(io_read_split, 33, TYPE_COUNTER) \ -+ x(io_read_reuse_race, 34, TYPE_COUNTER) \ -+ x(io_read_retry, 32, TYPE_COUNTER) \ -+ x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ - x(io_write, 1, TYPE_SECTORS) \ - x(io_move, 2, TYPE_SECTORS) \ -+ x(io_move_read, 35, TYPE_SECTORS) \ -+ x(io_move_write, 36, TYPE_SECTORS) \ -+ x(io_move_finish, 37, TYPE_SECTORS) \ -+ x(io_move_fail, 38, TYPE_COUNTER) \ -+ x(io_move_write_fail, 82, TYPE_COUNTER) \ -+ x(io_move_start_fail, 39, TYPE_COUNTER) \ -+ x(io_move_created_rebalance, 83, TYPE_COUNTER) \ - x(bucket_invalidate, 3, TYPE_COUNTER) \ - x(bucket_discard, 4, TYPE_COUNTER) \ -+ x(bucket_discard_fast, 79, TYPE_COUNTER) \ - x(bucket_alloc, 5, TYPE_COUNTER) \ - x(bucket_alloc_fail, 6, TYPE_COUNTER) \ - x(btree_cache_scan, 7, TYPE_COUNTER) \ -@@ -38,16 +54,6 @@ enum counters_flags { - x(journal_reclaim_finish, 27, TYPE_COUNTER) \ - x(journal_reclaim_start, 28, TYPE_COUNTER) \ - x(journal_write, 29, TYPE_COUNTER) \ -- x(read_promote, 30, TYPE_COUNTER) \ -- x(read_bounce, 31, TYPE_COUNTER) \ -- x(read_split, 33, TYPE_COUNTER) \ -- x(read_retry, 32, TYPE_COUNTER) \ -- x(read_reuse_race, 34, TYPE_COUNTER) \ -- x(move_extent_read, 35, TYPE_SECTORS) \ -- x(move_extent_write, 36, TYPE_SECTORS) \ -- x(move_extent_finish, 37, TYPE_SECTORS) \ -- x(move_extent_fail, 38, TYPE_COUNTER) \ -- x(move_extent_start_fail, 39, TYPE_COUNTER) \ - x(copygc, 40, TYPE_COUNTER) \ - x(copygc_wait, 41, TYPE_COUNTER) \ - x(gc_gens_end, 42, TYPE_COUNTER) \ -@@ -95,6 +101,13 @@ enum bch_persistent_counters { - BCH_COUNTER_NR - }; - -+enum bch_persistent_counters_stable { -+#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n, -+ BCH_PERSISTENT_COUNTERS() -+#undef x -+ BCH_COUNTER_STABLE_NR -+}; -+ - struct bch_sb_field_counters { - struct bch_sb_field field; - __le64 d[]; -diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c -index 051214fdc735..d4e22c43eeb2 100644 ---- a/fs/bcachefs/sb-downgrade.c -+++ b/fs/bcachefs/sb-downgrade.c -@@ -6,12 +6,13 @@ - */ - - #include "bcachefs.h" --#include "darray.h" - #include "recovery_passes.h" - #include "sb-downgrade.h" - #include "sb-errors.h" - #include "super-io.h" - -+#include -+ - #define RECOVERY_PASS_ALL_FSCK BIT_ULL(63) - - /* -@@ -20,6 +21,10 @@ - * x(version, recovery_passes, errors...) - */ - #define UPGRADE_TABLE() \ -+ x(snapshot_2, \ -+ RECOVERY_PASS_ALL_FSCK, \ -+ BCH_FSCK_ERR_subvol_root_wrong_bi_subvol, \ -+ BCH_FSCK_ERR_subvol_not_master_and_not_snapshot) \ - x(backpointers, \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_v3, \ -@@ -90,7 +95,17 @@ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ -- BCH_FSCK_ERR_accounting_key_junk_at_end) -+ BCH_FSCK_ERR_accounting_key_junk_at_end) \ -+ x(cached_backpointers, \ -+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ -+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ -+ x(stripe_backpointers, \ -+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ -+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ -+ x(inode_has_case_insensitive, \ -+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ -+ BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \ -+ BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set) - - #define DOWNGRADE_TABLE() \ - x(bucket_stripe_sectors, \ -@@ -364,6 +379,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c) - if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) - continue; - -+ if (src->version < c->sb.version_incompat) -+ continue; -+ - struct bch_sb_field_downgrade_entry *dst; - unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors; - -diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h -index b86ec013d7d7..4036a20c6adc 100644 ---- a/fs/bcachefs/sb-errors_format.h -+++ b/fs/bcachefs/sb-errors_format.h -@@ -5,8 +5,7 @@ - enum bch_fsck_flags { - FSCK_CAN_FIX = 1 << 0, - FSCK_CAN_IGNORE = 1 << 1, -- FSCK_NO_RATELIMIT = 1 << 2, -- FSCK_AUTOFIX = 1 << 3, -+ FSCK_AUTOFIX = 1 << 2, - }; - - #define BCH_SB_ERRS() \ -@@ -47,7 +46,7 @@ enum bch_fsck_flags { - x(btree_node_unsupported_version, 34, 0) \ - x(btree_node_bset_older_than_sb_min, 35, 0) \ - x(btree_node_bset_newer_than_sb, 36, 0) \ -- x(btree_node_data_missing, 37, 0) \ -+ x(btree_node_data_missing, 37, FSCK_AUTOFIX) \ - x(btree_node_bset_after_end, 38, 0) \ - x(btree_node_replicas_sectors_written_mismatch, 39, 0) \ - x(btree_node_replicas_data_mismatch, 40, 0) \ -@@ -179,6 +178,7 @@ enum bch_fsck_flags { - x(ptr_crc_redundant, 160, 0) \ - x(ptr_crc_nonce_mismatch, 162, 0) \ - x(ptr_stripe_redundant, 163, 0) \ -+ x(extent_flags_not_at_start, 306, 0) \ - x(reservation_key_nr_replicas_invalid, 164, 0) \ - x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ - x(reflink_v_pos_bad, 292, 0) \ -@@ -205,10 +205,11 @@ enum bch_fsck_flags { - x(snapshot_bad_depth, 184, 0) \ - x(snapshot_bad_skiplist, 185, 0) \ - x(subvol_pos_bad, 186, 0) \ -- x(subvol_not_master_and_not_snapshot, 187, 0) \ -+ x(subvol_not_master_and_not_snapshot, 187, FSCK_AUTOFIX) \ - x(subvol_to_missing_root, 188, 0) \ -- x(subvol_root_wrong_bi_subvol, 189, 0) \ -+ x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \ - x(bkey_in_missing_snapshot, 190, 0) \ -+ x(bkey_in_deleted_snapshot, 315, 0) \ - x(inode_pos_inode_nonzero, 191, 0) \ - x(inode_pos_blockdev_range, 192, 0) \ - x(inode_alloc_cursor_inode_bad, 301, 0) \ -@@ -216,6 +217,7 @@ enum bch_fsck_flags { - x(inode_str_hash_invalid, 194, 0) \ - x(inode_v3_fields_start_bad, 195, 0) \ - x(inode_snapshot_mismatch, 196, 0) \ -+ x(snapshot_key_missing_inode_snapshot, 314, 0) \ - x(inode_unlinked_but_clean, 197, 0) \ - x(inode_unlinked_but_nlink_nonzero, 198, 0) \ - x(inode_unlinked_and_not_open, 281, 0) \ -@@ -236,6 +238,11 @@ enum bch_fsck_flags { - x(inode_has_child_snapshots_wrong, 287, 0) \ - x(inode_unreachable, 210, FSCK_AUTOFIX) \ - x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ -+ x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ -+ x(inode_has_case_insensitive_not_set, 316, FSCK_AUTOFIX) \ -+ x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ -+ x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ -+ x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ - x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ - x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ - x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ -@@ -259,6 +266,7 @@ enum bch_fsck_flags { - x(dirent_to_overwritten_inode, 302, 0) \ - x(dirent_to_missing_subvol, 230, 0) \ - x(dirent_to_itself, 231, 0) \ -+ x(dirent_casefold_mismatch, 318, FSCK_AUTOFIX) \ - x(quota_type_invalid, 232, 0) \ - x(xattr_val_size_too_small, 233, 0) \ - x(xattr_val_size_too_big, 234, 0) \ -@@ -290,14 +298,15 @@ enum bch_fsck_flags { - x(btree_node_bkey_bad_u64s, 260, 0) \ - x(btree_node_topology_empty_interior_node, 261, 0) \ - x(btree_ptr_v2_min_key_bad, 262, 0) \ -- x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ -- x(snapshot_node_missing, 264, 0) \ -+ x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \ -+ x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ - x(dup_backpointer_to_bad_csum_extent, 265, 0) \ - x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ - x(sb_clean_entry_overrun, 267, 0) \ - x(btree_ptr_v2_written_0, 268, 0) \ - x(subvol_snapshot_bad, 269, 0) \ - x(subvol_inode_bad, 270, 0) \ -+ x(subvol_missing, 308, FSCK_AUTOFIX) \ - x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ - x(accounting_mismatch, 272, FSCK_AUTOFIX) \ - x(accounting_replicas_not_marked, 273, 0) \ -@@ -310,11 +319,16 @@ enum bch_fsck_flags { - x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ - x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ - x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ -+ x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \ - x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ - x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ - x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ -- x(MAX, 304, 0) -+ x(dirent_cf_name_too_big, 304, 0) \ -+ x(dirent_stray_data_after_cf_name, 305, 0) \ -+ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ -+ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ -+ x(MAX, 319, 0) - - enum bch_sb_error_id { - #define x(t, n, ...) BCH_FSCK_ERR_##t = n, -diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h -index 40325239c3b0..3b28871d23ed 100644 ---- a/fs/bcachefs/sb-errors_types.h -+++ b/fs/bcachefs/sb-errors_types.h -@@ -2,7 +2,7 @@ - #ifndef _BCACHEFS_SB_ERRORS_TYPES_H - #define _BCACHEFS_SB_ERRORS_TYPES_H - --#include "darray.h" -+#include - - struct bch_sb_error_entry_cpu { - u64 id:16, -diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c -index 116131f95815..3398906660a5 100644 ---- a/fs/bcachefs/sb-members.c -+++ b/fs/bcachefs/sb-members.c -@@ -5,19 +5,41 @@ - #include "disk_groups.h" - #include "error.h" - #include "opts.h" -+#include "recovery_passes.h" - #include "replicas.h" - #include "sb-members.h" - #include "super-io.h" - --void bch2_dev_missing(struct bch_fs *c, unsigned dev) -+int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) -+{ -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ -+ prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev); -+ bch2_bkey_val_to_text(&buf, c, k); -+ -+ bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); -+ -+ int ret = bch2_run_explicit_recovery_pass(c, &buf, -+ BCH_RECOVERY_PASS_check_allocations, 0); -+ -+ if (print) -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) - { - if (dev != BCH_SB_MEMBER_INVALID) - bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); - } - --void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket) -+void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket) - { -- bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset); -+ bch2_fs_inconsistent(ca->fs, -+ "pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)", -+ bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets); - } - - #define x(t, n, ...) [n] = #t, -@@ -117,6 +139,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) - struct bch_sb_field_members_v1 *mi1; - struct bch_sb_field_members_v2 *mi2; - -+ if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) { -+ bch2_sb_field_resize(disk_sb, members_v1, 0); -+ return 0; -+ } -+ - mi1 = bch2_sb_field_resize(disk_sb, members_v1, - DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * - disk_sb->sb->nr_devices, sizeof(u64))); -@@ -168,6 +195,12 @@ static int validate_member(struct printbuf *err, - return -BCH_ERR_invalid_sb_members; - } - -+ if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) && -+ sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) { -+ prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i); -+ return -BCH_ERR_invalid_sb_members; -+ } -+ - return 0; - } - -@@ -189,17 +222,11 @@ static void member_to_text(struct printbuf *out, - printbuf_indent_add(out, 2); - - prt_printf(out, "Label:\t"); -- if (BCH_MEMBER_GROUP(&m)) { -- unsigned idx = BCH_MEMBER_GROUP(&m) - 1; -- -- if (idx < disk_groups_nr(gi)) -- prt_printf(out, "%s (%u)", -- gi->entries[idx].label, idx); -- else -- prt_printf(out, "(bad disk labels section)"); -- } else { -+ if (BCH_MEMBER_GROUP(&m)) -+ bch2_disk_path_to_text_sb(out, sb, -+ BCH_MEMBER_GROUP(&m) - 1); -+ else - prt_printf(out, "(none)"); -- } - prt_newline(out); - - prt_printf(out, "UUID:\t"); -@@ -266,6 +293,7 @@ static void member_to_text(struct printbuf *out, - - prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); - prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); -+ prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m)); - - printbuf_indent_sub(out, 2); - } -@@ -491,6 +519,7 @@ int bch2_sb_member_alloc(struct bch_fs *c) - unsigned u64s; - int best = -1; - u64 best_last_mount = 0; -+ unsigned nr_deleted = 0; - - if (dev_idx < BCH_SB_MEMBERS_MAX) - goto have_slot; -@@ -501,7 +530,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) - continue; - - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); -- if (bch2_member_alive(&m)) -+ -+ nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); -+ -+ if (!bch2_is_zero(&m.uuid, sizeof(m.uuid))) - continue; - - u64 last_mount = le64_to_cpu(m.last_mount); -@@ -515,6 +547,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) - goto have_slot; - } - -+ if (nr_deleted) -+ bch_err(c, "unable to allocate new member, but have %u deleted: run fsck", -+ nr_deleted); -+ - return -BCH_ERR_ENOSPC_sb_members; - have_slot: - nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); -@@ -530,3 +566,22 @@ int bch2_sb_member_alloc(struct bch_fs *c) - c->disk_sb.sb->nr_devices = nr_devices; - return dev_idx; - } -+ -+void bch2_sb_members_clean_deleted(struct bch_fs *c) -+{ -+ mutex_lock(&c->sb_lock); -+ bool write_sb = false; -+ -+ for (unsigned i = 0; i < c->sb.nr_devices; i++) { -+ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i); -+ -+ if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) { -+ memset(&m->uuid, 0, sizeof(m->uuid)); -+ write_sb = true; -+ } -+ } -+ -+ if (write_sb) -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+} -diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h -index 762083b564ee..09b751a75020 100644 ---- a/fs/bcachefs/sb-members.h -+++ b/fs/bcachefs/sb-members.h -@@ -2,8 +2,10 @@ - #ifndef _BCACHEFS_SB_MEMBERS_H - #define _BCACHEFS_SB_MEMBERS_H - --#include "darray.h" - #include "bkey_types.h" -+#include "enumerated_ref.h" -+ -+#include - - extern char * const bch2_member_error_strs[]; - -@@ -20,10 +22,22 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); - - static inline bool bch2_dev_is_online(struct bch_dev *ca) - { -- return !percpu_ref_is_zero(&ca->io_ref); -+ return !enumerated_ref_is_zero(&ca->io_ref[READ]); -+} -+ -+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); -+ -+static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) -+{ -+ rcu_read_lock(); -+ struct bch_dev *ca = bch2_dev_rcu(c, dev); -+ bool ret = ca && bch2_dev_is_online(ca); -+ rcu_read_unlock(); -+ -+ return ret; - } - --static inline bool bch2_dev_is_readable(struct bch_dev *ca) -+static inline bool bch2_dev_is_healthy(struct bch_dev *ca) - { - return bch2_dev_is_online(ca) && - ca->mi.state != BCH_MEMBER_STATE_failed; -@@ -92,6 +106,12 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * - for (struct bch_dev *_ca = NULL; \ - (_ca = __bch2_next_dev((_c), _ca, (_mask)));) - -+#define for_each_online_member_rcu(_c, _ca) \ -+ for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) -+ -+#define for_each_rw_member_rcu(_c, _ca) \ -+ for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free]) -+ - static inline void bch2_dev_get(struct bch_dev *ca) - { - #ifdef CONFIG_BCACHEFS_DEBUG -@@ -144,33 +164,34 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev - - static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, - struct bch_dev *ca, -- unsigned state_mask) -+ unsigned state_mask, -+ int rw, unsigned ref_idx) - { - rcu_read_lock(); - if (ca) -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[rw], ref_idx); - - while ((ca = __bch2_next_dev(c, ca, NULL)) && - (!((1 << ca->mi.state) & state_mask) || -- !percpu_ref_tryget(&ca->io_ref))) -+ !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))) - ; - rcu_read_unlock(); - - return ca; - } - --#define __for_each_online_member(_c, _ca, state_mask) \ -+#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \ - for (struct bch_dev *_ca = NULL; \ -- (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));) -+ (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));) - --#define for_each_online_member(c, ca) \ -- __for_each_online_member(c, ca, ~0) -+#define for_each_online_member(c, ca, ref_idx) \ -+ __for_each_online_member(c, ca, ~0, READ, ref_idx) - --#define for_each_rw_member(c, ca) \ -- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw)) -+#define for_each_rw_member(c, ca, ref_idx) \ -+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx) - --#define for_each_readable_member(c, ca) \ -- __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro)) -+#define for_each_readable_member(c, ca, ref_idx) \ -+ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx) - - static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) - { -@@ -205,13 +226,15 @@ static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned de - : NULL; - } - --void bch2_dev_missing(struct bch_fs *, unsigned); -+int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned); -+ -+void bch2_dev_missing_atomic(struct bch_fs *, unsigned); - - static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) - { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); - if (unlikely(!ca)) -- bch2_dev_missing(c, dev); -+ bch2_dev_missing_atomic(c, dev); - return ca; - } - -@@ -229,27 +252,30 @@ static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) - { - struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); - if (unlikely(!ca)) -- bch2_dev_missing(c, dev); -+ bch2_dev_missing_atomic(c, dev); - return ca; - } - - static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket) - { - struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); -- if (ca && !bucket_valid(ca, bucket.offset)) { -+ if (ca && unlikely(!bucket_valid(ca, bucket.offset))) { - bch2_dev_put(ca); - ca = NULL; - } - return ca; - } - --void bch2_dev_bucket_missing(struct bch_fs *, struct bpos); -+void bch2_dev_bucket_missing(struct bch_dev *, u64); - - static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket) - { -- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket); -- if (!ca) -- bch2_dev_bucket_missing(c, bucket); -+ struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); -+ if (ca && unlikely(!bucket_valid(ca, bucket.offset))) { -+ bch2_dev_bucket_missing(ca, bucket.offset); -+ bch2_dev_put(ca); -+ ca = NULL; -+ } - return ca; - } - -@@ -269,11 +295,14 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev - return bch2_dev_tryget(c, dev_idx); - } - --static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) -+static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, -+ int rw, unsigned ref_idx) - { -+ might_sleep(); -+ - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, dev); -- if (ca && !percpu_ref_tryget(&ca->io_ref)) -+ if (ca && !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) - ca = NULL; - rcu_read_unlock(); - -@@ -283,27 +312,17 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, - return ca; - - if (ca) -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[rw], ref_idx); - return NULL; - } - --/* XXX kill, move to struct bch_fs */ --static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) --{ -- struct bch_devs_mask devs; -- -- memset(&devs, 0, sizeof(devs)); -- for_each_online_member(c, ca) -- __set_bit(ca->dev_idx, devs.d); -- return devs; --} -- - extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; - extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; - - static inline bool bch2_member_alive(struct bch_member *m) - { -- return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); -+ return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) && -+ !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID); - } - - static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) -@@ -333,6 +352,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) - ? BCH_MEMBER_DURABILITY(mi) - 1 - : 1, - .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), -+ .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi), - .valid = bch2_member_alive(mi), - .btree_bitmap_shift = mi->btree_bitmap_shift, - .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), -@@ -363,5 +383,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); - void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); - - int bch2_sb_member_alloc(struct bch_fs *); -+void bch2_sb_members_clean_deleted(struct bch_fs *); - - #endif /* _BCACHEFS_SB_MEMBERS_H */ -diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h -index 2adf1221a440..fb72ad730518 100644 ---- a/fs/bcachefs/sb-members_format.h -+++ b/fs/bcachefs/sb-members_format.h -@@ -13,6 +13,10 @@ - */ - #define BCH_SB_MEMBER_INVALID 255 - -+#define BCH_SB_MEMBER_DELETED_UUID \ -+ UUID_INIT(0xffffffff, 0xffff, 0xffff, \ -+ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) -+ - #define BCH_MIN_NR_NBUCKETS (1 << 6) - - #define BCH_IOPS_MEASUREMENTS() \ -@@ -79,6 +83,7 @@ struct bch_member { - - #define BCH_MEMBER_V1_BYTES 56 - -+LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16) - LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) - /* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ - LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) -@@ -87,6 +92,8 @@ LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) - LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) - LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, - struct bch_member, flags, 30, 31) -+LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT, -+ struct bch_member, flags, 31, 32) - - #if 0 - LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h -index c0eda888fe39..d6443e186872 100644 ---- a/fs/bcachefs/sb-members_types.h -+++ b/fs/bcachefs/sb-members_types.h -@@ -13,6 +13,7 @@ struct bch_member_cpu { - u8 data_allowed; - u8 durability; - u8 freespace_initialized; -+ u8 resize_on_mount; - u8 valid; - u8 btree_bitmap_shift; - u64 btree_allocated_bitmap; -diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c -index c54091a28909..00d62d1190ef 100644 ---- a/fs/bcachefs/snapshot.c -+++ b/fs/bcachefs/snapshot.c -@@ -1,11 +1,13 @@ - // SPDX-License-Identifier: GPL-2.0 - - #include "bcachefs.h" -+#include "bbpos.h" - #include "bkey_buf.h" - #include "btree_cache.h" - #include "btree_key_cache.h" - #include "btree_update.h" - #include "buckets.h" -+#include "enumerated_ref.h" - #include "errcode.h" - #include "error.h" - #include "fs.h" -@@ -141,13 +143,14 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) - rcu_read_lock(); - struct snapshot_table *t = rcu_dereference(c->snapshots); - -- if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { -+ if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) { - ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); - goto out; - } - -- while (id && id < ancestor - IS_ANCESTOR_BITMAP) -- id = get_ancestor_below(t, id, ancestor); -+ if (likely(ancestor >= IS_ANCESTOR_BITMAP)) -+ while (id && id < ancestor - IS_ANCESTOR_BITMAP) -+ id = get_ancestor_below(t, id, ancestor); - - ret = id && id < ancestor - ? test_ancestor_bitmap(t, id, ancestor) -@@ -208,9 +211,14 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, - { - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - -- prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", -- BCH_SNAPSHOT_SUBVOL(s.v), -- BCH_SNAPSHOT_DELETED(s.v), -+ if (BCH_SNAPSHOT_SUBVOL(s.v)) -+ prt_str(out, "subvol "); -+ if (BCH_SNAPSHOT_WILL_DELETE(s.v)) -+ prt_str(out, "will_delete "); -+ if (BCH_SNAPSHOT_DELETED(s.v)) -+ prt_str(out, "deleted "); -+ -+ prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u", - le32_to_cpu(s.v->parent), - le32_to_cpu(s.v->children[0]), - le32_to_cpu(s.v->children[1]), -@@ -280,6 +288,16 @@ int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, - return ret; - } - -+static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) -+{ -+ mutex_lock(&c->snapshot_table_lock); -+ int ret = snapshot_t_mut(c, id) -+ ? 0 -+ : -BCH_ERR_ENOMEM_mark_snapshot; -+ mutex_unlock(&c->snapshot_table_lock); -+ return ret; -+} -+ - static int __bch2_mark_snapshot(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, -@@ -301,7 +319,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, - if (new.k->type == KEY_TYPE_snapshot) { - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - -- t->live = true; -+ t->state = !BCH_SNAPSHOT_DELETED(s.v) -+ ? SNAPSHOT_ID_live -+ : SNAPSHOT_ID_deleted; - t->parent = le32_to_cpu(s.v->parent); - t->children[0] = le32_to_cpu(s.v->children[0]); - t->children[1] = le32_to_cpu(s.v->children[1]); -@@ -326,9 +346,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, - parent - id - 1 < IS_ANCESTOR_BITMAP) - __set_bit(parent - id - 1, t->is_ancestor); - -- if (BCH_SNAPSHOT_DELETED(s.v)) { -+ if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); -- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) -+ if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots) - bch2_delete_dead_snapshots_async(c); - } - } else { -@@ -389,22 +409,31 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) - return 0; - } - --static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) -+u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root, -+ snapshot_id_list *skip) - { -- u32 id = snapshot_root; -- u32 subvol = 0, s; -- -+ u32 id, subvol = 0, s; -+retry: -+ id = snapshot_root; - rcu_read_lock(); -- while (id) { -- s = snapshot_t(c, id)->subvol; -- -- if (s && (!subvol || s < subvol)) -- subvol = s; -+ while (id && bch2_snapshot_exists(c, id)) { -+ if (!(skip && snapshot_list_has_id(skip, id))) { -+ s = snapshot_t(c, id)->subvol; - -+ if (s && (!subvol || s < subvol)) -+ subvol = s; -+ } - id = bch2_snapshot_tree_next(c, id); -+ if (id == snapshot_root) -+ break; - } - rcu_read_unlock(); - -+ if (!subvol && skip) { -+ skip = NULL; -+ goto retry; -+ } -+ - return subvol; - } - -@@ -436,7 +465,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, - if (!ret && !found) { - struct bkey_i_subvolume *u; - -- *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); -+ *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL); - - u = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, *subvol_id), -@@ -484,7 +513,7 @@ static int check_snapshot_tree(struct btree_trans *trans, - root_id != bch2_snapshot_root(c, root_id) || - st.k->p.offset != le32_to_cpu(s.tree), - trans, snapshot_tree_to_missing_snapshot, -- "snapshot tree points to missing/incorrect snapshot:\n %s", -+ "snapshot tree points to missing/incorrect snapshot:\n%s", - (bch2_bkey_val_to_text(&buf, c, st.s_c), - prt_newline(&buf), - ret -@@ -504,19 +533,19 @@ static int check_snapshot_tree(struct btree_trans *trans, - - if (fsck_err_on(ret, - trans, snapshot_tree_to_missing_subvol, -- "snapshot tree points to missing subvolume:\n %s", -+ "snapshot tree points to missing subvolume:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(!bch2_snapshot_is_ancestor(c, - le32_to_cpu(subvol.snapshot), - root_id), - trans, snapshot_tree_to_wrong_subvol, -- "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", -+ "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), - trans, snapshot_tree_to_snapshot_subvol, -- "snapshot tree points to snapshot subvolume:\n %s", -+ "snapshot tree points to snapshot subvolume:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { - struct bkey_i_snapshot_tree *u; -@@ -653,7 +682,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, - u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u) ?: - bch2_snapshot_tree_create(trans, root_id, -- bch2_snapshot_tree_oldest_subvol(c, root_id), -+ bch2_snapshot_oldest_subvol(c, root_id, NULL), - &tree_id); - if (ret) - goto err; -@@ -698,6 +727,9 @@ static int check_snapshot(struct btree_trans *trans, - memset(&s, 0, sizeof(s)); - memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); - -+ if (BCH_SNAPSHOT_DELETED(&s)) -+ return 0; -+ - id = le32_to_cpu(s.parent); - if (id) { - ret = bch2_snapshot_lookup(trans, id, &v); -@@ -735,7 +767,7 @@ static int check_snapshot(struct btree_trans *trans, - } - - bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && -- !BCH_SNAPSHOT_DELETED(&s); -+ !BCH_SNAPSHOT_WILL_DELETE(&s); - - if (should_have_subvol) { - id = le32_to_cpu(s.subvol); -@@ -755,7 +787,7 @@ static int check_snapshot(struct btree_trans *trans, - } else { - if (fsck_err_on(s.subvol, - trans, snapshot_should_not_have_subvol, -- "snapshot should not point to subvol:\n %s", -+ "snapshot should not point to subvol:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); -@@ -773,7 +805,7 @@ static int check_snapshot(struct btree_trans *trans, - - if (fsck_err_on(!ret, - trans, snapshot_to_bad_snapshot_tree, -- "snapshot points to missing/incorrect tree:\n %s", -+ "snapshot points to missing/incorrect tree:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = snapshot_tree_ptr_repair(trans, iter, k, &s); - if (ret) -@@ -785,7 +817,7 @@ static int check_snapshot(struct btree_trans *trans, - - if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, - trans, snapshot_bad_depth, -- "snapshot with incorrect depth field, should be %u:\n %s", -+ "snapshot with incorrect depth field, should be %u:\n%s", - real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); -@@ -802,7 +834,7 @@ static int check_snapshot(struct btree_trans *trans, - - if (fsck_err_on(!ret, - trans, snapshot_bad_skiplist, -- "snapshot with bad skiplist field:\n %s", -+ "snapshot with bad skiplist field:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); -@@ -842,9 +874,6 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) - { - struct bch_fs *c = trans->c; - -- if (bch2_snapshot_exists(c, id)) -- return 0; -- - /* Do we need to reconstruct the snapshot_tree entry as well? */ - struct btree_iter iter; - struct bkey_s_c k; -@@ -889,9 +918,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) - } - bch2_trans_iter_exit(trans, &iter); - -- return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: -- bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, -- bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); -+ return bch2_snapshot_table_make_room(c, id) ?: -+ bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0); - } - - /* Figure out which snapshot nodes belong in the same tree: */ -@@ -989,7 +1017,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) - snapshot_id_list_to_text(&buf, t); - - darray_for_each(*t, id) { -- if (fsck_err_on(!bch2_snapshot_exists(c, *id), -+ if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty, - trans, snapshot_node_missing, - "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { - if (t->nr > 1) { -@@ -1014,22 +1042,38 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) - return ret; - } - --int bch2_check_key_has_snapshot(struct btree_trans *trans, -- struct btree_iter *iter, -- struct bkey_s_c k) -+int __bch2_check_key_has_snapshot(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) - { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; -+ enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot); -+ -+ /* Snapshot was definitively deleted, this error is marked autofix */ -+ if (fsck_err_on(state == SNAPSHOT_ID_deleted, -+ trans, bkey_in_deleted_snapshot, -+ "key in deleted snapshot %s, delete?", -+ (bch2_btree_id_to_text(&buf, iter->btree_id), -+ prt_char(&buf, ' '), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) -+ ret = bch2_btree_delete_at(trans, iter, -+ BTREE_UPDATE_internal_snapshot_node) ?: 1; - -- if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), -+ /* -+ * Snapshot missing: we should have caught this with btree_lost_data and -+ * kicked off reconstruct_snapshots, so if we end up here we have no -+ * idea what happened: -+ */ -+ if (fsck_err_on(state == SNAPSHOT_ID_empty, - trans, bkey_in_missing_snapshot, - "key in missing snapshot %s, delete?", - (bch2_btree_id_to_text(&buf, iter->btree_id), - prt_char(&buf, ' '), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, -- BTREE_UPDATE_internal_snapshot_node) ?: 1; -+ BTREE_UPDATE_internal_snapshot_node) ?: 1; - fsck_err: - printbuf_exit(&buf); - return ret; -@@ -1053,10 +1097,10 @@ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) - } - - /* already deleted? */ -- if (BCH_SNAPSHOT_DELETED(&s->v)) -+ if (BCH_SNAPSHOT_WILL_DELETE(&s->v)) - goto err; - -- SET_BCH_SNAPSHOT_DELETED(&s->v, true); -+ SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true); - SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); - s->v.subvol = 0; - err: -@@ -1073,27 +1117,28 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) - static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) - { - struct bch_fs *c = trans->c; -- struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; -- struct btree_iter c_iter = (struct btree_iter) { NULL }; -- struct btree_iter tree_iter = (struct btree_iter) { NULL }; -- struct bkey_s_c_snapshot s; -+ struct btree_iter iter, p_iter = {}; -+ struct btree_iter c_iter = {}; -+ struct btree_iter tree_iter = {}; - u32 parent_id, child_id; - unsigned i; - int ret = 0; - -- s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), -- BTREE_ITER_intent, snapshot); -- ret = bkey_err(s); -+ struct bkey_i_snapshot *s = -+ bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), -+ BTREE_ITER_intent, snapshot); -+ ret = PTR_ERR_OR_ZERO(s); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", id); - - if (ret) - goto err; - -- BUG_ON(s.v->children[1]); -+ BUG_ON(BCH_SNAPSHOT_DELETED(&s->v)); -+ BUG_ON(s->v.children[1]); - -- parent_id = le32_to_cpu(s.v->parent); -- child_id = le32_to_cpu(s.v->children[0]); -+ parent_id = le32_to_cpu(s->v.parent); -+ child_id = le32_to_cpu(s->v.children[0]); - - if (parent_id) { - struct bkey_i_snapshot *parent; -@@ -1151,24 +1196,38 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) - */ - struct bkey_i_snapshot_tree *s_t; - -- BUG_ON(s.v->children[1]); -+ BUG_ON(s->v.children[1]); - - s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, -- BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), -+ BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)), - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(s_t); - if (ret) - goto err; - -- if (s.v->children[0]) { -- s_t->v.root_snapshot = s.v->children[0]; -+ if (s->v.children[0]) { -+ s_t->v.root_snapshot = s->v.children[0]; - } else { - s_t->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&s_t->k, 0); - } - } - -- ret = bch2_btree_delete_at(trans, &iter, 0); -+ if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) { -+ SET_BCH_SNAPSHOT_DELETED(&s->v, true); -+ s->v.parent = 0; -+ s->v.children[0] = 0; -+ s->v.children[1] = 0; -+ s->v.subvol = 0; -+ s->v.tree = 0; -+ s->v.depth = 0; -+ s->v.skip[0] = 0; -+ s->v.skip[1] = 0; -+ s->v.skip[2] = 0; -+ } else { -+ s->k.type = KEY_TYPE_deleted; -+ set_bkey_val_u64s(&s->k, 0); -+ } - err: - bch2_trans_iter_exit(trans, &tree_iter); - bch2_trans_iter_exit(trans, &p_iter); -@@ -1192,13 +1251,13 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, - - bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS_MIN, BTREE_ITER_intent); -- k = bch2_btree_iter_peek(&iter); -+ k = bch2_btree_iter_peek(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - for (i = 0; i < nr_snapids; i++) { -- k = bch2_btree_iter_prev_slot(&iter); -+ k = bch2_btree_iter_prev_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; -@@ -1338,12 +1397,6 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, - * that key to snapshot leaf nodes, where we can mutate it - */ - --struct snapshot_interior_delete { -- u32 id; -- u32 live_child; --}; --typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; -- - static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) - { - darray_for_each(*l, i) -@@ -1377,28 +1430,34 @@ static unsigned __live_child(struct snapshot_table *t, u32 id, - return 0; - } - --static unsigned live_child(struct bch_fs *c, u32 id, -- snapshot_id_list *delete_leaves, -- interior_delete_list *delete_interior) -+static unsigned live_child(struct bch_fs *c, u32 id) - { -+ struct snapshot_delete *d = &c->snapshot_delete; -+ - rcu_read_lock(); - u32 ret = __live_child(rcu_dereference(c->snapshots), id, -- delete_leaves, delete_interior); -+ &d->delete_leaves, &d->delete_interior); - rcu_read_unlock(); - return ret; - } - -+static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) -+{ -+ return snapshot_list_has_id(&d->delete_leaves, id) || -+ interior_delete_has_id(&d->delete_interior, id) != 0; -+} -+ - static int delete_dead_snapshots_process_key(struct btree_trans *trans, - struct btree_iter *iter, -- struct bkey_s_c k, -- snapshot_id_list *delete_leaves, -- interior_delete_list *delete_interior) -+ struct bkey_s_c k) - { -- if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) -+ struct snapshot_delete *d = &trans->c->snapshot_delete; -+ -+ if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot)) - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - -- u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); -+ u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot); - if (live_child) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - int ret = PTR_ERR_OR_ZERO(new); -@@ -1429,49 +1488,208 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, - return 0; - } - -+static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum) -+{ -+ struct bch_fs *c = trans->c; -+ struct snapshot_delete *d = &c->snapshot_delete; -+ -+ u64 inum = iter->btree_id != BTREE_ID_inodes -+ ? iter->pos.inode -+ : iter->pos.offset; -+ -+ if (*prev_inum == inum) -+ return false; -+ -+ *prev_inum = inum; -+ -+ bool ret = !snapshot_list_has_id(&d->deleting_from_trees, -+ bch2_snapshot_tree(c, iter->pos.snapshot)); -+ if (unlikely(ret)) { -+ struct bpos pos = iter->pos; -+ pos.snapshot = 0; -+ if (iter->btree_id != BTREE_ID_inodes) -+ pos.offset = U64_MAX; -+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos)); -+ } -+ -+ return ret; -+} -+ -+static int delete_dead_snapshot_keys_v1(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct snapshot_delete *d = &c->snapshot_delete; -+ -+ for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { -+ struct disk_reservation res = { 0 }; -+ u64 prev_inum = 0; -+ -+ d->pos.pos = POS_MIN; -+ -+ if (!btree_type_has_snapshots(d->pos.btree)) -+ continue; -+ -+ int ret = for_each_btree_key_commit(trans, iter, -+ d->pos.btree, POS_MIN, -+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, -+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ -+ d->pos.pos = iter.pos; -+ -+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) -+ continue; -+ -+ delete_dead_snapshots_process_key(trans, &iter, k); -+ })); -+ -+ bch2_disk_reservation_put(c, &res); -+ -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree, -+ struct bpos start, struct bpos end) -+{ -+ struct bch_fs *c = trans->c; -+ struct snapshot_delete *d = &c->snapshot_delete; -+ struct disk_reservation res = { 0 }; -+ -+ d->pos.btree = btree; -+ d->pos.pos = POS_MIN; -+ -+ int ret = for_each_btree_key_max_commit(trans, iter, -+ btree, start, end, -+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, -+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ -+ d->pos.pos = iter.pos; -+ delete_dead_snapshots_process_key(trans, &iter, k); -+ })); -+ -+ bch2_disk_reservation_put(c, &res); -+ return ret; -+} -+ -+static int delete_dead_snapshot_keys_v2(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct snapshot_delete *d = &c->snapshot_delete; -+ struct disk_reservation res = { 0 }; -+ u64 prev_inum = 0; -+ int ret = 0; -+ -+ struct btree_iter iter; -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN, -+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); -+ -+ while (1) { -+ struct bkey_s_c k; -+ ret = lockrestart_do(trans, -+ bkey_err(k = bch2_btree_iter_peek(trans, &iter))); -+ if (ret) -+ break; -+ -+ if (!k.k) -+ break; -+ -+ d->pos.btree = iter.btree_id; -+ d->pos.pos = iter.pos; -+ -+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) -+ continue; -+ -+ if (snapshot_id_dying(d, k.k->p.snapshot)) { -+ struct bpos start = POS(k.k->p.offset, 0); -+ struct bpos end = POS(k.k->p.offset, U64_MAX); -+ -+ ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?: -+ delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?: -+ delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end); -+ if (ret) -+ break; -+ -+ bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1)); -+ } else { -+ bch2_btree_iter_advance(trans, &iter); -+ } -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (ret) -+ goto err; -+ -+ prev_inum = 0; -+ ret = for_each_btree_key_commit(trans, iter, -+ BTREE_ID_inodes, POS_MIN, -+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, -+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ -+ d->pos.btree = iter.btree_id; -+ d->pos.pos = iter.pos; -+ -+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) -+ continue; -+ -+ delete_dead_snapshots_process_key(trans, &iter, k); -+ })); -+err: -+ bch2_disk_reservation_put(c, &res); -+ return ret; -+} -+ - /* - * For a given snapshot, if it doesn't have a subvolume that points to it, and - * it doesn't have child snapshot nodes - it's now redundant and we can mark it - * as deleted. - */ --static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, -- snapshot_id_list *delete_leaves, -- interior_delete_list *delete_interior) -+static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k) - { - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - struct bch_fs *c = trans->c; -+ struct snapshot_delete *d = &c->snapshot_delete; - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - unsigned live_children = 0; -+ int ret = 0; - - if (BCH_SNAPSHOT_SUBVOL(s.v)) - return 0; - -+ if (BCH_SNAPSHOT_DELETED(s.v)) -+ return 0; -+ -+ mutex_lock(&d->progress_lock); - for (unsigned i = 0; i < 2; i++) { - u32 child = le32_to_cpu(s.v->children[i]); - - live_children += child && -- !snapshot_list_has_id(delete_leaves, child); -+ !snapshot_list_has_id(&d->delete_leaves, child); - } - -+ u32 tree = bch2_snapshot_tree(c, s.k->p.offset); -+ - if (live_children == 0) { -- return snapshot_list_add(c, delete_leaves, s.k->p.offset); -+ ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: -+ snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); - } else if (live_children == 1) { -- struct snapshot_interior_delete d = { -+ struct snapshot_interior_delete n = { - .id = s.k->p.offset, -- .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), -+ .live_child = live_child(c, s.k->p.offset), - }; - -- if (!d.live_child) { -- bch_err(c, "error finding live child of snapshot %u", d.id); -- return -EINVAL; -+ if (!n.live_child) { -+ bch_err(c, "error finding live child of snapshot %u", n.id); -+ ret = -EINVAL; -+ } else { -+ ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: -+ darray_push(&d->delete_interior, n); - } -- -- return darray_push(delete_interior, d); -- } else { -- return 0; - } -+ mutex_unlock(&d->progress_lock); -+ -+ return ret; - } - - static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, -@@ -1500,6 +1718,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, - struct bkey_i_snapshot *s; - int ret; - -+ if (!bch2_snapshot_exists(c, k.k->p.offset)) -+ return 0; -+ - if (k.k->type != KEY_TYPE_snapshot) - return 0; - -@@ -1547,39 +1768,56 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, - return bch2_trans_update(trans, iter, &s->k_i, 0); - } - --int bch2_delete_dead_snapshots(struct bch_fs *c) -+static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d) - { -- if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) -+ prt_printf(out, "deleting from trees"); -+ darray_for_each(d->deleting_from_trees, i) -+ prt_printf(out, " %u", *i); -+ -+ prt_printf(out, "deleting leaves"); -+ darray_for_each(d->delete_leaves, i) -+ prt_printf(out, " %u", *i); -+ prt_newline(out); -+ -+ prt_printf(out, "interior"); -+ darray_for_each(d->delete_interior, i) -+ prt_printf(out, " %u->%u", i->id, i->live_child); -+ prt_newline(out); -+} -+ -+int __bch2_delete_dead_snapshots(struct bch_fs *c) -+{ -+ struct snapshot_delete *d = &c->snapshot_delete; -+ int ret = 0; -+ -+ if (!mutex_trylock(&d->lock)) - return 0; - -+ if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) -+ goto out_unlock; -+ - struct btree_trans *trans = bch2_trans_get(c); -- snapshot_id_list delete_leaves = {}; -- interior_delete_list delete_interior = {}; -- int ret = 0; - - /* - * For every snapshot node: If we have no live children and it's not - * pointed to by a subvolume, delete it: - */ -+ d->running = true; -+ d->pos = BBPOS_MIN; -+ - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, -- check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); -+ check_should_delete_snapshot(trans, k)); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "walking snapshots"); - if (ret) - goto err; - -- if (!delete_leaves.nr && !delete_interior.nr) -+ if (!d->delete_leaves.nr && !d->delete_interior.nr) - goto err; - - { - struct printbuf buf = PRINTBUF; -- prt_printf(&buf, "deleting leaves"); -- darray_for_each(delete_leaves, i) -- prt_printf(&buf, " %u", *i); -- -- prt_printf(&buf, " interior"); -- darray_for_each(delete_interior, i) -- prt_printf(&buf, " %u->%u", i->id, i->live_child); -+ bch2_snapshot_delete_nodes_to_text(&buf, d); - - ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); - printbuf_exit(&buf); -@@ -1587,29 +1825,15 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) - goto err; - } - -- for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { -- struct disk_reservation res = { 0 }; -- -- if (!btree_type_has_snapshots(btree)) -- continue; -- -- ret = for_each_btree_key_commit(trans, iter, -- btree, POS_MIN, -- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, -- &res, NULL, BCH_TRANS_COMMIT_no_enospc, -- delete_dead_snapshots_process_key(trans, &iter, k, -- &delete_leaves, -- &delete_interior)); -- -- bch2_disk_reservation_put(c, &res); -- -- if (!bch2_err_matches(ret, EROFS)) -- bch_err_msg(c, ret, "deleting keys from dying snapshots"); -- if (ret) -- goto err; -- } -+ ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2) -+ ? delete_dead_snapshot_keys_v2(trans) -+ : delete_dead_snapshot_keys_v1(trans); -+ if (!bch2_err_matches(ret, EROFS)) -+ bch_err_msg(c, ret, "deleting keys from dying snapshots"); -+ if (ret) -+ goto err; - -- darray_for_each(delete_leaves, i) { -+ darray_for_each(d->delete_leaves, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - if (!bch2_err_matches(ret, EROFS)) -@@ -1626,11 +1850,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, - BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); -+ bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); - if (ret) - goto err; - -- darray_for_each(delete_interior, i) { -+ darray_for_each(d->delete_interior, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, i->id)); - if (!bch2_err_matches(ret, EROFS)) -@@ -1639,33 +1863,66 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) - goto err; - } - err: -- darray_exit(&delete_interior); -- darray_exit(&delete_leaves); -+ mutex_lock(&d->progress_lock); -+ darray_exit(&d->deleting_from_trees); -+ darray_exit(&d->delete_interior); -+ darray_exit(&d->delete_leaves); -+ d->running = false; -+ mutex_unlock(&d->progress_lock); - bch2_trans_put(trans); -+out_unlock: -+ mutex_unlock(&d->lock); - if (!bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); - return ret; - } - -+int bch2_delete_dead_snapshots(struct bch_fs *c) -+{ -+ if (!c->opts.auto_snapshot_deletion) -+ return 0; -+ -+ return __bch2_delete_dead_snapshots(c); -+} -+ - void bch2_delete_dead_snapshots_work(struct work_struct *work) - { -- struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); -+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); - - set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); - - bch2_delete_dead_snapshots(c); -- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); - } - - void bch2_delete_dead_snapshots_async(struct bch_fs *c) - { -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) -+ if (!c->opts.auto_snapshot_deletion) -+ return; -+ -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots)) - return; - - BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - -- if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) -- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); -+ if (!queue_work(c->write_ref_wq, &c->snapshot_delete.work)) -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); -+} -+ -+void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct snapshot_delete *d = &c->snapshot_delete; -+ -+ if (!d->running) { -+ prt_str(out, "(not running)"); -+ return; -+ } -+ -+ mutex_lock(&d->progress_lock); -+ bch2_snapshot_delete_nodes_to_text(out, d); -+ -+ bch2_bbpos_to_text(out, d->pos); -+ mutex_unlock(&d->progress_lock); - } - - int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, -@@ -1706,7 +1963,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct - return 0; - - struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); -- if (BCH_SNAPSHOT_DELETED(snap.v) || -+ if (BCH_SNAPSHOT_WILL_DELETE(snap.v) || - interior_snapshot_needs_delete(snap)) - set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); - -@@ -1735,10 +1992,6 @@ int bch2_snapshots_read(struct bch_fs *c) - BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && - test_bit(BCH_FS_may_go_rw, &c->flags)); - -- if (bch2_err_matches(ret, EIO) || -- (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) -- ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); -- - return ret; - } - -@@ -1746,3 +1999,11 @@ void bch2_fs_snapshots_exit(struct bch_fs *c) - { - kvfree(rcu_dereference_protected(c->snapshots, true)); - } -+ -+void bch2_fs_snapshots_init_early(struct bch_fs *c) -+{ -+ INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); -+ mutex_init(&c->snapshot_delete.lock); -+ mutex_init(&c->snapshot_delete.progress_lock); -+ mutex_init(&c->snapshots_unlinked_lock); -+} -diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h -index 00373cf32e7b..382a171f5413 100644 ---- a/fs/bcachefs/snapshot.h -+++ b/fs/bcachefs/snapshot.h -@@ -105,6 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) - return id; - } - -+u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *); - u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); - - static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) -@@ -119,21 +120,26 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) - return id; - } - --static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) -+static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id) - { - const struct snapshot_t *s = snapshot_t(c, id); -- return s ? s->live : 0; -+ return s ? s->state : SNAPSHOT_ID_empty; - } - --static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) -+static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) - { - rcu_read_lock(); -- bool ret = __bch2_snapshot_exists(c, id); -+ enum snapshot_id_state ret = __bch2_snapshot_id_state(c, id); - rcu_read_unlock(); - - return ret; - } - -+static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) -+{ -+ return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live; -+} -+ - static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) - { - rcu_read_lock(); -@@ -240,10 +246,19 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, - int bch2_check_snapshot_trees(struct bch_fs *); - int bch2_check_snapshots(struct bch_fs *); - int bch2_reconstruct_snapshots(struct bch_fs *); --int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); -+ -+int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); -+ -+static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot)) -+ ? 0 -+ : __bch2_check_key_has_snapshot(trans, iter, k); -+} - - int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); --void bch2_delete_dead_snapshots_work(struct work_struct *); - - int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); - -@@ -258,7 +273,14 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, - return __bch2_key_has_snapshot_overwrites(trans, id, pos); - } - -+int __bch2_delete_dead_snapshots(struct bch_fs *); -+int bch2_delete_dead_snapshots(struct bch_fs *); -+void bch2_delete_dead_snapshots_work(struct work_struct *); -+void bch2_delete_dead_snapshots_async(struct bch_fs *); -+void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *); -+ - int bch2_snapshots_read(struct bch_fs *); - void bch2_fs_snapshots_exit(struct bch_fs *); -+void bch2_fs_snapshots_init_early(struct bch_fs *); - - #endif /* _BCACHEFS_SNAPSHOT_H */ -diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h -index aabcd3a74cd9..9bccae1f3590 100644 ---- a/fs/bcachefs/snapshot_format.h -+++ b/fs/bcachefs/snapshot_format.h -@@ -15,10 +15,10 @@ struct bch_snapshot { - bch_le128 btime; - }; - --LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) -- -+LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) - /* True if a subvolume points to this snapshot node: */ - LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) -+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) - - /* - * Snapshot trees: -diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h -new file mode 100644 -index 000000000000..31f96d1cf5f4 ---- /dev/null -+++ b/fs/bcachefs/snapshot_types.h -@@ -0,0 +1,58 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SNAPSHOT_TYPES_H -+#define _BCACHEFS_SNAPSHOT_TYPES_H -+ -+#include "bbpos_types.h" -+#include "subvolume_types.h" -+ -+#include -+ -+typedef DARRAY(u32) snapshot_id_list; -+ -+#define IS_ANCESTOR_BITMAP 128 -+ -+struct snapshot_t { -+ enum snapshot_id_state { -+ SNAPSHOT_ID_empty, -+ SNAPSHOT_ID_live, -+ SNAPSHOT_ID_deleted, -+ } state; -+ u32 parent; -+ u32 skip[3]; -+ u32 depth; -+ u32 children[2]; -+ u32 subvol; /* Nonzero only if a subvolume points to this node: */ -+ u32 tree; -+ unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; -+}; -+ -+struct snapshot_table { -+ struct rcu_head rcu; -+ size_t nr; -+#ifndef RUST_BINDGEN -+ DECLARE_FLEX_ARRAY(struct snapshot_t, s); -+#else -+ struct snapshot_t s[0]; -+#endif -+}; -+ -+struct snapshot_interior_delete { -+ u32 id; -+ u32 live_child; -+}; -+typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; -+ -+struct snapshot_delete { -+ struct mutex lock; -+ struct work_struct work; -+ -+ struct mutex progress_lock; -+ snapshot_id_list deleting_from_trees; -+ snapshot_id_list delete_leaves; -+ interior_delete_list delete_interior; -+ -+ bool running; -+ struct bbpos pos; -+}; -+ -+#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */ -diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c -index d78451c2a0c6..0cbf5508a32c 100644 ---- a/fs/bcachefs/str_hash.c -+++ b/fs/bcachefs/str_hash.c -@@ -50,7 +50,7 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans, - for (unsigned i = 0; i < 1000; i++) { - unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", - old_name.len, old_name.name, i); -- unsigned u64s = BKEY_U64s + dirent_val_u64s(len); -+ unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0); - - if (u64s > U8_MAX) - return -EINVAL; -@@ -101,17 +101,25 @@ static noinline int hash_pick_winner(struct btree_trans *trans, - } - } - --static int repair_inode_hash_info(struct btree_trans *trans, -- struct bch_inode_unpacked *snapshot_root) -+/* -+ * str_hash lookups across snapshots break in wild ways if hash_info in -+ * different snapshot versions doesn't match - so if we find one mismatch, check -+ * them all -+ */ -+int bch2_repair_inode_hash_info(struct btree_trans *trans, -+ struct bch_inode_unpacked *snapshot_root) - { -+ struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; -+ struct printbuf buf = PRINTBUF; -+ bool need_commit = false; - int ret = 0; - -- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, -- SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), -- BTREE_ITER_all_snapshots, k, ret) { -- if (k.k->p.offset != snapshot_root->bi_inum) -+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, -+ POS(0, snapshot_root->bi_inum), -+ BTREE_ITER_all_snapshots, k, ret) { -+ if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot))) - break; - if (!bkey_is_inode(k.k)) - continue; -@@ -121,19 +129,72 @@ static int repair_inode_hash_info(struct btree_trans *trans, - if (ret) - break; - -- if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || -- INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), -- trans, inode_snapshot_mismatch, -- "inode hash info in different snapshots don't match")) { -+ if (inode.bi_hash_seed == snapshot_root->bi_hash_seed && -+ INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) { -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root); -+ struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); -+ -+ BUG_ON(hash1.type != hash2.type || -+ memcmp(&hash1.siphash_key, -+ &hash2.siphash_key, -+ sizeof(hash1.siphash_key))); -+#endif -+ continue; -+ } -+ -+ printbuf_reset(&buf); -+ prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n", -+ snapshot_root->bi_inum, -+ inode.bi_snapshot, -+ snapshot_root->bi_snapshot); -+ -+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode)); -+ prt_printf(&buf, " %llx\n", inode.bi_hash_seed); -+ -+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); -+ prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed); -+ -+ if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) { - inode.bi_hash_seed = snapshot_root->bi_hash_seed; - SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); -- ret = __bch2_fsck_write_inode(trans, &inode) ?: -- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: -- -BCH_ERR_transaction_restart_nested; -- break; -+ -+ ret = __bch2_fsck_write_inode(trans, &inode); -+ if (ret) -+ break; -+ need_commit = true; - } - } -+ -+ if (ret) -+ goto err; -+ -+ if (!need_commit) { -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ -+ prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", -+ snapshot_root->bi_inum); -+ -+ prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot); -+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); -+ prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed); -+#if 0 -+ prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); -+ bch2_prt_str_hash_type(&buf, hash_info->type); -+ prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); -+#endif -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ ret = -BCH_ERR_fsck_repair_unimplemented; -+ goto err; -+ } -+ -+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: -+ -BCH_ERR_transaction_restart_nested; -+err: - fsck_err: -+ printbuf_exit(&buf); - bch2_trans_iter_exit(trans, &iter); - return ret; - } -@@ -145,46 +206,18 @@ static int repair_inode_hash_info(struct btree_trans *trans, - static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, - struct bch_hash_info *hash_info) - { -- struct bch_fs *c = trans->c; -- struct btree_iter iter; -- struct bkey_s_c k; -- int ret = 0; -- -- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), -- BTREE_ITER_all_snapshots, k, ret) { -- if (k.k->p.offset != inum) -- break; -- if (bkey_is_inode(k.k)) -- goto found; -- } -- bch_err(c, "%s(): inum %llu not found", __func__, inum); -- ret = -BCH_ERR_fsck_repair_unimplemented; -- goto err; --found:; -- struct bch_inode_unpacked inode; -- ret = bch2_inode_unpack(k, &inode); -+ struct bch_inode_unpacked snapshot_root; -+ int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); - if (ret) -- goto err; -+ return ret; -+ -+ struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root); -+ if (hash_info->type != hash_root.type || -+ memcmp(&hash_info->siphash_key, -+ &hash_root.siphash_key, -+ sizeof(hash_root.siphash_key))) -+ ret = bch2_repair_inode_hash_info(trans, &snapshot_root); - -- struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); -- if (hash_info->type != hash2.type || -- memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) { -- ret = repair_inode_hash_info(trans, &inode); -- if (!ret) { -- bch_err(c, "inode hash info mismatch with root, but mismatch not found\n" -- "%u %llx %llx\n" -- "%u %llx %llx", -- hash_info->type, -- hash_info->siphash_key.k0, -- hash_info->siphash_key.k1, -- hash2.type, -- hash2.siphash_key.k0, -- hash2.siphash_key.k1); -- ret = -BCH_ERR_fsck_repair_unimplemented; -- } -- } --err: -- bch2_trans_iter_exit(trans, &iter); - return ret; - } - -@@ -195,7 +228,7 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, - struct btree_iter *k_iter, struct bkey_s_c hash_k) - { - struct bch_fs *c = trans->c; -- struct btree_iter iter = { NULL }; -+ struct btree_iter iter = {}; - struct printbuf buf = PRINTBUF; - struct bkey_s_c k; - int ret = 0; -@@ -232,7 +265,7 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, - goto out; - - if (fsck_err(trans, hash_table_key_wrong_offset, -- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", -+ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", - bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { -diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h -index 55a4ac7bf220..6762b3627e1b 100644 ---- a/fs/bcachefs/str_hash.h -+++ b/fs/bcachefs/str_hash.h -@@ -12,7 +12,6 @@ - #include "super.h" - - #include --#include - #include - - static inline enum bch_str_hash_type -@@ -33,7 +32,9 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) - } - - struct bch_hash_info { -+ u32 inum_snapshot; - u8 type; -+ struct unicode_map *cf_encoding; - /* - * For crc32 or crc64 string hashes the first key value of - * the siphash_key (k0) is used as the key. -@@ -44,20 +45,20 @@ struct bch_hash_info { - static inline struct bch_hash_info - bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) - { -- /* XXX ick */ - struct bch_hash_info info = { -- .type = INODE_STR_HASH(bi), -- .siphash_key = { .k0 = bi->bi_hash_seed } -+ .inum_snapshot = bi->bi_snapshot, -+ .type = INODE_STR_HASH(bi), -+#ifdef CONFIG_UNICODE -+ .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, -+#endif -+ .siphash_key = { .k0 = bi->bi_hash_seed } - }; - - if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { -- SHASH_DESC_ON_STACK(desc, c->sha256); - u8 digest[SHA256_DIGEST_SIZE]; - -- desc->tfm = c->sha256; -- -- crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, -- sizeof(bi->bi_hash_seed), digest); -+ sha256((const u8 *)&bi->bi_hash_seed, -+ sizeof(bi->bi_hash_seed), digest); - memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); - } - -@@ -231,11 +232,11 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, - struct bkey_s_c k; - int ret; - -- bch2_trans_copy_iter(&iter, start); -+ bch2_trans_copy_iter(trans, &iter, start); - -- bch2_btree_iter_advance(&iter); -+ bch2_btree_iter_advance(trans, &iter); - -- for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) { -+ for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) { - if (k.k->type != desc.key_type && - k.k->type != KEY_TYPE_hash_whiteout) - break; -@@ -280,7 +281,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, - } - - if (!slot.path && !(flags & STR_HASH_must_replace)) -- bch2_trans_copy_iter(&slot, iter); -+ bch2_trans_copy_iter(trans, &slot, iter); - - if (k.k->type != KEY_TYPE_hash_whiteout) - goto not_found; -@@ -393,6 +394,8 @@ int bch2_hash_delete(struct btree_trans *trans, - return ret; - } - -+int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); -+ - struct snapshots_seen; - int __bch2_str_hash_check_key(struct btree_trans *, - struct snapshots_seen *, -diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c -index b7b96283c316..35c9f86a73c1 100644 ---- a/fs/bcachefs/subvolume.c -+++ b/fs/bcachefs/subvolume.c -@@ -3,9 +3,11 @@ - #include "bcachefs.h" - #include "btree_key_cache.h" - #include "btree_update.h" -+#include "enumerated_ref.h" - #include "errcode.h" - #include "error.h" - #include "fs.h" -+#include "recovery_passes.h" - #include "snapshot.h" - #include "subvolume.h" - -@@ -13,6 +15,22 @@ - - static int bch2_subvolume_delete(struct btree_trans *, u32); - -+static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid) -+{ -+ struct printbuf buf = PRINTBUF; -+ bch2_log_msg_start(c, &buf); -+ -+ prt_printf(&buf, "missing subvolume %u", subvolid); -+ bool print = bch2_count_fsck_err(c, subvol_missing, &buf); -+ -+ int ret = bch2_run_explicit_recovery_pass(c, &buf, -+ BCH_RECOVERY_PASS_check_inodes, 0); -+ if (print) -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ return ret; -+} -+ - static struct bpos subvolume_children_pos(struct bkey_s_c k) - { - if (k.k->type != KEY_TYPE_subvolume) -@@ -44,8 +62,8 @@ static int check_subvol(struct btree_trans *trans, - ret = bch2_snapshot_lookup(trans, snapid, &snapshot); - - if (bch2_err_matches(ret, ENOENT)) -- bch_err(c, "subvolume %llu points to nonexistent snapshot %u", -- k.k->p.offset, snapid); -+ return bch2_run_print_explicit_recovery_pass(c, -+ BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; - if (ret) - return ret; - -@@ -275,7 +293,7 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) - struct btree_iter iter; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); -- struct bkey_s_c k = bch2_btree_iter_peek(&iter); -+ struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter); - bch2_trans_iter_exit(trans, &iter); - - return bkey_err(k) ?: k.k && k.k->p.inode == subvol -@@ -291,9 +309,8 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), - BTREE_ITER_cached| - BTREE_ITER_with_updates, subvolume, s); -- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && -- inconsistent_if_not_found, -- trans->c, "missing subvolume %u", subvol); -+ if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found) -+ ret = bch2_subvolume_missing(trans->c, subvol) ?: ret; - return ret; - } - -@@ -343,8 +360,8 @@ int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, - subvolume); - ret = bkey_err(subvol); - -- bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c, -- "missing subvolume %u", subvolid); -+ if (bch2_err_matches(ret, ENOENT)) -+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; - - if (likely(!ret)) - *snapid = le32_to_cpu(subvol.v->snapshot); -@@ -417,8 +434,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) - BTREE_ITER_cached|BTREE_ITER_intent, - subvolume); - int ret = bkey_err(subvol); -- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, -- "missing subvolume %u", subvolid); -+ if (bch2_err_matches(ret, ENOENT)) -+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; - if (ret) - goto err; - -@@ -478,13 +495,11 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor - { - struct bch_fs *c = container_of(work, struct bch_fs, - snapshot_wait_for_pagecache_and_delete_work); -- snapshot_id_list s; -- u32 *id; - int ret = 0; - - while (!ret) { - mutex_lock(&c->snapshots_unlinked_lock); -- s = c->snapshots_unlinked; -+ snapshot_id_list s = c->snapshots_unlinked; - darray_init(&c->snapshots_unlinked); - mutex_unlock(&c->snapshots_unlinked_lock); - -@@ -493,7 +508,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor - - bch2_evict_subvolume_inodes(c, &s); - -- for (id = s.data; id < s.data + s.nr; id++) { -+ darray_for_each(s, id) { - ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); - bch_err_msg(c, ret, "deleting subvolume %u", *id); - if (ret) -@@ -503,7 +518,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor - darray_exit(&s); - } - -- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); - } - - struct subvolume_unlink_hook { -@@ -526,11 +541,11 @@ static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans - if (ret) - return ret; - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache)) - return -EROFS; - - if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) -- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); - return 0; - } - -@@ -554,13 +569,13 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached, subvolume); - ret = PTR_ERR_OR_ZERO(n); -- if (unlikely(ret)) { -- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, -- "missing subvolume %u", subvolid); -+ if (bch2_err_matches(ret, ENOENT)) -+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; -+ if (unlikely(ret)) - return ret; -- } - - SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); -+ n->v.fs_path_parent = 0; - bch2_trans_iter_exit(trans, &iter); - return ret; - } -@@ -573,7 +588,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, - bool ro) - { - struct bch_fs *c = trans->c; -- struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; -+ struct btree_iter dst_iter, src_iter = {}; - struct bkey_i_subvolume *new_subvol = NULL; - struct bkey_i_subvolume *src_subvol = NULL; - u32 parent = 0, new_nodes[2], snapshot_subvols[2]; -@@ -596,11 +611,10 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, - BTREE_ID_subvolumes, POS(0, src_subvolid), - BTREE_ITER_cached, subvolume); - ret = PTR_ERR_OR_ZERO(src_subvol); -- if (unlikely(ret)) { -- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, -- "subvolume %u not found", src_subvolid); -+ if (bch2_err_matches(ret, ENOENT)) -+ ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret; -+ if (unlikely(ret)) - goto err; -- } - - parent = le32_to_cpu(src_subvol->v.snapshot); - } -@@ -714,11 +728,8 @@ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) - return ret; - } - --int bch2_fs_subvolumes_init(struct bch_fs *c) -+void bch2_fs_subvolumes_init_early(struct bch_fs *c) - { -- INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); - INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, - bch2_subvolume_wait_for_pagecache_and_delete); -- mutex_init(&c->snapshots_unlinked_lock); -- return 0; - } -diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h -index 910f6196700e..771ade03a348 100644 ---- a/fs/bcachefs/subvolume.h -+++ b/fs/bcachefs/subvolume.h -@@ -2,7 +2,6 @@ - #ifndef _BCACHEFS_SUBVOLUME_H - #define _BCACHEFS_SUBVOLUME_H - --#include "darray.h" - #include "subvolume_types.h" - - int bch2_check_subvols(struct bch_fs *); -@@ -33,16 +32,16 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32); - int bch2_subvol_is_ro(struct bch_fs *, u32); - - static inline struct bkey_s_c --bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end, -- u32 subvolid, unsigned flags) -+bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos end, u32 subvolid, unsigned flags) - { - u32 snapshot; -- int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot); -+ int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot); - if (ret) - return bkey_s_c_err(ret); - -- bch2_btree_iter_set_snapshot(iter, snapshot); -- return bch2_btree_iter_peek_max_type(iter, end, flags); -+ bch2_btree_iter_set_snapshot(trans, iter, snapshot); -+ return bch2_btree_iter_peek_max_type(trans, iter, end, flags); - } - - #define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ -@@ -53,14 +52,14 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ -- (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \ -+ (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\ - _end, _subvolid, (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ -- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ -+ } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -@@ -77,15 +76,12 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos - _end, _subvolid, _flags, _k, _do); \ - }) - --int bch2_delete_dead_snapshots(struct bch_fs *); --void bch2_delete_dead_snapshots_async(struct bch_fs *); -- - int bch2_subvolume_unlink(struct btree_trans *, u32); - int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); - - int bch2_initialize_subvolumes(struct bch_fs *); - int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); - --int bch2_fs_subvolumes_init(struct bch_fs *); -+void bch2_fs_subvolumes_init_early(struct bch_fs *); - - #endif /* _BCACHEFS_SUBVOLUME_H */ -diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h -index 1549d6daf7af..9d634b906dcd 100644 ---- a/fs/bcachefs/subvolume_types.h -+++ b/fs/bcachefs/subvolume_types.h -@@ -2,33 +2,6 @@ - #ifndef _BCACHEFS_SUBVOLUME_TYPES_H - #define _BCACHEFS_SUBVOLUME_TYPES_H - --#include "darray.h" -- --typedef DARRAY(u32) snapshot_id_list; -- --#define IS_ANCESTOR_BITMAP 128 -- --struct snapshot_t { -- bool live; -- u32 parent; -- u32 skip[3]; -- u32 depth; -- u32 children[2]; -- u32 subvol; /* Nonzero only if a subvolume points to this node: */ -- u32 tree; -- unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; --}; -- --struct snapshot_table { -- struct rcu_head rcu; -- size_t nr; --#ifndef RUST_BINDGEN -- DECLARE_FLEX_ARRAY(struct snapshot_t, s); --#else -- struct snapshot_t s[0]; --#endif --}; -- - typedef struct { - /* we can't have padding in this struct: */ - u64 subvol; -diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c -index a81a7b6c0989..6687b9235d3c 100644 ---- a/fs/bcachefs/super-io.c -+++ b/fs/bcachefs/super-io.c -@@ -25,9 +25,6 @@ - #include - #include - --static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { --}; -- - struct bch2_metadata_version { - u16 version; - const char *name; -@@ -69,19 +66,38 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta - return v; - } - --bool bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) -+int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) - { -- bool ret = (c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && -- version <= c->sb.version_incompat_allowed; -+ int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && -+ version <= c->sb.version_incompat_allowed) -+ ? 0 -+ : -BCH_ERR_may_not_use_incompat_feature; - -- if (ret) { -- mutex_lock(&c->sb_lock); -+ mutex_lock(&c->sb_lock); -+ if (!ret) { - SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); - bch2_write_super(c); -- mutex_unlock(&c->sb_lock); -+ } else { -+ darray_for_each(c->incompat_versions_requested, i) -+ if (version == *i) -+ goto out; -+ -+ darray_push(&c->incompat_versions_requested, version); -+ struct printbuf buf = PRINTBUF; -+ prt_str(&buf, "requested incompat feature "); -+ bch2_version_to_text(&buf, version); -+ prt_str(&buf, " currently not enabled, allowed up to "); -+ bch2_version_to_text(&buf, version); -+ prt_printf(&buf, "\n set version_upgrade=incompat to enable"); -+ -+ bch_notice(c, "%s", buf.buf); -+ printbuf_exit(&buf); - } - -+out: -+ mutex_unlock(&c->sb_lock); -+ - return ret; - } - -@@ -245,11 +261,11 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, - - /* XXX: we're not checking that offline device have enough space */ - -- for_each_online_member(c, ca) { -+ for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) { - struct bch_sb_handle *dev_sb = &ca->disk_sb; - - if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize); - return NULL; - } - } -@@ -366,39 +382,40 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) - return 0; - } - --static int bch2_sb_validate(struct bch_sb_handle *disk_sb, -- enum bch_validate_flags flags, struct printbuf *out) -+int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, -+ enum bch_validate_flags flags, struct printbuf *out) - { -- struct bch_sb *sb = disk_sb->sb; -- struct bch_sb_field_members_v1 *mi; - enum bch_opt_id opt_id; -- u16 block_size; - int ret; - - ret = bch2_sb_compatible(sb, out); - if (ret) - return ret; - -- if (sb->features[1] || -- (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { -- prt_printf(out, "Filesystem has incompatible features"); -+ u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); -+ unsigned incompat_bit = 0; -+ if (incompat) -+ incompat_bit = __ffs64(incompat); -+ else if (sb->features[1]) -+ incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); -+ -+ if (incompat_bit) { -+ prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", -+ incompat_bit, -+ bch2_sb_features[BCH_FEATURE_NR - 1], -+ BCH_FEATURE_NR - 1); - return -BCH_ERR_invalid_sb_features; - } - - if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || - BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { -- prt_printf(out, "Filesystem has incompatible version"); -+ prt_str(out, "Filesystem has incompatible version "); -+ bch2_version_to_text(out, le16_to_cpu(sb->version)); -+ prt_str(out, ", current version "); -+ bch2_version_to_text(out, bcachefs_metadata_version_current); - return -BCH_ERR_invalid_sb_features; - } - -- block_size = le16_to_cpu(sb->block_size); -- -- if (block_size > PAGE_SECTORS) { -- prt_printf(out, "Block size too big (got %u, max %u)", -- block_size, PAGE_SECTORS); -- return -BCH_ERR_invalid_sb_block_size; -- } -- - if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { - prt_printf(out, "Bad user UUID (got zeroes)"); - return -BCH_ERR_invalid_sb_uuid; -@@ -409,6 +426,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - return -BCH_ERR_invalid_sb_uuid; - } - -+ if (!(flags & BCH_VALIDATE_write) && -+ le64_to_cpu(sb->offset) != read_offset) { -+ prt_printf(out, "Bad sb offset (got %llu, read from %llu)", -+ le64_to_cpu(sb->offset), read_offset); -+ return -BCH_ERR_invalid_sb_offset; -+ } -+ - if (!sb->nr_devices || - sb->nr_devices > BCH_SB_MEMBERS_MAX) { - prt_printf(out, "Bad number of member devices %u (max %u)", -@@ -444,6 +468,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); - } - -+ if (sb->nr_devices > 1) -+ SET_BCH_SB_MULTI_DEVICE(sb, true); -+ - if (!flags) { - /* - * Been seeing a bug where these are getting inexplicably -@@ -464,6 +491,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) - SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); -+ -+ if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) -+ SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); -+ -+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags && -+ !BCH_SB_CSUM_ERR_RETRY_NR(sb)) -+ SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3); - } - - #ifdef __KERNEL__ -@@ -474,8 +508,8 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { - const struct bch_option *opt = bch2_opt_table + opt_id; - -- if (opt->get_sb != BCH2_NO_SB_OPT) { -- u64 v = bch2_opt_from_sb(sb, opt_id); -+ if (opt->get_sb) { -+ u64 v = bch2_opt_from_sb(sb, opt_id, -1); - - prt_printf(out, "Invalid option "); - ret = bch2_opt_validate(opt, v, out); -@@ -505,14 +539,17 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - } - } - -+ struct bch_sb_field *mi = -+ bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?: -+ bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1); -+ - /* members must be validated first: */ -- mi = bch2_sb_field_get(sb, members_v1); - if (!mi) { - prt_printf(out, "Invalid superblock: member info area missing"); - return -BCH_ERR_invalid_sb_members_missing; - } - -- ret = bch2_sb_field_validate(sb, &mi->field, flags, out); -+ ret = bch2_sb_field_validate(sb, mi, flags, out); - if (ret) - return ret; - -@@ -581,11 +618,15 @@ static void bch2_sb_update(struct bch_fs *c) - - c->sb.features = le64_to_cpu(src->features[0]); - c->sb.compat = le64_to_cpu(src->compat[0]); -+ c->sb.multi_device = BCH_SB_MULTI_DEVICE(src); - - memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); - if (ext) { -+ c->sb.recovery_passes_required = -+ bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); -+ - le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, - sizeof(c->sb.errors_silent) * 8); - c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); -@@ -755,7 +796,7 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, - memset(sb, 0, sizeof(*sb)); - sb->mode = BLK_OPEN_READ; - sb->have_bio = true; -- sb->holder = kmalloc(1, GFP_KERNEL); -+ sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL); - if (!sb->holder) - return -ENOMEM; - -@@ -881,7 +922,7 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, - - sb->have_layout = true; - -- ret = bch2_sb_validate(sb, 0, &err); -+ ret = bch2_sb_validate(sb->sb, offset, 0, &err); - if (ret) { - bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", - path, err.buf); -@@ -918,19 +959,19 @@ static void write_super_endio(struct bio *bio) - { - struct bch_dev *ca = bio->bi_private; - -+ bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status); -+ - /* XXX: return errors directly */ - -- if (bch2_dev_io_err_on(bio->bi_status, ca, -- bio_data_dir(bio) -- ? BCH_MEMBER_ERROR_write -- : BCH_MEMBER_ERROR_read, -- "superblock %s error: %s", -+ if (bio->bi_status) { -+ bch_err_dev_ratelimited(ca, "superblock %s error: %s", - str_write_read(bio_data_dir(bio)), -- bch2_blk_status_to_str(bio->bi_status))) -+ bch2_blk_status_to_str(bio->bi_status)); - ca->sb_write_error = 1; -+ } - - closure_put(&ca->fs->sb_write); -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - } - - static void read_back_super(struct bch_fs *c, struct bch_dev *ca) -@@ -948,7 +989,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) - - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); - -- percpu_ref_get(&ca->io_ref); -+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - closure_bio_submit(bio, &c->sb_write); - } - -@@ -974,7 +1015,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], - bio_sectors(bio)); - -- percpu_ref_get(&ca->io_ref); -+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - closure_bio_submit(bio, &c->sb_write); - } - -@@ -991,7 +1032,7 @@ int bch2_write_super(struct bch_fs *c) - - trace_and_count(c, write_super, c, _RET_IP_); - -- if (c->opts.very_degraded) -+ if (c->opts.degraded == BCH_DEGRADED_very) - degraded_flags |= BCH_FORCE_IF_LOST; - - lockdep_assert_held(&c->sb_lock); -@@ -999,13 +1040,20 @@ int bch2_write_super(struct bch_fs *c) - closure_init_stack(cl); - memset(&sb_written, 0, sizeof(sb_written)); - -- for_each_online_member(c, ca) { -+ /* -+ * Note: we do writes to RO devices here, and we might want to change -+ * that in the future. -+ * -+ * For now, we expect to be able to call write_super() when we're not -+ * yet RW: -+ */ -+ for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) { - ret = darray_push(&online_devices, ca); - if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { -- percpu_ref_put(&ca->io_ref); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - goto out; - } -- percpu_ref_get(&ca->io_ref); -+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - } - - /* Make sure we're using the new magic numbers: */ -@@ -1038,7 +1086,7 @@ int bch2_write_super(struct bch_fs *c) - darray_for_each(online_devices, ca) { - printbuf_reset(&err); - -- ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err); -+ ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err); - if (ret) { - bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); - goto out; -@@ -1064,7 +1112,8 @@ int bch2_write_super(struct bch_fs *c) - prt_str(&buf, ")"); - bch2_fs_fatal_error(c, ": %s", buf.buf); - printbuf_exit(&buf); -- return -BCH_ERR_sb_not_downgraded; -+ ret = -BCH_ERR_sb_not_downgraded; -+ goto out; - } - - darray_for_each(online_devices, ca) { -@@ -1166,12 +1215,12 @@ int bch2_write_super(struct bch_fs *c) - !can_mount_with_written), c, - ": Unable to write superblock to sufficient devices (from %ps)", - (void *) _RET_IP_)) -- ret = -1; -+ ret = -BCH_ERR_erofs_sb_err; - out: - /* Make new options visible after they're persistent: */ - bch2_sb_update(c); - darray_for_each(online_devices, ca) -- percpu_ref_put(&(*ca)->io_ref); -+ enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super); - darray_exit(&online_devices); - printbuf_exit(&err); - return ret; -@@ -1223,15 +1272,39 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) - bch2_sb_field_resize(&c->disk_sb, downgrade, 0); - - c->disk_sb.sb->version = cpu_to_le16(new_version); -- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); - - if (incompat) { -+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); -- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); - } - } - -+void bch2_sb_upgrade_incompat(struct bch_fs *c) -+{ -+ mutex_lock(&c->sb_lock); -+ if (c->sb.version == c->sb.version_incompat_allowed) -+ goto unlock; -+ -+ struct printbuf buf = PRINTBUF; -+ -+ prt_str(&buf, "Now allowing incompatible features up to "); -+ bch2_version_to_text(&buf, c->sb.version); -+ prt_str(&buf, ", previously allowed up to "); -+ bch2_version_to_text(&buf, c->sb.version_incompat_allowed); -+ prt_newline(&buf); -+ -+ bch_notice(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ -+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); -+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, -+ max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version)); -+ bch2_write_super(c); -+unlock: -+ mutex_unlock(&c->sb_lock); -+} -+ - static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) - { -@@ -1459,8 +1532,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, - for (id = 0; id < bch2_opts_nr; id++) { - const struct bch_option *opt = bch2_opt_table + id; - -- if (opt->get_sb != BCH2_NO_SB_OPT) { -- u64 v = bch2_opt_from_sb(sb, id); -+ if (opt->get_sb) { -+ u64 v = bch2_opt_from_sb(sb, id, -1); - - prt_printf(out, "%s:\t", opt->attr.name); - bch2_opt_to_text(out, NULL, sb, opt, v, -diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h -index b4cff9ebdebb..a3b7a90f2533 100644 ---- a/fs/bcachefs/super-io.h -+++ b/fs/bcachefs/super-io.h -@@ -21,13 +21,13 @@ static inline bool bch2_version_compatible(u16 version) - void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); - enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); - --bool bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); -+int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); - --static inline bool bch2_request_incompat_feature(struct bch_fs *c, -- enum bcachefs_metadata_version version) -+static inline int bch2_request_incompat_feature(struct bch_fs *c, -+ enum bcachefs_metadata_version version) - { - return likely(version <= c->sb.version_incompat) -- ? true -+ ? 0 - : bch2_set_version_incompat(c, version); - } - -@@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); - void bch2_free_super(struct bch_sb_handle *); - int bch2_sb_realloc(struct bch_sb_handle *, unsigned); - -+int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *); -+ - int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); - int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); - int bch2_write_super(struct bch_fs *); -@@ -105,6 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) - - bool bch2_check_version_downgrade(struct bch_fs *); - void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); -+void bch2_sb_upgrade_incompat(struct bch_fs *); - - void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, - struct bch_sb_field *); -diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c -index 0459c875e189..24658bf450ab 100644 ---- a/fs/bcachefs/super.c -+++ b/fs/bcachefs/super.c -@@ -10,6 +10,8 @@ - #include "bcachefs.h" - #include "alloc_background.h" - #include "alloc_foreground.h" -+#include "async_objs.h" -+#include "backpointers.h" - #include "bkey_sort.h" - #include "btree_cache.h" - #include "btree_gc.h" -@@ -28,6 +30,7 @@ - #include "disk_accounting.h" - #include "disk_groups.h" - #include "ec.h" -+#include "enumerated_ref.h" - #include "errcode.h" - #include "error.h" - #include "fs.h" -@@ -48,6 +51,7 @@ - #include "quota.h" - #include "rebalance.h" - #include "recovery.h" -+#include "recovery_passes.h" - #include "replicas.h" - #include "sb-clean.h" - #include "sb-counters.h" -@@ -70,26 +74,37 @@ - #include - #include - #include --#include - - MODULE_LICENSE("GPL"); - MODULE_AUTHOR("Kent Overstreet "); - MODULE_DESCRIPTION("bcachefs filesystem"); --MODULE_SOFTDEP("pre: crc32c"); --MODULE_SOFTDEP("pre: crc64"); --MODULE_SOFTDEP("pre: sha256"); --MODULE_SOFTDEP("pre: chacha20"); --MODULE_SOFTDEP("pre: poly1305"); --MODULE_SOFTDEP("pre: xxhash"); - --const char * const bch2_fs_flag_strs[] = { -+typedef DARRAY(struct bch_sb_handle) bch_sb_handles; -+ - #define x(n) #n, -+const char * const bch2_fs_flag_strs[] = { - BCH_FS_FLAGS() --#undef x - NULL - }; - --void bch2_print_str(struct bch_fs *c, const char *str) -+const char * const bch2_write_refs[] = { -+ BCH_WRITE_REFS() -+ NULL -+}; -+ -+const char * const bch2_dev_read_refs[] = { -+ BCH_DEV_READ_REFS() -+ NULL -+}; -+ -+const char * const bch2_dev_write_refs[] = { -+ BCH_DEV_WRITE_REFS() -+ NULL -+}; -+#undef x -+ -+static void __bch2_print_str(struct bch_fs *c, const char *prefix, -+ const char *str, bool nonblocking) - { - #ifdef __KERNEL__ - struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); -@@ -99,7 +114,17 @@ void bch2_print_str(struct bch_fs *c, const char *str) - return; - } - #endif -- bch2_print_string_as_lines(KERN_ERR, str); -+ bch2_print_string_as_lines(KERN_ERR, str, nonblocking); -+} -+ -+void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) -+{ -+ __bch2_print_str(c, prefix, str, false); -+} -+ -+void bch2_print_str_nonblocking(struct bch_fs *c, const char *prefix, const char *str) -+{ -+ __bch2_print_str(c, prefix, str, true); - } - - __printf(2, 0) -@@ -188,7 +213,9 @@ static void bch2_dev_unlink(struct bch_dev *); - static void bch2_dev_free(struct bch_dev *); - static int bch2_dev_alloc(struct bch_fs *, unsigned); - static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); -+static void bch2_dev_io_ref_stop(struct bch_dev *, int); - static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); -+static int bch2_fs_init_rw(struct bch_fs *); - - struct bch_fs *bch2_dev_to_fs(dev_t dev) - { -@@ -297,19 +324,19 @@ static void __bch2_fs_read_only(struct bch_fs *c) - /* - * After stopping journal: - */ -- for_each_member_device(c, ca) -+ for_each_member_device(c, ca) { -+ bch2_dev_io_ref_stop(ca, WRITE); - bch2_dev_allocator_remove(c, ca); -+ } - } - --#ifndef BCH_WRITE_REF_DEBUG --static void bch2_writes_disabled(struct percpu_ref *writes) -+static void bch2_writes_disabled(struct enumerated_ref *writes) - { - struct bch_fs *c = container_of(writes, struct bch_fs, writes); - - set_bit(BCH_FS_write_disable_complete, &c->flags); - wake_up(&bch2_read_only_wait); - } --#endif - - void bch2_fs_read_only(struct bch_fs *c) - { -@@ -327,12 +354,7 @@ void bch2_fs_read_only(struct bch_fs *c) - * writes will return -EROFS: - */ - set_bit(BCH_FS_going_ro, &c->flags); --#ifndef BCH_WRITE_REF_DEBUG -- percpu_ref_kill(&c->writes); --#else -- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) -- bch2_write_ref_put(c, i); --#endif -+ enumerated_ref_stop_async(&c->writes); - - /* - * If we're not doing an emergency shutdown, we want to wait on -@@ -370,7 +392,7 @@ void bch2_fs_read_only(struct bch_fs *c) - !test_bit(BCH_FS_emergency_ro, &c->flags) && - test_bit(BCH_FS_started, &c->flags) && - test_bit(BCH_FS_clean_shutdown, &c->flags) && -- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { -+ c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { - BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); - BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); - BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); -@@ -381,6 +403,11 @@ void bch2_fs_read_only(struct bch_fs *c) - bch_verbose(c, "marking filesystem clean"); - bch2_fs_mark_clean(c); - } else { -+ /* Make sure error counts/counters are persisted */ -+ mutex_lock(&c->sb_lock); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ - bch_verbose(c, "done going read-only, filesystem not clean"); - } - } -@@ -411,41 +438,39 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) - return ret; - } - --bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) -+static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, -+ bool locked) - { - bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - -- bch2_journal_halt_locked(&c->journal); -+ if (!locked) -+ bch2_journal_halt(&c->journal); -+ else -+ bch2_journal_halt_locked(&c->journal); - bch2_fs_read_only_async(c); -- - wake_up(&bch2_read_only_wait); -+ -+ if (ret) -+ prt_printf(out, "emergency read only at seq %llu\n", -+ journal_cur_seq(&c->journal)); -+ - return ret; - } - --static int bch2_fs_read_write_late(struct bch_fs *c) -+bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) - { -- int ret; -+ return __bch2_fs_emergency_read_only2(c, out, false); -+} - -- /* -- * Data move operations can't run until after check_snapshots has -- * completed, and bch2_snapshot_is_ancestor() is available. -- * -- * Ideally we'd start copygc/rebalance earlier instead of waiting for -- * all of recovery/fsck to complete: -- */ -- ret = bch2_copygc_start(c); -- if (ret) { -- bch_err(c, "error starting copygc thread"); -- return ret; -- } -+bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) -+{ -+ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - -- ret = bch2_rebalance_start(c); -- if (ret) { -- bch_err(c, "error starting rebalance thread"); -- return ret; -- } -+ bch2_journal_halt_locked(&c->journal); -+ bch2_fs_read_only_async(c); - -- return 0; -+ wake_up(&bch2_read_only_wait); -+ return ret; - } - - static int __bch2_fs_read_write(struct bch_fs *c, bool early) -@@ -454,59 +479,79 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) - - BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - -+ if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) -+ return -BCH_ERR_erofs_no_alloc_info; -+ - if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { - bch_err(c, "cannot go rw, unfixed btree errors"); - return -BCH_ERR_erofs_unfixed_errors; - } - -+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { -+ bch_err(c, "cannot go rw, filesystem is an unresized image file"); -+ return -BCH_ERR_erofs_filesystem_full; -+ } -+ - if (test_bit(BCH_FS_rw, &c->flags)) - return 0; - - bch_info(c, "going read-write"); - -- ret = bch2_sb_members_v2_init(c); -+ ret = bch2_fs_init_rw(c); - if (ret) - goto err; - -- ret = bch2_fs_mark_dirty(c); -+ ret = bch2_sb_members_v2_init(c); - if (ret) - goto err; - - clear_bit(BCH_FS_clean_shutdown, &c->flags); - -+ rcu_read_lock(); -+ for_each_online_member_rcu(c, ca) -+ if (ca->mi.state == BCH_MEMBER_STATE_rw) { -+ bch2_dev_allocator_add(c, ca); -+ enumerated_ref_start(&ca->io_ref[WRITE]); -+ } -+ rcu_read_unlock(); -+ -+ bch2_recalc_capacity(c); -+ - /* - * First journal write must be a flush write: after a clean shutdown we - * don't read the journal, so the first journal write may end up - * overwriting whatever was there previously, and there must always be - * at least one non-flush write in the journal or recovery will fail: - */ -+ spin_lock(&c->journal.lock); - set_bit(JOURNAL_need_flush_write, &c->journal.flags); - set_bit(JOURNAL_running, &c->journal.flags); -+ bch2_journal_space_available(&c->journal); -+ spin_unlock(&c->journal.lock); - -- for_each_rw_member(c, ca) -- bch2_dev_allocator_add(c, ca); -- bch2_recalc_capacity(c); -+ ret = bch2_fs_mark_dirty(c); -+ if (ret) -+ goto err; -+ -+ ret = bch2_journal_reclaim_start(&c->journal); -+ if (ret) -+ goto err; - - set_bit(BCH_FS_rw, &c->flags); - set_bit(BCH_FS_was_rw, &c->flags); - --#ifndef BCH_WRITE_REF_DEBUG -- percpu_ref_reinit(&c->writes); --#else -- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { -- BUG_ON(atomic_long_read(&c->writes[i])); -- atomic_long_inc(&c->writes[i]); -- } --#endif -+ enumerated_ref_start(&c->writes); - -- ret = bch2_journal_reclaim_start(&c->journal); -- if (ret) -+ ret = bch2_copygc_start(c); -+ if (ret) { -+ bch_err_msg(c, ret, "error starting copygc thread"); - goto err; -+ } - -- if (!early) { -- ret = bch2_fs_read_write_late(c); -- if (ret) -- goto err; -+ ret = bch2_rebalance_start(c); -+ if (ret) { -+ bch_err_msg(c, ret, "error starting rebalance thread"); -+ goto err; - } - - bch2_do_discards(c); -@@ -531,14 +576,19 @@ int bch2_fs_read_write(struct bch_fs *c) - if (c->opts.nochanges) - return -BCH_ERR_erofs_nochanges; - -+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) -+ return -BCH_ERR_erofs_no_alloc_info; -+ - return __bch2_fs_read_write(c, false); - } - - int bch2_fs_read_write_early(struct bch_fs *c) - { -- lockdep_assert_held(&c->state_lock); -+ down_write(&c->state_lock); -+ int ret = __bch2_fs_read_write(c, true); -+ up_write(&c->state_lock); - -- return __bch2_fs_read_write(c, true); -+ return ret; - } - - /* Filesystem startup/shutdown: */ -@@ -548,37 +598,44 @@ static void __bch2_fs_free(struct bch_fs *c) - for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) - bch2_time_stats_exit(&c->times[i]); - -+#ifdef CONFIG_UNICODE -+ utf8_unload(c->cf_encoding); -+#endif -+ - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - bch2_free_pending_node_rewrites(c); -- bch2_fs_accounting_exit(c); -- bch2_fs_sb_errors_exit(c); -- bch2_fs_counters_exit(c); -+ bch2_free_fsck_errs(c); -+ bch2_fs_vfs_exit(c); - bch2_fs_snapshots_exit(c); -+ bch2_fs_sb_errors_exit(c); -+ bch2_fs_replicas_exit(c); -+ bch2_fs_rebalance_exit(c); - bch2_fs_quota_exit(c); -+ bch2_fs_nocow_locking_exit(c); -+ bch2_fs_journal_exit(&c->journal); - bch2_fs_fs_io_direct_exit(c); - bch2_fs_fs_io_buffered_exit(c); - bch2_fs_fsio_exit(c); -- bch2_fs_vfs_exit(c); -- bch2_fs_ec_exit(c); -- bch2_fs_encryption_exit(c); -- bch2_fs_nocow_locking_exit(c); - bch2_fs_io_write_exit(c); - bch2_fs_io_read_exit(c); -+ bch2_fs_encryption_exit(c); -+ bch2_fs_ec_exit(c); -+ bch2_fs_counters_exit(c); -+ bch2_fs_compress_exit(c); -+ bch2_io_clock_exit(&c->io_clock[WRITE]); -+ bch2_io_clock_exit(&c->io_clock[READ]); - bch2_fs_buckets_waiting_for_journal_exit(c); -- bch2_fs_btree_interior_update_exit(c); -+ bch2_fs_btree_write_buffer_exit(c); - bch2_fs_btree_key_cache_exit(&c->btree_key_cache); -- bch2_fs_btree_cache_exit(c); - bch2_fs_btree_iter_exit(c); -- bch2_fs_replicas_exit(c); -- bch2_fs_journal_exit(&c->journal); -- bch2_io_clock_exit(&c->io_clock[WRITE]); -- bch2_io_clock_exit(&c->io_clock[READ]); -- bch2_fs_compress_exit(c); -- bch2_fs_btree_gc_exit(c); -+ bch2_fs_btree_interior_update_exit(c); -+ bch2_fs_btree_cache_exit(c); -+ bch2_fs_accounting_exit(c); -+ bch2_fs_async_obj_exit(c); - bch2_journal_keys_put_initial(c); - bch2_find_btree_nodes_exit(&c->found_btree_nodes); -+ - BUG_ON(atomic_read(&c->journal_keys.ref)); -- bch2_fs_btree_write_buffer_exit(c); - percpu_free_rwsem(&c->mark_lock); - if (c->online_reserved) { - u64 v = percpu_u64_get(c->online_reserved); -@@ -586,6 +643,7 @@ static void __bch2_fs_free(struct bch_fs *c) - free_percpu(c->online_reserved); - } - -+ darray_exit(&c->incompat_versions_requested); - darray_exit(&c->btree_roots_extra); - free_percpu(c->pcpu); - free_percpu(c->usage); -@@ -593,9 +651,7 @@ static void __bch2_fs_free(struct bch_fs *c) - mempool_exit(&c->btree_bounce_pool); - bioset_exit(&c->btree_bio); - mempool_exit(&c->fill_iter); --#ifndef BCH_WRITE_REF_DEBUG -- percpu_ref_exit(&c->writes); --#endif -+ enumerated_ref_exit(&c->writes); - kfree(rcu_dereference_protected(c->disk_groups, 1)); - kfree(c->journal_seq_blacklist_table); - -@@ -607,8 +663,8 @@ static void __bch2_fs_free(struct bch_fs *c) - destroy_workqueue(c->btree_read_complete_wq); - if (c->copygc_wq) - destroy_workqueue(c->copygc_wq); -- if (c->btree_io_complete_wq) -- destroy_workqueue(c->btree_io_complete_wq); -+ if (c->btree_write_complete_wq) -+ destroy_workqueue(c->btree_write_complete_wq); - if (c->btree_update_wq) - destroy_workqueue(c->btree_update_wq); - -@@ -634,6 +690,12 @@ void __bch2_fs_stop(struct bch_fs *c) - bch2_fs_read_only(c); - up_write(&c->state_lock); - -+ for (unsigned i = 0; i < c->sb.nr_devices; i++) { -+ struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); -+ if (ca) -+ bch2_dev_io_ref_stop(ca, READ); -+ } -+ - for_each_member_device(c, ca) - bch2_dev_unlink(ca); - -@@ -662,8 +724,6 @@ void __bch2_fs_stop(struct bch_fs *c) - - void bch2_fs_free(struct bch_fs *c) - { -- unsigned i; -- - mutex_lock(&bch_fs_list_lock); - list_del(&c->list); - mutex_unlock(&bch_fs_list_lock); -@@ -671,11 +731,12 @@ void bch2_fs_free(struct bch_fs *c) - closure_sync(&c->cl); - closure_debug_destroy(&c->cl); - -- for (i = 0; i < c->sb.nr_devices; i++) { -+ for (unsigned i = 0; i < c->sb.nr_devices; i++) { - struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); - - if (ca) { - EBUG_ON(atomic_long_read(&ca->ref) != 1); -+ bch2_dev_io_ref_stop(ca, READ); - bch2_free_super(&ca->disk_sb); - bch2_dev_free(ca); - } -@@ -698,9 +759,10 @@ static int bch2_fs_online(struct bch_fs *c) - - lockdep_assert_held(&bch_fs_list_lock); - -- if (__bch2_uuid_to_fs(c->sb.uuid)) { -+ if (c->sb.multi_device && -+ __bch2_uuid_to_fs(c->sb.uuid)) { - bch_err(c, "filesystem UUID already open"); -- return -EINVAL; -+ return -BCH_ERR_filesystem_uuid_already_open; - } - - ret = bch2_fs_chardev_init(c); -@@ -711,14 +773,16 @@ static int bch2_fs_online(struct bch_fs *c) - - bch2_fs_debug_init(c); - -- ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: -+ ret = (c->sb.multi_device -+ ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) -+ : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: - kobject_add(&c->internal, &c->kobj, "internal") ?: - kobject_add(&c->opts_dir, &c->kobj, "options") ?: - #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: - #endif - kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: -- bch2_opts_create_sysfs_files(&c->opts_dir); -+ bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); - if (ret) { - bch_err(c, "error creating sysfs objects"); - return ret; -@@ -742,7 +806,37 @@ static int bch2_fs_online(struct bch_fs *c) - return ret; - } - --static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) -+static int bch2_fs_init_rw(struct bch_fs *c) -+{ -+ if (test_bit(BCH_FS_rw_init_done, &c->flags)) -+ return 0; -+ -+ if (!(c->btree_update_wq = alloc_workqueue("bcachefs", -+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || -+ !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", -+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || -+ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", -+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || -+ !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", -+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || -+ !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", -+ WQ_FREEZABLE, 0))) -+ return -BCH_ERR_ENOMEM_fs_other_alloc; -+ -+ int ret = bch2_fs_btree_interior_update_init(c) ?: -+ bch2_fs_btree_write_buffer_init(c) ?: -+ bch2_fs_fs_io_buffered_init(c) ?: -+ bch2_fs_io_write_init(c) ?: -+ bch2_fs_journal_init(&c->journal); -+ if (ret) -+ return ret; -+ -+ set_bit(BCH_FS_rw_init_done, &c->flags); -+ return 0; -+} -+ -+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, -+ bch_sb_handles *sbs) - { - struct bch_fs *c; - struct printbuf name = PRINTBUF; -@@ -755,7 +849,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - goto out; - } - -- c->stdio = (void *)(unsigned long) opts.stdio; -+ c->stdio = (void *)(unsigned long) opts->stdio; - - __module_get(THIS_MODULE); - -@@ -779,24 +873,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - - refcount_set(&c->ro_ref, 1); - init_waitqueue_head(&c->ro_ref_wait); -- spin_lock_init(&c->recovery_pass_lock); -- sema_init(&c->online_fsck_mutex, 1); - - for (i = 0; i < BCH_TIME_STAT_NR; i++) - bch2_time_stats_init(&c->times[i]); - -- bch2_fs_copygc_init(c); -- bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); -- bch2_fs_btree_iter_init_early(c); -- bch2_fs_btree_interior_update_init_early(c); -- bch2_fs_journal_keys_init(c); - bch2_fs_allocator_background_init(c); - bch2_fs_allocator_foreground_init(c); -- bch2_fs_rebalance_init(c); -- bch2_fs_quota_init(c); -+ bch2_fs_btree_cache_init_early(&c->btree_cache); -+ bch2_fs_btree_gc_init_early(c); -+ bch2_fs_btree_interior_update_init_early(c); -+ bch2_fs_btree_iter_init_early(c); -+ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); -+ bch2_fs_btree_write_buffer_init_early(c); -+ bch2_fs_copygc_init(c); - bch2_fs_ec_init_early(c); -+ bch2_fs_journal_init_early(&c->journal); -+ bch2_fs_journal_keys_init(c); - bch2_fs_move_init(c); -+ bch2_fs_nocow_locking_init_early(c); -+ bch2_fs_quota_init(c); -+ bch2_fs_recovery_passes_init(c); - bch2_fs_sb_errors_init_early(c); -+ bch2_fs_snapshots_init_early(c); -+ bch2_fs_subvolumes_init_early(c); - - INIT_LIST_HEAD(&c->list); - -@@ -822,8 +921,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; - c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; - -- bch2_fs_btree_cache_init_early(&c->btree_cache); -- - mutex_init(&c->sectors_available_lock); - - ret = percpu_init_rwsem(&c->mark_lock); -@@ -837,14 +934,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - if (ret) - goto err; - -- pr_uuid(&name, c->sb.user_uuid.b); -- ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; -- if (ret) -- goto err; -- -- strscpy(c->name, name.buf, sizeof(c->name)); -- printbuf_exit(&name); -- - /* Compat: */ - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && - !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) -@@ -859,7 +948,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - if (ret) - goto err; - -- bch2_opts_apply(&c->opts, opts); -+ bch2_opts_apply(&c->opts, *opts); - - c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; - if (c->opts.inodes_use_key_cache) -@@ -875,26 +964,26 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - goto err; - } - -+ if (c->sb.multi_device) -+ pr_uuid(&name, c->sb.user_uuid.b); -+ else -+ prt_bdevname(&name, sbs->data[0].bdev); -+ -+ ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; -+ if (ret) -+ goto err; -+ -+ strscpy(c->name, name.buf, sizeof(c->name)); -+ printbuf_exit(&name); -+ - iter_size = sizeof(struct sort_iter) + - (btree_blocks(c) + 1) * 2 * - sizeof(struct sort_iter_set); - -- if (!(c->btree_update_wq = alloc_workqueue("bcachefs", -- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || -- !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", -- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || -- !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", -- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || -- !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", -+ if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || -- !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", -- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || -- !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", -- WQ_FREEZABLE, 0)) || --#ifndef BCH_WRITE_REF_DEBUG -- percpu_ref_init(&c->writes, bch2_writes_disabled, -- PERCPU_REF_INIT_DEAD, GFP_KERNEL) || --#endif -+ enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, -+ bch2_writes_disabled) || - mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || - bioset_init(&c->btree_bio, 1, - max(offsetof(struct btree_read_bio, bio), -@@ -910,32 +999,50 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) - goto err; - } - -- ret = bch2_fs_counters_init(c) ?: -- bch2_fs_sb_errors_init(c) ?: -- bch2_io_clock_init(&c->io_clock[READ]) ?: -- bch2_io_clock_init(&c->io_clock[WRITE]) ?: -- bch2_fs_journal_init(&c->journal) ?: -- bch2_fs_btree_iter_init(c) ?: -+ ret = -+ bch2_fs_async_obj_init(c) ?: - bch2_fs_btree_cache_init(c) ?: -+ bch2_fs_btree_iter_init(c) ?: - bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: -- bch2_fs_btree_interior_update_init(c) ?: -- bch2_fs_btree_gc_init(c) ?: - bch2_fs_buckets_waiting_for_journal_init(c) ?: -- bch2_fs_btree_write_buffer_init(c) ?: -- bch2_fs_subvolumes_init(c) ?: -- bch2_fs_io_read_init(c) ?: -- bch2_fs_io_write_init(c) ?: -- bch2_fs_nocow_locking_init(c) ?: -- bch2_fs_encryption_init(c) ?: -+ bch2_io_clock_init(&c->io_clock[READ]) ?: -+ bch2_io_clock_init(&c->io_clock[WRITE]) ?: - bch2_fs_compress_init(c) ?: -+ bch2_fs_counters_init(c) ?: - bch2_fs_ec_init(c) ?: -- bch2_fs_vfs_init(c) ?: -+ bch2_fs_encryption_init(c) ?: - bch2_fs_fsio_init(c) ?: -- bch2_fs_fs_io_buffered_init(c) ?: -- bch2_fs_fs_io_direct_init(c); -+ bch2_fs_fs_io_direct_init(c) ?: -+ bch2_fs_io_read_init(c) ?: -+ bch2_fs_rebalance_init(c) ?: -+ bch2_fs_sb_errors_init(c) ?: -+ bch2_fs_vfs_init(c); - if (ret) - goto err; - -+#ifdef CONFIG_UNICODE -+ /* Default encoding until we can potentially have more as an option. */ -+ c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); -+ if (IS_ERR(c->cf_encoding)) { -+ printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", -+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), -+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), -+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); -+ ret = -EINVAL; -+ goto err; -+ } -+ bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", -+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), -+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), -+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); -+#else -+ if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { -+ printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); -+ ret = -EINVAL; -+ goto err; -+ } -+#endif -+ - for (i = 0; i < c->sb.nr_devices; i++) { - if (!bch2_member_exists(c->disk_sb.sb, i)) - continue; -@@ -975,12 +1082,6 @@ static void print_mount_opts(struct bch_fs *c) - prt_str(&p, "starting version "); - bch2_version_to_text(&p, c->sb.version); - -- if (c->opts.read_only) { -- prt_str(&p, " opts="); -- first = false; -- prt_printf(&p, "ro"); -- } -- - for (i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - u64 v = bch2_opt_get_by_id(&c->opts, i); -@@ -996,45 +1097,102 @@ static void print_mount_opts(struct bch_fs *c) - bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); - } - -+ if (c->sb.version_incompat_allowed != c->sb.version) { -+ prt_printf(&p, "\n allowing incompatible features above "); -+ bch2_version_to_text(&p, c->sb.version_incompat_allowed); -+ } -+ -+ if (c->opts.verbose) { -+ prt_printf(&p, "\n features: "); -+ prt_bitflags(&p, bch2_sb_features, c->sb.features); -+ } -+ - bch_info(c, "%s", p.buf); - printbuf_exit(&p); - } - -+static bool bch2_fs_may_start(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned flags = 0; -+ -+ switch (c->opts.degraded) { -+ case BCH_DEGRADED_very: -+ flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; -+ break; -+ case BCH_DEGRADED_yes: -+ flags |= BCH_FORCE_IF_DEGRADED; -+ break; -+ default: -+ mutex_lock(&c->sb_lock); -+ for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { -+ if (!bch2_member_exists(c->disk_sb.sb, i)) -+ continue; -+ -+ ca = bch2_dev_locked(c, i); -+ -+ if (!bch2_dev_is_online(ca) && -+ (ca->mi.state == BCH_MEMBER_STATE_rw || -+ ca->mi.state == BCH_MEMBER_STATE_ro)) { -+ mutex_unlock(&c->sb_lock); -+ return false; -+ } -+ } -+ mutex_unlock(&c->sb_lock); -+ break; -+ } -+ -+ return bch2_have_enough_devs(c, c->online_devs, flags, true); -+} -+ - int bch2_fs_start(struct bch_fs *c) - { - time64_t now = ktime_get_real_seconds(); -- int ret; -+ int ret = 0; - - print_mount_opts(c); - -+ if (!bch2_fs_may_start(c)) -+ return -BCH_ERR_insufficient_devices_to_start; -+ - down_write(&c->state_lock); -+ mutex_lock(&c->sb_lock); - - BUG_ON(test_bit(BCH_FS_started, &c->flags)); - -- mutex_lock(&c->sb_lock); -+ if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, -+ sizeof(struct bch_sb_field_ext) / sizeof(u64))) { -+ mutex_unlock(&c->sb_lock); -+ up_write(&c->state_lock); -+ ret = -BCH_ERR_ENOSPC_sb; -+ goto err; -+ } - - ret = bch2_sb_members_v2_init(c); - if (ret) { - mutex_unlock(&c->sb_lock); -+ up_write(&c->state_lock); - goto err; - } - -- for_each_online_member(c, ca) -- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); -+ rcu_read_lock(); -+ for_each_online_member_rcu(c, ca) -+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = -+ cpu_to_le64(now); -+ rcu_read_unlock(); - -- struct bch_sb_field_ext *ext = -- bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); -+ /* -+ * Dno't write superblock yet: recovery might have to downgrade -+ */ - mutex_unlock(&c->sb_lock); - -- if (!ext) { -- bch_err(c, "insufficient space in superblock for sb_field_ext"); -- ret = -BCH_ERR_ENOSPC_sb; -- goto err; -- } -- -- for_each_rw_member(c, ca) -- bch2_dev_allocator_add(c, ca); -+ rcu_read_lock(); -+ for_each_online_member_rcu(c, ca) -+ if (ca->mi.state == BCH_MEMBER_STATE_rw) -+ bch2_dev_allocator_add(c, ca); -+ rcu_read_unlock(); - bch2_recalc_capacity(c); -+ up_write(&c->state_lock); - - c->recovery_task = current; - ret = BCH_SB_INITIALIZED(c->disk_sb.sb) -@@ -1045,35 +1203,30 @@ int bch2_fs_start(struct bch_fs *c) - if (ret) - goto err; - -- ret = bch2_opts_check_may_set(c); -+ ret = bch2_opts_hooks_pre_set(c); - if (ret) - goto err; - - if (bch2_fs_init_fault("fs_start")) { -- bch_err(c, "fs_start fault injected"); -- ret = -EINVAL; -+ ret = -BCH_ERR_injected_fs_start; - goto err; - } - - set_bit(BCH_FS_started, &c->flags); -+ wake_up(&c->ro_ref_wait); - -- if (c->opts.read_only) { -+ down_write(&c->state_lock); -+ if (c->opts.read_only) - bch2_fs_read_only(c); -- } else { -- ret = !test_bit(BCH_FS_rw, &c->flags) -- ? bch2_fs_read_write(c) -- : bch2_fs_read_write_late(c); -- if (ret) -- goto err; -- } -+ else if (!test_bit(BCH_FS_rw, &c->flags)) -+ ret = bch2_fs_read_write(c); -+ up_write(&c->state_lock); - -- ret = 0; - err: - if (ret) - bch_err_msg(c, ret, "starting filesystem"); - else - bch_verbose(c, "done starting filesystem"); -- up_write(&c->state_lock); - return ret; - } - -@@ -1182,6 +1335,18 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, - - /* Device startup/shutdown: */ - -+static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) -+{ -+ if (rw == READ) -+ clear_bit(ca->dev_idx, ca->fs->online_devs.d); -+ -+ if (!enumerated_ref_is_zero(&ca->io_ref[rw])) -+ enumerated_ref_stop(&ca->io_ref[rw], -+ rw == READ -+ ? bch2_dev_read_refs -+ : bch2_dev_write_refs); -+} -+ - static void bch2_dev_release(struct kobject *kobj) - { - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -@@ -1191,6 +1356,9 @@ static void bch2_dev_release(struct kobject *kobj) - - static void bch2_dev_free(struct bch_dev *ca) - { -+ WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); -+ WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); -+ - cancel_work_sync(&ca->io_error_work); - - bch2_dev_unlink(ca); -@@ -1198,6 +1366,9 @@ static void bch2_dev_free(struct bch_dev *ca) - if (ca->kobj.state_in_sysfs) - kobject_del(&ca->kobj); - -+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); -+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); -+ - bch2_free_super(&ca->disk_sb); - bch2_dev_allocator_background_exit(ca); - bch2_dev_journal_exit(ca); -@@ -1209,7 +1380,8 @@ static void bch2_dev_free(struct bch_dev *ca) - bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); - bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); - -- percpu_ref_exit(&ca->io_ref); -+ enumerated_ref_exit(&ca->io_ref[WRITE]); -+ enumerated_ref_exit(&ca->io_ref[READ]); - #ifndef CONFIG_BCACHEFS_DEBUG - percpu_ref_exit(&ca->ref); - #endif -@@ -1221,14 +1393,12 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) - - lockdep_assert_held(&c->state_lock); - -- if (percpu_ref_is_zero(&ca->io_ref)) -+ if (enumerated_ref_is_zero(&ca->io_ref[READ])) - return; - - __bch2_dev_read_only(c, ca); - -- reinit_completion(&ca->io_ref_completion); -- percpu_ref_kill(&ca->io_ref); -- wait_for_completion(&ca->io_ref_completion); -+ bch2_dev_io_ref_stop(ca, READ); - - bch2_dev_unlink(ca); - -@@ -1245,13 +1415,6 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref) - } - #endif - --static void bch2_dev_io_ref_complete(struct percpu_ref *ref) --{ -- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); -- -- complete(&ca->io_ref_completion); --} -- - static void bch2_dev_unlink(struct bch_dev *ca) - { - struct kobject *b; -@@ -1280,8 +1443,8 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) - return 0; - - if (!ca->kobj.state_in_sysfs) { -- ret = kobject_add(&ca->kobj, &c->kobj, -- "dev-%u", ca->dev_idx); -+ ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: -+ bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); - if (ret) - return ret; - } -@@ -1313,7 +1476,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, - - kobject_init(&ca->kobj, &bch2_dev_ktype); - init_completion(&ca->ref_completion); -- init_completion(&ca->io_ref_completion); - - INIT_WORK(&ca->io_error_work, bch2_io_error_work); - -@@ -1337,10 +1499,13 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, - atomic_long_set(&ca->ref, 1); - #endif - -+ mutex_init(&ca->bucket_backpointer_mismatch.lock); -+ mutex_init(&ca->bucket_backpointer_empty.lock); -+ - bch2_dev_allocator_background_init(ca); - -- if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, -- PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -+ if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || -+ enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || - !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || - bch2_dev_buckets_alloc(c, ca) || - !(ca->io_done = alloc_percpu(*ca->io_done))) -@@ -1357,7 +1522,9 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, - { - ca->dev_idx = dev_idx; - __set_bit(ca->dev_idx, ca->self.d); -- scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); -+ -+ if (!ca->name[0]) -+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); - - ca->fs = c; - rcu_assign_pointer(c->devs[ca->dev_idx], ca); -@@ -1402,19 +1569,32 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) - return -BCH_ERR_device_size_too_small; - } - -- BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); -+ BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); -+ BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); - - ret = bch2_dev_journal_init(ca, sb->sb); - if (ret) - return ret; - -+ struct printbuf name = PRINTBUF; -+ prt_bdevname(&name, sb->bdev); -+ strscpy(ca->name, name.buf, sizeof(ca->name)); -+ printbuf_exit(&name); -+ - /* Commit: */ - ca->disk_sb = *sb; - memset(sb, 0, sizeof(*sb)); - -+ /* -+ * Stash pointer to the filesystem for blk_holder_ops - note that once -+ * attached to a filesystem, we will always close the block device -+ * before tearing down the filesystem object. -+ */ -+ ca->disk_sb.holder->c = ca->fs; -+ - ca->dev = ca->disk_sb.bdev->bd_dev; - -- percpu_ref_reinit(&ca->io_ref); -+ enumerated_ref_start(&ca->io_ref[READ]); - - return 0; - } -@@ -1438,18 +1618,11 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) - if (ret) - return ret; - -- bch2_dev_sysfs_online(c, ca); -- -- struct printbuf name = PRINTBUF; -- prt_bdevname(&name, ca->disk_sb.bdev); -- -- if (c->sb.nr_devices == 1) -- strscpy(c->name, name.buf, sizeof(c->name)); -- strscpy(ca->name, name.buf, sizeof(ca->name)); -+ set_bit(ca->dev_idx, c->online_devs.d); - -- printbuf_exit(&name); -+ bch2_dev_sysfs_online(c, ca); - -- rebalance_wakeup(c); -+ bch2_rebalance_wakeup(c); - return 0; - } - -@@ -1499,7 +1672,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, - return true; - - /* do we have enough devices to read from? */ -- new_online_devs = bch2_online_devs(c); -+ new_online_devs = c->online_devs; - __clear_bit(ca->dev_idx, new_online_devs.d); - - return bch2_have_enough_devs(c, new_online_devs, flags, false); -@@ -1508,42 +1681,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, - } - } - --static bool bch2_fs_may_start(struct bch_fs *c) --{ -- struct bch_dev *ca; -- unsigned i, flags = 0; -- -- if (c->opts.very_degraded) -- flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; -- -- if (c->opts.degraded) -- flags |= BCH_FORCE_IF_DEGRADED; -- -- if (!c->opts.degraded && -- !c->opts.very_degraded) { -- mutex_lock(&c->sb_lock); -- -- for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { -- if (!bch2_member_exists(c->disk_sb.sb, i)) -- continue; -- -- ca = bch2_dev_locked(c, i); -- -- if (!bch2_dev_is_online(ca) && -- (ca->mi.state == BCH_MEMBER_STATE_rw || -- ca->mi.state == BCH_MEMBER_STATE_ro)) { -- mutex_unlock(&c->sb_lock); -- return false; -- } -- } -- mutex_unlock(&c->sb_lock); -- } -- -- return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); --} -- - static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) - { -+ bch2_dev_io_ref_stop(ca, WRITE); -+ - /* - * The allocator thread itself allocates btree nodes, so stop it first: - */ -@@ -1560,6 +1701,10 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) - - bch2_dev_allocator_add(c, ca); - bch2_recalc_capacity(c); -+ -+ if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) -+ enumerated_ref_start(&ca->io_ref[WRITE]); -+ - bch2_dev_do_discards(ca); - } - -@@ -1589,7 +1734,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, - if (new_state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); - -- rebalance_wakeup(c); -+ bch2_rebalance_wakeup(c); - - return ret; - } -@@ -1612,6 +1757,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) - { - struct bch_member *m; - unsigned dev_idx = ca->dev_idx, data; -+ bool fast_device_removal = !bch2_request_incompat_feature(c, -+ bcachefs_metadata_version_fast_device_removal); - int ret; - - down_write(&c->state_lock); -@@ -1630,11 +1777,25 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) - - __bch2_dev_read_only(c, ca); - -- ret = bch2_dev_data_drop(c, ca->dev_idx, flags); -- bch_err_msg(ca, ret, "bch2_dev_data_drop()"); -+ ret = fast_device_removal -+ ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) -+ : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: -+ bch2_dev_remove_stripes(c, ca->dev_idx, flags)); - if (ret) - goto err; - -+ /* Check if device still has data before blowing away alloc info */ -+ struct bch_dev_usage usage = bch2_dev_usage_read(ca); -+ for (unsigned i = 0; i < BCH_DATA_NR; i++) -+ if (!data_type_is_empty(i) && -+ !data_type_is_hidden(i) && -+ usage.buckets[i]) { -+ bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", -+ __bch2_data_types[i], usage.buckets[i]); -+ ret = -EBUSY; -+ goto err; -+ } -+ - ret = bch2_dev_remove_alloc(c, ca); - bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); - if (ret) -@@ -1698,7 +1859,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) - */ - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); -- memset(&m->uuid, 0, sizeof(m->uuid)); -+ -+ if (fast_device_removal) -+ m->uuid = BCH_SB_MEMBER_DELETED_UUID; -+ else -+ memset(&m->uuid, 0, sizeof(m->uuid)); - - bch2_write_super(c); - -@@ -1706,8 +1871,9 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) - up_write(&c->state_lock); - return 0; - err: -- if (ca->mi.state == BCH_MEMBER_STATE_rw && -- !percpu_ref_is_zero(&ca->io_ref)) -+ if (test_bit(BCH_FS_rw, &c->flags) && -+ ca->mi.state == BCH_MEMBER_STATE_rw && -+ !enumerated_ref_is_zero(&ca->io_ref[READ])) - __bch2_dev_read_write(c, ca); - up_write(&c->state_lock); - return ret; -@@ -1717,11 +1883,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) - int bch2_dev_add(struct bch_fs *c, const char *path) - { - struct bch_opts opts = bch2_opts_empty(); -- struct bch_sb_handle sb; -+ struct bch_sb_handle sb = {}; - struct bch_dev *ca = NULL; - struct printbuf errbuf = PRINTBUF; - struct printbuf label = PRINTBUF; -- int ret; -+ int ret = 0; - - ret = bch2_read_super(path, &opts, &sb); - bch_err_msg(c, ret, "reading super"); -@@ -1738,6 +1904,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path) - } - } - -+ if (list_empty(&c->list)) { -+ mutex_lock(&bch_fs_list_lock); -+ if (__bch2_uuid_to_fs(c->sb.uuid)) -+ ret = -BCH_ERR_filesystem_uuid_already_open; -+ else -+ list_add(&c->list, &bch_fs_list); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ if (ret) { -+ bch_err(c, "filesystem UUID already open"); -+ goto err; -+ } -+ } -+ - ret = bch2_dev_may_add(sb.sb, c); - if (ret) - goto err; -@@ -1754,6 +1934,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) - - down_write(&c->state_lock); - mutex_lock(&c->sb_lock); -+ SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); - - ret = bch2_sb_from_fs(c, ca); - bch_err_msg(c, ret, "setting up new superblock"); -@@ -1769,6 +1950,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) - goto err_unlock; - } - unsigned dev_idx = ret; -+ ret = 0; - - /* success: */ - -@@ -1788,27 +1970,29 @@ int bch2_dev_add(struct bch_fs *c, const char *path) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - -- ret = bch2_dev_usage_init(ca, false); -- if (ret) -- goto err_late; -+ if (test_bit(BCH_FS_started, &c->flags)) { -+ ret = bch2_dev_usage_init(ca, false); -+ if (ret) -+ goto err_late; - -- ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); -- bch_err_msg(ca, ret, "marking new superblock"); -- if (ret) -- goto err_late; -+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); -+ bch_err_msg(ca, ret, "marking new superblock"); -+ if (ret) -+ goto err_late; - -- ret = bch2_fs_freespace_init(c); -- bch_err_msg(ca, ret, "initializing free space"); -- if (ret) -- goto err_late; -+ ret = bch2_fs_freespace_init(c); -+ bch_err_msg(ca, ret, "initializing free space"); -+ if (ret) -+ goto err_late; - -- if (ca->mi.state == BCH_MEMBER_STATE_rw) -- __bch2_dev_read_write(c, ca); -+ if (ca->mi.state == BCH_MEMBER_STATE_rw) -+ __bch2_dev_read_write(c, ca); - -- ret = bch2_dev_journal_alloc(ca, false); -- bch_err_msg(c, ret, "allocating journal"); -- if (ret) -- goto err_late; -+ ret = bch2_dev_journal_alloc(ca, false); -+ bch_err_msg(c, ret, "allocating journal"); -+ if (ret) -+ goto err_late; -+ } - - up_write(&c->state_lock); - out: -@@ -1919,6 +2103,18 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) - return 0; - } - -+static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) -+{ -+ struct bch_fs *c = ca->fs; -+ u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; -+ -+ return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, -+ bch2_disk_accounting_mod2(trans, false, v, dev_data_type, -+ .dev = ca->dev_idx, -+ .data_type = BCH_DATA_free)) ?: -+ bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); -+} -+ - int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) - { - struct bch_member *m; -@@ -1966,16 +2162,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) - mutex_unlock(&c->sb_lock); - - if (ca->mi.freespace_initialized) { -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_dev_data_type, -- .dev_data_type.dev = ca->dev_idx, -- .dev_data_type.data_type = BCH_DATA_free, -- }; -- u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; -- -- ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, -- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: -- bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); -+ ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); - if (ret) - goto err; - } -@@ -1986,6 +2173,49 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) - return ret; - } - -+int bch2_fs_resize_on_mount(struct bch_fs *c) -+{ -+ for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { -+ u64 old_nbuckets = ca->mi.nbuckets; -+ u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), -+ ca->mi.bucket_size); -+ -+ if (ca->mi.resize_on_mount && -+ new_nbuckets > ca->mi.nbuckets) { -+ bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); -+ int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); -+ bch_err_fn(ca, ret); -+ if (ret) { -+ enumerated_ref_put(&ca->io_ref[READ], -+ BCH_DEV_READ_REF_fs_resize_on_mount); -+ up_write(&c->state_lock); -+ return ret; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ struct bch_member *m = -+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); -+ m->nbuckets = cpu_to_le64(new_nbuckets); -+ SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); -+ -+ c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ if (ca->mi.freespace_initialized) { -+ ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); -+ if (ret) { -+ enumerated_ref_put(&ca->io_ref[READ], -+ BCH_DEV_READ_REF_fs_resize_on_mount); -+ up_write(&c->state_lock); -+ return ret; -+ } -+ } -+ } -+ } -+ return 0; -+} -+ - /* return with ref on ca->ref: */ - struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) - { -@@ -1998,6 +2228,114 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) - return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); - } - -+/* blk_holder_ops: */ -+ -+static struct bch_fs *bdev_get_fs(struct block_device *bdev) -+ __releases(&bdev->bd_holder_lock) -+{ -+ struct bch_sb_handle_holder *holder = bdev->bd_holder; -+ struct bch_fs *c = holder->c; -+ -+ if (c && !bch2_ro_ref_tryget(c)) -+ c = NULL; -+ -+ mutex_unlock(&bdev->bd_holder_lock); -+ -+ if (c) -+ wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); -+ return c; -+} -+ -+/* returns with ref on ca->ref */ -+static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) -+{ -+ for_each_member_device(c, ca) -+ if (ca->disk_sb.bdev == bdev) -+ return ca; -+ return NULL; -+} -+ -+static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) -+{ -+ struct bch_fs *c = bdev_get_fs(bdev); -+ if (!c) -+ return; -+ -+ struct super_block *sb = c->vfs_sb; -+ if (sb) { -+ /* -+ * Not necessary, c->ro_ref guards against the filesystem being -+ * unmounted - we only take this to avoid a warning in -+ * sync_filesystem: -+ */ -+ down_read(&sb->s_umount); -+ } -+ -+ down_write(&c->state_lock); -+ struct bch_dev *ca = bdev_to_bch_dev(c, bdev); -+ if (!ca) -+ goto unlock; -+ -+ bool dev = bch2_dev_state_allowed(c, ca, -+ BCH_MEMBER_STATE_failed, -+ BCH_FORCE_IF_DEGRADED); -+ -+ if (!dev && sb) { -+ if (!surprise) -+ sync_filesystem(sb); -+ shrink_dcache_sb(sb); -+ evict_inodes(sb); -+ } -+ -+ struct printbuf buf = PRINTBUF; -+ __bch2_log_msg_start(ca->name, &buf); -+ -+ prt_printf(&buf, "offline from block layer"); -+ -+ if (dev) { -+ __bch2_dev_offline(c, ca); -+ } else { -+ bch2_journal_flush(&c->journal); -+ bch2_fs_emergency_read_only2(c, &buf); -+ } -+ -+ bch2_print_str(c, KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ -+ bch2_dev_put(ca); -+unlock: -+ if (sb) -+ up_read(&sb->s_umount); -+ up_write(&c->state_lock); -+ bch2_ro_ref_put(c); -+} -+ -+static void bch2_fs_bdev_sync(struct block_device *bdev) -+{ -+ struct bch_fs *c = bdev_get_fs(bdev); -+ if (!c) -+ return; -+ -+ struct super_block *sb = c->vfs_sb; -+ if (sb) { -+ /* -+ * Not necessary, c->ro_ref guards against the filesystem being -+ * unmounted - we only take this to avoid a warning in -+ * sync_filesystem: -+ */ -+ down_read(&sb->s_umount); -+ sync_filesystem(sb); -+ up_read(&sb->s_umount); -+ } -+ -+ bch2_ro_ref_put(c); -+} -+ -+const struct blk_holder_ops bch2_sb_handle_bdev_ops = { -+ .mark_dead = bch2_fs_bdev_mark_dead, -+ .sync = bch2_fs_bdev_sync, -+}; -+ - /* Filesystem open: */ - - static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) -@@ -2006,10 +2344,10 @@ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) - cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); - } - --struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, -- struct bch_opts opts) -+struct bch_fs *bch2_fs_open(darray_const_str *devices, -+ struct bch_opts *opts) - { -- DARRAY(struct bch_sb_handle) sbs = { 0 }; -+ bch_sb_handles sbs = {}; - struct bch_fs *c = NULL; - struct bch_sb_handle *best = NULL; - struct printbuf errbuf = PRINTBUF; -@@ -2018,26 +2356,26 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, - if (!try_module_get(THIS_MODULE)) - return ERR_PTR(-ENODEV); - -- if (!nr_devices) { -+ if (!devices->nr) { - ret = -EINVAL; - goto err; - } - -- ret = darray_make_room(&sbs, nr_devices); -+ ret = darray_make_room(&sbs, devices->nr); - if (ret) - goto err; - -- for (unsigned i = 0; i < nr_devices; i++) { -+ darray_for_each(*devices, i) { - struct bch_sb_handle sb = { NULL }; - -- ret = bch2_read_super(devices[i], &opts, &sb); -+ ret = bch2_read_super(*i, opts, &sb); - if (ret) - goto err; - - BUG_ON(darray_push(&sbs, sb)); - } - -- if (opts.nochanges && !opts.read_only) { -+ if (opts->nochanges && !opts->read_only) { - ret = -BCH_ERR_erofs_nochanges; - goto err_print; - } -@@ -2047,7 +2385,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, - best = sb; - - darray_for_each_reverse(sbs, sb) { -- ret = bch2_dev_in_fs(best, sb, &opts); -+ ret = bch2_dev_in_fs(best, sb, opts); - - if (ret == -BCH_ERR_device_has_been_removed || - ret == -BCH_ERR_device_splitbrain) { -@@ -2062,7 +2400,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, - goto err_print; - } - -- c = bch2_fs_alloc(best->sb, opts); -+ c = bch2_fs_alloc(best->sb, opts, &sbs); - ret = PTR_ERR_OR_ZERO(c); - if (ret) - goto err; -@@ -2077,11 +2415,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, - } - up_write(&c->state_lock); - -- if (!bch2_fs_may_start(c)) { -- ret = -BCH_ERR_insufficient_devices_to_start; -- goto err_print; -- } -- - if (!c->opts.nostart) { - ret = bch2_fs_start(c); - if (ret) -@@ -2096,7 +2429,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, - return c; - err_print: - pr_err("bch_fs_open err opening %s: %s", -- devices[0], bch2_err_str(ret)); -+ devices->data[0], bch2_err_str(ret)); - err: - if (!IS_ERR_OR_NULL(c)) - bch2_fs_stop(c); -@@ -2133,16 +2466,52 @@ static int __init bcachefs_init(void) - return -ENOMEM; - } - --#define BCH_DEBUG_PARAM(name, description) \ -- bool bch2_##name; \ -- module_param_named(name, bch2_##name, bool, 0644); \ -+#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); -+BCH_DEBUG_PARAMS_ALL() -+#undef BCH_DEBUG_PARAM -+ -+static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) -+{ -+ /* Match bool exactly, by re-using it. */ -+ struct static_key *key = kp->arg; -+ struct kernel_param boolkp = *kp; -+ bool v; -+ int ret; -+ -+ boolkp.arg = &v; -+ -+ ret = param_set_bool(val, &boolkp); -+ if (ret) -+ return ret; -+ if (v) -+ static_key_enable(key); -+ else -+ static_key_disable(key); -+ return 0; -+} -+ -+static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) -+{ -+ struct static_key *key = kp->arg; -+ return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); -+} -+ -+static const struct kernel_param_ops bch2_param_ops_static_key_t = { -+ .flags = KERNEL_PARAM_OPS_FL_NOARG, -+ .set = bch2_param_set_static_key_t, -+ .get = bch2_param_get_static_key_t, -+}; -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ -+ __MODULE_PARM_TYPE(name, "static_key_t"); \ - MODULE_PARM_DESC(name, description); - BCH_DEBUG_PARAMS() - #undef BCH_DEBUG_PARAM - - __maybe_unused - static unsigned bch2_metadata_version = bcachefs_metadata_version_current; --module_param_named(version, bch2_metadata_version, uint, 0400); -+module_param_named(version, bch2_metadata_version, uint, 0444); - - module_exit(bcachefs_exit); - module_init(bcachefs_init); -diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h -index 04f8287eff5c..dc52f06cb2b9 100644 ---- a/fs/bcachefs/super.h -+++ b/fs/bcachefs/super.h -@@ -9,6 +9,9 @@ - #include - - extern const char * const bch2_fs_flag_strs[]; -+extern const char * const bch2_write_refs[]; -+extern const char * const bch2_dev_read_refs[]; -+extern const char * const bch2_dev_write_refs[]; - - struct bch_fs *bch2_dev_to_fs(dev_t); - struct bch_fs *bch2_uuid_to_fs(__uuid_t); -@@ -29,17 +32,23 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); - struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); - - bool bch2_fs_emergency_read_only(struct bch_fs *); -+bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *); -+ - bool bch2_fs_emergency_read_only_locked(struct bch_fs *); - void bch2_fs_read_only(struct bch_fs *); - - int bch2_fs_read_write(struct bch_fs *); - int bch2_fs_read_write_early(struct bch_fs *); - -+int bch2_fs_resize_on_mount(struct bch_fs *); -+ - void __bch2_fs_stop(struct bch_fs *); - void bch2_fs_free(struct bch_fs *); - void bch2_fs_stop(struct bch_fs *); - - int bch2_fs_start(struct bch_fs *); --struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); -+struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); -+ -+extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; - - #endif /* _BCACHEFS_SUPER_H */ -diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h -index 368a63d938cf..3a899f799d1d 100644 ---- a/fs/bcachefs/super_types.h -+++ b/fs/bcachefs/super_types.h -@@ -2,13 +2,19 @@ - #ifndef _BCACHEFS_SUPER_TYPES_H - #define _BCACHEFS_SUPER_TYPES_H - -+struct bch_fs; -+ -+struct bch_sb_handle_holder { -+ struct bch_fs *c; -+}; -+ - struct bch_sb_handle { - struct bch_sb *sb; - struct file *s_bdev_file; - struct block_device *bdev; - char *sb_name; - struct bio *bio; -- void *holder; -+ struct bch_sb_handle_holder *holder; - size_t buffer_size; - blk_mode_t mode; - unsigned have_layout:1; -diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c -index a7eb1f511484..0101eb025117 100644 ---- a/fs/bcachefs/sysfs.c -+++ b/fs/bcachefs/sysfs.c -@@ -25,6 +25,7 @@ - #include "disk_accounting.h" - #include "disk_groups.h" - #include "ec.h" -+#include "enumerated_ref.h" - #include "inode.h" - #include "journal.h" - #include "journal_reclaim.h" -@@ -34,12 +35,14 @@ - #include "nocow_locking.h" - #include "opts.h" - #include "rebalance.h" -+#include "recovery_passes.h" - #include "replicas.h" - #include "super-io.h" - #include "tests.h" - - #include - #include -+#include - #include - - #include "util.h" -@@ -145,16 +148,18 @@ write_attribute(trigger_journal_flush); - write_attribute(trigger_journal_writes); - write_attribute(trigger_btree_cache_shrink); - write_attribute(trigger_btree_key_cache_shrink); -+write_attribute(trigger_btree_updates); - write_attribute(trigger_freelist_wakeup); -+write_attribute(trigger_recalc_capacity); -+write_attribute(trigger_delete_dead_snapshots); - read_attribute(gc_gens_pos); -+__sysfs_attribute(read_fua_test, 0400); - - read_attribute(uuid); - read_attribute(minor); - read_attribute(flags); --read_attribute(bucket_size); - read_attribute(first_bucket); - read_attribute(nbuckets); --rw_attribute(durability); - read_attribute(io_done); - read_attribute(io_errors); - write_attribute(io_errors_reset); -@@ -173,31 +178,13 @@ read_attribute(journal_debug); - read_attribute(btree_cache); - read_attribute(btree_key_cache); - read_attribute(btree_reserve_cache); --read_attribute(stripes_heap); - read_attribute(open_buckets); - read_attribute(open_buckets_partial); --read_attribute(write_points); - read_attribute(nocow_lock_table); - --#ifdef BCH_WRITE_REF_DEBUG -+read_attribute(read_refs); - read_attribute(write_refs); - --static const char * const bch2_write_refs[] = { --#define x(n) #n, -- BCH_WRITE_REFS() --#undef x -- NULL --}; -- --static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) --{ -- bch2_printbuf_tabstop_push(out, 24); -- -- for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) -- prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i])); --} --#endif -- - read_attribute(internal_uuid); - read_attribute(disk_groups); - -@@ -209,14 +196,14 @@ read_attribute(usage_base); - BCH_PERSISTENT_COUNTERS() - #undef x - --rw_attribute(discard); --read_attribute(state); - rw_attribute(label); - - read_attribute(copy_gc_wait); - - sysfs_pd_controller_attribute(rebalance); - read_attribute(rebalance_status); -+read_attribute(snapshot_delete_status); -+read_attribute(recovery_status); - - read_attribute(new_stripes); - -@@ -262,10 +249,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c - prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n"); - - for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) { -- struct disk_accounting_pos a = { -- .type = BCH_DISK_ACCOUNTING_compression, -- .compression.type = i, -- }; -+ struct disk_accounting_pos a; -+ disk_accounting_key_init(a, compression, .type = i); - struct bpos p = disk_accounting_pos_to_bpos(&a); - u64 v[3]; - bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v)); -@@ -315,6 +300,116 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) - prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes); - } - -+static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca) -+{ -+ struct bch_fs *c = ca->fs; -+ struct bio *bio = NULL; -+ void *buf = NULL; -+ unsigned bs = c->opts.block_size, iters; -+ u64 end, test_duration = NSEC_PER_SEC * 2; -+ struct bch2_time_stats stats_nofua, stats_fua, stats_random; -+ int ret = 0; -+ -+ bch2_time_stats_init_no_pcpu(&stats_nofua); -+ bch2_time_stats_init_no_pcpu(&stats_fua); -+ bch2_time_stats_init_no_pcpu(&stats_random); -+ -+ if (!bch2_dev_get_ioref(c, ca->dev_idx, READ, BCH_DEV_READ_REF_read_fua_test)) { -+ prt_str(out, "offline\n"); -+ return 0; -+ } -+ -+ struct block_device *bdev = ca->disk_sb.bdev; -+ -+ bio = bio_kmalloc(1, GFP_KERNEL); -+ if (!bio) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ buf = kmalloc(bs, GFP_KERNEL); -+ if (!buf) -+ goto err; -+ -+ end = ktime_get_ns() + test_duration; -+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { -+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); -+ bch2_bio_map(bio, buf, bs); -+ -+ u64 submit_time = ktime_get_ns(); -+ ret = submit_bio_wait(bio); -+ bch2_time_stats_update(&stats_nofua, submit_time); -+ -+ if (ret) -+ goto err; -+ } -+ -+ end = ktime_get_ns() + test_duration; -+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { -+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ); -+ bch2_bio_map(bio, buf, bs); -+ -+ u64 submit_time = ktime_get_ns(); -+ ret = submit_bio_wait(bio); -+ bch2_time_stats_update(&stats_fua, submit_time); -+ -+ if (ret) -+ goto err; -+ } -+ -+ u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca); -+ -+ end = ktime_get_ns() + test_duration; -+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { -+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); -+ bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9; -+ bch2_bio_map(bio, buf, bs); -+ -+ u64 submit_time = ktime_get_ns(); -+ ret = submit_bio_wait(bio); -+ bch2_time_stats_update(&stats_random, submit_time); -+ -+ if (ret) -+ goto err; -+ } -+ -+ u64 ns_nofua = mean_and_variance_get_mean(stats_nofua.duration_stats); -+ u64 ns_fua = mean_and_variance_get_mean(stats_fua.duration_stats); -+ u64 ns_rand = mean_and_variance_get_mean(stats_random.duration_stats); -+ -+ u64 stddev_nofua = mean_and_variance_get_stddev(stats_nofua.duration_stats); -+ u64 stddev_fua = mean_and_variance_get_stddev(stats_fua.duration_stats); -+ u64 stddev_rand = mean_and_variance_get_stddev(stats_random.duration_stats); -+ -+ printbuf_tabstop_push(out, 8); -+ printbuf_tabstop_push(out, 12); -+ printbuf_tabstop_push(out, 12); -+ prt_printf(out, "This test must be run on an idle drive for accurate results\n"); -+ prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device)); -+ prt_printf(out, "fua support advertized: %s\n", str_yes_no(bdev_fua(bdev))); -+ prt_newline(out); -+ prt_printf(out, "ns:\tlatency\rstddev\r\n"); -+ prt_printf(out, "nofua\t%llu\r%llu\r\n", ns_nofua, stddev_nofua); -+ prt_printf(out, "fua\t%llu\r%llu\r\n", ns_fua, stddev_fua); -+ prt_printf(out, "random\t%llu\r%llu\r\n", ns_rand, stddev_rand); -+ -+ bool read_cache = ns_nofua * 2 < ns_rand; -+ bool fua_cached = read_cache && ns_fua < (ns_nofua + ns_rand) / 2; -+ -+ if (!read_cache) -+ prt_str(out, "reads don't appear to be cached - safe\n"); -+ else if (!fua_cached) -+ prt_str(out, "fua reads don't appear to be cached - safe\n"); -+ else -+ prt_str(out, "fua reads appear to be cached - unsafe\n"); -+err: -+ kfree(buf); -+ kfree(bio); -+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_read_fua_test); -+ bch_err_fn(c, ret); -+ return ret; -+} -+ - SHOW(bch2_fs) - { - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -@@ -341,6 +436,12 @@ SHOW(bch2_fs) - if (attr == &sysfs_rebalance_status) - bch2_rebalance_status_to_text(out, c); - -+ if (attr == &sysfs_snapshot_delete_status) -+ bch2_snapshot_delete_status_to_text(out, c); -+ -+ if (attr == &sysfs_recovery_status) -+ bch2_recovery_pass_status_to_text(out, c); -+ - /* Debugging: */ - - if (attr == &sysfs_journal_debug) -@@ -355,18 +456,12 @@ SHOW(bch2_fs) - if (attr == &sysfs_btree_reserve_cache) - bch2_btree_reserve_cache_to_text(out, c); - -- if (attr == &sysfs_stripes_heap) -- bch2_stripes_heap_to_text(out, c); -- - if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c, NULL); - - if (attr == &sysfs_open_buckets_partial) - bch2_open_buckets_partial_to_text(out, c); - -- if (attr == &sysfs_write_points) -- bch2_write_points_to_text(out, c); -- - if (attr == &sysfs_compression_stats) - bch2_compression_stats_to_text(out, c); - -@@ -382,10 +477,8 @@ SHOW(bch2_fs) - if (attr == &sysfs_moving_ctxts) - bch2_fs_moving_ctxts_to_text(out, c); - --#ifdef BCH_WRITE_REF_DEBUG - if (attr == &sysfs_write_refs) -- bch2_write_refs_to_text(out, c); --#endif -+ enumerated_ref_to_text(out, &c->writes, bch2_write_refs); - - if (attr == &sysfs_nocow_lock_table) - bch2_nocow_locks_to_text(out, &c->nocow_locks); -@@ -415,7 +508,10 @@ STORE(bch2_fs) - - /* Debugging: */ - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) -+ if (attr == &sysfs_trigger_btree_updates) -+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); -+ -+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)) - return -EROFS; - - if (attr == &sysfs_trigger_btree_cache_shrink) { -@@ -455,6 +551,15 @@ STORE(bch2_fs) - if (attr == &sysfs_trigger_freelist_wakeup) - closure_wake_up(&c->freelist_wait); - -+ if (attr == &sysfs_trigger_recalc_capacity) { -+ down_read(&c->state_lock); -+ bch2_recalc_capacity(c); -+ up_read(&c->state_lock); -+ } -+ -+ if (attr == &sysfs_trigger_delete_dead_snapshots) -+ __bch2_delete_dead_snapshots(c); -+ - #ifdef CONFIG_BCACHEFS_TESTS - if (attr == &sysfs_perf_test) { - char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; -@@ -475,7 +580,7 @@ STORE(bch2_fs) - size = ret; - } - #endif -- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); - return size; - } - SYSFS_OPS(bch2_fs); -@@ -486,6 +591,8 @@ struct attribute *bch2_fs_files[] = { - &sysfs_btree_write_stats, - - &sysfs_rebalance_status, -+ &sysfs_snapshot_delete_status, -+ &sysfs_recovery_status, - - &sysfs_compression_stats, - -@@ -566,13 +673,9 @@ struct attribute *bch2_fs_internal_files[] = { - &sysfs_btree_key_cache, - &sysfs_btree_reserve_cache, - &sysfs_new_stripes, -- &sysfs_stripes_heap, - &sysfs_open_buckets, - &sysfs_open_buckets_partial, -- &sysfs_write_points, --#ifdef BCH_WRITE_REF_DEBUG - &sysfs_write_refs, --#endif - &sysfs_nocow_lock_table, - &sysfs_io_timers_read, - &sysfs_io_timers_write, -@@ -584,7 +687,10 @@ struct attribute *bch2_fs_internal_files[] = { - &sysfs_trigger_journal_writes, - &sysfs_trigger_btree_cache_shrink, - &sysfs_trigger_btree_key_cache_shrink, -+ &sysfs_trigger_btree_updates, - &sysfs_trigger_freelist_wakeup, -+ &sysfs_trigger_recalc_capacity, -+ &sysfs_trigger_delete_dead_snapshots, - - &sysfs_gc_gens_pos, - -@@ -604,87 +710,115 @@ struct attribute *bch2_fs_internal_files[] = { - - /* options */ - --SHOW(bch2_fs_opts_dir) -+static ssize_t sysfs_opt_show(struct bch_fs *c, -+ struct bch_dev *ca, -+ enum bch_opt_id id, -+ struct printbuf *out) - { -- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -- const struct bch_option *opt = container_of(attr, struct bch_option, attr); -- int id = opt - bch2_opt_table; -- u64 v = bch2_opt_get_by_id(&c->opts, id); -+ const struct bch_option *opt = bch2_opt_table + id; -+ u64 v; -+ -+ if (opt->flags & OPT_FS) { -+ v = bch2_opt_get_by_id(&c->opts, id); -+ } else if ((opt->flags & OPT_DEVICE) && opt->get_member) { -+ v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx); -+ } else { -+ return -EINVAL; -+ } - - bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); - prt_char(out, '\n'); -- - return 0; - } - --STORE(bch2_fs_opts_dir) -+static ssize_t sysfs_opt_store(struct bch_fs *c, -+ struct bch_dev *ca, -+ enum bch_opt_id id, -+ const char *buf, size_t size) - { -- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -- const struct bch_option *opt = container_of(attr, struct bch_option, attr); -- int ret, id = opt - bch2_opt_table; -- char *tmp; -- u64 v; -+ const struct bch_option *opt = bch2_opt_table + id; -+ int ret = 0; - - /* - * We don't need to take c->writes for correctness, but it eliminates an - * unsightly error message in the dmesg log when we're RO: - */ -- if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) -+ if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))) - return -EROFS; - -- tmp = kstrdup(buf, GFP_KERNEL); -+ char *tmp = kstrdup(buf, GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto err; - } - -- ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); -+ u64 v; -+ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: -+ bch2_opt_hook_pre_set(c, ca, id, v); - kfree(tmp); - - if (ret < 0) - goto err; - -- ret = bch2_opt_check_may_set(c, id, v); -- if (ret < 0) -- goto err; -- -- bch2_opt_set_sb(c, NULL, opt, v); -- bch2_opt_set_by_id(&c->opts, id, v); -- -- if (v && -- (id == Opt_background_target || -- id == Opt_background_compression || -- (id == Opt_compression && !c->opts.background_compression))) -- bch2_set_rebalance_needs_scan(c, 0); -+ bool is_sb = opt->get_sb || opt->get_member; -+ bool changed = false; -+ -+ if (is_sb) { -+ changed = bch2_opt_set_sb(c, ca, opt, v); -+ } else if (!ca) { -+ changed = bch2_opt_get_by_id(&c->opts, id) != v; -+ } else { -+ /* device options that aren't superblock options aren't -+ * supported */ -+ BUG(); -+ } - -- if (v && id == Opt_rebalance_enabled) -- rebalance_wakeup(c); -+ if (!ca) -+ bch2_opt_set_by_id(&c->opts, id, v); - -- if (v && id == Opt_copygc_enabled && -- c->copygc_thread) -- wake_up_process(c->copygc_thread); -+ if (changed) -+ bch2_opt_hook_post_set(c, ca, 0, &c->opts, id); - - ret = size; - err: -- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); -+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); - return ret; - } -+ -+SHOW(bch2_fs_opts_dir) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -+ int id = bch2_opt_lookup(attr->name); -+ if (id < 0) -+ return 0; -+ -+ return sysfs_opt_show(c, NULL, id, out); -+} -+ -+STORE(bch2_fs_opts_dir) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -+ int id = bch2_opt_lookup(attr->name); -+ if (id < 0) -+ return 0; -+ -+ return sysfs_opt_store(c, NULL, id, buf, size); -+} - SYSFS_OPS(bch2_fs_opts_dir); - - struct attribute *bch2_fs_opts_dir_files[] = { NULL }; - --int bch2_opts_create_sysfs_files(struct kobject *kobj) -+int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) - { -- const struct bch_option *i; -- int ret; -- -- for (i = bch2_opt_table; -+ for (const struct bch_option *i = bch2_opt_table; - i < bch2_opt_table + bch2_opts_nr; - i++) { -- if (!(i->flags & OPT_FS)) -+ if (i->flags & OPT_HIDDEN) -+ continue; -+ if (!(i->flags & type)) - continue; - -- ret = sysfs_create_file(kobj, &i->attr); -+ int ret = sysfs_create_file(kobj, &i->attr); - if (ret) - return ret; - } -@@ -755,11 +889,8 @@ SHOW(bch2_dev) - - sysfs_printf(uuid, "%pU\n", ca->uuid.b); - -- sysfs_print(bucket_size, bucket_bytes(ca)); - sysfs_print(first_bucket, ca->mi.first_bucket); - sysfs_print(nbuckets, ca->mi.nbuckets); -- sysfs_print(durability, ca->mi.durability); -- sysfs_print(discard, ca->mi.discard); - - if (attr == &sysfs_label) { - if (ca->mi.group) -@@ -772,11 +903,6 @@ SHOW(bch2_dev) - prt_char(out, '\n'); - } - -- if (attr == &sysfs_state) { -- prt_string_option(out, bch2_member_states, ca->mi.state); -- prt_char(out, '\n'); -- } -- - if (attr == &sysfs_io_done) - dev_io_done_to_text(out, ca); - -@@ -802,6 +928,19 @@ SHOW(bch2_dev) - if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c, ca); - -+ if (attr == &sysfs_read_fua_test) -+ return bch2_read_fua_test(out, ca); -+ -+ int opt_id = bch2_opt_lookup(attr->name); -+ if (opt_id >= 0) -+ return sysfs_opt_show(c, ca, opt_id, out); -+ -+ if (attr == &sysfs_read_refs) -+ enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs); -+ -+ if (attr == &sysfs_write_refs) -+ enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs); -+ - return 0; - } - -@@ -810,18 +949,6 @@ STORE(bch2_dev) - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - struct bch_fs *c = ca->fs; - -- if (attr == &sysfs_discard) { -- bool v = strtoul_or_return(buf); -- -- bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v); -- } -- -- if (attr == &sysfs_durability) { -- u64 v = strtoul_or_return(buf); -- -- bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v); -- } -- - if (attr == &sysfs_label) { - char *tmp; - int ret; -@@ -839,20 +966,20 @@ STORE(bch2_dev) - if (attr == &sysfs_io_errors_reset) - bch2_dev_errors_reset(ca); - -+ int opt_id = bch2_opt_lookup(attr->name); -+ if (opt_id >= 0) -+ return sysfs_opt_store(c, ca, opt_id, buf, size); -+ - return size; - } - SYSFS_OPS(bch2_dev); - - struct attribute *bch2_dev_files[] = { - &sysfs_uuid, -- &sysfs_bucket_size, - &sysfs_first_bucket, - &sysfs_nbuckets, -- &sysfs_durability, - - /* settings: */ -- &sysfs_discard, -- &sysfs_state, - &sysfs_label, - - &sysfs_has_data, -@@ -866,9 +993,14 @@ struct attribute *bch2_dev_files[] = { - &sysfs_io_latency_stats_write, - &sysfs_congested, - -+ &sysfs_read_fua_test, -+ - /* debug: */ - &sysfs_alloc_debug, - &sysfs_open_buckets, -+ -+ &sysfs_read_refs, -+ &sysfs_write_refs, - NULL - }; - -diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h -index 222cd5062702..303e0433c702 100644 ---- a/fs/bcachefs/sysfs.h -+++ b/fs/bcachefs/sysfs.h -@@ -23,7 +23,7 @@ extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; - extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; - extern const struct sysfs_ops bch2_dev_sysfs_ops; - --int bch2_opts_create_sysfs_files(struct kobject *); -+int bch2_opts_create_sysfs_files(struct kobject *, unsigned); - - #else - -@@ -41,7 +41,8 @@ static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; - static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; - static const struct sysfs_ops bch2_dev_sysfs_ops; - --static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } -+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) -+{ return 0; } - - #endif /* NO_BCACHEFS_SYSFS */ - -diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c -index 6c6469814637..782a05fe7656 100644 ---- a/fs/bcachefs/tests.c -+++ b/fs/bcachefs/tests.c -@@ -43,7 +43,7 @@ static int test_delete(struct bch_fs *c, u64 nr) - BTREE_ITER_intent); - - ret = commit_do(trans, NULL, NULL, 0, -- bch2_btree_iter_traverse(&iter) ?: -+ bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &k.k_i, 0)); - bch_err_msg(c, ret, "update error"); - if (ret) -@@ -51,7 +51,7 @@ static int test_delete(struct bch_fs *c, u64 nr) - - pr_info("deleting once"); - ret = commit_do(trans, NULL, NULL, 0, -- bch2_btree_iter_traverse(&iter) ?: -+ bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error (first)"); - if (ret) -@@ -59,7 +59,7 @@ static int test_delete(struct bch_fs *c, u64 nr) - - pr_info("deleting twice"); - ret = commit_do(trans, NULL, NULL, 0, -- bch2_btree_iter_traverse(&iter) ?: -+ bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error (second)"); - if (ret) -@@ -84,7 +84,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) - BTREE_ITER_intent); - - ret = commit_do(trans, NULL, NULL, 0, -- bch2_btree_iter_traverse(&iter) ?: -+ bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &k.k_i, 0)); - bch_err_msg(c, ret, "update error"); - if (ret) -@@ -94,7 +94,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) - bch2_journal_flush_all_pins(&c->journal); - - ret = commit_do(trans, NULL, NULL, 0, -- bch2_btree_iter_traverse(&iter) ?: -+ bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error"); - if (ret) -@@ -342,6 +342,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) - */ - static int test_peek_end(struct bch_fs *c, u64 nr) - { -+ delete_test_keys(c); -+ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; -@@ -349,10 +351,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr) - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - -- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); -+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - -- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); -+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - bch2_trans_iter_exit(trans, &iter); -@@ -362,6 +364,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr) - - static int test_peek_end_extents(struct bch_fs *c, u64 nr) - { -+ delete_test_keys(c); -+ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; -@@ -369,10 +373,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), 0); - -- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); -+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - -- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); -+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - bch2_trans_iter_exit(trans, &iter); -@@ -488,7 +492,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) - trans = bch2_trans_get(c); - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, snapid_lo), 0); -- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); -+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - - BUG_ON(k.k->p.snapshot != U32_MAX); - -@@ -602,9 +606,9 @@ static int rand_lookup(struct bch_fs *c, u64 nr) - SPOS(0, 0, U32_MAX), 0); - - for (i = 0; i < nr; i++) { -- bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); -+ bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX)); - -- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); -+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter))); - ret = bkey_err(k); - if (ret) - break; -@@ -623,9 +627,9 @@ static int rand_mixed_trans(struct btree_trans *trans, - struct bkey_s_c k; - int ret; - -- bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); -+ bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX)); - -- k = bch2_btree_iter_peek(iter); -+ k = bch2_btree_iter_peek(trans, iter); - ret = bkey_err(k); - bch_err_msg(trans->c, ret, "lookup error"); - if (ret) -@@ -672,7 +676,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, - BTREE_ITER_intent); -- k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)); -+ k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)); - ret = bkey_err(k); - if (ret) - goto err; -diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c -index dea73bc1cb51..314a24d15d4e 100644 ---- a/fs/bcachefs/thread_with_file.c -+++ b/fs/bcachefs/thread_with_file.c -@@ -455,8 +455,10 @@ ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocki - struct stdio_buf *buf = &stdio->output; - unsigned long flags; - ssize_t ret; -- - again: -+ if (stdio->done) -+ return -EPIPE; -+ - spin_lock_irqsave(&buf->lock, flags); - ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); - spin_unlock_irqrestore(&buf->lock, flags); -diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h -index f4d484d44f63..254b8493ec4b 100644 ---- a/fs/bcachefs/thread_with_file_types.h -+++ b/fs/bcachefs/thread_with_file_types.h -@@ -2,7 +2,7 @@ - #ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H - #define _BCACHEFS_THREAD_WITH_FILE_TYPES_H - --#include "darray.h" -+#include - - struct stdio_buf { - spinlock_t lock; -diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c -index 3fe82757f93a..2c34fe4be912 100644 ---- a/fs/bcachefs/time_stats.c -+++ b/fs/bcachefs/time_stats.c -@@ -10,6 +10,9 @@ - #include "eytzinger.h" - #include "time_stats.h" - -+/* disable automatic switching to percpu mode */ -+#define TIME_STATS_NONPCPU ((unsigned long) 1) -+ - static const struct time_unit time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, -@@ -123,11 +126,12 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) - { - unsigned long flags; - -- if (!stats->buffer) { -+ if ((unsigned long) stats->buffer <= TIME_STATS_NONPCPU) { - spin_lock_irqsave(&stats->lock, flags); - time_stats_update_one(stats, start, end); - -- if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && -+ if (!stats->buffer && -+ mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && - stats->duration_stats.n > 1024) - stats->buffer = - alloc_percpu_gfp(struct time_stat_buffer, -@@ -157,7 +161,7 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats) - unsigned offset = offsetof(struct bch2_time_stats, min_duration); - memset((void *) stats + offset, 0, sizeof(*stats) - offset); - -- if (stats->buffer) { -+ if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) { - int cpu; - for_each_possible_cpu(cpu) - per_cpu_ptr(stats->buffer, cpu)->nr = 0; -@@ -167,7 +171,9 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats) - - void bch2_time_stats_exit(struct bch2_time_stats *stats) - { -- free_percpu(stats->buffer); -+ if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) -+ free_percpu(stats->buffer); -+ stats->buffer = NULL; - } - - void bch2_time_stats_init(struct bch2_time_stats *stats) -@@ -177,3 +183,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats) - stats->min_freq = U64_MAX; - spin_lock_init(&stats->lock); - } -+ -+void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats) -+{ -+ bch2_time_stats_init(stats); -+ stats->buffer = (struct time_stat_buffer __percpu *) TIME_STATS_NONPCPU; -+} -diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h -index dc6493f7bbab..eddb0985bab4 100644 ---- a/fs/bcachefs/time_stats.h -+++ b/fs/bcachefs/time_stats.h -@@ -145,6 +145,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v) - void bch2_time_stats_reset(struct bch2_time_stats *); - void bch2_time_stats_exit(struct bch2_time_stats *); - void bch2_time_stats_init(struct bch2_time_stats *); -+void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *); - - static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) - { -diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h -index c1b51009edf6..8cb5b40704fd 100644 ---- a/fs/bcachefs/trace.h -+++ b/fs/bcachefs/trace.h -@@ -295,12 +295,12 @@ TRACE_EVENT(write_super, - - /* io.c: */ - --DEFINE_EVENT(bio, read_promote, -+DEFINE_EVENT(bio, io_read_promote, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) - ); - --TRACE_EVENT(read_nopromote, -+TRACE_EVENT(io_read_nopromote, - TP_PROTO(struct bch_fs *c, int ret), - TP_ARGS(c, ret), - -@@ -319,26 +319,55 @@ TRACE_EVENT(read_nopromote, - __entry->ret) - ); - --DEFINE_EVENT(bio, read_bounce, -+DEFINE_EVENT(bio, io_read_bounce, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) - ); - --DEFINE_EVENT(bio, read_split, -+DEFINE_EVENT(bio, io_read_split, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) - ); - --DEFINE_EVENT(bio, read_retry, -+DEFINE_EVENT(bio, io_read_retry, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) - ); - --DEFINE_EVENT(bio, read_reuse_race, -+DEFINE_EVENT(bio, io_read_reuse_race, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) - ); - -+DEFINE_EVENT(bio, io_read_fail_and_poison, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+/* ec.c */ -+ -+TRACE_EVENT(stripe_create, -+ TP_PROTO(struct bch_fs *c, u64 idx, int ret), -+ TP_ARGS(c, idx, ret), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(u64, idx ) -+ __field(int, ret ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->idx = idx; -+ __entry->ret = ret; -+ ), -+ -+ TP_printk("%d,%d idx %llu ret %i", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->idx, -+ __entry->ret) -+); -+ - /* Journal */ - - DEFINE_EVENT(bch_fs, journal_full, -@@ -797,53 +826,37 @@ TRACE_EVENT(bucket_invalidate, - - /* Moving IO */ - --TRACE_EVENT(bucket_evacuate, -- TP_PROTO(struct bch_fs *c, struct bpos *bucket), -- TP_ARGS(c, bucket), -- -- TP_STRUCT__entry( -- __field(dev_t, dev ) -- __field(u32, dev_idx ) -- __field(u64, bucket ) -- ), -- -- TP_fast_assign( -- __entry->dev = c->dev; -- __entry->dev_idx = bucket->inode; -- __entry->bucket = bucket->offset; -- ), -- -- TP_printk("%d:%d %u:%llu", -- MAJOR(__entry->dev), MINOR(__entry->dev), -- __entry->dev_idx, __entry->bucket) -+DEFINE_EVENT(fs_str, io_move, -+ TP_PROTO(struct bch_fs *c, const char *str), -+ TP_ARGS(c, str) - ); - --DEFINE_EVENT(fs_str, move_extent, -+DEFINE_EVENT(fs_str, io_move_read, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); - --DEFINE_EVENT(fs_str, move_extent_read, -+DEFINE_EVENT(fs_str, io_move_write, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); - --DEFINE_EVENT(fs_str, move_extent_write, -+DEFINE_EVENT(fs_str, io_move_finish, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); - --DEFINE_EVENT(fs_str, move_extent_finish, -+DEFINE_EVENT(fs_str, io_move_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); - --DEFINE_EVENT(fs_str, move_extent_fail, -+DEFINE_EVENT(fs_str, io_move_write_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); - --DEFINE_EVENT(fs_str, move_extent_start_fail, -+DEFINE_EVENT(fs_str, io_move_start_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); -@@ -881,37 +894,6 @@ TRACE_EVENT(move_data, - __entry->sectors_raced) - ); - --TRACE_EVENT(evacuate_bucket, -- TP_PROTO(struct bch_fs *c, struct bpos *bucket, -- unsigned sectors, unsigned bucket_size, -- int ret), -- TP_ARGS(c, bucket, sectors, bucket_size, ret), -- -- TP_STRUCT__entry( -- __field(dev_t, dev ) -- __field(u64, member ) -- __field(u64, bucket ) -- __field(u32, sectors ) -- __field(u32, bucket_size ) -- __field(int, ret ) -- ), -- -- TP_fast_assign( -- __entry->dev = c->dev; -- __entry->member = bucket->inode; -- __entry->bucket = bucket->offset; -- __entry->sectors = sectors; -- __entry->bucket_size = bucket_size; -- __entry->ret = ret; -- ), -- -- TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", -- MAJOR(__entry->dev), MINOR(__entry->dev), -- __entry->member, __entry->bucket, -- __entry->sectors, __entry->bucket_size, -- __entry->ret) --); -- - TRACE_EVENT(copygc, - TP_PROTO(struct bch_fs *c, - u64 buckets, -@@ -1145,51 +1127,9 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, - TP_ARGS(trans, caller_ip, path) - ); - --TRACE_EVENT(trans_restart_upgrade, -- TP_PROTO(struct btree_trans *trans, -- unsigned long caller_ip, -- struct btree_path *path, -- unsigned old_locks_want, -- unsigned new_locks_want, -- struct get_locks_fail *f), -- TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f), -- -- TP_STRUCT__entry( -- __array(char, trans_fn, 32 ) -- __field(unsigned long, caller_ip ) -- __field(u8, btree_id ) -- __field(u8, old_locks_want ) -- __field(u8, new_locks_want ) -- __field(u8, level ) -- __field(u32, path_seq ) -- __field(u32, node_seq ) -- TRACE_BPOS_entries(pos) -- ), -- -- TP_fast_assign( -- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -- __entry->caller_ip = caller_ip; -- __entry->btree_id = path->btree_id; -- __entry->old_locks_want = old_locks_want; -- __entry->new_locks_want = new_locks_want; -- __entry->level = f->l; -- __entry->path_seq = path->l[f->l].lock_seq; -- __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; -- TRACE_BPOS_assign(pos, path->pos) -- ), -- -- TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u", -- __entry->trans_fn, -- (void *) __entry->caller_ip, -- bch2_btree_id_str(__entry->btree_id), -- __entry->pos_inode, -- __entry->pos_offset, -- __entry->pos_snapshot, -- __entry->old_locks_want, -- __entry->new_locks_want, -- __entry->level, -- __entry->path_seq, -- __entry->node_seq) -+DEFINE_EVENT(fs_str, trans_restart_upgrade, -+ TP_PROTO(struct bch_fs *c, const char *str), -+ TP_ARGS(c, str) - ); - - DEFINE_EVENT(trans_str, trans_restart_relock, -@@ -1491,6 +1431,11 @@ DEFINE_EVENT(fs_str, data_update, - TP_ARGS(c, str) - ); - -+DEFINE_EVENT(fs_str, io_move_created_rebalance, -+ TP_PROTO(struct bch_fs *c, const char *str), -+ TP_ARGS(c, str) -+); -+ - TRACE_EVENT(error_downcast, - TP_PROTO(int bch_err, int std_err, unsigned long ip), - TP_ARGS(bch_err, std_err, ip), -diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c -index da2cd11b3025..dc3817f545fa 100644 ---- a/fs/bcachefs/util.c -+++ b/fs/bcachefs/util.c -@@ -252,8 +252,18 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v) - bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); - } - --static void __bch2_print_string_as_lines(const char *prefix, const char *lines, -- bool nonblocking) -+static bool string_is_spaces(const char *str) -+{ -+ while (*str) { -+ if (*str != ' ') -+ return false; -+ str++; -+ } -+ return true; -+} -+ -+void bch2_print_string_as_lines(const char *prefix, const char *lines, -+ bool nonblocking) - { - bool locked = false; - const char *p; -@@ -270,8 +280,11 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, - locked = console_trylock(); - } - -- while (1) { -+ while (*lines) { - p = strchrnul(lines, '\n'); -+ if (!*p && string_is_spaces(lines)) -+ break; -+ - printk("%s%.*s\n", prefix, (int) (p - lines), lines); - if (!*p) - break; -@@ -281,16 +294,6 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, - console_unlock(); - } - --void bch2_print_string_as_lines(const char *prefix, const char *lines) --{ -- return __bch2_print_string_as_lines(prefix, lines, false); --} -- --void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines) --{ -- return __bch2_print_string_as_lines(prefix, lines, true); --} -- - int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, - gfp_t gfp) - { -@@ -473,10 +476,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats - u64 last_q = 0; - - prt_printf(out, "quantiles (%s):\t", u->name); -- eytzinger0_for_each(i, NR_QUANTILES) { -- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; -+ eytzinger0_for_each(j, NR_QUANTILES) { -+ bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1; - -- u64 q = max(quantiles->entries[i].m, last_q); -+ u64 q = max(quantiles->entries[j].m, last_q); - prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); - if (is_last) - prt_newline(out); -@@ -704,12 +707,43 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) - } - } - -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_corrupt_bio(struct bio *bio) -+{ -+ struct bvec_iter iter; -+ struct bio_vec bv; -+ unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); -+ -+ bio_for_each_segment(bv, bio, iter) { -+ unsigned u64s = bv.bv_len / sizeof(u64); -+ -+ if (offset < u64s) { -+ u64 *segment = bvec_kmap_local(&bv); -+ segment[offset] = get_random_u64(); -+ kunmap_local(segment); -+ return; -+ } -+ offset -= u64s; -+ } -+} -+#endif -+ -+void bch2_bio_to_text(struct printbuf *out, struct bio *bio) -+{ -+ prt_printf(out, "bi_remaining:\t%u\n", -+ atomic_read(&bio->__bi_remaining)); -+ prt_printf(out, "bi_end_io:\t%ps\n", -+ bio->bi_end_io); -+ prt_printf(out, "bi_status:\t%u\n", -+ bio->bi_status); -+} -+ - #if 0 - void eytzinger1_test(void) - { -- unsigned inorder, eytz, size; -+ unsigned inorder, size; - -- pr_info("1 based eytzinger test:"); -+ pr_info("1 based eytzinger test:\n"); - - for (size = 2; - size < 65536; -@@ -717,13 +751,7 @@ void eytzinger1_test(void) - unsigned extra = eytzinger1_extra(size); - - if (!(size % 4096)) -- pr_info("tree size %u", size); -- -- BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); -- BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); -- -- BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); -- BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); -+ pr_info("tree size %u\n", size); - - inorder = 1; - eytzinger1_for_each(eytz, size) { -@@ -734,15 +762,16 @@ void eytzinger1_test(void) - - inorder++; - } -+ BUG_ON(inorder - 1 != size); - } - } - - void eytzinger0_test(void) - { - -- unsigned inorder, eytz, size; -+ unsigned inorder, size; - -- pr_info("0 based eytzinger test:"); -+ pr_info("0 based eytzinger test:\n"); - - for (size = 1; - size < 65536; -@@ -750,13 +779,7 @@ void eytzinger0_test(void) - unsigned extra = eytzinger0_extra(size); - - if (!(size % 4096)) -- pr_info("tree size %u", size); -- -- BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); -- BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); -- -- BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); -- BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); -+ pr_info("tree size %u\n", size); - - inorder = 0; - eytzinger0_for_each(eytz, size) { -@@ -767,54 +790,191 @@ void eytzinger0_test(void) - - inorder++; - } -+ BUG_ON(inorder != size); -+ -+ inorder = size - 1; -+ eytzinger0_for_each_prev(eytz, size) { -+ BUG_ON(eytz != eytzinger0_first(size) && -+ eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz); -+ -+ inorder--; -+ } -+ BUG_ON(inorder != -1); - } - } - --static inline int cmp_u16(const void *_l, const void *_r, size_t size) -+static inline int cmp_u16(const void *_l, const void *_r) - { - const u16 *l = _l, *r = _r; - -- return (*l > *r) - (*r - *l); -+ return (*l > *r) - (*r > *l); - } - --static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) -+static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search) - { -- int i, c1 = -1, c2 = -1; -- ssize_t r; -+ int r, s; -+ bool bad; - - r = eytzinger0_find_le(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); -- if (r >= 0) -- c1 = test_array[r]; -- -- for (i = 0; i < nr; i++) -- if (test_array[i] <= search && test_array[i] > c2) -- c2 = test_array[i]; -- -- if (c1 != c2) { -- eytzinger0_for_each(i, nr) -- pr_info("[%3u] = %12u", i, test_array[i]); -- pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", -- i, r, c1, c2); -+ if (r >= 0) { -+ if (test_array[r] > search) { -+ bad = true; -+ } else { -+ s = eytzinger0_next(r, nr); -+ bad = s >= 0 && test_array[s] <= search; -+ } -+ } else { -+ s = eytzinger0_last(nr); -+ bad = s >= 0 && test_array[s] <= search; -+ } -+ -+ if (bad) { -+ s = -1; -+ eytzinger0_for_each_prev(j, nr) { -+ if (test_array[j] <= search) { -+ s = j; -+ break; -+ } -+ } -+ -+ eytzinger0_for_each(j, nr) -+ pr_info("[%3u] = %12u\n", j, test_array[j]); -+ pr_info("find_le(%12u) = %3i should be %3i\n", -+ search, r, s); -+ BUG(); -+ } -+} -+ -+static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) -+{ -+ int r, s; -+ bool bad; -+ -+ r = eytzinger0_find_gt(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ if (r >= 0) { -+ if (test_array[r] <= search) { -+ bad = true; -+ } else { -+ s = eytzinger0_prev(r, nr); -+ bad = s >= 0 && test_array[s] > search; -+ } -+ } else { -+ s = eytzinger0_first(nr); -+ bad = s >= 0 && test_array[s] > search; -+ } -+ -+ if (bad) { -+ s = -1; -+ eytzinger0_for_each(j, nr) { -+ if (test_array[j] > search) { -+ s = j; -+ break; -+ } -+ } -+ -+ eytzinger0_for_each(j, nr) -+ pr_info("[%3u] = %12u\n", j, test_array[j]); -+ pr_info("find_gt(%12u) = %3i should be %3i\n", -+ search, r, s); -+ BUG(); - } - } - -+static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) -+{ -+ int r, s; -+ bool bad; -+ -+ r = eytzinger0_find_ge(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ if (r >= 0) { -+ if (test_array[r] < search) { -+ bad = true; -+ } else { -+ s = eytzinger0_prev(r, nr); -+ bad = s >= 0 && test_array[s] >= search; -+ } -+ } else { -+ s = eytzinger0_first(nr); -+ bad = s >= 0 && test_array[s] >= search; -+ } -+ -+ if (bad) { -+ s = -1; -+ eytzinger0_for_each(j, nr) { -+ if (test_array[j] >= search) { -+ s = j; -+ break; -+ } -+ } -+ -+ eytzinger0_for_each(j, nr) -+ pr_info("[%3u] = %12u\n", j, test_array[j]); -+ pr_info("find_ge(%12u) = %3i should be %3i\n", -+ search, r, s); -+ BUG(); -+ } -+} -+ -+static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search) -+{ -+ unsigned r; -+ int s; -+ bool bad; -+ -+ r = eytzinger0_find(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ -+ if (r < nr) { -+ bad = test_array[r] != search; -+ } else { -+ s = eytzinger0_find_le(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ bad = s >= 0 && test_array[s] == search; -+ } -+ -+ if (bad) { -+ eytzinger0_for_each(j, nr) -+ pr_info("[%3u] = %12u\n", j, test_array[j]); -+ pr_info("find(%12u) = %3i is incorrect\n", -+ search, r); -+ BUG(); -+ } -+} -+ -+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) -+{ -+ eytzinger0_find_test_le(test_array, nr, search); -+ eytzinger0_find_test_gt(test_array, nr, search); -+ eytzinger0_find_test_ge(test_array, nr, search); -+ eytzinger0_find_test_eq(test_array, nr, search); -+} -+ - void eytzinger0_find_test(void) - { - unsigned i, nr, allocated = 1 << 12; - u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); - - for (nr = 1; nr < allocated; nr++) { -- pr_info("testing %u elems", nr); -+ u16 prev = 0; -+ -+ pr_info("testing %u elems\n", nr); - - get_random_bytes(test_array, nr * sizeof(test_array[0])); - eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); - - /* verify array is sorted correctly: */ -- eytzinger0_for_each(i, nr) -- BUG_ON(i != eytzinger0_last(nr) && -- test_array[i] > test_array[eytzinger0_next(i, nr)]); -+ eytzinger0_for_each(j, nr) { -+ BUG_ON(test_array[j] < prev); -+ prev = test_array[j]; -+ } - - for (i = 0; i < U16_MAX; i += 1 << 12) - eytzinger0_find_test_val(test_array, nr, i); -@@ -856,14 +1016,14 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) - return ret; - } - --void bch2_darray_str_exit(darray_str *d) -+void bch2_darray_str_exit(darray_const_str *d) - { - darray_for_each(*d, i) - kfree(*i); - darray_exit(d); - } - --int bch2_split_devs(const char *_dev_name, darray_str *ret) -+int bch2_split_devs(const char *_dev_name, darray_const_str *ret) - { - darray_init(ret); - -diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h -index f4a4783219d9..ed5bee5e63de 100644 ---- a/fs/bcachefs/util.h -+++ b/fs/bcachefs/util.h -@@ -5,23 +5,24 @@ - #include - #include - #include -+#include - #include - #include - #include - #include --#include - #include - #include - #include - #include -+#include - #include -+#include - #include - #include - #include - - #include "mean_and_variance.h" - --#include "darray.h" - #include "time_stats.h" - - struct closure; -@@ -55,15 +56,16 @@ static inline size_t buf_pages(void *p, size_t len) - PAGE_SIZE); - } - --static inline void *bch2_kvmalloc(size_t n, gfp_t flags) -+static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) - { - void *p = unlikely(n >= INT_MAX) -- ? vmalloc(n) -- : kvmalloc(n, flags & ~__GFP_ZERO); -+ ? vmalloc_noprof(n) -+ : kvmalloc_noprof(n, flags & ~__GFP_ZERO); - if (p && (flags & __GFP_ZERO)) - memset(p, 0, n); - return p; - } -+#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__)) - - #define init_heap(heap, _size, gfp) \ - ({ \ -@@ -94,6 +96,7 @@ do { \ - #define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n) - - #define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n) -+#define printbuf_indent_add_nextline(_out, _n) bch2_printbuf_indent_add_nextline(_out, _n) - #define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n) - - #define prt_newline(_out) bch2_prt_newline(_out) -@@ -210,8 +213,7 @@ u64 bch2_read_flag_list(const char *, const char * const[]); - void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); - void bch2_prt_u64_base2(struct printbuf *, u64); - --void bch2_print_string_as_lines(const char *prefix, const char *lines); --void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines); -+void bch2_print_string_as_lines(const char *, const char *, bool); - - typedef DARRAY(unsigned long) bch_stacktrace; - int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); -@@ -406,6 +408,20 @@ u64 bch2_get_random_u64_below(u64); - void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); - void memcpy_from_bio(void *, struct bio *, struct bvec_iter); - -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_corrupt_bio(struct bio *); -+ -+static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) -+{ -+ if (ratio && !get_random_u32_below(ratio)) -+ bch2_corrupt_bio(bio); -+} -+#else -+#define bch2_maybe_corrupt_bio(...) do {} while (0) -+#endif -+ -+void bch2_bio_to_text(struct printbuf *, struct bio *); -+ - static inline void memcpy_u64s_small(void *dst, const void *src, - unsigned u64s) - { -@@ -419,7 +435,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src, - static inline void __memcpy_u64s(void *dst, const void *src, - unsigned u64s) - { --#ifdef CONFIG_X86_64 -+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) - long d0, d1, d2; - - asm volatile("rep ; movsq" -@@ -496,7 +512,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src, - u64 *dst = (u64 *) _dst + u64s - 1; - u64 *src = (u64 *) _src + u64s - 1; - --#ifdef CONFIG_X86_64 -+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) - long d0, d1, d2; - - asm volatile("std ;\n" -@@ -536,30 +552,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) - memset(s + bytes, c, rem); - } - --/* just the memmove, doesn't update @_nr */ --#define __array_insert_item(_array, _nr, _pos) \ -- memmove(&(_array)[(_pos) + 1], \ -- &(_array)[(_pos)], \ -- sizeof((_array)[0]) * ((_nr) - (_pos))) -- --#define array_insert_item(_array, _nr, _pos, _new_item) \ --do { \ -- __array_insert_item(_array, _nr, _pos); \ -- (_nr)++; \ -- (_array)[(_pos)] = (_new_item); \ --} while (0) -- --#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ --do { \ -- (_nr) -= (_nr_to_remove); \ -- memmove(&(_array)[(_pos)], \ -- &(_array)[(_pos) + (_nr_to_remove)], \ -- sizeof((_array)[0]) * ((_nr) - (_pos))); \ --} while (0) -- --#define array_remove_item(_array, _nr, _pos) \ -- array_remove_items(_array, _nr, _pos, 1) -- - static inline void __move_gap(void *array, size_t element_size, - size_t nr, size_t size, - size_t old_gap, size_t new_gap) -@@ -675,8 +667,8 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r) - return l.len == r.len && !memcmp(l.name, r.name, l.len); - } - --void bch2_darray_str_exit(darray_str *); --int bch2_split_devs(const char *, darray_str *); -+void bch2_darray_str_exit(darray_const_str *); -+int bch2_split_devs(const char *, darray_const_str *); - - #ifdef __KERNEL__ - -@@ -726,4 +718,42 @@ static inline void memcpy_swab(void *_dst, void *_src, size_t len) - *--dst = *src++; - } - -+#define set_flags(_map, _in, _out) \ -+do { \ -+ unsigned _i; \ -+ \ -+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -+ if ((_in) & (1 << _i)) \ -+ (_out) |= _map[_i]; \ -+ else \ -+ (_out) &= ~_map[_i]; \ -+} while (0) -+ -+#define map_flags(_map, _in) \ -+({ \ -+ unsigned _out = 0; \ -+ \ -+ set_flags(_map, _in, _out); \ -+ _out; \ -+}) -+ -+#define map_flags_rev(_map, _in) \ -+({ \ -+ unsigned _i, _out = 0; \ -+ \ -+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -+ if ((_in) & _map[_i]) { \ -+ (_out) |= 1 << _i; \ -+ (_in) &= ~_map[_i]; \ -+ } \ -+ (_out); \ -+}) -+ -+#define map_defined(_map) \ -+({ \ -+ unsigned _in = ~0; \ -+ \ -+ map_flags_rev(_map, _in); \ -+}) -+ - #endif /* _BCACHEFS_UTIL_H */ -diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c -index aed7c6984173..627f153798c6 100644 ---- a/fs/bcachefs/xattr.c -+++ b/fs/bcachefs/xattr.c -@@ -38,7 +38,7 @@ static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) - struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); - - return bch2_xattr_hash(info, -- &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); -+ &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len)); - } - - static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) -@@ -48,7 +48,7 @@ static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) - - return l.v->x_type != r->type || - l.v->x_name_len != r->name.len || -- memcmp(l.v->x_name, r->name.name, r->name.len); -+ memcmp(l.v->x_name_and_value, r->name.name, r->name.len); - } - - static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -@@ -58,7 +58,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) - - return l.v->x_type != r.v->x_type || - l.v->x_name_len != r.v->x_name_len || -- memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); -+ memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len); - } - - const struct bch_hash_desc bch2_xattr_hash_desc = { -@@ -96,7 +96,7 @@ int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, - c, xattr_invalid_type, - "invalid type (%u)", xattr.v->x_type); - -- bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), -+ bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len), - c, xattr_name_invalid_chars, - "xattr name has invalid characters"); - fsck_err: -@@ -120,13 +120,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, - unsigned name_len = xattr.v->x_name_len; - unsigned val_len = le16_to_cpu(xattr.v->x_val_len); - unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - -- offsetof(struct bch_xattr, x_name); -+ offsetof(struct bch_xattr, x_name_and_value); - - val_len = min_t(int, val_len, max_name_val_bytes - name_len); - name_len = min(name_len, max_name_val_bytes); - - prt_printf(out, "%.*s:%.*s", -- name_len, xattr.v->x_name, -+ name_len, xattr.v->x_name_and_value, - val_len, (char *) xattr_val(xattr.v)); - - if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || -@@ -168,7 +168,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, - int type, int flags) - { - struct bch_fs *c = trans->c; -- struct btree_iter inode_iter = { NULL }; -+ struct btree_iter inode_iter = {}; - int ret; - - ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: -@@ -176,6 +176,11 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, - if (ret) - return ret; - -+ /* -+ * Besides the ctime update, extents, dirents and xattrs updates require -+ * that an inode update also happens - to ensure that if a key exists in -+ * one of those btrees with a given snapshot ID an inode is also present -+ */ - inode_u->bi_ctime = bch2_current_time(c); - - ret = bch2_inode_write(trans, &inode_iter, inode_u); -@@ -202,7 +207,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, - xattr->v.x_type = type; - xattr->v.x_name_len = namelen; - xattr->v.x_val_len = cpu_to_le16(size); -- memcpy(xattr->v.x_name, name, namelen); -+ memcpy(xattr->v.x_name_and_value, name, namelen); - memcpy(xattr_val(&xattr->v), value, size); - - ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, -@@ -270,7 +275,7 @@ static int bch2_xattr_emit(struct dentry *dentry, - if (!prefix) - return 0; - -- return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf); -+ return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf); - } - - static int bch2_xattr_list_bcachefs(struct bch_fs *c, -@@ -473,6 +478,12 @@ static int inode_opt_set_fn(struct btree_trans *trans, - { - struct inode_opt_set *s = p; - -+ if (s->id == Inode_opt_casefold) { -+ int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v); -+ if (ret) -+ return ret; -+ } -+ - if (s->defined) - bi->bi_fields_set |= 1U << s->id; - else -@@ -523,7 +534,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, - if (ret < 0) - goto err_class_exit; - -- ret = bch2_opt_check_may_set(c, opt_id, v); -+ ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v); - if (ret < 0) - goto err_class_exit; - -diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h -index 132fbbd15a66..1139bf345f70 100644 ---- a/fs/bcachefs/xattr.h -+++ b/fs/bcachefs/xattr.h -@@ -18,12 +18,12 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - - static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) - { -- return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + -+ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) + - name_len + val_len, sizeof(u64)); - } - - #define xattr_val(_xattr) \ -- ((void *) (_xattr)->x_name + (_xattr)->x_name_len) -+ ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len) - - struct xattr_search_key { - u8 type; -diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h -index c7916011ef34..4121b78d9a92 100644 ---- a/fs/bcachefs/xattr_format.h -+++ b/fs/bcachefs/xattr_format.h -@@ -13,7 +13,13 @@ struct bch_xattr { - __u8 x_type; - __u8 x_name_len; - __le16 x_val_len; -- __u8 x_name[] __counted_by(x_name_len); -+ /* -+ * x_name contains the name and value counted by -+ * x_name_len + x_val_len. The introduction of -+ * __counted_by(x_name_len) previously caused a false positive -+ * detection of an out of bounds write. -+ */ -+ __u8 x_name_and_value[]; - } __packed __aligned(8); - - #endif /* _BCACHEFS_XATTR_FORMAT_H */ -diff --git a/fs/dcache.c b/fs/dcache.c -index e3634916ffb9..db7029e2673f 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -32,6 +32,9 @@ - #include - #include - #include -+#include -+#include -+#include - #include "internal.h" - #include "mount.h" - -@@ -3169,6 +3172,266 @@ ino_t d_parent_ino(struct dentry *dentry) - } - EXPORT_SYMBOL(d_parent_ino); - -+static struct rhashtable no_casefold_dentries; -+ -+enum no_casefold_dentry_ref { -+ ref_casefold_disable, -+ ref_casefold_enable, -+}; -+ -+struct no_casefold_dentry { -+ struct rhash_head hash; -+ struct dentry *dentry; -+ unsigned long ref[2]; -+}; -+ -+static const struct rhashtable_params no_casefold_dentries_params = { -+ .head_offset = offsetof(struct no_casefold_dentry, hash), -+ .key_offset = offsetof(struct no_casefold_dentry, dentry), -+ .key_len = sizeof(struct dentry *), -+ .automatic_shrinking = true, -+}; -+ -+static int no_casefold_dentry_get(struct dentry *dentry, -+ enum no_casefold_dentry_ref ref) -+{ -+ struct no_casefold_dentry *n = -+ rhashtable_lookup_fast(&no_casefold_dentries, -+ &dentry, -+ no_casefold_dentries_params); -+ if (n) { -+ if (n->ref[!ref]) -+ return -EINVAL; -+ -+ n->ref[ref]++; -+ return 0; -+ } -+ -+ n = kzalloc(sizeof(*n), GFP_KERNEL); -+ if (!n) -+ return -ENOMEM; -+ -+ n->dentry = dget(dentry); -+ n->ref[ref]++; -+ -+ int ret = rhashtable_lookup_insert_fast(&no_casefold_dentries, -+ &n->hash, no_casefold_dentries_params); -+ if (WARN_ON(ret)) { -+ kfree(n); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static void no_casefold_dentry_put(struct dentry *dentry, -+ enum no_casefold_dentry_ref ref) -+{ -+ struct no_casefold_dentry *n = -+ rhashtable_lookup_fast(&no_casefold_dentries, -+ &dentry, -+ no_casefold_dentries_params); -+ if (WARN_ON(!n)) -+ return; -+ -+ if (--n->ref[ref]) -+ return; -+ -+ dput(n->dentry); -+ int ret = rhashtable_remove_fast(&no_casefold_dentries, -+ &n->hash, no_casefold_dentries_params); -+ WARN_ON(ret); -+} -+ -+/** -+ * d_casefold_disabled_put - drop a "casefold disabled" ref -+ * -+ * Only for overlayfs. -+ */ -+void d_casefold_disabled_put(struct dentry *dentry) -+{ -+ struct super_block *sb = dentry->d_sb; -+ -+ if (!(sb->s_flags & SB_CASEFOLD)) -+ return; -+ -+ guard(mutex)(&sb->s_casefold_enable_lock); -+ no_casefold_dentry_put(dentry, ref_casefold_disable); -+} -+EXPORT_SYMBOL_GPL(d_casefold_disabled_put); -+ -+/** -+ * d_casefold_disabled_get - attempt to disable casefold on a tree -+ * -+ * Only for overlayfs. -+ * -+ * Returns -EINVAL if casefolding is in use on any subdirectory; this must be -+ * tracked by the filesystem. -+ * -+ * On success, returns with a reference held that must be released by -+ * d_casefold_disabled_put(); this ref blocks casefold from being enabled -+ * by d_casefold_enable(). -+ */ -+int d_casefold_disabled_get(struct dentry *dentry) -+{ -+ struct super_block *sb = dentry->d_sb; -+ -+ if (!(sb->s_flags & SB_CASEFOLD)) -+ return 0; -+ -+ guard(mutex)(&sb->s_casefold_enable_lock); -+ -+ if (!(dentry->d_inode->i_flags & S_NO_CASEFOLD)) -+ return -EINVAL; -+ -+ return no_casefold_dentry_get(dentry, ref_casefold_disable); -+} -+EXPORT_SYMBOL_GPL(d_casefold_disabled_get); -+ -+/* Crabwalk: releases @dentry after getting ref on parent */ -+static struct dentry *dget_parent_this_sb(struct dentry *dentry) -+{ -+ struct dentry *parent = dentry != dentry->d_sb->s_root -+ ? dget_parent(dentry) -+ : NULL; -+ dput(dentry); -+ return parent; -+} -+ -+/** -+ * d_casefold_enable - check if casefolding may be enabled on a dentry -+ * -+ * @dentry: dentry to enable casefolding on -+ * @e: state object, released by d_casefold_enable_commit() -+ * @rename: Are we in the rename path? -+ * If so, we expect s_vfs_rename_mutex to be held, if not (called -+ * from setflags), we aquire it if necessary, and release in -+ * commit. -+ * -+ * The rename mutex is required because we're operating on a whole path, -+ * potentially up to the filesystem root, and we need it to be stable until -+ * commit (i.e. we don't want to be renamed into a tree overlayfs is exporting -+ * after we've returned success). -+ * -+ * For rename, this should only be called for cross-directory rename. -+ * S_NO_CASEFOLD doesn't need to change on rename within a directory, and -+ * s_vfs_rename_mutex won't be held on non cross-directory rename. -+ * -+ * Returns -EINVAL if casefolding has been disabled on any parent directory (by -+ * overlayfs). -+ * -+ * On success, the d_casefold_enable object must be committed with -+ * d_casefold_enable_commit(), after the filesystem has updated its internal -+ * state. -+ * -+ * Commit will clear S_NO_CASEFOLD on all inodes up to the filesystem root, -+ * informing overlayfs that this tree has casefolding enabled somewhere in it. -+ */ -+int d_casefold_enable(struct dentry *dentry, struct d_casefold_enable *e, -+ bool rename) -+{ -+ int ret = 0; -+ -+ memset(e, sizeof(*e), 0); -+ e->sb = dentry->d_sb; -+ -+ if (!(e->sb->s_flags & SB_CASEFOLD)) -+ return 0; -+ -+ if (!S_ISDIR(dentry->d_inode->i_mode)) -+ return 0; -+ -+ /* -+ * On rename, we're passed the dentry being renamed (the filesystem is -+ * not passed the dentry of the directory we're renaming to), but it's -+ * the parent that may need to have S_NO_CASEFOLD cleared: -+ */ -+ dentry = rename -+ ? dget_parent(dentry) -+ : dget(dentry); -+ -+ if (!(dentry->d_inode->i_flags & S_NO_CASEFOLD)) { -+ dput(dentry); -+ return 0; -+ } -+ -+ if (rename) { -+ lockdep_assert_held(&e->sb->s_vfs_rename_mutex); -+ } else { -+ mutex_lock(&e->sb->s_vfs_rename_mutex); -+ e->rename_mutex_held = true; -+ } -+ -+ guard(mutex)(&e->sb->s_casefold_enable_lock); -+ -+ for (struct dentry *i = dentry; i; i = dget_parent_this_sb(i)) { -+ if (!(i->d_inode->i_flags & S_NO_CASEFOLD)) { -+ dput(i); -+ break; -+ } -+ -+ ret = darray_push(&e->refs, i); -+ if (ret) { -+ dput(i); -+ goto err; -+ } -+ -+ ret = no_casefold_dentry_get(i, ref_casefold_enable); -+ if (ret) { -+ dput(i); -+ --e->refs.nr; -+ goto err; -+ } -+ } -+ -+ return 0; -+err: -+ darray_for_each(e->refs, i) -+ no_casefold_dentry_put(*i, ref_casefold_enable); -+ darray_exit(&e->refs); -+ -+ if (e->rename_mutex_held) -+ mutex_unlock(&e->sb->s_vfs_rename_mutex); -+ e->rename_mutex_held = false; -+ return ret; -+} -+EXPORT_SYMBOL_GPL(d_casefold_enable); -+ -+/** -+ * d_casefold_enable_commit - finish operation started by d_casefold_enable() -+ * -+ * @e: state object, started by d_casefold_enable_commit() -+ * @ret: Success or failure of the operation, from the filesystem -+ * -+ * On success (@ret == 0), clear S_NO_CASEFOLD on all inodes up to the -+ * filesystem root that have it set, which d_casefold_enable() previously took -+ * references to. -+ */ -+void d_casefold_enable_commit(struct d_casefold_enable *e, int ret) -+{ -+ if (e->refs.nr) { -+ guard(mutex)(&e->sb->s_casefold_enable_lock); -+ -+ darray_for_each(e->refs, i) { -+ if (!ret) { -+ struct inode *inode = (*i)->d_inode; -+ -+ spin_lock(&inode->i_lock); -+ inode->i_flags &= ~S_NO_CASEFOLD; -+ spin_unlock(&inode->i_lock); -+ } -+ -+ no_casefold_dentry_put(*i, ref_casefold_enable); -+ } -+ darray_exit(&e->refs); -+ } -+ -+ if (e->rename_mutex_held) -+ mutex_unlock(&e->sb->s_vfs_rename_mutex); -+ e->rename_mutex_held = false; -+} -+EXPORT_SYMBOL_GPL(d_casefold_enable_commit); -+ - static __initdata unsigned long dhash_entries; - static int __init set_dhash_entries(char *str) - { -@@ -3214,6 +3477,10 @@ static void __init dcache_init(void) - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, - d_shortname.string); - -+ int ret = rhashtable_init(&no_casefold_dentries, &no_casefold_dentries_params); -+ if (ret) -+ panic("error initializing no_casefold_dentries: %s\n", errname(ret)); -+ - /* Hash may have been set up in dcache_init_early */ - if (!hashdist) - return; -diff --git a/fs/libfs.c b/fs/libfs.c -index dc042a975a56..fa73064c311c 100644 ---- a/fs/libfs.c -+++ b/fs/libfs.c -@@ -1952,6 +1952,7 @@ void generic_set_sb_d_ops(struct super_block *sb) - { - #if IS_ENABLED(CONFIG_UNICODE) - if (sb->s_encoding) { -+ sb->s_flags |= SB_CASEFOLD; - sb->s_d_op = &generic_ci_dentry_ops; - return; - } -diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c -index 1115c22deca0..acebffc61c44 100644 ---- a/fs/overlayfs/params.c -+++ b/fs/overlayfs/params.c -@@ -285,7 +285,8 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, - * with overlayfs. Check explicitly to prevent post-mount - * failures. - */ -- if (sb_has_encoding(path->mnt->mnt_sb)) -+ if ((path->mnt->mnt_sb->s_flags & SB_CASEFOLD) && -+ !(path->dentry->d_inode->i_flags & S_NO_CASEFOLD)) - return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name); - - if (ovl_dentry_weird(path->dentry)) -@@ -409,20 +410,32 @@ static int ovl_do_parse_layer(struct fs_context *fc, const char *layer_name, - if (!name) - return -ENOMEM; - -+ if (layer != Opt_workdir && -+ layer != Opt_upperdir) { -+ err = d_casefold_disabled_get(layer_path->dentry); -+ if (err) -+ return err; -+ } -+ - upper = is_upper_layer(layer); - err = ovl_mount_dir_check(fc, layer_path, layer, name, upper); - if (err) -- return err; -+ goto err_put; - - if (!upper) { - err = ovl_ctx_realloc_lower(fc); - if (err) -- return err; -+ goto err_put; - } - - /* Store the user provided path string in ctx to show in mountinfo */ - ovl_add_layer(fc, layer, layer_path, &name); - return err; -+err_put: -+ if (layer != Opt_workdir && -+ layer != Opt_upperdir) -+ d_casefold_disabled_put(layer_path->dentry); -+ return err; - } - - static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, -@@ -473,6 +486,7 @@ static void ovl_reset_lowerdirs(struct ovl_fs_context *ctx) - ctx->lowerdir_all = NULL; - - for (size_t nr = 0; nr < ctx->nr; nr++, l++) { -+ d_casefold_disabled_put(l->path.dentry); - path_put(&l->path); - kfree(l->name); - l->name = NULL; -diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c -index 0819c739cc2f..c515f260032c 100644 ---- a/fs/overlayfs/util.c -+++ b/fs/overlayfs/util.c -@@ -205,10 +205,21 @@ bool ovl_dentry_weird(struct dentry *dentry) - if (!d_can_lookup(dentry) && !d_is_file(dentry) && !d_is_symlink(dentry)) - return true; - -- return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | -- DCACHE_MANAGE_TRANSIT | -- DCACHE_OP_HASH | -- DCACHE_OP_COMPARE); -+ if (dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | -+ DCACHE_MANAGE_TRANSIT)) -+ return true; -+ -+ /* -+ * The filesystem might support casefolding, but we've already checked -+ * that casefolding isn't present on this tree: we only need to check -+ * for non-casefolding hash/compare ops -+ */ -+ if (!(dentry->d_sb->s_flags & SB_CASEFOLD) && -+ (dentry->d_flags & (DCACHE_OP_HASH | -+ DCACHE_OP_COMPARE))) -+ return true; -+ -+ return false; - } - - enum ovl_path_type ovl_path_type(struct dentry *dentry) -diff --git a/fs/super.c b/fs/super.c -index 5a7db4a556e3..d84c50c400ec 100644 ---- a/fs/super.c -+++ b/fs/super.c -@@ -368,6 +368,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, - atomic_set(&s->s_active, 1); - mutex_init(&s->s_vfs_rename_mutex); - lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); -+ mutex_init(&s->s_casefold_enable_lock); - init_rwsem(&s->s_dquot.dqio_sem); - s->s_maxbytes = MAX_NON_LFS; - s->s_op = &default_op; -diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c -index 0055066fb1d9..62d04f4843cf 100644 ---- a/fs/xfs/xfs_super.c -+++ b/fs/xfs/xfs_super.c -@@ -2122,7 +2122,8 @@ static struct file_system_type xfs_fs_type = { - .init_fs_context = xfs_init_fs_context, - .parameters = xfs_fs_parameters, - .kill_sb = xfs_kill_sb, -- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, -+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME | -+ FS_LBS, - }; - MODULE_ALIAS_FS("xfs"); - -diff --git a/fs/bcachefs/darray.h b/include/linux/darray.h -similarity index 64% -rename from fs/bcachefs/darray.h -rename to include/linux/darray.h -index c6151495985f..7a0c0159b319 100644 ---- a/fs/bcachefs/darray.h -+++ b/include/linux/darray.h -@@ -1,34 +1,26 @@ - /* SPDX-License-Identifier: GPL-2.0 */ --#ifndef _BCACHEFS_DARRAY_H --#define _BCACHEFS_DARRAY_H -+/* -+ * (C) 2022-2024 Kent Overstreet -+ */ -+#ifndef _LINUX_DARRAY_H -+#define _LINUX_DARRAY_H - - /* -- * Dynamic arrays: -+ * Dynamic arrays - * - * Inspired by CCAN's darray - */ - -+#include - #include - --#define DARRAY_PREALLOCATED(_type, _nr) \ --struct { \ -- size_t nr, size; \ -- _type *data; \ -- _type preallocated[_nr]; \ --} -- --#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) -- --typedef DARRAY(char) darray_char; --typedef DARRAY(char *) darray_str; -- --int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); -+int __darray_resize_slowpath(darray_char *, size_t, size_t, gfp_t); - - #define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__)) - - #define __darray_resize(_d, _element_size, _new_size, _gfp) \ - (unlikely((_new_size) > (_d)->size) \ -- ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\ -+ ? __darray_resize_slowpath((_d), (_element_size), (_new_size), (_gfp))\ - : 0) - - #define darray_resize_gfp(_d, _new_size, _gfp) \ -@@ -63,6 +55,28 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); - #define darray_first(_d) ((_d).data[0]) - #define darray_last(_d) ((_d).data[(_d).nr - 1]) - -+/* Insert/remove items into the middle of a darray: */ -+ -+#define array_insert_item(_array, _nr, _pos, _new_item) \ -+do { \ -+ memmove(&(_array)[(_pos) + 1], \ -+ &(_array)[(_pos)], \ -+ sizeof((_array)[0]) * ((_nr) - (_pos))); \ -+ (_nr)++; \ -+ (_array)[(_pos)] = (_new_item); \ -+} while (0) -+ -+#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ -+do { \ -+ (_nr) -= (_nr_to_remove); \ -+ memmove(&(_array)[(_pos)], \ -+ &(_array)[(_pos) + (_nr_to_remove)], \ -+ sizeof((_array)[0]) * ((_nr) - (_pos))); \ -+} while (0) -+ -+#define array_remove_item(_array, _nr, _pos) \ -+ array_remove_items(_array, _nr, _pos, 1) -+ - #define darray_insert_item(_d, pos, _item) \ - ({ \ - size_t _pos = (pos); \ -@@ -73,10 +87,15 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); - _ret; \ - }) - -+#define darray_remove_items(_d, _pos, _nr_to_remove) \ -+ array_remove_items((_d)->data, (_d)->nr, (_pos) - (_d)->data, _nr_to_remove) -+ - #define darray_remove_item(_d, _pos) \ -- array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) -+ darray_remove_items(_d, _pos, 1) -+ -+/* Iteration: */ - --#define __darray_for_each(_d, _i) \ -+#define __darray_for_each(_d, _i) \ - for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) - - #define darray_for_each(_d, _i) \ -@@ -100,4 +119,4 @@ do { \ - darray_init(_d); \ - } while (0) - --#endif /* _BCACHEFS_DARRAY_H */ -+#endif /* _LINUX_DARRAY_H */ -diff --git a/include/linux/darray_types.h b/include/linux/darray_types.h -new file mode 100644 -index 000000000000..c55484487905 ---- /dev/null -+++ b/include/linux/darray_types.h -@@ -0,0 +1,33 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * (C) 2022-2024 Kent Overstreet -+ */ -+#ifndef _LINUX_DARRAY_TYPES_H -+#define _LINUX_DARRAY_TYPES_H -+ -+#include -+ -+#define DARRAY_PREALLOCATED(_type, _nr) \ -+struct { \ -+ size_t nr, size; \ -+ _type *data; \ -+ _type preallocated[_nr]; \ -+} -+ -+#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) -+ -+typedef DARRAY(char) darray_char; -+typedef DARRAY(char *) darray_str; -+typedef DARRAY(const char *) darray_const_str; -+ -+typedef DARRAY(u8) darray_u8; -+typedef DARRAY(u16) darray_u16; -+typedef DARRAY(u32) darray_u32; -+typedef DARRAY(u64) darray_u64; -+ -+typedef DARRAY(s8) darray_s8; -+typedef DARRAY(s16) darray_s16; -+typedef DARRAY(s32) darray_s32; -+typedef DARRAY(s64) darray_s64; -+ -+#endif /* _LINUX_DARRAY_TYPES_H */ -diff --git a/include/linux/dcache.h b/include/linux/dcache.h -index 4afb60365675..be0f42911a02 100644 ---- a/include/linux/dcache.h -+++ b/include/linux/dcache.h -@@ -3,6 +3,7 @@ - #define __LINUX_DCACHE_H - - #include -+#include - #include - #include - #include -@@ -616,4 +617,15 @@ static inline struct dentry *d_next_sibling(const struct dentry *dentry) - return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib); - } - -+void d_casefold_disabled_put(struct dentry *dentry); -+int d_casefold_disabled_get(struct dentry *dentry); -+ -+struct d_casefold_enable { -+ DARRAY(struct dentry *) refs; -+ struct super_block *sb; -+ bool rename_mutex_held; -+}; -+int d_casefold_enable(struct dentry *dentry, struct d_casefold_enable *e, bool); -+void d_casefold_enable_commit(struct d_casefold_enable *e, int ret); -+ - #endif /* __LINUX_DCACHE_H */ -diff --git a/include/linux/fs.h b/include/linux/fs.h -index 2788df98080f..63c9a1a8a24b 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -47,6 +47,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -1238,6 +1239,7 @@ extern int send_sigurg(struct file *file); - #define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ - #define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ - #define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ -+#define SB_CASEFOLD BIT(8) /* Superblock supports casefolding */ - #define SB_NOATIME BIT(10) /* Do not update access times. */ - #define SB_NODIRATIME BIT(11) /* Do not update directory access times */ - #define SB_SILENT BIT(15) -@@ -1398,6 +1400,7 @@ struct super_block { - * even looking at it. You had been warned. - */ - struct mutex s_vfs_rename_mutex; /* Kludge */ -+ struct mutex s_casefold_enable_lock; - - /* - * Filesystem subtype. If non-empty the filesystem type field -@@ -1441,6 +1444,7 @@ struct super_block { - - struct mutex s_sync_lock; /* sync serialisation lock */ - -+ - /* - * Indicates how deep in a filesystem stack this SB is - */ -@@ -2346,6 +2350,7 @@ struct super_operations { - #define S_CASEFOLD (1 << 15) /* Casefolded file */ - #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ - #define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */ -+#define S_NO_CASEFOLD (1 << 18) /* Directory, and all descendents, are not casefolded */ - - /* - * Note that nosuid etc flags are inode-specific: setting some file-system -@@ -2616,6 +2621,7 @@ struct file_system_type { - #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ - #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ - #define FS_MGTIME 64 /* FS uses multigrain timestamps */ -+#define FS_LBS 128 /* FS supports LBS */ - #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ - int (*init_fs_context)(struct fs_context *); - const struct fs_parameter_spec *parameters; -diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h -index fe41da005970..1cba369e1821 100644 ---- a/include/linux/seq_buf.h -+++ b/include/linux/seq_buf.h -@@ -173,4 +173,8 @@ seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary); - - void seq_buf_do_printk(struct seq_buf *s, const char *lvl); - -+enum string_size_units; -+void seq_buf_human_readable_u64(struct seq_buf *s, u64 v, -+ const enum string_size_units units); -+ - #endif /* _LINUX_SEQ_BUF_H */ -diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h -index 1a00be90d93a..106622ddac77 100644 ---- a/include/linux/shrinker.h -+++ b/include/linux/shrinker.h -@@ -24,6 +24,8 @@ struct shrinker_info { - struct shrinker_info_unit *unit[]; - }; - -+struct seq_buf; -+ - /* - * This struct is used to pass information from page reclaim to the shrinkers. - * We consolidate the values for easier extension later. -@@ -80,10 +82,12 @@ struct shrink_control { - * @flags determine the shrinker abilities, like numa awareness - */ - struct shrinker { -+ const char *name; - unsigned long (*count_objects)(struct shrinker *, - struct shrink_control *sc); - unsigned long (*scan_objects)(struct shrinker *, - struct shrink_control *sc); -+ void (*to_text)(struct seq_buf *, struct shrinker *); - - long batch; /* reclaim batch size, 0 = default */ - int seeks; /* seeks to recreate an obj */ -@@ -110,11 +114,16 @@ struct shrinker { - #endif - #ifdef CONFIG_SHRINKER_DEBUG - int debugfs_id; -- const char *name; - struct dentry *debugfs_entry; - #endif - /* objs pending delete, per node */ - atomic_long_t *nr_deferred; -+ -+ atomic_long_t objects_requested_to_free; -+ atomic_long_t objects_freed; -+ unsigned long last_freed; /* timestamp, in jiffies */ -+ unsigned long last_scanned; /* timestamp, in jiffies */ -+ atomic64_t ns_run; - }; - #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ - -@@ -135,6 +144,8 @@ __printf(2, 3) - struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); - void shrinker_register(struct shrinker *shrinker); - void shrinker_free(struct shrinker *shrinker); -+void shrinker_to_text(struct seq_buf *, struct shrinker *); -+void shrinkers_to_text(struct seq_buf *); - - static inline bool shrinker_try_get(struct shrinker *shrinker) - { -diff --git a/include/linux/sort.h b/include/linux/sort.h -index e163287ac6c1..8e5603b10941 100644 ---- a/include/linux/sort.h -+++ b/include/linux/sort.h -@@ -13,4 +13,15 @@ void sort(void *base, size_t num, size_t size, - cmp_func_t cmp_func, - swap_func_t swap_func); - -+/* Versions that periodically call cond_resched(): */ -+ -+void sort_r_nonatomic(void *base, size_t num, size_t size, -+ cmp_r_func_t cmp_func, -+ swap_r_func_t swap_func, -+ const void *priv); -+ -+void sort_nonatomic(void *base, size_t num, size_t size, -+ cmp_func_t cmp_func, -+ swap_func_t swap_func); -+ - #endif -diff --git a/lib/Makefile b/lib/Makefile -index 4f3d00a2fd65..ccc50cdc4926 100644 ---- a/lib/Makefile -+++ b/lib/Makefile -@@ -56,7 +56,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ - bsearch.o find_bit.o llist.o lwq.o memweight.o kfifo.o \ - percpu-refcount.o rhashtable.o base64.o \ - once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \ -- generic-radix-tree.o bitmap-str.o -+ generic-radix-tree.o bitmap-str.o darray.o - obj-$(CONFIG_STRING_KUNIT_TEST) += string_kunit.o - obj-y += string_helpers.o - obj-$(CONFIG_STRING_HELPERS_KUNIT_TEST) += string_helpers_kunit.o -diff --git a/fs/bcachefs/darray.c b/lib/darray.c -similarity index 75% -rename from fs/bcachefs/darray.c -rename to lib/darray.c -index e86d36d23e9e..1d3820a43e14 100644 ---- a/fs/bcachefs/darray.c -+++ b/lib/darray.c -@@ -1,11 +1,15 @@ - // SPDX-License-Identifier: GPL-2.0 -+/* -+ * (C) 2022-2024 Kent Overstreet -+ */ - -+#include -+#include - #include - #include - #include --#include "darray.h" - --int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) -+int __darray_resize_slowpath(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) - { - if (new_size > d->size) { - new_size = roundup_pow_of_two(new_size); -@@ -36,3 +40,4 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_ - - return 0; - } -+EXPORT_SYMBOL_GPL(__darray_resize_slowpath); -diff --git a/lib/seq_buf.c b/lib/seq_buf.c -index f3f3436d60a9..3c41ca83a0c3 100644 ---- a/lib/seq_buf.c -+++ b/lib/seq_buf.c -@@ -436,3 +436,13 @@ int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type, - } - return 0; - } -+ -+void seq_buf_human_readable_u64(struct seq_buf *s, u64 v, const enum string_size_units units) -+{ -+ char *buf; -+ size_t size = seq_buf_get_buf(s, &buf); -+ int wrote = string_get_size(v, 1, units, buf, size); -+ -+ seq_buf_commit(s, wrote); -+} -+EXPORT_SYMBOL(seq_buf_human_readable_u64); -diff --git a/lib/sort.c b/lib/sort.c -index 8e73dc55476b..52363995ccc5 100644 ---- a/lib/sort.c -+++ b/lib/sort.c -@@ -186,36 +186,13 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size) - return i / 2; - } - --/** -- * sort_r - sort an array of elements -- * @base: pointer to data to sort -- * @num: number of elements -- * @size: size of each element -- * @cmp_func: pointer to comparison function -- * @swap_func: pointer to swap function or NULL -- * @priv: third argument passed to comparison function -- * -- * This function does a heapsort on the given array. You may provide -- * a swap_func function if you need to do something more than a memory -- * copy (e.g. fix up pointers or auxiliary data), but the built-in swap -- * avoids a slow retpoline and so is significantly faster. -- * -- * The comparison function must adhere to specific mathematical -- * properties to ensure correct and stable sorting: -- * - Antisymmetry: cmp_func(a, b) must return the opposite sign of -- * cmp_func(b, a). -- * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then -- * cmp_func(a, c) <= 0. -- * -- * Sorting time is O(n log n) both on average and worst-case. While -- * quicksort is slightly faster on average, it suffers from exploitable -- * O(n*n) worst-case behavior and extra memory requirements that make -- * it less suitable for kernel use. -- */ --void sort_r(void *base, size_t num, size_t size, -- cmp_r_func_t cmp_func, -- swap_r_func_t swap_func, -- const void *priv) -+#include -+ -+static void __sort_r(void *base, size_t num, size_t size, -+ cmp_r_func_t cmp_func, -+ swap_r_func_t swap_func, -+ const void *priv, -+ bool may_schedule) - { - /* pre-scale counters for performance */ - size_t n = num * size, a = (num/2) * size; -@@ -286,6 +263,9 @@ void sort_r(void *base, size_t num, size_t size, - b = parent(b, lsbit, size); - do_swap(base + b, base + c, size, swap_func, priv); - } -+ -+ if (may_schedule) -+ cond_resched(); - } - - n -= size; -@@ -293,8 +273,63 @@ void sort_r(void *base, size_t num, size_t size, - if (n == size * 2 && do_cmp(base, base + size, cmp_func, priv) > 0) - do_swap(base, base + size, size, swap_func, priv); - } -+ -+/** -+ * sort_r - sort an array of elements -+ * @base: pointer to data to sort -+ * @num: number of elements -+ * @size: size of each element -+ * @cmp_func: pointer to comparison function -+ * @swap_func: pointer to swap function or NULL -+ * @priv: third argument passed to comparison function -+ * -+ * This function does a heapsort on the given array. You may provide -+ * a swap_func function if you need to do something more than a memory -+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap -+ * avoids a slow retpoline and so is significantly faster. -+ * -+ * The comparison function must adhere to specific mathematical -+ * properties to ensure correct and stable sorting: -+ * - Antisymmetry: cmp_func(a, b) must return the opposite sign of -+ * cmp_func(b, a). -+ * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then -+ * cmp_func(a, c) <= 0. -+ * -+ * Sorting time is O(n log n) both on average and worst-case. While -+ * quicksort is slightly faster on average, it suffers from exploitable -+ * O(n*n) worst-case behavior and extra memory requirements that make -+ * it less suitable for kernel use. -+ */ -+void sort_r(void *base, size_t num, size_t size, -+ cmp_r_func_t cmp_func, -+ swap_r_func_t swap_func, -+ const void *priv) -+{ -+ __sort_r(base, num, size, cmp_func, swap_func, priv, false); -+} - EXPORT_SYMBOL(sort_r); - -+/** -+ * sort_r_nonatomic - sort an array of elements, with cond_resched -+ * @base: pointer to data to sort -+ * @num: number of elements -+ * @size: size of each element -+ * @cmp_func: pointer to comparison function -+ * @swap_func: pointer to swap function or NULL -+ * @priv: third argument passed to comparison function -+ * -+ * Same as sort_r, but preferred for larger arrays as it does a periodic -+ * cond_resched(). -+ */ -+void sort_r_nonatomic(void *base, size_t num, size_t size, -+ cmp_r_func_t cmp_func, -+ swap_r_func_t swap_func, -+ const void *priv) -+{ -+ __sort_r(base, num, size, cmp_func, swap_func, priv, true); -+} -+EXPORT_SYMBOL(sort_r_nonatomic); -+ - void sort(void *base, size_t num, size_t size, - cmp_func_t cmp_func, - swap_func_t swap_func) -@@ -304,6 +339,19 @@ void sort(void *base, size_t num, size_t size, - .swap = swap_func, - }; - -- return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); -+ return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, false); - } - EXPORT_SYMBOL(sort); -+ -+void sort_nonatomic(void *base, size_t num, size_t size, -+ cmp_func_t cmp_func, -+ swap_func_t swap_func) -+{ -+ struct wrapper w = { -+ .cmp = cmp_func, -+ .swap = swap_func, -+ }; -+ -+ return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, true); -+} -+EXPORT_SYMBOL(sort_nonatomic); -diff --git a/mm/oom_kill.c b/mm/oom_kill.c -index 1cf121ad7085..43914d472059 100644 ---- a/mm/oom_kill.c -+++ b/mm/oom_kill.c -@@ -169,27 +169,6 @@ static bool oom_unkillable_task(struct task_struct *p) - return false; - } - --/* -- * Check whether unreclaimable slab amount is greater than -- * all user memory(LRU pages). -- * dump_unreclaimable_slab() could help in the case that -- * oom due to too much unreclaimable slab used by kernel. --*/ --static bool should_dump_unreclaim_slab(void) --{ -- unsigned long nr_lru; -- -- nr_lru = global_node_page_state(NR_ACTIVE_ANON) + -- global_node_page_state(NR_INACTIVE_ANON) + -- global_node_page_state(NR_ACTIVE_FILE) + -- global_node_page_state(NR_INACTIVE_FILE) + -- global_node_page_state(NR_ISOLATED_ANON) + -- global_node_page_state(NR_ISOLATED_FILE) + -- global_node_page_state(NR_UNEVICTABLE); -- -- return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru); --} -- - /** - * oom_badness - heuristic function to determine which candidate task to kill - * @p: task struct of which task we should calculate -@@ -469,8 +448,6 @@ static void dump_header(struct oom_control *oc) - mem_cgroup_print_oom_meminfo(oc->memcg); - else { - __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask)); -- if (should_dump_unreclaim_slab()) -- dump_unreclaimable_slab(); - } - if (sysctl_oom_dump_tasks) - dump_tasks(oc); -diff --git a/mm/show_mem.c b/mm/show_mem.c -index 43afb56abbd3..982a64a86880 100644 ---- a/mm/show_mem.c -+++ b/mm/show_mem.c -@@ -7,15 +7,18 @@ - - #include - #include -+#include - #include - #include - #include - #include - #include -+#include - #include - #include - - #include "internal.h" -+#include "slab.h" - #include "swap.h" - - atomic_long_t _totalram_pages __read_mostly; -@@ -396,10 +399,31 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z - show_swap_cache_info(); - } - -+static void print_string_as_lines(const char *prefix, const char *lines) -+{ -+ if (!lines) { -+ printk("%s (null)\n", prefix); -+ return; -+ } -+ -+ bool locked = console_trylock(); -+ -+ while (1) { -+ const char *p = strchrnul(lines, '\n'); -+ printk("%s%.*s\n", prefix, (int) (p - lines), lines); -+ if (!*p) -+ break; -+ lines = p + 1; -+ } -+ if (locked) -+ console_unlock(); -+} -+ - void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) - { - unsigned long total = 0, reserved = 0, highmem = 0; - struct zone *zone; -+ char *buf; - - printk("Mem-Info:\n"); - show_free_areas(filter, nodemask, max_zone_idx); -@@ -451,4 +475,30 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) - } - } - #endif -+ -+ const unsigned buf_size = 8192; -+ buf = kmalloc(buf_size, GFP_ATOMIC); -+ if (buf) { -+ struct seq_buf s; -+ -+ printk("Unreclaimable slab info:\n"); -+ seq_buf_init(&s, buf, buf_size); -+ dump_unreclaimable_slab(&s); -+ print_string_as_lines(KERN_NOTICE, seq_buf_str(&s)); -+ -+ static unsigned long shrinkers_last_print; -+ -+ /* Ratelimit to at most once every 30 seconds */ -+ if (!shrinkers_last_print || -+ time_after(jiffies, shrinkers_last_print + HZ * 30)) { -+ shrinkers_last_print = jiffies; -+ -+ printk("Shrinkers:\n"); -+ seq_buf_init(&s, buf, buf_size); -+ shrinkers_to_text(&s); -+ print_string_as_lines(KERN_NOTICE, seq_buf_str(&s)); -+ } -+ -+ kfree(buf); -+ } - } -diff --git a/mm/shrinker.c b/mm/shrinker.c -index 4a93fd433689..c56c1f824f79 100644 ---- a/mm/shrinker.c -+++ b/mm/shrinker.c -@@ -1,8 +1,9 @@ - // SPDX-License-Identifier: GPL-2.0 - #include -+#include - #include -+#include - #include --#include - #include - - #include "internal.h" -@@ -411,6 +412,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - - trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - freeable, delta, total_scan, priority); -+ u64 start_time = ktime_get_ns(); - - /* - * Normally, we should not scan less than batch_size objects in one -@@ -461,6 +463,17 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - */ - new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); - -+ unsigned long now = jiffies; -+ if (freed) { -+ atomic_long_add(freed, &shrinker->objects_freed); -+ shrinker->last_freed = now; -+ } -+ shrinker->last_scanned = now; -+ atomic_long_add(scanned, &shrinker->objects_requested_to_free); -+ -+ atomic64_add(ktime_get_ns() - start_time, &shrinker->ns_run); -+ -+ - trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); - return freed; - } -@@ -809,3 +822,83 @@ void shrinker_free(struct shrinker *shrinker) - call_rcu(&shrinker->rcu, shrinker_free_rcu_cb); - } - EXPORT_SYMBOL_GPL(shrinker_free); -+ -+void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker) -+{ -+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; -+ unsigned long nr_freed = atomic_long_read(&shrinker->objects_freed); -+ -+ seq_buf_puts(out, shrinker->name); -+ seq_buf_putc(out, '\n'); -+ -+ seq_buf_printf(out, "objects: %lu\n", shrinker->count_objects(shrinker, &sc)); -+ seq_buf_printf(out, "requested to free: %lu\n", atomic_long_read(&shrinker->objects_requested_to_free)); -+ seq_buf_printf(out, "objects freed: %lu\n", nr_freed); -+ seq_buf_printf(out, "last scanned: %li sec ago\n", (jiffies - shrinker->last_scanned) / HZ); -+ seq_buf_printf(out, "last freed: %li sec ago\n", (jiffies - shrinker->last_freed) / HZ); -+ seq_buf_printf(out, "ns per object freed: %llu\n", nr_freed -+ ? div64_ul(atomic64_read(&shrinker->ns_run), nr_freed) -+ : 0); -+ -+ if (shrinker->to_text) { -+ shrinker->to_text(out, shrinker); -+ seq_buf_puts(out, "\n"); -+ } -+} -+ -+/** -+ * shrinkers_to_text - Report on shrinkers with highest usage -+ * -+ * This reports on the top 10 shrinkers, by object counts, in sorted order: -+ * intended to be used for OOM reporting. -+ */ -+void shrinkers_to_text(struct seq_buf *out) -+{ -+ struct shrinker *shrinker; -+ struct shrinker_by_mem { -+ struct shrinker *shrinker; -+ unsigned long mem; -+ } shrinkers_by_mem[4]; -+ int i, nr = 0; -+ -+ if (!mutex_trylock(&shrinker_mutex)) { -+ seq_buf_puts(out, "(couldn't take shrinker lock)"); -+ return; -+ } -+ -+ list_for_each_entry(shrinker, &shrinker_list, list) { -+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; -+ unsigned long mem = shrinker->count_objects(shrinker, &sc); -+ -+ if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY) -+ continue; -+ -+ for (i = 0; i < nr; i++) -+ if (mem < shrinkers_by_mem[i].mem) -+ break; -+ -+ if (nr < ARRAY_SIZE(shrinkers_by_mem)) { -+ memmove(&shrinkers_by_mem[i + 1], -+ &shrinkers_by_mem[i], -+ sizeof(shrinkers_by_mem[0]) * (nr - i)); -+ nr++; -+ } else if (i) { -+ i--; -+ memmove(&shrinkers_by_mem[0], -+ &shrinkers_by_mem[1], -+ sizeof(shrinkers_by_mem[0]) * i); -+ } else { -+ continue; -+ } -+ -+ shrinkers_by_mem[i] = (struct shrinker_by_mem) { -+ .shrinker = shrinker, -+ .mem = mem, -+ }; -+ } -+ -+ for (i = nr - 1; i >= 0; --i) -+ shrinker_to_text(out, shrinkers_by_mem[i].shrinker); -+ -+ mutex_unlock(&shrinker_mutex); -+} -diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c -index 794bd433cce0..c3f2d22ae0d9 100644 ---- a/mm/shrinker_debug.c -+++ b/mm/shrinker_debug.c -@@ -2,6 +2,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -159,6 +160,21 @@ static const struct file_operations shrinker_debugfs_scan_fops = { - .write = shrinker_debugfs_scan_write, - }; - -+static int shrinker_debugfs_report_show(struct seq_file *m, void *v) -+{ -+ struct shrinker *shrinker = m->private; -+ char *bufp; -+ size_t buflen = seq_get_buf(m, &bufp); -+ struct seq_buf out; -+ -+ seq_buf_init(&out, bufp, buflen); -+ shrinker_to_text(&out, shrinker); -+ seq_commit(m, seq_buf_used(&out)); -+ -+ return 0; -+} -+DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_report); -+ - int shrinker_debugfs_add(struct shrinker *shrinker) - { - struct dentry *entry; -@@ -190,6 +206,8 @@ int shrinker_debugfs_add(struct shrinker *shrinker) - &shrinker_debugfs_count_fops); - debugfs_create_file("scan", 0220, entry, shrinker, - &shrinker_debugfs_scan_fops); -+ debugfs_create_file("report", 0440, entry, shrinker, -+ &shrinker_debugfs_report_fops); - return 0; - } - -diff --git a/mm/slab.h b/mm/slab.h -index e9fd9bf0bfa6..1baf8771089b 100644 ---- a/mm/slab.h -+++ b/mm/slab.h -@@ -631,10 +631,12 @@ static inline size_t slab_ksize(const struct kmem_cache *s) - return s->size; - } - -+struct seq_buf; -+ - #ifdef CONFIG_SLUB_DEBUG --void dump_unreclaimable_slab(void); -+void dump_unreclaimable_slab(struct seq_buf *); - #else --static inline void dump_unreclaimable_slab(void) -+static inline void dump_unreclaimable_slab(struct seq_buf *out) - { - } - #endif -diff --git a/mm/slab_common.c b/mm/slab_common.c -index 4c9f0a87f733..1f24fcc2bc7f 100644 ---- a/mm/slab_common.c -+++ b/mm/slab_common.c -@@ -27,6 +27,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -1134,10 +1135,15 @@ static int slab_show(struct seq_file *m, void *p) - return 0; - } - --void dump_unreclaimable_slab(void) -+void dump_unreclaimable_slab(struct seq_buf *out) - { - struct kmem_cache *s; - struct slabinfo sinfo; -+ struct slab_by_mem { -+ struct kmem_cache *s; -+ size_t total, active; -+ } slabs_by_mem[10], n; -+ int i, nr = 0; - - /* - * Here acquiring slab_mutex is risky since we don't prefer to get -@@ -1147,24 +1153,52 @@ void dump_unreclaimable_slab(void) - * without acquiring the mutex. - */ - if (!mutex_trylock(&slab_mutex)) { -- pr_warn("excessive unreclaimable slab but cannot dump stats\n"); -+ seq_buf_puts(out, "excessive unreclaimable slab but cannot dump stats\n"); - return; - } - -- pr_info("Unreclaimable slab info:\n"); -- pr_info("Name Used Total\n"); -- - list_for_each_entry(s, &slab_caches, list) { - if (s->flags & SLAB_RECLAIM_ACCOUNT) - continue; - - get_slabinfo(s, &sinfo); - -- if (sinfo.num_objs > 0) -- pr_info("%-17s %10luKB %10luKB\n", s->name, -- (sinfo.active_objs * s->size) / 1024, -- (sinfo.num_objs * s->size) / 1024); -+ if (!sinfo.num_objs) -+ continue; -+ -+ n.s = s; -+ n.total = sinfo.num_objs * s->size; -+ n.active = sinfo.active_objs * s->size; -+ -+ for (i = 0; i < nr; i++) -+ if (n.total < slabs_by_mem[i].total) -+ break; -+ -+ if (nr < ARRAY_SIZE(slabs_by_mem)) { -+ memmove(&slabs_by_mem[i + 1], -+ &slabs_by_mem[i], -+ sizeof(slabs_by_mem[0]) * (nr - i)); -+ nr++; -+ } else if (i) { -+ i--; -+ memmove(&slabs_by_mem[0], -+ &slabs_by_mem[1], -+ sizeof(slabs_by_mem[0]) * i); -+ } else { -+ continue; -+ } -+ -+ slabs_by_mem[i] = n; - } -+ -+ for (i = nr - 1; i >= 0; --i) { -+ seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name); -+ seq_buf_human_readable_u64(out, slabs_by_mem[i].total, STRING_UNITS_2); -+ seq_buf_printf(out, " active: "); -+ seq_buf_human_readable_u64(out, slabs_by_mem[i].active, STRING_UNITS_2); -+ seq_buf_putc(out, '\n'); -+ } -+ - mutex_unlock(&slab_mutex); - } - --- -2.49.0 - diff --git a/sys-kernel/hardened-kernel/files/linux-6.14/1199_openpax-cherry-pick-updates-fb1be96.patch b/sys-kernel/hardened-kernel/files/linux-6.14/1199_openpax-cherry-pick-updates-fb1be96.patch deleted file mode 100644 index 6d2e2ec..0000000 --- a/sys-kernel/hardened-kernel/files/linux-6.14/1199_openpax-cherry-pick-updates-fb1be96.patch +++ /dev/null @@ -1,720 +0,0 @@ -From a80207aef480f66179564003807d7a4ecf5aef8e Mon Sep 17 00:00:00 2001 -From: Alexander Miroshnichenko -Date: Wed, 14 May 2025 19:33:06 +0300 -Subject: [PATCH] openpax: cherry-pick updates from master fb1be96e0a3e -Content-Type: text/plain; charset="utf-8" -Content-Transfer-Encoding: 8bit - -Signed-off-by: Alexander Miroshnichenko ---- - .../admin-guide/kernel-parameters.txt | 3 + - arch/x86/mm/fault.c | 218 ++++++++++++++++++ - fs/binfmt_elf.c | 88 ++++++- - fs/proc/array.c | 15 ++ - fs/xattr.c | 16 ++ - include/linux/init.h | 1 + - include/linux/mm_types.h | 11 + - include/linux/mman.h | 11 +- - include/linux/xattr.h | 4 + - include/uapi/linux/xattr.h | 5 + - init/main.c | 11 + - kernel/sysctl.c | 15 ++ - security/Kconfig | 1 + - security/Kconfig.openpax | 89 +++++++ - 14 files changed, 485 insertions(+), 3 deletions(-) - create mode 100644 security/Kconfig.openpax - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index bd53e2675c75..d46f21aa6a26 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4579,6 +4579,9 @@ - from the first 4GB of memory as the bootmem allocator - passes the memory pages to the buddy allocator. - -+ pax_softmode= -+ Enables OpenPaX soft mode if set to a non-zero value. -+ - pcbit= [HW,ISDN] - - pci=option[,option...] [PCI,EARLY] various PCI subsystem options. -diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c -index 296d294142c8..65665982e401 100644 ---- a/arch/x86/mm/fault.c -+++ b/arch/x86/mm/fault.c -@@ -1198,6 +1198,217 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, - } - NOKPROBE_SYMBOL(do_kern_addr_fault); - -+#ifdef CONFIG_OPENPAX_EMUTRAMP -+/* -+ * Determine if a fault is possibly caused by an emulatable stack or -+ * heap trampoline. We return false if trampoline emulation is not -+ * enabled. -+ */ -+static inline -+bool openpax_fault_is_trampoline(unsigned long error_code, -+ struct pt_regs *regs, -+ unsigned long address) -+{ -+ struct mm_struct *mm = current->mm; -+ unsigned long ip = regs->ip; -+ -+ if (!test_bit(PAXF_EMUTRAMP, &mm->pax_flags)) -+ return false; -+ -+ if (v8086_mode(regs)) -+ ip = ((regs->cs & 0xffff) << 4) + (ip & 0xffff); -+ -+ if (test_bit(PAXF_PAGEEXEC, &mm->pax_flags)) { -+ if ((__supported_pte_mask & _PAGE_NX) && (error_code & X86_PF_INSTR)) -+ return true; -+ if (!(error_code & (X86_PF_PROT | X86_PF_WRITE)) && ip == address) -+ return true; -+ return false; -+ } -+ -+ return false; -+} -+NOKPROBE_SYMBOL(openpax_fault_is_trampoline); -+ -+static inline -+bool openpax_emulate_trampoline_32(struct pt_regs *regs) -+{ -+ int err; -+ -+ /* libffi trampoline type 1, gcc trampoline type 2 */ -+ do { -+ unsigned char mov, jmp; -+ unsigned int addr1, addr2; -+ -+#ifdef CONFIG_X86_64 -+ if ((regs->ip + 9) >> 32) -+ break; -+#endif -+ -+ err = get_user(mov, (unsigned char __user *) regs->ip); -+ err |= get_user(addr1, (unsigned int __user *) (regs->ip + 1)); -+ err |= get_user(jmp, (unsigned char __user *) (regs->ip + 5)); -+ err |= get_user(addr2, (unsigned int __user *) (regs->ip + 6)); -+ -+ if (err) -+ break; -+ -+ if ((mov == 0xB8 || mov == 0xB9) && jmp == 0xE9) { -+ if (mov == 0xB8) -+ regs->ax = addr1; -+ else -+ regs->cx = addr1; -+ -+ regs->ip = (unsigned int)(regs->ip + addr2 + 10); -+ return true; -+ } -+ } while (0); -+ -+ /* older gcc trampoline type... */ -+ do { -+ unsigned char mov1, mov2; -+ unsigned short jmp; -+ unsigned int addr1, addr2; -+ -+#ifdef CONFIG_X86_64 -+ if ((regs->ip + 11) >> 32) -+ break; -+#endif -+ -+ err = get_user(mov1, (unsigned char __user *) regs->ip); -+ err |= get_user(addr1, (unsigned int __user *) (regs->ip + 1)); -+ err |= get_user(mov2, (unsigned char __user *) (regs->ip + 5)); -+ err |= get_user(addr2, (unsigned int __user *) (regs->ip + 6)); -+ err |= get_user(jmp, (unsigned short __user *) (regs->ip + 10)); -+ -+ if (err) -+ break; -+ -+ if (mov1 == 0xB9 && mov2 == 0xB8 && jmp == 0xE0FF) { -+ regs->cx = addr1; -+ regs->ax = addr2; -+ regs->ip = addr2; -+ return true; -+ } -+ } while (0); -+ -+ return false; -+} -+NOKPROBE_SYMBOL(openpax_emulate_trampoline_32); -+ -+#ifdef CONFIG_X86_64 -+static inline -+bool openpax_emulate_trampoline_64(struct pt_regs *regs) -+{ -+ int err; -+ -+ /* libffi trampoline type 1 */ -+ do { -+ unsigned short mov1, mov2, jmp1; -+ unsigned char stcclc, jmp2; -+ unsigned long addr1, addr2; -+ -+ err = get_user(mov1, (unsigned short __user *) regs->ip); -+ err |= get_user(addr1, (unsigned long __user *) (regs->ip + 2)); -+ err |= get_user(mov2, (unsigned short __user *) (regs->ip + 10)); -+ err |= get_user(addr2, (unsigned long __user *) (regs->ip + 12)); -+ err |= get_user(stcclc, (unsigned char __user *) (regs->ip + 20)); -+ err |= get_user(jmp1, (unsigned short __user *) (regs->ip + 21)); -+ err |= get_user(jmp2, (unsigned char __user *) (regs->ip + 23)); -+ -+ if (err) -+ break; -+ -+ if (mov1 == 0xBB49 && mov2 == 0xBA49 && (stcclc == 0xF8 || stcclc == 0xF9) && jmp1 == 0xFF49 && jmp2 == 0xE3) { -+ regs->r11 = addr1; -+ regs->r10 = addr2; -+ -+ if (stcclc == 0xF8) -+ regs->flags &= ~X86_EFLAGS_CF; -+ else -+ regs->flags |= X86_EFLAGS_CF; -+ -+ regs->ip = addr1; -+ return true; -+ } -+ } while (0); -+ -+ /* gcc trampoline type 1 */ -+ do { -+ unsigned short mov1, mov2, jmp1; -+ unsigned char jmp2; -+ unsigned int addr1; -+ unsigned long addr2; -+ -+ err = get_user(mov1, (unsigned short __user *) regs->ip); -+ err |= get_user(addr1, (unsigned int __user *) (regs->ip + 2)); -+ err |= get_user(mov2, (unsigned short __user *) (regs->ip + 6)); -+ err |= get_user(addr2, (unsigned long __user *) (regs->ip + 8)); -+ err |= get_user(jmp1, (unsigned short __user *) (regs->ip + 16)); -+ err |= get_user(jmp2, (unsigned char __user *) (regs->ip + 18)); -+ -+ if (err) -+ break; -+ -+ if (mov1 == 0xBB41 && mov2 == 0xBA49 && jmp1 == 0xFF49 && jmp2 == 0xE3) { -+ regs->r11 = addr1; -+ regs->r10 = addr2; -+ regs->ip = addr1; -+ return true; -+ } -+ } while (0); -+ -+ /* gcc trampoline type 2 */ -+ do { -+ unsigned short mov1, mov2, jmp1; -+ unsigned char jmp2; -+ unsigned long addr1, addr2; -+ -+ err = get_user(mov1, (unsigned short __user *) regs->ip); -+ err |= get_user(addr1, (unsigned long __user *) (regs->ip + 2)); -+ err |= get_user(mov2, (unsigned short __user *) (regs->ip + 10)); -+ err |= get_user(addr2, (unsigned long __user *) (regs->ip + 12)); -+ err |= get_user(jmp1, (unsigned short __user *) (regs->ip + 20)); -+ err |= get_user(jmp2, (unsigned char __user *) (regs->ip + 22)); -+ -+ if (err) -+ break; -+ -+ if (mov1 == 0xBB49 && mov2 == 0xBA49 && jmp1 == 0xFF49 && jmp2 == 0xE3) { -+ regs->r11 = addr1; -+ regs->r10 = addr2; -+ regs->ip = addr1; -+ return true; -+ } -+ } while (0); -+ -+ return false; -+} -+NOKPROBE_SYMBOL(openpax_emulate_trampoline_64); -+#endif -+ -+/* -+ * Emulate a trampoline. Returns false if emulation failed, meaning -+ * that the task should be killed. -+ */ -+static inline -+bool openpax_emulate_trampoline(struct pt_regs *regs) -+{ -+ if (v8086_mode(regs)) -+ return false; -+ -+ if (regs->cs == __USER32_CS || (regs->cs & SEGMENT_LDT)) -+ return openpax_emulate_trampoline_32(regs); -+#ifdef CONFIG_X86_64 -+ else -+ return openpax_emulate_trampoline_64(regs); -+#endif -+ -+ return false; -+} -+NOKPROBE_SYMBOL(openpax_emulate_trampoline); -+#endif -+ - /* - * Handle faults in the user portion of the address space. Nothing in here - * should check X86_PF_USER without a specific justification: for almost -@@ -1322,6 +1533,13 @@ void do_user_addr_fault(struct pt_regs *regs, - } - #endif - -+#ifdef CONFIG_OPENPAX_EMUTRAMP -+ if (openpax_fault_is_trampoline(error_code, regs, address)) { -+ if (openpax_emulate_trampoline(regs)) -+ return; -+ } -+#endif -+ - if (!(flags & FAULT_FLAG_USER)) - goto lock_mmap; - -diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c -index 8054f44d39cf..00f436d6d0a8 100644 ---- a/fs/binfmt_elf.c -+++ b/fs/binfmt_elf.c -@@ -47,6 +47,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -822,6 +823,72 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, - return ret == -ENOENT ? 0 : ret; - } - -+#ifdef CONFIG_OPENPAX -+#ifdef CONFIG_OPENPAX_XATTR_PAX_FLAGS -+static int openpax_parse_xattr_flags(struct file * const file) -+{ -+ ssize_t xattr_size, i; -+ unsigned char xattr_value[sizeof("pemrs") - 1]; -+ -+ xattr_size = pax_getxattr(file, xattr_value, sizeof xattr_value); -+ if (xattr_size < 0 || xattr_size > sizeof xattr_value) -+ return -ENOENT; -+ -+ for (i = 0; i < xattr_size; i++) -+ switch (xattr_value[i]) { -+ default: -+ return -EINVAL; -+ -+#define parse_flag(option_disable, option_enable, flag) \ -+ case option_disable: \ -+ clear_bit(flag, ¤t->mm->pax_flags); \ -+ break; \ -+ case option_enable: \ -+ set_bit(flag, ¤t->mm->pax_flags); \ -+ break; -+ -+ parse_flag('p', 'P', PAXF_PAGEEXEC); -+ parse_flag('e', 'E', PAXF_EMUTRAMP); -+ parse_flag('m', 'M', PAXF_MPROTECT); -+ parse_flag('r', 'R', PAXF_RANDMMAP); -+ parse_flag('s', 'S', PAXF_SEGMEXEC); -+#undef parse_flag -+ } -+ -+ return 0; -+} -+#endif -+ -+static int openpax_set_flags(struct file * const file, const int snapshot_randomize_va_space) -+{ -+#ifdef CONFIG_OPENPAX_XATTR_PAX_FLAGS -+ int error; -+#endif -+ current->mm->pax_flags = 0; -+ -+ if (snapshot_randomize_va_space) { -+ set_bit(PAXF_RANDMMAP, ¤t->mm->pax_flags); -+ } -+ -+ if (!pax_softmode) { -+ set_bit(PAXF_PAGEEXEC, ¤t->mm->pax_flags); -+ set_bit(PAXF_MPROTECT, ¤t->mm->pax_flags); -+ } -+ -+#ifdef CONFIG_OPENPAX_EMUTRAMP_DEFAULT -+ set_bit(PAXF_EMUTRAMP, ¤t->mm->pax_flags); -+#endif -+ -+#ifdef CONFIG_OPENPAX_XATTR_PAX_FLAGS -+ error = openpax_parse_xattr_flags(file); -+ if (error != -ENOENT) -+ return error; -+#endif -+ -+ return 0; -+} -+#endif -+ - static int load_elf_binary(struct linux_binprm *bprm) - { - struct file *interpreter = NULL; /* to shut gcc up */ -@@ -1006,11 +1073,28 @@ static int load_elf_binary(struct linux_binprm *bprm) - /* Do this immediately, since STACK_TOP as used in setup_arg_pages - may depend on the personality. */ - SET_PERSONALITY2(*elf_ex, &arch_state); -+ -+ const int snapshot_randomize_va_space = READ_ONCE(randomize_va_space); -+ -+#ifdef CONFIG_OPENPAX -+ retval = openpax_set_flags(bprm->file, snapshot_randomize_va_space); -+ if (retval) -+ goto out_free_dentry; -+ -+ if (test_bit(PAXF_PAGEEXEC, ¤t->mm->pax_flags) || test_bit(PAXF_SEGMEXEC, ¤t->mm->pax_flags)) { -+ executable_stack = EXSTACK_DISABLE_X; -+ current->personality &= ~READ_IMPLIES_EXEC; -+ } else -+#endif -+ - if (elf_read_implies_exec(*elf_ex, executable_stack)) - current->personality |= READ_IMPLIES_EXEC; - -- const int snapshot_randomize_va_space = READ_ONCE(randomize_va_space); -- if (!(current->personality & ADDR_NO_RANDOMIZE) && snapshot_randomize_va_space) -+ if (!(current->personality & ADDR_NO_RANDOMIZE) && snapshot_randomize_va_space -+#ifdef CONFIG_OPENPAX -+ && test_bit(PAXF_RANDMMAP, ¤t->mm->pax_flags) -+#endif -+ ) - current->flags |= PF_RANDOMIZE; - - setup_new_exec(bprm); -diff --git a/fs/proc/array.c b/fs/proc/array.c -index d6a0369caa93..242c8a969400 100644 ---- a/fs/proc/array.c -+++ b/fs/proc/array.c -@@ -436,6 +436,18 @@ __weak void arch_proc_pid_thread_features(struct seq_file *m, - { - } - -+#ifdef CONFIG_OPENPAX -+static inline void task_pax(struct seq_file *m, struct mm_struct *mm) -+{ -+ seq_printf(m, "PaX:\t%c%c%c%c%c\n", -+ test_bit(PAXF_PAGEEXEC, &mm->pax_flags) ? 'P' : 'p', -+ test_bit(PAXF_EMUTRAMP, &mm->pax_flags) ? 'E' : 'e', -+ test_bit(PAXF_MPROTECT, &mm->pax_flags) ? 'M' : 'm', -+ test_bit(PAXF_RANDMMAP, &mm->pax_flags) ? 'R' : 'r', -+ test_bit(PAXF_SEGMEXEC, &mm->pax_flags) ? 'S' : 's'); -+} -+#endif -+ - int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task) - { -@@ -452,6 +464,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, - task_core_dumping(m, task); - task_thp_status(m, mm); - task_untag_mask(m, mm); -+#ifdef CONFIG_OPENPAX -+ task_pax(m, mm); -+#endif - mmput(mm); - } - task_sig(m, task); -diff --git a/fs/xattr.c b/fs/xattr.c -index fabb2a04501e..76c2b5f8d6e6 100644 ---- a/fs/xattr.c -+++ b/fs/xattr.c -@@ -424,6 +424,22 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, - } - EXPORT_SYMBOL(__vfs_getxattr); - -+#ifdef CONFIG_OPENPAX_XATTR_PAX_FLAGS -+ssize_t -+pax_getxattr(struct file *file, void *value, size_t size) -+{ -+ struct inode *inode = file->f_path.dentry->d_inode; -+ ssize_t error; -+ -+ error = inode_permission(file_mnt_idmap(file), inode, MAY_EXEC); -+ if (error) -+ return error; -+ -+ return __vfs_getxattr(file->f_path.dentry, inode, XATTR_NAME_USER_PAX_FLAGS, value, size); -+} -+EXPORT_SYMBOL(pax_getxattr); -+#endif -+ - ssize_t - vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, - const char *name, void *value, size_t size) -diff --git a/include/linux/init.h b/include/linux/init.h -index ee1309473bc6..4abbce4cf60b 100644 ---- a/include/linux/init.h -+++ b/include/linux/init.h -@@ -144,6 +144,7 @@ extern char __initdata boot_command_line[]; - extern char *saved_command_line; - extern unsigned int saved_command_line_len; - extern unsigned int reset_devices; -+extern int pax_softmode; - - /* used by init/main.c */ - void setup_arch(char **); -diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index 0234f14f2aa6..fd8bd5517e4d 100644 ---- a/include/linux/mm_types.h -+++ b/include/linux/mm_types.h -@@ -973,6 +973,9 @@ struct mm_struct { - mm_context_t context; - - unsigned long flags; /* Must use atomic bitops to access */ -+#ifdef CONFIG_OPENPAX -+ unsigned long pax_flags; -+#endif - - #ifdef CONFIG_AIO - spinlock_t ioctx_lock; -@@ -1656,4 +1659,12 @@ static inline unsigned long mmf_init_flags(unsigned long flags) - return flags & MMF_INIT_MASK; - } - -+#ifdef CONFIG_OPENPAX -+#define PAXF_PAGEEXEC 1 -+#define PAXF_EMUTRAMP 2 -+#define PAXF_MPROTECT 3 -+#define PAXF_RANDMMAP 4 -+#define PAXF_SEGMEXEC 5 -+#endif -+ - #endif /* _LINUX_MM_TYPES_H */ -diff --git a/include/linux/mman.h b/include/linux/mman.h -index a842783ffa62..e108371ff12e 100644 ---- a/include/linux/mman.h -+++ b/include/linux/mman.h -@@ -197,12 +197,21 @@ static inline bool arch_memory_deny_write_exec_supported(void) - * we propose to set. - * - * Return: false if proposed change is OK, true if not ok and should be denied. -+ * -+ * Note: If OpenPaX is enabled, it will be assumed that we want to deny -+ * PROT_WRITE | PROT_EXEC by default, unless the MPROTECT feature bit is -+ * disabled on a binary. - */ - static inline bool map_deny_write_exec(unsigned long old, unsigned long new) - { - /* If MDWE is disabled, we have nothing to deny. */ -- if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) -+ if ( -+#ifdef CONFIG_OPENPAX_MPROTECT -+ !test_bit(PAXF_MPROTECT, ¤t->mm->pax_flags) && -+#endif -+ !test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) { - return false; -+ } - - /* If the new VMA is not executable, we have nothing to deny. */ - if (!(new & VM_EXEC)) -diff --git a/include/linux/xattr.h b/include/linux/xattr.h -index 86b0d47984a1..c4ad3af7e1a2 100644 ---- a/include/linux/xattr.h -+++ b/include/linux/xattr.h -@@ -25,6 +25,7 @@ - - struct inode; - struct dentry; -+struct file; - - static inline bool is_posix_acl_xattr(const char *name) - { -@@ -75,6 +76,9 @@ struct xattr { - size_t value_len; - }; - -+#ifdef CONFIG_OPENPAX_XATTR_PAX_FLAGS -+ssize_t pax_getxattr(struct file *, void *, size_t); -+#endif - ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t); - ssize_t vfs_getxattr(struct mnt_idmap *, struct dentry *, const char *, - void *, size_t); -diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h -index 9854f9cff3c6..843787b91ef0 100644 ---- a/include/uapi/linux/xattr.h -+++ b/include/uapi/linux/xattr.h -@@ -88,5 +88,10 @@ struct xattr_args { - #define XATTR_POSIX_ACL_DEFAULT "posix_acl_default" - #define XATTR_NAME_POSIX_ACL_DEFAULT XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_DEFAULT - -+/* User namespace */ -+#define XATTR_PAX_PREFIX "pax." -+#define XATTR_PAX_FLAGS_SUFFIX "flags" -+#define XATTR_NAME_USER_PAX_FLAGS XATTR_USER_PREFIX XATTR_PAX_PREFIX XATTR_PAX_FLAGS_SUFFIX -+#define XATTR_NAME_PAX_FLAGS XATTR_PAX_PREFIX XATTR_PAX_FLAGS_SUFFIX - - #endif /* _UAPI_LINUX_XATTR_H */ -diff --git a/init/main.c b/init/main.c -index 2a1757826397..4720dce1a3b9 100644 ---- a/init/main.c -+++ b/init/main.c -@@ -188,6 +188,17 @@ static int __init set_reset_devices(char *str) - - __setup("reset_devices", set_reset_devices); - -+int pax_softmode; -+ -+#ifdef CONFIG_OPENPAX_SOFTMODE -+static int __init setup_pax_softmode(char *str) -+{ -+ get_option(&str, &pax_softmode); -+ return 1; -+} -+__setup("pax_softmode=", setup_pax_softmode); -+#endif -+ - static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; - const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; - static const char *panic_later, *panic_param; -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 1d600ae89f15..44aff4b84516 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -1647,6 +1647,18 @@ int proc_do_static_key(const struct ctl_table *table, int write, - return ret; - } - -+#ifdef CONFIG_OPENPAX_SOFTMODE -+static const struct ctl_table pax_table[] = { -+ { -+ .procname = "softmode", -+ .data = &pax_softmode, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = proc_dointvec, -+ }, -+}; -+#endif -+ - static const struct ctl_table kern_table[] = { - { - .procname = "panic", -@@ -2279,6 +2291,9 @@ int __init sysctl_init_bases(void) - { - register_sysctl_init("kernel", kern_table); - register_sysctl_init("vm", vm_table); -+#ifdef CONFIG_OPENPAX_SOFTMODE -+ register_sysctl_init("kernel/pax", pax_table); -+#endif - - return 0; - } -diff --git a/security/Kconfig b/security/Kconfig -index adc4a853ce0d..e9cfe77f08e0 100644 ---- a/security/Kconfig -+++ b/security/Kconfig -@@ -311,6 +311,7 @@ config LSM - If unsure, leave this as the default. - - source "security/Kconfig.hardening" -+source "security/Kconfig.openpax" - - endmenu - -diff --git a/security/Kconfig.openpax b/security/Kconfig.openpax -new file mode 100644 -index 000000000000..76ee145094d9 ---- /dev/null -+++ b/security/Kconfig.openpax -@@ -0,0 +1,89 @@ -+# -+# OpenPaX configuration -+# -+ -+menu "OpenPaX options" -+ -+config OPENPAX -+ bool "Enable OpenPaX features" -+ default y -+ help -+ This configuration setting enables OpenPaX features. -+ OpenPaX adds memory safety-related defenses to the kernel which -+ reduce the risks posed by exploitable memory safety bugs. -+ -+config OPENPAX_SOFTMODE -+ bool "Support PaX soft mode" -+ default y -+ help -+ Enabling this option will allow you to configure OpenPaX -+ features to run in soft mode. In this mode, OpenPaX features -+ will be disabled by default, only running on applications -+ which explicitly enable them. -+ -+ Soft mode can be enabled via the kernel.pax.softmode sysctl, -+ or the pax_softmode=1 kernel command-line option. -+ -+config OPENPAX_XATTR_PAX_FLAGS -+ bool "Use filesystem extended attributes to modify OpenPaX features" -+ depends on OPENPAX -+ default y -+ help -+ Enabling this option will allow you to control whether -+ OpenPaX features are enabled on a per-executable basis via -+ xattr attributes. -+ -+ For compatibility with the original PaX patch, the feature -+ flags are read from the user.pax.flags extended attribute. -+ -+ If you disable this feature, then all applications will run -+ with OpenPaX enabled by default. -+ -+config OPENPAX_MPROTECT -+ bool "Enforce W^X for memory mappings" -+ depends on OPENPAX -+ default y -+ help -+ Enabling this option prevents programs from making pages -+ executable when they are also writable. In addition, it -+ also denies transition of writable mappings to executable -+ mappings. -+ -+ This feature is known to break programs which depend on -+ just-in-time (JIT) compilation. It is advisable to enable -+ this feature system-wide, but mark programs which have -+ JIT compilation appropriately so the W^X enforcement is -+ disabled for them. -+ -+config OPENPAX_EMUTRAMP -+ bool "Emulate stack and heap trampolines" -+ depends on OPENPAX -+ default y -+ help -+ Enabling this option allows programs to depend on common -+ types of stack and heap trampolines (such as the ones -+ generated by GCC and libffi) to continue working despite -+ the stack and heap being non-executable memory. -+ -+ This option works by intercepting the page faults caused -+ by executing code in non-executable memory and emulating -+ the side effects that would have happened from executing -+ the trampoline. -+ -+ Most likely, you should say 'y' here. -+ -+config OPENPAX_EMUTRAMP_DEFAULT -+ bool "Enable trampoline emulation by default" -+ depends on OPENPAX_EMUTRAMP -+ default y -+ help -+ Enabling this option allows programs which require -+ trampolines to be emulated to continue working by default. -+ -+ Otherwise, the emulation flag must be enabled in a binary's -+ PaX marking, e.g. with paxmark -E . -+ -+ If you do not say 'y' here, you will have to manually mark -+ all programs which require trampoline emulation. -+ -+endmenu --- -2.49.0 - diff --git a/sys-kernel/hardened-kernel/files/linux-6.14.amd64.config b/sys-kernel/hardened-kernel/files/linux-6.15.amd64.config similarity index 98% rename from sys-kernel/hardened-kernel/files/linux-6.14.amd64.config rename to sys-kernel/hardened-kernel/files/linux-6.15.amd64.config index a76adbc..5d7f0e0 100644 --- a/sys-kernel/hardened-kernel/files/linux-6.14.amd64.config +++ b/sys-kernel/hardened-kernel/files/linux-6.15.amd64.config @@ -1,28 +1,26 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 6.14.6-hardened1 Kernel Configuration +# Linux/x86 6.15.8-hardened2 Kernel Configuration # -CONFIG_CC_VERSION_TEXT="gcc (Gentoo Hardened 14.2.1_p20241221 p7) 14.2.1 20241221" +CONFIG_CC_VERSION_TEXT="gcc (Gentoo Hardened 14.3.0 p8) 14.3.0" CONFIG_CC_IS_GCC=y -CONFIG_GCC_VERSION=140201 +CONFIG_GCC_VERSION=140300 CONFIG_CLANG_VERSION=0 CONFIG_AS_IS_GNU=y CONFIG_AS_VERSION=24400 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=24400 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=108501 -CONFIG_RUSTC_LLVM_VERSION=190107 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y -CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT=y CONFIG_TOOLS_SUPPORT_RELR=y CONFIG_CC_HAS_ASM_INLINE=y CONFIG_CC_HAS_NO_PROFILE_FN_ATTR=y CONFIG_LD_CAN_USE_KEEP_IN_OVERLAY=y -CONFIG_RUSTC_HAS_COERCE_POINTEE=y -CONFIG_PAHOLE_VERSION=129 +CONFIG_PAHOLE_VERSION=130 CONFIG_IRQ_WORK=y CONFIG_BUILDTIME_TABLE_SORT=y CONFIG_THREAD_INFO_IN_TASK=y @@ -76,7 +74,6 @@ CONFIG_HARDIRQS_SW_RESEND=y CONFIG_IRQ_DOMAIN=y CONFIG_IRQ_DOMAIN_HIERARCHY=y CONFIG_GENERIC_MSI_IRQ=y -CONFIG_IRQ_MSI_IOMMU=y CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y CONFIG_GENERIC_IRQ_RESERVATION_MODE=y CONFIG_GENERIC_IRQ_STAT_SNAPSHOT=y @@ -222,7 +219,6 @@ CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_HUGETLB=y CONFIG_CPUSETS=y # CONFIG_CPUSETS_V1 is not set -CONFIG_PROC_PID_CPUSET=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y @@ -261,12 +257,12 @@ CONFIG_LD_ORPHAN_WARN_LEVEL="error" CONFIG_SYSCTL=y CONFIG_HAVE_UID16=y CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_SYSFS_SYSCALL=y CONFIG_HAVE_PCSPKR_PLATFORM=y # CONFIG_EXPERT is not set CONFIG_UID16=y CONFIG_MULTIUSER=y CONFIG_SGETMASK_SYSCALL=y -CONFIG_SYSFS_SYSCALL=y CONFIG_FHANDLE=y CONFIG_POSIX_TIMERS=y CONFIG_PRINTK=y @@ -290,8 +286,8 @@ CONFIG_CACHESTAT_SYSCALL=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_SELFTEST is not set # CONFIG_KALLSYMS_ALL is not set -CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y +CONFIG_ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS=y CONFIG_HAVE_PERF_EVENTS=y CONFIG_GUEST_PERF_EVENTS=y @@ -340,7 +336,6 @@ CONFIG_X86_64_SMP=y CONFIG_ARCH_SUPPORTS_UPROBES=y CONFIG_FIX_EARLYCON_MEM=y CONFIG_PGTABLE_LEVELS=4 -CONFIG_CC_HAS_SANE_STACKPROTECTOR=y # # Processor type and features @@ -350,6 +345,7 @@ CONFIG_X86_X2APIC=y CONFIG_X86_POSTED_MSI=y CONFIG_X86_MPPARSE=y CONFIG_X86_CPU_RESCTRL=y +CONFIG_RESCTRL_FS_PSEUDO_LOCK=y CONFIG_X86_FRED=y # CONFIG_X86_EXTENDED_PLATFORM is not set CONFIG_X86_INTEL_LPSS=y @@ -372,61 +368,17 @@ CONFIG_PARAVIRT_CLOCK=y # CONFIG_JAILHOUSE_GUEST is not set # CONFIG_ACRN_GUEST is not set # CONFIG_INTEL_TDX_GUEST is not set -# CONFIG_MK8 is not set -# CONFIG_MK8SSE3 is not set -# CONFIG_MK10 is not set -# CONFIG_MBARCELONA is not set -# CONFIG_MBOBCAT is not set -# CONFIG_MJAGUAR is not set -# CONFIG_MBULLDOZER is not set -# CONFIG_MPILEDRIVER is not set -# CONFIG_MSTEAMROLLER is not set -# CONFIG_MEXCAVATOR is not set -# CONFIG_MZEN is not set -# CONFIG_MZEN2 is not set -# CONFIG_MZEN3 is not set -# CONFIG_MZEN4 is not set -# CONFIG_MZEN5 is not set -# CONFIG_MPSC is not set -# CONFIG_MATOM is not set -# CONFIG_MCORE2 is not set -# CONFIG_MNEHALEM is not set -# CONFIG_MWESTMERE is not set -# CONFIG_MSILVERMONT is not set -# CONFIG_MGOLDMONT is not set -# CONFIG_MGOLDMONTPLUS is not set -# CONFIG_MSANDYBRIDGE is not set -# CONFIG_MIVYBRIDGE is not set -# CONFIG_MHASWELL is not set -# CONFIG_MBROADWELL is not set -# CONFIG_MSKYLAKE is not set -# CONFIG_MSKYLAKEX is not set -# CONFIG_MCANNONLAKE is not set -# CONFIG_MICELAKE_CLIENT is not set -# CONFIG_MICELAKE_SERVER is not set -# CONFIG_MCASCADELAKE is not set -# CONFIG_MCOOPERLAKE is not set -# CONFIG_MTIGERLAKE is not set -# CONFIG_MSAPPHIRERAPIDS is not set -# CONFIG_MROCKETLAKE is not set -# CONFIG_MALDERLAKE is not set -# CONFIG_MRAPTORLAKE is not set -# CONFIG_MMETEORLAKE is not set -# CONFIG_MEMERALDRAPIDS is not set -CONFIG_GENERIC_CPU=y -# CONFIG_MNATIVE_INTEL is not set -# CONFIG_MNATIVE_AMD is not set -CONFIG_X86_64_VERSION=2 CONFIG_X86_INTERNODE_CACHE_SHIFT=6 CONFIG_X86_L1_CACHE_SHIFT=6 CONFIG_X86_TSC=y CONFIG_X86_HAVE_PAE=y -CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CX8=y CONFIG_X86_CMOV=y CONFIG_X86_MINIMUM_CPU_FAMILY=64 CONFIG_X86_DEBUGCTLMSR=y CONFIG_IA32_FEAT_CTL=y CONFIG_X86_VMX_FEATURE_NAMES=y +CONFIG_BROADCAST_TLB_FLUSH=y CONFIG_CPU_SUP_INTEL=y CONFIG_CPU_SUP_AMD=y CONFIG_CPU_SUP_HYGON=y @@ -540,8 +492,7 @@ CONFIG_HOTPLUG_CPU=y # CONFIG_LEGACY_VSYSCALL_XONLY is not set CONFIG_LEGACY_VSYSCALL_NONE=y CONFIG_CMDLINE_BOOL=y -CONFIG_CMDLINE="vdso32=0 page_poison=1 page_alloc.shuffle=1 slab_nomerge pti=on" -# CONFIG_CMDLINE_OVERRIDE is not set +CONFIG_CMDLINE="" # CONFIG_MODIFY_LDT_SYSCALL is not set # CONFIG_STRICT_SIGALTSTACK_SIZE is not set CONFIG_HAVE_LIVEPATCH=y @@ -583,6 +534,8 @@ CONFIG_MITIGATION_SPECTRE_V1=y CONFIG_MITIGATION_SPECTRE_V2=y CONFIG_MITIGATION_SRBDS=y CONFIG_MITIGATION_SSB=y +CONFIG_MITIGATION_ITS=y +CONFIG_MITIGATION_TSA=y CONFIG_ARCH_HAS_ADD_PAGES=y # @@ -709,6 +662,7 @@ CONFIG_X86_AMD_FREQ_SENSITIVITY=m # # shared options # +CONFIG_CPUFREQ_ARCH_CUR_FREQ=y # end of CPU Frequency scaling # @@ -770,6 +724,7 @@ CONFIG_HAVE_KVM_PM_NOTIFIER=y CONFIG_KVM_GENERIC_HARDWARE_ENABLING=y CONFIG_KVM_GENERIC_MMU_NOTIFIER=y CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG=y +CONFIG_KVM_MMU_LOCKLESS_AGING=y CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES=y CONFIG_KVM_PRIVATE_MEM=y CONFIG_KVM_GENERIC_PRIVATE_MEM=y @@ -787,6 +742,30 @@ CONFIG_KVM_SMM=y # CONFIG_KVM_XEN is not set CONFIG_KVM_EXTERNAL_WRITE_TRACKING=y CONFIG_KVM_MAX_NR_VCPUS=1024 +CONFIG_X86_REQUIRED_FEATURE_ALWAYS=y +CONFIG_X86_REQUIRED_FEATURE_NOPL=y +CONFIG_X86_REQUIRED_FEATURE_CX8=y +CONFIG_X86_REQUIRED_FEATURE_CMOV=y +CONFIG_X86_REQUIRED_FEATURE_CPUID=y +CONFIG_X86_REQUIRED_FEATURE_FPU=y +CONFIG_X86_REQUIRED_FEATURE_PAE=y +CONFIG_X86_REQUIRED_FEATURE_PSE=y +CONFIG_X86_REQUIRED_FEATURE_PGE=y +CONFIG_X86_REQUIRED_FEATURE_MSR=y +CONFIG_X86_REQUIRED_FEATURE_FXSR=y +CONFIG_X86_REQUIRED_FEATURE_XMM=y +CONFIG_X86_REQUIRED_FEATURE_XMM2=y +CONFIG_X86_REQUIRED_FEATURE_LM=y +CONFIG_X86_DISABLED_FEATURE_VME=y +CONFIG_X86_DISABLED_FEATURE_K6_MTRR=y +CONFIG_X86_DISABLED_FEATURE_CYRIX_ARR=y +CONFIG_X86_DISABLED_FEATURE_CENTAUR_MCR=y +CONFIG_X86_DISABLED_FEATURE_LA57=y +CONFIG_X86_DISABLED_FEATURE_LAM=y +CONFIG_X86_DISABLED_FEATURE_SGX=y +CONFIG_X86_DISABLED_FEATURE_XENPV=y +CONFIG_X86_DISABLED_FEATURE_TDX_GUEST=y +CONFIG_X86_DISABLED_FEATURE_USER_SHSTK=y CONFIG_AS_AVX512=y CONFIG_AS_SHA1_NI=y CONFIG_AS_SHA256_NI=y @@ -897,6 +876,7 @@ CONFIG_ARCH_WANT_PMD_MKWRITE=y CONFIG_HAVE_ARCH_SOFT_DIRTY=y CONFIG_HAVE_MOD_ARCH_SPECIFIC=y CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_ARCH_HAS_EXECMEM_ROX=y CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK=y CONFIG_SOFTIRQ_ON_OWN_STACK=y @@ -950,6 +930,7 @@ CONFIG_DYNAMIC_SIGFRAME=y CONFIG_ARCH_HAS_HW_PTE_YOUNG=y CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y CONFIG_ARCH_HAS_KERNEL_FPU_SUPPORT=y +CONFIG_ARCH_VMLINUX_NEEDS_RELOCS=y # # GCOV-based kernel profiling @@ -1112,12 +1093,8 @@ CONFIG_ZSWAP_SHRINKER_DEFAULT_ON=y # CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4HC is not set CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD=y CONFIG_ZSWAP_COMPRESSOR_DEFAULT="zstd" -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD is not set -# CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED is not set CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC=y CONFIG_ZSWAP_ZPOOL_DEFAULT="zsmalloc" -CONFIG_ZBUD=y -# CONFIG_Z3FOLD_DEPRECATED is not set CONFIG_ZSMALLOC=y # CONFIG_ZSMALLOC_STAT is not set CONFIG_ZSMALLOC_CHAIN_SIZE=8 @@ -1126,6 +1103,7 @@ CONFIG_ZSMALLOC_CHAIN_SIZE=8 # Slab allocator options # CONFIG_SLUB=y +CONFIG_KVFREE_RCU_BATCHED=y # CONFIG_SLAB_MERGE_DEFAULT is not set CONFIG_SLAB_FREELIST_RANDOM=y CONFIG_SLAB_FREELIST_HARDENED=y @@ -1142,8 +1120,10 @@ CONFIG_SPARSEMEM=y CONFIG_SPARSEMEM_EXTREME=y CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_SPARSEMEM_VMEMMAP_PREINIT=y CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP=y CONFIG_ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP=y +CONFIG_ARCH_WANT_HUGETLB_VMEMMAP_PREINIT=y CONFIG_HAVE_GUP_FAST=y CONFIG_NUMA_KEEP_MEMINFO=y CONFIG_MEMORY_ISOLATION=y @@ -1182,12 +1162,15 @@ CONFIG_MEMORY_FAILURE=y # CONFIG_HWPOISON_INJECT is not set CONFIG_ARCH_WANT_GENERAL_HUGETLB=y CONFIG_ARCH_WANTS_THP_SWAP=y +CONFIG_MM_ID=y CONFIG_TRANSPARENT_HUGEPAGE=y # CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y # CONFIG_TRANSPARENT_HUGEPAGE_NEVER is not set CONFIG_THP_SWAP=y CONFIG_READ_ONLY_THP_FOR_FS=y +# CONFIG_NO_PAGE_MAPCOUNT is not set +CONFIG_PAGE_MAPCOUNT=y CONFIG_PGTABLE_HAS_HUGE_LEAVES=y CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y @@ -1978,6 +1961,7 @@ CONFIG_PCI_REALLOC_ENABLE_AUTO=y CONFIG_PCI_STUB=m # CONFIG_PCI_PF_STUB is not set CONFIG_PCI_ATS=y +CONFIG_PCI_DOE=y CONFIG_PCI_LOCKLESS_CONFIG=y CONFIG_PCI_IOV=y CONFIG_PCI_PRI=y @@ -2034,6 +2018,8 @@ CONFIG_HOTPLUG_PCI_ACPI_IBM=m # CONFIG_PCI_SW_SWITCHTEC is not set # end of PCI switch controller drivers +CONFIG_PCI_PWRCTL=m +CONFIG_PCI_PWRCTL_SLOT=m # CONFIG_CXL_BUS is not set # CONFIG_PCCARD is not set # CONFIG_RAPIDIO is not set @@ -2154,6 +2140,7 @@ CONFIG_UEFI_CPER_X86=y # end of Tegra firmware driver # end of Firmware Drivers +CONFIG_FWCTL=m # CONFIG_GNSS is not set # CONFIG_MTD is not set # CONFIG_OF is not set @@ -2654,6 +2641,7 @@ CONFIG_BCM_NET_PHYLIB=m # # CONFIG_MCTP_SERIAL is not set # CONFIG_MCTP_TRANSPORT_I3C is not set +# CONFIG_MCTP_TRANSPORT_USB is not set # end of MCTP Device Drivers CONFIG_MDIO_DEVICE=m @@ -2772,6 +2760,7 @@ CONFIG_IWLWIFI=m CONFIG_IWLWIFI_LEDS=y CONFIG_IWLDVM=m CONFIG_IWLMVM=m +CONFIG_IWLMLD=m CONFIG_IWLWIFI_OPMODE_MODULAR=y # @@ -3029,6 +3018,7 @@ CONFIG_SERIAL_8250_DWLIB=y CONFIG_SERIAL_8250_LPSS=y CONFIG_SERIAL_8250_MID=y CONFIG_SERIAL_8250_PERICOM=y +# CONFIG_SERIAL_8250_NI is not set # # Non-8250 serial port support @@ -3392,6 +3382,7 @@ CONFIG_SENSORS_DRIVETEMP=m # CONFIG_SENSORS_G762 is not set # CONFIG_SENSORS_HIH6130 is not set # CONFIG_SENSORS_HS3001 is not set +# CONFIG_SENSORS_HTU31 is not set # CONFIG_SENSORS_I5500 is not set CONFIG_SENSORS_CORETEMP=m # CONFIG_SENSORS_ISL28022 is not set @@ -3592,6 +3583,7 @@ CONFIG_WATCHDOG_SYSFS=y # # CONFIG_SOFT_WATCHDOG is not set # CONFIG_LENOVO_SE10_WDT is not set +# CONFIG_LENOVO_SE30_WDT is not set # CONFIG_WDAT_WDT is not set # CONFIG_XILINX_WATCHDOG is not set # CONFIG_ZIIRAVE_WATCHDOG is not set @@ -3697,6 +3689,7 @@ CONFIG_MFD_INTEL_LPSS_PCI=m # CONFIG_MFD_MAX14577 is not set # CONFIG_MFD_MAX77541 is not set # CONFIG_MFD_MAX77693 is not set +# CONFIG_MFD_MAX77705 is not set # CONFIG_MFD_MAX77843 is not set # CONFIG_MFD_MAX8907 is not set # CONFIG_MFD_MAX8925 is not set @@ -3710,7 +3703,6 @@ CONFIG_MFD_INTEL_LPSS_PCI=m # CONFIG_EZX_PCAP is not set # CONFIG_MFD_VIPERBOARD is not set # CONFIG_MFD_RETU is not set -# CONFIG_MFD_PCF50633 is not set # CONFIG_MFD_SY7636A is not set # CONFIG_MFD_RDC321X is not set # CONFIG_MFD_RT4831 is not set @@ -4013,15 +4005,6 @@ CONFIG_DRM_GEM_SHMEM_HELPER=y CONFIG_DRM_SUBALLOC_HELPER=m CONFIG_DRM_SCHED=m -# -# I2C encoder or helper chips -# -# CONFIG_DRM_I2C_CH7006 is not set -# CONFIG_DRM_I2C_SIL164 is not set -# CONFIG_DRM_I2C_NXP_TDA998X is not set -# CONFIG_DRM_I2C_NXP_TDA9950 is not set -# end of I2C encoder or helper chips - # # ARM devices # @@ -4058,6 +4041,8 @@ CONFIG_NOUVEAU_DEBUG_DEFAULT=3 # CONFIG_NOUVEAU_DEBUG_PUSH is not set CONFIG_DRM_NOUVEAU_BACKLIGHT=y # CONFIG_DRM_NOUVEAU_GSP_DEFAULT is not set +# CONFIG_DRM_NOUVEAU_CH7006 is not set +# CONFIG_DRM_NOUVEAU_SIL164 is not set CONFIG_DRM_I915=m CONFIG_DRM_I915_FORCE_PROBE="" CONFIG_DRM_I915_CAPTURE_ERROR=y @@ -4103,11 +4088,13 @@ CONFIG_DRM_PANEL_BRIDGE=y # # Display Interface Bridges # +# CONFIG_DRM_I2C_NXP_TDA998X is not set # CONFIG_DRM_ANALOGIX_ANX78XX is not set # end of Display Interface Bridges # CONFIG_DRM_ETNAVIV is not set # CONFIG_DRM_HISI_HIBMC is not set +# CONFIG_DRM_APPLETBDRM is not set CONFIG_DRM_BOCHS=m # CONFIG_DRM_CIRRUS_QEMU is not set # CONFIG_DRM_GM12U320 is not set @@ -4251,6 +4238,7 @@ CONFIG_SOUND=y CONFIG_SND=m CONFIG_SND_TIMER=m CONFIG_SND_PCM=m +CONFIG_SND_PCM_ELD=y CONFIG_SND_HWDEP=m CONFIG_SND_SEQ_DEVICE=m CONFIG_SND_RAWMIDI=m @@ -4584,6 +4572,7 @@ CONFIG_SND_SOC_I2C_AND_SPI=m # CONFIG_SND_SOC_ALC5623 is not set # CONFIG_SND_SOC_AW8738 is not set # CONFIG_SND_SOC_AW88395 is not set +# CONFIG_SND_SOC_AW88166 is not set # CONFIG_SND_SOC_AW88261 is not set # CONFIG_SND_SOC_AW88081 is not set # CONFIG_SND_SOC_AW87390 is not set @@ -4775,6 +4764,8 @@ CONFIG_HID_A4TECH=m # CONFIG_HID_ACRUX is not set # CONFIG_HID_APPLE is not set # CONFIG_HID_APPLEIR is not set +# CONFIG_HID_APPLETB_BL is not set +# CONFIG_HID_APPLETB_KBD is not set # CONFIG_HID_ASUS is not set # CONFIG_HID_AUREAL is not set # CONFIG_HID_BELKIN is not set @@ -4823,13 +4814,6 @@ CONFIG_HID_XIAOMI=m # CONFIG_HID_LED is not set CONFIG_HID_LENOVO=m # CONFIG_HID_LETSKETCH is not set -CONFIG_HID_LOGITECH=m -CONFIG_HID_LOGITECH_DJ=m -CONFIG_HID_LOGITECH_HIDPP=m -# CONFIG_LOGITECH_FF is not set -# CONFIG_LOGIRUMBLEPAD2_FF is not set -# CONFIG_LOGIG940_FF is not set -# CONFIG_LOGIWHEELS_FF is not set # CONFIG_HID_MAGICMOUSE is not set # CONFIG_HID_MALTRON is not set # CONFIG_HID_MAYFLASH is not set @@ -5144,6 +5128,7 @@ CONFIG_UCSI_ACPI=m # CONFIG_TYPEC_MUX_INTEL_PMC is not set # CONFIG_TYPEC_MUX_IT5205 is not set # CONFIG_TYPEC_MUX_NB7VPQ904M is not set +# CONFIG_TYPEC_MUX_PS883X is not set # CONFIG_TYPEC_MUX_PTN36502 is not set # CONFIG_TYPEC_MUX_TUSB1046 is not set # CONFIG_TYPEC_MUX_WCD939X_USBSS is not set @@ -5265,7 +5250,7 @@ CONFIG_LEDS_TRIGGER_HEARTBEAT=m # CONFIG_LEDS_TRIGGER_INPUT_EVENTS is not set # -# Simple LED drivers +# Simatic LED drivers # # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set @@ -5276,6 +5261,9 @@ CONFIG_EDAC=y # CONFIG_EDAC_DEBUG is not set CONFIG_EDAC_DECODE_MCE=m # CONFIG_EDAC_GHES is not set +CONFIG_EDAC_SCRUB=y +CONFIG_EDAC_ECS=y +CONFIG_EDAC_MEM_REPAIR=y CONFIG_EDAC_AMD64=m # CONFIG_EDAC_E752X is not set # CONFIG_EDAC_I82975X is not set @@ -5463,7 +5451,6 @@ CONFIG_VFIO_VIRQFD=y # VFIO support for PCI devices # CONFIG_VFIO_PCI_CORE=m -CONFIG_VFIO_PCI_MMAP=y CONFIG_VFIO_PCI_INTX=y CONFIG_VFIO_PCI=m CONFIG_VFIO_PCI_VGA=y @@ -5564,6 +5551,7 @@ CONFIG_AMD_WBRF=y CONFIG_WIRELESS_HOTKEY=m # CONFIG_IBM_RTL is not set CONFIG_IDEAPAD_LAPTOP=m +CONFIG_LENOVO_WMI_HOTKEY_UTILITIES=m CONFIG_LENOVO_YMC=m CONFIG_SENSORS_HDAPS=m CONFIG_THINKPAD_ACPI=m @@ -5616,6 +5604,7 @@ CONFIG_INTEL_VSEC=m # CONFIG_MSI_LAPTOP is not set # CONFIG_MSI_WMI is not set # CONFIG_MSI_WMI_PLATFORM is not set +# CONFIG_SAMSUNG_GALAXYBOOK is not set # CONFIG_SAMSUNG_LAPTOP is not set # CONFIG_SAMSUNG_Q10 is not set # CONFIG_TOSHIBA_BT_RFKILL is not set @@ -5629,7 +5618,6 @@ CONFIG_INTEL_VSEC=m # CONFIG_SYSTEM76_ACPI is not set # CONFIG_TOPSTAR_LAPTOP is not set # CONFIG_SERIAL_MULTI_INSTANTIATE is not set -# CONFIG_MLX_PLATFORM is not set # CONFIG_INSPUR_PLATFORM_PROFILE is not set # CONFIG_LENOVO_WMI_CAMERA is not set CONFIG_FW_ATTR_CLASS=m @@ -6141,7 +6129,6 @@ CONFIG_PSTORE_BLK=y CONFIG_PSTORE_BLK_BLKDEV="" CONFIG_PSTORE_BLK_KMSG_SIZE=64 CONFIG_PSTORE_BLK_MAX_REASON=2 -# CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set # CONFIG_EROFS_FS is not set CONFIG_NETWORK_FILESYSTEMS=y @@ -6259,7 +6246,6 @@ CONFIG_NLS_UTF8=y CONFIG_NLS_UCS2_UTILS=m # CONFIG_DLM is not set CONFIG_UNICODE=y -# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set CONFIG_IO_WQ=y # end of File systems @@ -6269,6 +6255,7 @@ CONFIG_IO_WQ=y CONFIG_KEYS=y CONFIG_KEYS_REQUEST_CACHE=y CONFIG_PERSISTENT_KEYRINGS=y +# CONFIG_BIG_KEYS is not set # CONFIG_TRUSTED_KEYS is not set CONFIG_ENCRYPTED_KEYS=y # CONFIG_USER_DECRYPTED_DATA is not set @@ -6288,8 +6275,6 @@ CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_PATH=y # CONFIG_INTEL_TXT is not set CONFIG_LSM_MMAP_MIN_ADDR=65536 -CONFIG_HARDENED_USERCOPY=y -CONFIG_FORTIFY_SOURCE=y # CONFIG_STATIC_USERMODEHELPER is not set CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y @@ -6355,6 +6340,14 @@ CONFIG_PAGE_SANITIZE_VERIFY=y CONFIG_SLAB_SANITIZE_VERIFY=y # end of Memory initialization +# +# Bounds checking +# +CONFIG_FORTIFY_SOURCE=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_HARDENED_USERCOPY_DEFAULT_ON=y +# end of Bounds checking + # # Hardening of kernel data structures # @@ -6368,17 +6361,6 @@ CONFIG_RANDSTRUCT_FULL=y CONFIG_RANDSTRUCT=y CONFIG_GCC_PLUGIN_RANDSTRUCT=y # end of Kernel hardening options - -# -# OpenPaX options -# -CONFIG_OPENPAX=y -CONFIG_OPENPAX_SOFTMODE=y -CONFIG_OPENPAX_XATTR_PAX_FLAGS=y -CONFIG_OPENPAX_MPROTECT=y -CONFIG_OPENPAX_EMUTRAMP=y -# CONFIG_OPENPAX_EMUTRAMP_DEFAULT is not set -# end of OpenPaX options # end of Security options CONFIG_XOR_BLOCKS=m @@ -6410,6 +6392,7 @@ CONFIG_CRYPTO_AKCIPHER=y CONFIG_CRYPTO_KPP2=y CONFIG_CRYPTO_KPP=m CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_HKDF=y CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y # CONFIG_CRYPTO_USER is not set @@ -6419,6 +6402,7 @@ CONFIG_CRYPTO_NULL2=y CONFIG_CRYPTO_PCRYPT=y CONFIG_CRYPTO_CRYPTD=y CONFIG_CRYPTO_AUTHENC=y +# CONFIG_CRYPTO_KRB5ENC is not set # CONFIG_CRYPTO_TEST is not set CONFIG_CRYPTO_SIMD=y CONFIG_CRYPTO_ENGINE=m @@ -6518,8 +6502,6 @@ CONFIG_CRYPTO_XXHASH=y # CONFIG_CRYPTO_CRC32C=y CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_CRCT10DIF=y -CONFIG_CRYPTO_CRC64_ROCKSOFT=y # end of CRCs (cyclic redundancy checks) # @@ -6645,6 +6627,7 @@ CONFIG_SYSTEM_TRUSTED_KEYS="" # CONFIG_SYSTEM_BLACKLIST_KEYRING is not set # end of Certificates for signature checking +# CONFIG_CRYPTO_KRB5 is not set CONFIG_BINARY_PRINTF=y # @@ -6694,20 +6677,18 @@ CONFIG_CRYPTO_LIB_SHA1=y CONFIG_CRYPTO_LIB_SHA256=y # end of Crypto library routines -CONFIG_CRC_CCITT=y +CONFIG_CRC_CCITT=m CONFIG_CRC16=y CONFIG_CRC_T10DIF=y CONFIG_ARCH_HAS_CRC_T10DIF=y CONFIG_CRC_T10DIF_ARCH=y -CONFIG_CRC64_ROCKSOFT=y -CONFIG_CRC_ITU_T=y +CONFIG_CRC_ITU_T=m CONFIG_CRC32=y CONFIG_ARCH_HAS_CRC32=y CONFIG_CRC32_ARCH=y CONFIG_CRC64=y -# CONFIG_CRC4 is not set -CONFIG_CRC7=m -CONFIG_LIBCRC32C=y +CONFIG_ARCH_HAS_CRC64=y +CONFIG_CRC64_ARCH=y CONFIG_CRC8=m CONFIG_CRC_OPTIMIZATIONS=y CONFIG_XXHASH=y @@ -6781,6 +6762,7 @@ CONFIG_GENERIC_GETTIMEOFDAY=y CONFIG_GENERIC_VDSO_TIME_NS=y CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT=y CONFIG_VDSO_GETRANDOM=y +CONFIG_GENERIC_VDSO_DATA_STORE=y CONFIG_FONT_SUPPORT=y # CONFIG_FONTS is not set CONFIG_FONT_8x8=y @@ -6841,6 +6823,7 @@ CONFIG_STRIP_ASM_SYMS=y CONFIG_SECTION_MISMATCH_WARN_ONLY=y # CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE is not set CONFIG_OBJTOOL=y +# CONFIG_OBJTOOL_WERROR is not set # CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set # end of Compile-time checks and compiler options @@ -6887,8 +6870,8 @@ CONFIG_PAGE_POISONING=y CONFIG_DEBUG_RODATA_TEST=y CONFIG_ARCH_HAS_DEBUG_WX=y CONFIG_DEBUG_WX=y -CONFIG_GENERIC_PTDUMP=y -CONFIG_PTDUMP_CORE=y +CONFIG_ARCH_HAS_PTDUMP=y +CONFIG_PTDUMP=y # CONFIG_PTDUMP_DEBUGFS is not set CONFIG_HAVE_DEBUG_KMEMLEAK=y # CONFIG_DEBUG_KMEMLEAK is not set @@ -6898,6 +6881,7 @@ CONFIG_HAVE_DEBUG_KMEMLEAK=y # CONFIG_DEBUG_STACK_USAGE is not set CONFIG_SCHED_STACK_END_CHECK=y CONFIG_ARCH_HAS_DEBUG_VM_PGTABLE=y +# CONFIG_DEBUG_VFS is not set # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_VM_PGTABLE is not set CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y @@ -6947,6 +6931,7 @@ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y CONFIG_DETECT_HUNG_TASK=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 # CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_DETECT_HUNG_TASK_BLOCKER=y CONFIG_WQ_WATCHDOG=y # CONFIG_WQ_CPU_INTENSIVE_REPORT is not set # CONFIG_TEST_LOCKUP is not set @@ -6955,7 +6940,6 @@ CONFIG_WQ_WATCHDOG=y # # Scheduler Debugging # -# CONFIG_SCHED_DEBUG is not set CONFIG_SCHED_INFO=y # CONFIG_SCHEDSTATS is not set # end of Scheduler Debugging @@ -7155,8 +7139,6 @@ CONFIG_RUNTIME_TESTING_MENU=y # CONFIG_ASYNC_RAID6_TEST is not set # CONFIG_TEST_HEXDUMP is not set # CONFIG_TEST_KSTRTOX is not set -# CONFIG_TEST_PRINTF is not set -# CONFIG_TEST_SCANF is not set # CONFIG_TEST_BITMAP is not set # CONFIG_TEST_UUID is not set # CONFIG_TEST_XARRAY is not set @@ -7167,7 +7149,6 @@ CONFIG_RUNTIME_TESTING_MENU=y # CONFIG_TEST_BITOPS is not set # CONFIG_TEST_VMALLOC is not set CONFIG_TEST_BPF=m -# CONFIG_TEST_BLACKHOLE_DEV is not set # CONFIG_FIND_BIT_BENCHMARK is not set # CONFIG_TEST_FIRMWARE is not set # CONFIG_TEST_SYSCTL is not set @@ -7193,6 +7174,8 @@ CONFIG_ARCH_USE_MEMTEST=y # end of Rust hacking # end of Kernel hacking +CONFIG_IO_URING_ZCRX=y + # # Gentoo Linux # diff --git a/sys-kernel/hardened-kernel/files/linux-6.14/1189_restrict-fs-causes-bpf-verifier.patch b/sys-kernel/hardened-kernel/files/linux-6.15/1189_restrict-fs-causes-bpf-verifier.patch similarity index 100% rename from sys-kernel/hardened-kernel/files/linux-6.14/1189_restrict-fs-causes-bpf-verifier.patch rename to sys-kernel/hardened-kernel/files/linux-6.15/1189_restrict-fs-causes-bpf-verifier.patch diff --git a/sys-kernel/hardened-kernel/hardened-kernel-6.12.19.ebuild b/sys-kernel/hardened-kernel/hardened-kernel-6.12.19.ebuild deleted file mode 100644 index aa3d200..0000000 --- a/sys-kernel/hardened-kernel/hardened-kernel-6.12.19.ebuild +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2020-2025 Gentoo Authors -# Distributed under the terms of the GNU General Public License v2 - -EAPI=8 - -KERNEL_IUSE_GENERIC_UKI=1 -KERNEL_IUSE_MODULES_SIGN=1 - -inherit kernel-build toolchain-funcs - -MY_P=linux-${PV%.*} -GENPATCHES_P=genpatches-${PV%.*}-$(( ${PV##*.} + 4 )) -# https://koji.fedoraproject.org/koji/packageinfo?packageID=8 -# forked to https://github.com/projg2/fedora-kernel-config-for-gentoo -CONFIG_VER=6.12.8-gentoo -GENTOO_CONFIG_VER=g15 -HARDENED_PATCH_VER="${PV}-hardened1" -GENPATCHES_EXCLUDE="1500_XATTR_USER_PREFIX.patch - 1510_fs-enable-link-security-restrictions-by-default.patch - 2900_dev-root-proc-mount-fix.patch - 4200_fbcondecor.patch - 4400_alpha-sysctl-uac.patch" - -DESCRIPTION="Linux kernel built with Gentoo patches" -HOMEPAGE=" - https://wiki.gentoo.org/wiki/Project:Distribution_Kernel - https://www.kernel.org/ -" -SRC_URI+=" - https://cdn.kernel.org/pub/linux/kernel/v$(ver_cut 1).x/${MY_P}.tar.xz - https://dev.gentoo.org/~mpagano/dist/genpatches/${GENPATCHES_P}.base.tar.xz - https://dev.gentoo.org/~mpagano/dist/genpatches/${GENPATCHES_P}.extras.tar.xz - experimental? ( - https://dev.gentoo.org/~mpagano/dist/genpatches/${GENPATCHES_P}.experimental.tar.xz - ) - https://github.com/anthraxx/linux-hardened/releases/download/v${HARDENED_PATCH_VER}/linux-hardened-v${HARDENED_PATCH_VER}.patch - https://github.com/projg2/gentoo-kernel-config/archive/${GENTOO_CONFIG_VER}.tar.gz - -> gentoo-kernel-config-${GENTOO_CONFIG_VER}.tar.gz - amd64? ( - https://raw.githubusercontent.com/projg2/fedora-kernel-config-for-gentoo/${CONFIG_VER}/kernel-x86_64-fedora.config - -> kernel-x86_64-fedora.config.${CONFIG_VER} - ) - arm64? ( - https://raw.githubusercontent.com/projg2/fedora-kernel-config-for-gentoo/${CONFIG_VER}/kernel-aarch64-fedora.config - -> kernel-aarch64-fedora.config.${CONFIG_VER} - ) - ppc64? ( - https://raw.githubusercontent.com/projg2/fedora-kernel-config-for-gentoo/${CONFIG_VER}/kernel-ppc64le-fedora.config - -> kernel-ppc64le-fedora.config.${CONFIG_VER} - ) - x86? ( - https://raw.githubusercontent.com/projg2/fedora-kernel-config-for-gentoo/${CONFIG_VER}/kernel-i686-fedora.config - -> kernel-i686-fedora.config.${CONFIG_VER} - ) -" -S=${WORKDIR}/${MY_P} - -KEYWORDS="amd64 ~arm arm64 ~hppa ~loong ~ppc ppc64 ~riscv ~sparc x86" -IUSE="debug +experimental" -REQUIRED_USE=" - arm? ( savedconfig ) - hppa? ( savedconfig ) - riscv? ( savedconfig ) - sparc? ( savedconfig ) -" - -RDEPEND=" - !sys-kernel/gentoo-kernel-bin:${SLOT} -" -BDEPEND=" - debug? ( dev-util/pahole ) -" -PDEPEND=" - >=virtual/dist-kernel-${PV} -" - -QA_FLAGS_IGNORED=" - usr/src/linux-.*/scripts/gcc-plugins/.*.so - usr/src/linux-.*/vmlinux - usr/src/linux-.*/arch/powerpc/kernel/vdso.*/vdso.*.so.dbg -" - -src_prepare() { - # remove some genpatches causes conflicts with linux-hardened patch - for patch in ${GENPATCHES_EXCLUDE}; do - rm -f ${WORKDIR}/${patch} - done - # Remove already exists changes in linux-hardened patch - sed -i '322,337d' "${WORKDIR}/4567_distro-Gentoo-Kconfig.patch" - # include linux-hardened patch with priority - cp ${DISTDIR}/linux-hardened-v${HARDENED_PATCH_VER}.patch ${WORKDIR}/1199_linux-hardened-${HARDENED_PATCH_VER}.patch - - # copy pkg maintainer supplied patches - if [ -d "${FILESDIR}/${MY_P}" ]; then - cp "${FILESDIR}/${MY_P}"/*.patch ${WORKDIR}/ - fi - - local PATCHES=( - # meh, genpatches have no directory - "${WORKDIR}"/*.patch - ) - default - - #sed -i "s@\-hardened1@@g" Makefile || die - - local biendian=false - - # prepare the default config - case ${ARCH} in - amd64) - cp "${FILESDIR}/${MY_P}.amd64.config" .config || die - ;; - *) - die "Unsupported arch ${ARCH}" - ;; - esac - - local myversion="-gentoo-dist" - echo "CONFIG_LOCALVERSION=\"${myversion}\"" > "${T}"/version.config || die - local dist_conf_path="${WORKDIR}/gentoo-kernel-config-${GENTOO_CONFIG_VER}" - - local merge_configs=( - "${T}"/version.config - ) - use debug || merge_configs+=( - "${dist_conf_path}"/no-debug.config - ) - - merge_configs+=( "${dist_conf_path}"/hardened-base.config ) - - tc-is-gcc && merge_configs+=( "${dist_conf_path}"/hardened-gcc-plugins.config ) - - if [[ -f "${dist_conf_path}/hardened-${ARCH}.config" ]]; then - merge_configs+=( "${dist_conf_path}/hardened-${ARCH}.config" ) - fi - - # this covers ppc64 and aarch64_be only for now - if [[ ${biendian} == true && $(tc-endian) == big ]]; then - merge_configs+=( "${dist_conf_path}/big-endian.config" ) - fi - - use secureboot && merge_configs+=( "${dist_conf_path}/secureboot.config" ) - - kernel-build_merge_configs "${merge_configs[@]}" -} diff --git a/sys-kernel/hardened-kernel/hardened-kernel-6.14.8.ebuild b/sys-kernel/hardened-kernel/hardened-kernel-6.15.8.ebuild similarity index 64% rename from sys-kernel/hardened-kernel/hardened-kernel-6.14.8.ebuild rename to sys-kernel/hardened-kernel/hardened-kernel-6.15.8.ebuild index a460dbc..bf4acd8 100644 --- a/sys-kernel/hardened-kernel/hardened-kernel-6.14.8.ebuild +++ b/sys-kernel/hardened-kernel/hardened-kernel-6.15.8.ebuild @@ -6,20 +6,20 @@ EAPI=8 KERNEL_IUSE_GENERIC_UKI=1 KERNEL_IUSE_MODULES_SIGN=1 -inherit kernel-build toolchain-funcs +inherit kernel-build toolchain-funcs verify-sig MY_P=linux-${PV%.*} -GENPATCHES_P=genpatches-${PV%.*}-$(( ${PV##*.} + 1 )) +PATCHSET=linux-gentoo-patches-6.15.8 # https://koji.fedoraproject.org/koji/packageinfo?packageID=8 # forked to https://github.com/projg2/fedora-kernel-config-for-gentoo -CONFIG_VER=6.14.5-gentoo +CONFIG_VER=6.15.6-gentoo GENTOO_CONFIG_VER=g16 -HARDENED_PATCH_VER="${PV}-hardened1" +SHA256SUM_DATE=20250724 +HARDENED_PATCH_VER="${PV}-hardened2" +USER_PATCHSET=linux-user-patches-6.15.8 GENPATCHES_EXCLUDE="1500_XATTR_USER_PREFIX.patch - 1510_fs-enable-link-security-restrictions-by-default.patch - 2900_dev-root-proc-mount-fix.patch - 4200_fbcondecor.patch - 4400_alpha-sysctl-uac.patch" + 0001-fs-Enable-link-security-restrictions-by-default.patch +" DESCRIPTION="Linux kernel built with Gentoo patches" HOMEPAGE=" @@ -27,15 +27,16 @@ HOMEPAGE=" https://www.kernel.org/ " SRC_URI+=" - https://cdn.kernel.org/pub/linux/kernel/v$(ver_cut 1).x/${MY_P}.tar.xz - https://dev.gentoo.org/~mpagano/dist/genpatches/${GENPATCHES_P}.base.tar.xz - https://dev.gentoo.org/~mpagano/dist/genpatches/${GENPATCHES_P}.extras.tar.xz - experimental? ( - https://dev.gentoo.org/~mpagano/dist/genpatches/${GENPATCHES_P}.experimental.tar.xz - ) + https://cdn.kernel.org/pub/linux/kernel/v$(ver_cut 1).x/${MY_P}.tar.xz + https://cdn.kernel.org/pub/linux/kernel/v$(ver_cut 1).x/patch-${PV}.xz + https://dev.gentoo.org/~mgorny/dist/linux/${PATCHSET}.tar.xz https://github.com/anthraxx/linux-hardened/releases/download/v${HARDENED_PATCH_VER}/linux-hardened-v${HARDENED_PATCH_VER}.patch https://github.com/projg2/gentoo-kernel-config/archive/${GENTOO_CONFIG_VER}.tar.gz -> gentoo-kernel-config-${GENTOO_CONFIG_VER}.tar.gz + verify-sig? ( + https://cdn.kernel.org/pub/linux/kernel/v$(ver_cut 1).x/sha256sums.asc + -> linux-$(ver_cut 1).x-sha256sums-${SHA256SUM_DATE}.asc + ) amd64? ( https://raw.githubusercontent.com/projg2/fedora-kernel-config-for-gentoo/${CONFIG_VER}/kernel-x86_64-fedora.config -> kernel-x86_64-fedora.config.${CONFIG_VER} @@ -80,28 +81,56 @@ QA_FLAGS_IGNORED=" usr/src/linux-.*/arch/powerpc/kernel/vdso.*/vdso.*.so.dbg " +VERIFY_SIG_OPENPGP_KEY_PATH=/usr/share/openpgp-keys/kernel.org.asc + +src_unpack() { + if use verify-sig; then + cd "${DISTDIR}" || die + verify-sig_verify_signed_checksums \ + "linux-$(ver_cut 1).x-sha256sums-${SHA256SUM_DATE}.asc" \ + sha256 "${MY_P}.tar.xz patch-${PV}.xz" + cd "${WORKDIR}" || die + fi + + default +} + src_prepare() { + local patch + + mkdir ${WORKDIR}/${USER_PATCHSET} + # remove some genpatches causes conflicts with linux-hardened patch for patch in ${GENPATCHES_EXCLUDE}; do - rm -f ${WORKDIR}/${patch} + rm -f ${WORKDIR}/${PATCHSET}/${patch} done # Remove already exists changes in linux-hardened patch - sed -i '322,337d' "${WORKDIR}/4567_distro-Gentoo-Kconfig.patch" + sed -i '344,356d' "${WORKDIR}/linux-gentoo-patches-${PV}/0010-Add-Gentoo-Linux-support-config-settings-and-default.patch" # include linux-hardened patch with priority - cp ${DISTDIR}/linux-hardened-v${HARDENED_PATCH_VER}.patch ${WORKDIR}/1198_linux-hardened-${HARDENED_PATCH_VER}.patch + cp ${DISTDIR}/linux-hardened-v${HARDENED_PATCH_VER}.patch ${WORKDIR}/${USER_PATCHSET}/1198_linux-hardened-${HARDENED_PATCH_VER}.patch # copy pkg maintainer supplied patches if [ -d "${FILESDIR}/${MY_P}" ]; then - cp "${FILESDIR}/${MY_P}"/*.patch ${WORKDIR}/ + cp "${FILESDIR}/${MY_P}"/*.patch ${WORKDIR}/${USER_PATCHSET}/ fi - local PATCHES=( - # meh, genpatches have no directory - "${WORKDIR}"/*.patch - ) - default + eapply "${WORKDIR}/patch-${PV}" + for patch in "${WORKDIR}/${PATCHSET}"/*.patch; do + eapply "${patch}" + # non-experimental patches always finish with Gentoo Kconfig + # when ! use experimental, stop applying after it + if [[ ${patch} == *Add-Gentoo-Linux-support-config-settings* ]] && + ! use experimental + then + break + fi + done - #sed -i "s@\-hardened1@@g" Makefile || die + for patch in "${WORKDIR}/${USER_PATCHSET}"/*.patch; do + eapply "${patch}" + done + + default local biendian=false