41409 lines
1.3 MiB
41409 lines
1.3 MiB
From daf3d0f8fb4768e7c05f1fac44b217f4437ce04b Mon Sep 17 00:00:00 2001
|
|
From: Alexander Miroshnichenko <alex@millerson.name>
|
|
Date: Thu, 22 May 2025 13:15:09 +0300
|
|
Subject: [PATCH] bcachefs: cherry-pick updates from master 17227e8
|
|
Content-Type: text/plain; charset="utf-8"
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Signed-off-by: Alexander Miroshnichenko <alex@millerson.name>
|
|
---
|
|
.../bcachefs/SubmittingPatches.rst | 43 +-
|
|
.../filesystems/bcachefs/casefolding.rst | 108 ++
|
|
.../filesystems/bcachefs/future/idle_work.rst | 78 ++
|
|
Documentation/filesystems/bcachefs/index.rst | 27 +-
|
|
MAINTAINERS | 7 +
|
|
block/bdev.c | 2 +
|
|
block/blk-core.c | 19 +-
|
|
fs/bcachefs/Kconfig | 16 +-
|
|
fs/bcachefs/Makefile | 8 +-
|
|
fs/bcachefs/acl.c | 4 +-
|
|
fs/bcachefs/alloc_background.c | 428 ++++---
|
|
fs/bcachefs/alloc_background.h | 9 +-
|
|
fs/bcachefs/alloc_foreground.c | 650 +++++------
|
|
fs/bcachefs/alloc_foreground.h | 92 +-
|
|
fs/bcachefs/alloc_types.h | 18 +-
|
|
fs/bcachefs/async_objs.c | 132 +++
|
|
fs/bcachefs/async_objs.h | 44 +
|
|
fs/bcachefs/async_objs_types.h | 25 +
|
|
fs/bcachefs/backpointers.c | 553 +++++----
|
|
fs/bcachefs/backpointers.h | 40 +-
|
|
fs/bcachefs/bcachefs.h | 252 ++--
|
|
fs/bcachefs/bcachefs_format.h | 124 +-
|
|
fs/bcachefs/bcachefs_ioctl.h | 29 +-
|
|
fs/bcachefs/bkey.c | 47 +-
|
|
fs/bcachefs/bkey.h | 4 +-
|
|
fs/bcachefs/bkey_methods.c | 26 +-
|
|
fs/bcachefs/bset.c | 64 +-
|
|
fs/bcachefs/bset.h | 22 +-
|
|
fs/bcachefs/btree_cache.c | 212 ++--
|
|
fs/bcachefs/btree_gc.c | 102 +-
|
|
fs/bcachefs/btree_gc.h | 3 +-
|
|
fs/bcachefs/btree_io.c | 616 +++++++---
|
|
fs/bcachefs/btree_io.h | 16 +-
|
|
fs/bcachefs/btree_iter.c | 420 ++++---
|
|
fs/bcachefs/btree_iter.h | 213 ++--
|
|
fs/bcachefs/btree_journal_iter.c | 5 +-
|
|
fs/bcachefs/btree_key_cache.c | 96 +-
|
|
fs/bcachefs/btree_key_cache.h | 3 +-
|
|
fs/bcachefs/btree_locking.c | 73 +-
|
|
fs/bcachefs/btree_locking.h | 41 +-
|
|
fs/bcachefs/btree_node_scan.c | 63 +-
|
|
fs/bcachefs/btree_node_scan_types.h | 2 +-
|
|
fs/bcachefs/btree_trans_commit.c | 199 ++--
|
|
fs/bcachefs/btree_types.h | 47 +-
|
|
fs/bcachefs/btree_update.c | 95 +-
|
|
fs/bcachefs/btree_update.h | 71 +-
|
|
fs/bcachefs/btree_update_interior.c | 338 +++---
|
|
fs/bcachefs/btree_update_interior.h | 11 +-
|
|
fs/bcachefs/btree_write_buffer.c | 38 +-
|
|
fs/bcachefs/btree_write_buffer.h | 1 +
|
|
fs/bcachefs/btree_write_buffer_types.h | 2 +-
|
|
fs/bcachefs/buckets.c | 303 ++---
|
|
fs/bcachefs/buckets.h | 58 +-
|
|
fs/bcachefs/buckets_types.h | 32 +
|
|
fs/bcachefs/chardev.c | 58 +-
|
|
fs/bcachefs/checksum.c | 270 ++---
|
|
fs/bcachefs/checksum.h | 7 +-
|
|
fs/bcachefs/compress.c | 74 +-
|
|
fs/bcachefs/data_update.c | 330 ++++--
|
|
fs/bcachefs/data_update.h | 44 +-
|
|
fs/bcachefs/debug.c | 119 +-
|
|
fs/bcachefs/debug.h | 20 +-
|
|
fs/bcachefs/dirent.c | 294 ++++-
|
|
fs/bcachefs/dirent.h | 30 +-
|
|
fs/bcachefs/dirent_format.h | 20 +-
|
|
fs/bcachefs/disk_accounting.c | 170 ++-
|
|
fs/bcachefs/disk_accounting.h | 54 +-
|
|
fs/bcachefs/disk_accounting_format.h | 90 +-
|
|
fs/bcachefs/disk_accounting_types.h | 2 +-
|
|
fs/bcachefs/disk_groups.c | 148 ++-
|
|
fs/bcachefs/ec.c | 754 ++++++------
|
|
fs/bcachefs/ec.h | 56 +-
|
|
fs/bcachefs/ec_types.h | 19 +-
|
|
fs/bcachefs/enumerated_ref.c | 144 +++
|
|
fs/bcachefs/enumerated_ref.h | 66 ++
|
|
fs/bcachefs/enumerated_ref_types.h | 19 +
|
|
fs/bcachefs/errcode.h | 77 +-
|
|
fs/bcachefs/error.c | 387 +++++--
|
|
fs/bcachefs/error.h | 113 +-
|
|
fs/bcachefs/extent_update.c | 67 +-
|
|
fs/bcachefs/extent_update.h | 2 +-
|
|
fs/bcachefs/extents.c | 379 ++++--
|
|
fs/bcachefs/extents.h | 34 +-
|
|
fs/bcachefs/extents_format.h | 24 +-
|
|
fs/bcachefs/extents_types.h | 12 +-
|
|
fs/bcachefs/eytzinger.c | 76 +-
|
|
fs/bcachefs/eytzinger.h | 95 +-
|
|
fs/bcachefs/fast_list.c | 156 +++
|
|
fs/bcachefs/fast_list.h | 41 +
|
|
fs/bcachefs/fs-io-buffered.c | 59 +-
|
|
fs/bcachefs/fs-io-direct.c | 27 +-
|
|
fs/bcachefs/fs-io-pagecache.c | 18 +-
|
|
fs/bcachefs/fs-io.c | 97 +-
|
|
fs/bcachefs/fs-ioctl.c | 215 +---
|
|
fs/bcachefs/fs-ioctl.h | 73 --
|
|
fs/bcachefs/fs.c | 672 ++++++++---
|
|
fs/bcachefs/fsck.c | 734 +++++-------
|
|
fs/bcachefs/inode.c | 204 ++--
|
|
fs/bcachefs/inode.h | 46 +-
|
|
fs/bcachefs/inode_format.h | 13 +-
|
|
fs/bcachefs/io_misc.c | 21 +-
|
|
fs/bcachefs/io_read.c | 1024 ++++++++++-------
|
|
fs/bcachefs/io_read.h | 111 +-
|
|
fs/bcachefs/io_write.c | 539 +++++----
|
|
fs/bcachefs/io_write.h | 38 +-
|
|
fs/bcachefs/io_write_types.h | 34 +-
|
|
fs/bcachefs/journal.c | 317 +++--
|
|
fs/bcachefs/journal.h | 52 +-
|
|
fs/bcachefs/journal_io.c | 283 +++--
|
|
fs/bcachefs/journal_io.h | 2 +-
|
|
fs/bcachefs/journal_reclaim.c | 72 +-
|
|
fs/bcachefs/journal_sb.c | 2 +-
|
|
fs/bcachefs/journal_seq_blacklist.c | 7 +-
|
|
fs/bcachefs/journal_types.h | 39 +-
|
|
fs/bcachefs/lru.c | 107 +-
|
|
fs/bcachefs/lru.h | 22 +-
|
|
fs/bcachefs/lru_format.h | 6 +-
|
|
fs/bcachefs/migrate.c | 143 ++-
|
|
fs/bcachefs/migrate.h | 3 +-
|
|
fs/bcachefs/move.c | 641 +++++++----
|
|
fs/bcachefs/move.h | 17 +-
|
|
fs/bcachefs/move_types.h | 28 +-
|
|
fs/bcachefs/movinggc.c | 233 ++--
|
|
fs/bcachefs/movinggc.h | 11 +-
|
|
fs/bcachefs/{fs-common.c => namei.c} | 501 ++++++--
|
|
fs/bcachefs/{fs-common.h => namei.h} | 38 +-
|
|
fs/bcachefs/nocow_locking.c | 4 +-
|
|
fs/bcachefs/nocow_locking.h | 2 +-
|
|
fs/bcachefs/opts.c | 314 +++--
|
|
fs/bcachefs/opts.h | 111 +-
|
|
fs/bcachefs/printbuf.c | 19 +
|
|
fs/bcachefs/printbuf.h | 1 +
|
|
fs/bcachefs/progress.c | 61 +
|
|
fs/bcachefs/progress.h | 29 +
|
|
fs/bcachefs/quota.c | 2 +-
|
|
fs/bcachefs/rcu_pending.c | 3 +-
|
|
fs/bcachefs/rebalance.c | 292 ++++-
|
|
fs/bcachefs/rebalance.h | 8 +-
|
|
fs/bcachefs/rebalance_types.h | 5 +
|
|
fs/bcachefs/recovery.c | 163 ++-
|
|
fs/bcachefs/recovery.h | 3 +-
|
|
fs/bcachefs/recovery_passes.c | 590 +++++++---
|
|
fs/bcachefs/recovery_passes.h | 26 +-
|
|
fs/bcachefs/recovery_passes_format.h | 104 ++
|
|
fs/bcachefs/recovery_passes_types.h | 93 +-
|
|
fs/bcachefs/reflink.c | 59 +-
|
|
fs/bcachefs/sb-counters.c | 90 +-
|
|
fs/bcachefs/sb-counters.h | 4 +
|
|
fs/bcachefs/sb-counters_format.h | 33 +-
|
|
fs/bcachefs/sb-downgrade.c | 22 +-
|
|
fs/bcachefs/sb-errors_format.h | 30 +-
|
|
fs/bcachefs/sb-errors_types.h | 2 +-
|
|
fs/bcachefs/sb-members.c | 83 +-
|
|
fs/bcachefs/sb-members.h | 95 +-
|
|
fs/bcachefs/sb-members_format.h | 7 +
|
|
fs/bcachefs/sb-members_types.h | 1 +
|
|
fs/bcachefs/snapshot.c | 539 ++++++---
|
|
fs/bcachefs/snapshot.h | 34 +-
|
|
fs/bcachefs/snapshot_format.h | 4 +-
|
|
fs/bcachefs/snapshot_types.h | 58 +
|
|
fs/bcachefs/str_hash.c | 143 ++-
|
|
fs/bcachefs/str_hash.h | 29 +-
|
|
fs/bcachefs/subvolume.c | 71 +-
|
|
fs/bcachefs/subvolume.h | 20 +-
|
|
fs/bcachefs/subvolume_types.h | 27 -
|
|
fs/bcachefs/super-io.c | 183 ++-
|
|
fs/bcachefs/super-io.h | 11 +-
|
|
fs/bcachefs/super.c | 971 +++++++++++-----
|
|
fs/bcachefs/super.h | 11 +-
|
|
fs/bcachefs/super_types.h | 8 +-
|
|
fs/bcachefs/sysfs.c | 344 ++++--
|
|
fs/bcachefs/sysfs.h | 5 +-
|
|
fs/bcachefs/tests.c | 34 +-
|
|
fs/bcachefs/thread_with_file.c | 4 +-
|
|
fs/bcachefs/thread_with_file_types.h | 2 +-
|
|
fs/bcachefs/time_stats.c | 20 +-
|
|
fs/bcachefs/time_stats.h | 1 +
|
|
fs/bcachefs/trace.h | 159 +--
|
|
fs/bcachefs/util.c | 274 ++++-
|
|
fs/bcachefs/util.h | 100 +-
|
|
fs/bcachefs/xattr.c | 31 +-
|
|
fs/bcachefs/xattr.h | 4 +-
|
|
fs/bcachefs/xattr_format.h | 8 +-
|
|
fs/dcache.c | 267 +++++
|
|
fs/libfs.c | 1 +
|
|
fs/overlayfs/params.c | 20 +-
|
|
fs/overlayfs/util.c | 19 +-
|
|
fs/super.c | 1 +
|
|
fs/xfs/xfs_super.c | 3 +-
|
|
{fs/bcachefs => include/linux}/darray.h | 59 +-
|
|
include/linux/darray_types.h | 33 +
|
|
include/linux/dcache.h | 12 +
|
|
include/linux/fs.h | 6 +
|
|
include/linux/seq_buf.h | 4 +
|
|
include/linux/shrinker.h | 13 +-
|
|
include/linux/sort.h | 11 +
|
|
lib/Makefile | 2 +-
|
|
{fs/bcachefs => lib}/darray.c | 9 +-
|
|
lib/seq_buf.c | 10 +
|
|
lib/sort.c | 110 +-
|
|
mm/oom_kill.c | 23 -
|
|
mm/show_mem.c | 50 +
|
|
mm/shrinker.c | 95 +-
|
|
mm/shrinker_debug.c | 18 +
|
|
mm/slab.h | 6 +-
|
|
mm/slab_common.c | 52 +-
|
|
206 files changed, 14908 insertions(+), 7898 deletions(-)
|
|
create mode 100644 Documentation/filesystems/bcachefs/casefolding.rst
|
|
create mode 100644 Documentation/filesystems/bcachefs/future/idle_work.rst
|
|
create mode 100644 fs/bcachefs/async_objs.c
|
|
create mode 100644 fs/bcachefs/async_objs.h
|
|
create mode 100644 fs/bcachefs/async_objs_types.h
|
|
create mode 100644 fs/bcachefs/enumerated_ref.c
|
|
create mode 100644 fs/bcachefs/enumerated_ref.h
|
|
create mode 100644 fs/bcachefs/enumerated_ref_types.h
|
|
create mode 100644 fs/bcachefs/fast_list.c
|
|
create mode 100644 fs/bcachefs/fast_list.h
|
|
rename fs/bcachefs/{fs-common.c => namei.c} (52%)
|
|
rename fs/bcachefs/{fs-common.h => namei.h} (52%)
|
|
create mode 100644 fs/bcachefs/progress.c
|
|
create mode 100644 fs/bcachefs/progress.h
|
|
create mode 100644 fs/bcachefs/recovery_passes_format.h
|
|
create mode 100644 fs/bcachefs/snapshot_types.h
|
|
rename {fs/bcachefs => include/linux}/darray.h (64%)
|
|
create mode 100644 include/linux/darray_types.h
|
|
rename {fs/bcachefs => lib}/darray.c (75%)
|
|
|
|
diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst
|
|
index 026b12ae0d6a..a455f9cfd15c 100644
|
|
--- a/Documentation/filesystems/bcachefs/SubmittingPatches.rst
|
|
+++ b/Documentation/filesystems/bcachefs/SubmittingPatches.rst
|
|
@@ -1,8 +1,13 @@
|
|
-Submitting patches to bcachefs:
|
|
-===============================
|
|
+Submitting patches to bcachefs
|
|
+==============================
|
|
+
|
|
+Here are suggestions for submitting patches to bcachefs subsystem.
|
|
+
|
|
+Submission checklist
|
|
+--------------------
|
|
|
|
Patches must be tested before being submitted, either with the xfstests suite
|
|
-[0], or the full bcachefs test suite in ktest [1], depending on what's being
|
|
+[0]_, or the full bcachefs test suite in ktest [1]_, depending on what's being
|
|
touched. Note that ktest wraps xfstests and will be an easier method to running
|
|
it for most users; it includes single-command wrappers for all the mainstream
|
|
in-kernel local filesystems.
|
|
@@ -26,21 +31,21 @@ considered out of date), but try not to deviate too much without reason.
|
|
Focus on writing code that reads well and is organized well; code should be
|
|
aesthetically pleasing.
|
|
|
|
-CI:
|
|
-===
|
|
+CI
|
|
+--
|
|
|
|
Instead of running your tests locally, when running the full test suite it's
|
|
prefereable to let a server farm do it in parallel, and then have the results
|
|
in a nice test dashboard (which can tell you which failures are new, and
|
|
presents results in a git log view, avoiding the need for most bisecting).
|
|
|
|
-That exists [2], and community members may request an account. If you work for
|
|
+That exists [2]_, and community members may request an account. If you work for
|
|
a big tech company, you'll need to help out with server costs to get access -
|
|
but the CI is not restricted to running bcachefs tests: it runs any ktest test
|
|
(which generally makes it easy to wrap other tests that can run in qemu).
|
|
|
|
-Other things to think about:
|
|
-============================
|
|
+Other things to think about
|
|
+---------------------------
|
|
|
|
- How will we debug this code? Is there sufficient introspection to diagnose
|
|
when something starts acting wonky on a user machine?
|
|
@@ -79,20 +84,22 @@ Other things to think about:
|
|
tested? (Automated tests exists but aren't in the CI, due to the hassle of
|
|
disk image management; coordinate to have them run.)
|
|
|
|
-Mailing list, IRC:
|
|
-==================
|
|
+Mailing list, IRC
|
|
+-----------------
|
|
|
|
-Patches should hit the list [3], but much discussion and code review happens on
|
|
-IRC as well [4]; many people appreciate the more conversational approach and
|
|
-quicker feedback.
|
|
+Patches should hit the list [3]_, but much discussion and code review happens
|
|
+on IRC as well [4]_; many people appreciate the more conversational approach
|
|
+and quicker feedback.
|
|
|
|
Additionally, we have a lively user community doing excellent QA work, which
|
|
exists primarily on IRC. Please make use of that resource; user feedback is
|
|
important for any nontrivial feature, and documenting it in commit messages
|
|
would be a good idea.
|
|
|
|
-[0]: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git
|
|
-[1]: https://evilpiepirate.org/git/ktest.git/
|
|
-[2]: https://evilpiepirate.org/~testdashboard/ci/
|
|
-[3]: linux-bcachefs@vger.kernel.org
|
|
-[4]: irc.oftc.net#bcache, #bcachefs-dev
|
|
+.. rubric:: References
|
|
+
|
|
+.. [0] git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git
|
|
+.. [1] https://evilpiepirate.org/git/ktest.git/
|
|
+.. [2] https://evilpiepirate.org/~testdashboard/ci/
|
|
+.. [3] linux-bcachefs@vger.kernel.org
|
|
+.. [4] irc.oftc.net#bcache, #bcachefs-dev
|
|
diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst
|
|
new file mode 100644
|
|
index 000000000000..871a38f557e8
|
|
--- /dev/null
|
|
+++ b/Documentation/filesystems/bcachefs/casefolding.rst
|
|
@@ -0,0 +1,108 @@
|
|
+.. SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+Casefolding
|
|
+===========
|
|
+
|
|
+bcachefs has support for case-insensitive file and directory
|
|
+lookups using the regular `chattr +F` (`S_CASEFOLD`, `FS_CASEFOLD_FL`)
|
|
+casefolding attributes.
|
|
+
|
|
+The main usecase for casefolding is compatibility with software written
|
|
+against other filesystems that rely on casefolded lookups
|
|
+(eg. NTFS and Wine/Proton).
|
|
+Taking advantage of file-system level casefolding can lead to great
|
|
+loading time gains in many applications and games.
|
|
+
|
|
+Casefolding support requires a kernel with the `CONFIG_UNICODE` enabled.
|
|
+Once a directory has been flagged for casefolding, a feature bit
|
|
+is enabled on the superblock which marks the filesystem as using
|
|
+casefolding.
|
|
+When the feature bit for casefolding is enabled, it is no longer possible
|
|
+to mount that filesystem on kernels without `CONFIG_UNICODE` enabled.
|
|
+
|
|
+On the lookup/query side: casefolding is implemented by allocating a new
|
|
+string of `BCH_NAME_MAX` length using the `utf8_casefold` function to
|
|
+casefold the query string.
|
|
+
|
|
+On the dirent side: casefolding is implemented by ensuring the `bkey`'s
|
|
+hash is made from the casefolded string and storing the cached casefolded
|
|
+name with the regular name in the dirent.
|
|
+
|
|
+The structure looks like this:
|
|
+
|
|
+* Regular: [dirent data][regular name][nul][nul]...
|
|
+* Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]...
|
|
+
|
|
+(Do note, the number of NULs here is merely for illustration; their count can
|
|
+vary per-key, and they may not even be present if the key is aligned to
|
|
+`sizeof(u64)`.)
|
|
+
|
|
+This is efficient as it means that for all file lookups that require casefolding,
|
|
+it has identical performance to a regular lookup:
|
|
+a hash comparison and a `memcmp` of the name.
|
|
+
|
|
+Rationale
|
|
+---------
|
|
+
|
|
+Several designs were considered for this system:
|
|
+One was to introduce a dirent_v2, however that would be painful especially as
|
|
+the hash system only has support for a single key type. This would also need
|
|
+`BCH_NAME_MAX` to change between versions, and a new feature bit.
|
|
+
|
|
+Another option was to store without the two lengths, and just take the length of
|
|
+the regular name and casefolded name contiguously / 2 as the length. This would
|
|
+assume that the regular length == casefolded length, but that could potentially
|
|
+not be true, if the uppercase unicode glyph had a different UTF-8 encoding than
|
|
+the lowercase unicode glyph.
|
|
+It would be possible to disregard the casefold cache for those cases, but it was
|
|
+decided to simply encode the two string lengths in the key to avoid random
|
|
+performance issues if this edgecase was ever hit.
|
|
+
|
|
+The option settled on was to use a free-bit in d_type to mark a dirent as having
|
|
+a casefold cache, and then treat the first 4 bytes the name block as lengths.
|
|
+You can see this in the `d_cf_name_block` member of union in `bch_dirent`.
|
|
+
|
|
+The feature bit was used to allow casefolding support to be enabled for the majority
|
|
+of users, but some allow users who have no need for the feature to still use bcachefs as
|
|
+`CONFIG_UNICODE` can increase the kernel side a significant amount due to the tables used,
|
|
+which may be decider between using bcachefs for eg. embedded platforms.
|
|
+
|
|
+Other filesystems like ext4 and f2fs have a super-block level option for casefolding
|
|
+encoding, but bcachefs currently does not provide this. ext4 and f2fs do not expose
|
|
+any encodings than a single UTF-8 version. When future encodings are desirable,
|
|
+they will be added trivially using the opts mechanism.
|
|
+
|
|
+dentry/dcache considerations
|
|
+----------------------------
|
|
+
|
|
+Currently, in casefolded directories, bcachefs (like other filesystems) will not cache
|
|
+negative dentry's.
|
|
+
|
|
+This is because currently doing so presents a problem in the following scenario:
|
|
+
|
|
+ - Lookup file "blAH" in a casefolded directory
|
|
+ - Creation of file "BLAH" in a casefolded directory
|
|
+ - Lookup file "blAH" in a casefolded directory
|
|
+
|
|
+This would fail if negative dentry's were cached.
|
|
+
|
|
+This is slightly suboptimal, but could be fixed in future with some vfs work.
|
|
+
|
|
+
|
|
+References
|
|
+----------
|
|
+
|
|
+(from Peter Anvin, on the list)
|
|
+
|
|
+It is worth noting that Microsoft has basically declared their
|
|
+"recommended" case folding (upcase) table to be permanently frozen (for
|
|
+new filesystem instances in the case where they use an on-disk
|
|
+translation table created at format time.) As far as I know they have
|
|
+never supported anything other than 1:1 conversion of BMP code points,
|
|
+nor normalization.
|
|
+
|
|
+The exFAT specification enumerates the full recommended upcase table,
|
|
+although in a somewhat annoying format (basically a hex dump of
|
|
+compressed data):
|
|
+
|
|
+https://learn.microsoft.com/en-us/windows/win32/fileio/exfat-specification
|
|
diff --git a/Documentation/filesystems/bcachefs/future/idle_work.rst b/Documentation/filesystems/bcachefs/future/idle_work.rst
|
|
new file mode 100644
|
|
index 000000000000..59a332509dcd
|
|
--- /dev/null
|
|
+++ b/Documentation/filesystems/bcachefs/future/idle_work.rst
|
|
@@ -0,0 +1,78 @@
|
|
+Idle/background work classes design doc:
|
|
+
|
|
+Right now, our behaviour at idle isn't ideal, it was designed for servers that
|
|
+would be under sustained load, to keep pending work at a "medium" level, to
|
|
+let work build up so we can process it in more efficient batches, while also
|
|
+giving headroom for bursts in load.
|
|
+
|
|
+But for desktops or mobile - scenarios where work is less sustained and power
|
|
+usage is more important - we want to operate differently, with a "rush to
|
|
+idle" so the system can go to sleep. We don't want to be dribbling out
|
|
+background work while the system should be idle.
|
|
+
|
|
+The complicating factor is that there are a number of background tasks, which
|
|
+form a heirarchy (or a digraph, depending on how you divide it up) - one
|
|
+background task may generate work for another.
|
|
+
|
|
+Thus proper idle detection needs to model this heirarchy.
|
|
+
|
|
+- Foreground writes
|
|
+- Page cache writeback
|
|
+- Copygc, rebalance
|
|
+- Journal reclaim
|
|
+
|
|
+When we implement idle detection and rush to idle, we need to be careful not
|
|
+to disturb too much the existing behaviour that works reasonably well when the
|
|
+system is under sustained load (or perhaps improve it in the case of
|
|
+rebalance, which currently does not actively attempt to let work batch up).
|
|
+
|
|
+SUSTAINED LOAD REGIME
|
|
+---------------------
|
|
+
|
|
+When the system is under continuous load, we want these jobs to run
|
|
+continuously - this is perhaps best modelled with a P/D controller, where
|
|
+they'll be trying to keep a target value (i.e. fragmented disk space,
|
|
+available journal space) roughly in the middle of some range.
|
|
+
|
|
+The goal under sustained load is to balance our ability to handle load spikes
|
|
+without running out of x resource (free disk space, free space in the
|
|
+journal), while also letting some work accumululate to be batched (or become
|
|
+unnecessary).
|
|
+
|
|
+For example, we don't want to run copygc too aggressively, because then it
|
|
+will be evacuating buckets that would have become empty (been overwritten or
|
|
+deleted) anyways, and we don't want to wait until we're almost out of free
|
|
+space because then the system will behave unpredicably - suddenly we're doing
|
|
+a lot more work to service each write and the system becomes much slower.
|
|
+
|
|
+IDLE REGIME
|
|
+-----------
|
|
+
|
|
+When the system becomes idle, we should start flushing our pending work
|
|
+quicker so the system can go to sleep.
|
|
+
|
|
+Note that the definition of "idle" depends on where in the heirarchy a task
|
|
+is - a task should start flushing work more quickly when the task above it has
|
|
+stopped generating new work.
|
|
+
|
|
+e.g. rebalance should start flushing more quickly when page cache writeback is
|
|
+idle, and journal reclaim should only start flushing more quickly when both
|
|
+copygc and rebalance are idle.
|
|
+
|
|
+It's important to let work accumulate when more work is still incoming and we
|
|
+still have room, because flushing is always more efficient if we let it batch
|
|
+up. New writes may overwrite data before rebalance moves it, and tasks may be
|
|
+generating more updates for the btree nodes that journal reclaim needs to flush.
|
|
+
|
|
+On idle, how much work we do at each interval should be proportional to the
|
|
+length of time we have been idle for. If we're idle only for a short duration,
|
|
+we shouldn't flush everything right away; the system might wake up and start
|
|
+generating new work soon, and flushing immediately might end up doing a lot of
|
|
+work that would have been unnecessary if we'd allowed things to batch more.
|
|
+
|
|
+To summarize, we will need:
|
|
+
|
|
+ - A list of classes for background tasks that generate work, which will
|
|
+ include one "foreground" class.
|
|
+ - Tracking for each class - "Am I doing work, or have I gone to sleep?"
|
|
+ - And each class should check the class above it when deciding how much work to issue.
|
|
diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst
|
|
index 7db4d7ceab58..e5c4c2120b93 100644
|
|
--- a/Documentation/filesystems/bcachefs/index.rst
|
|
+++ b/Documentation/filesystems/bcachefs/index.rst
|
|
@@ -4,10 +4,35 @@
|
|
bcachefs Documentation
|
|
======================
|
|
|
|
+Subsystem-specific development process notes
|
|
+--------------------------------------------
|
|
+
|
|
+Development notes specific to bcachefs. These are intended to supplement
|
|
+:doc:`general kernel development handbook </process/index>`.
|
|
+
|
|
.. toctree::
|
|
- :maxdepth: 2
|
|
+ :maxdepth: 1
|
|
:numbered:
|
|
|
|
CodingStyle
|
|
SubmittingPatches
|
|
+
|
|
+Filesystem implementation
|
|
+-------------------------
|
|
+
|
|
+Documentation for filesystem features and their implementation details.
|
|
+At this moment, only a few of these are described here.
|
|
+
|
|
+.. toctree::
|
|
+ :maxdepth: 1
|
|
+ :numbered:
|
|
+
|
|
+ casefolding
|
|
errorcodes
|
|
+
|
|
+Future design
|
|
+-------------
|
|
+.. toctree::
|
|
+ :maxdepth: 1
|
|
+
|
|
+ future/idle_work
|
|
diff --git a/MAINTAINERS b/MAINTAINERS
|
|
index 00e94bec401e..82d236baff32 100644
|
|
--- a/MAINTAINERS
|
|
+++ b/MAINTAINERS
|
|
@@ -6404,6 +6404,13 @@ F: net/ax25/ax25_out.c
|
|
F: net/ax25/ax25_timer.c
|
|
F: net/ax25/sysctl_net_ax25.c
|
|
|
|
+DARRAY
|
|
+M: Kent Overstreet <kent.overstreet@linux.dev>
|
|
+L: linux-bcachefs@vger.kernel.org
|
|
+S: Maintained
|
|
+F: include/linux/darray.h
|
|
+F: include/linux/darray_types.h
|
|
+
|
|
DATA ACCESS MONITOR
|
|
M: SeongJae Park <sj@kernel.org>
|
|
L: damon@lists.linux.dev
|
|
diff --git a/block/bdev.c b/block/bdev.c
|
|
index 5aebcf437f17..d909d87f857c 100644
|
|
--- a/block/bdev.c
|
|
+++ b/block/bdev.c
|
|
@@ -178,6 +178,8 @@ EXPORT_SYMBOL(set_blocksize);
|
|
|
|
int sb_set_blocksize(struct super_block *sb, int size)
|
|
{
|
|
+ if (!(sb->s_type->fs_flags & FS_LBS) && size > PAGE_SIZE)
|
|
+ return 0;
|
|
if (set_blocksize(sb->s_bdev_file, size))
|
|
return 0;
|
|
/* If we get here, we know size is power of two
|
|
diff --git a/block/blk-core.c b/block/blk-core.c
|
|
index d6c4fa3943b5..7b1103eb877d 100644
|
|
--- a/block/blk-core.c
|
|
+++ b/block/blk-core.c
|
|
@@ -793,20 +793,21 @@ void submit_bio_noacct(struct bio *bio)
|
|
goto end_io;
|
|
}
|
|
|
|
+ if (WARN_ON_ONCE((bio->bi_opf & REQ_PREFLUSH) &&
|
|
+ bio_op(bio) != REQ_OP_WRITE &&
|
|
+ bio_op(bio) != REQ_OP_ZONE_APPEND))
|
|
+ goto end_io;
|
|
+
|
|
/*
|
|
* Filter flush bio's early so that bio based drivers without flush
|
|
* support don't have to worry about them.
|
|
*/
|
|
- if (op_is_flush(bio->bi_opf)) {
|
|
- if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
|
|
- bio_op(bio) != REQ_OP_ZONE_APPEND))
|
|
+ if (op_is_flush(bio->bi_opf) &&
|
|
+ !bdev_write_cache(bdev)) {
|
|
+ bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
|
|
+ if (!bio_sectors(bio)) {
|
|
+ status = BLK_STS_OK;
|
|
goto end_io;
|
|
- if (!bdev_write_cache(bdev)) {
|
|
- bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
|
|
- if (!bio_sectors(bio)) {
|
|
- status = BLK_STS_OK;
|
|
- goto end_io;
|
|
- }
|
|
}
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
|
|
index fc7efd0a7525..ca165415b1d8 100644
|
|
--- a/fs/bcachefs/Kconfig
|
|
+++ b/fs/bcachefs/Kconfig
|
|
@@ -15,10 +15,9 @@ config BCACHEFS_FS
|
|
select ZLIB_INFLATE
|
|
select ZSTD_COMPRESS
|
|
select ZSTD_DECOMPRESS
|
|
- select CRYPTO
|
|
- select CRYPTO_SHA256
|
|
- select CRYPTO_CHACHA20
|
|
- select CRYPTO_POLY1305
|
|
+ select CRYPTO_LIB_SHA256
|
|
+ select CRYPTO_LIB_CHACHA
|
|
+ select CRYPTO_LIB_POLY1305
|
|
select KEYS
|
|
select RAID6_PQ
|
|
select XOR_BLOCKS
|
|
@@ -26,6 +25,7 @@ config BCACHEFS_FS
|
|
select SRCU
|
|
select SYMBOLIC_ERRNAME
|
|
select MIN_HEAP
|
|
+ select XARRAY_MULTI
|
|
help
|
|
The bcachefs filesystem - a modern, copy on write filesystem, with
|
|
support for multiple devices, compression, checksumming, etc.
|
|
@@ -103,6 +103,14 @@ config BCACHEFS_PATH_TRACEPOINTS
|
|
Enable extra tracepoints for debugging btree_path operations; we don't
|
|
normally want these enabled because they happen at very high rates.
|
|
|
|
+config BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+ bool "Trace bch2_trans_kmalloc() calls"
|
|
+ depends on BCACHEFS_FS
|
|
+
|
|
+config BCACHEFS_ASYNC_OBJECT_LISTS
|
|
+ bool "Keep async objects on fast_lists for debugfs visibility"
|
|
+ depends on BCACHEFS_FS && DEBUG_FS
|
|
+
|
|
config MEAN_AND_VARIANCE_UNIT_TEST
|
|
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
|
|
depends on KUNIT
|
|
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
|
|
index d2689388d5e8..d71621711cfa 100644
|
|
--- a/fs/bcachefs/Makefile
|
|
+++ b/fs/bcachefs/Makefile
|
|
@@ -28,20 +28,20 @@ bcachefs-y := \
|
|
checksum.o \
|
|
clock.o \
|
|
compress.o \
|
|
- darray.o \
|
|
data_update.o \
|
|
debug.o \
|
|
dirent.o \
|
|
disk_accounting.o \
|
|
disk_groups.o \
|
|
ec.o \
|
|
+ enumerated_ref.o \
|
|
errcode.o \
|
|
error.o \
|
|
extents.o \
|
|
extent_update.o \
|
|
eytzinger.o \
|
|
+ fast_list.o \
|
|
fs.o \
|
|
- fs-common.o \
|
|
fs-ioctl.o \
|
|
fs-io.o \
|
|
fs-io-buffered.o \
|
|
@@ -64,9 +64,11 @@ bcachefs-y := \
|
|
migrate.o \
|
|
move.o \
|
|
movinggc.o \
|
|
+ namei.o \
|
|
nocow_locking.o \
|
|
opts.o \
|
|
printbuf.o \
|
|
+ progress.o \
|
|
quota.o \
|
|
rebalance.o \
|
|
rcu_pending.o \
|
|
@@ -96,6 +98,8 @@ bcachefs-y := \
|
|
varint.o \
|
|
xattr.o
|
|
|
|
+bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o
|
|
+
|
|
obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
|
|
|
|
# Silence "note: xyz changed in GCC X.X" messages
|
|
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
|
|
index 99487727ae64..d03adc36100e 100644
|
|
--- a/fs/bcachefs/acl.c
|
|
+++ b/fs/bcachefs/acl.c
|
|
@@ -273,7 +273,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
|
|
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
|
|
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
|
|
- struct btree_iter iter = { NULL };
|
|
+ struct btree_iter iter = {};
|
|
struct posix_acl *acl = NULL;
|
|
|
|
if (rcu)
|
|
@@ -344,7 +344,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
|
|
{
|
|
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
|
|
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
- struct btree_iter inode_iter = { NULL };
|
|
+ struct btree_iter inode_iter = {};
|
|
struct bch_inode_unpacked inode_u;
|
|
struct posix_acl *acl;
|
|
umode_t mode;
|
|
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
|
|
index 3ea809990ef1..a38b9c6c891e 100644
|
|
--- a/fs/bcachefs/alloc_background.c
|
|
+++ b/fs/bcachefs/alloc_background.c
|
|
@@ -17,6 +17,7 @@
|
|
#include "debug.h"
|
|
#include "disk_accounting.h"
|
|
#include "ec.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "error.h"
|
|
#include "lru.h"
|
|
#include "recovery.h"
|
|
@@ -232,7 +233,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
int ret = 0;
|
|
|
|
bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
|
|
- c, alloc_v2_unpack_error,
|
|
+ c, alloc_v3_unpack_error,
|
|
"unpack error");
|
|
fsck_err:
|
|
return ret;
|
|
@@ -308,7 +309,8 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
"data type inconsistency");
|
|
|
|
bkey_fsck_err_on(!a.io_time[READ] &&
|
|
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
|
|
+ !(c->recovery.passes_to_run &
|
|
+ BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)),
|
|
c, alloc_key_cached_but_read_time_zero,
|
|
"cached bucket with read_time == 0");
|
|
break;
|
|
@@ -478,12 +480,27 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans,
|
|
enum btree_iter_update_trigger_flags flags)
|
|
{
|
|
struct btree_iter iter;
|
|
- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
|
|
- int ret = PTR_ERR_OR_ZERO(a);
|
|
- if (ret)
|
|
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos,
|
|
+ BTREE_ITER_with_updates|
|
|
+ BTREE_ITER_cached|
|
|
+ BTREE_ITER_intent);
|
|
+ int ret = bkey_err(k);
|
|
+ if (unlikely(ret))
|
|
return ERR_PTR(ret);
|
|
|
|
- ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
|
|
+ if ((void *) k.v >= trans->mem &&
|
|
+ (void *) k.v < trans->mem + trans->mem_top) {
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v);
|
|
+ }
|
|
+
|
|
+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
|
|
+ if (IS_ERR(a)) {
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return a;
|
|
+ }
|
|
+
|
|
+ ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return unlikely(ret) ? ERR_PTR(ret) : a;
|
|
}
|
|
@@ -589,6 +606,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
|
|
|
|
int bch2_alloc_read(struct bch_fs *c)
|
|
{
|
|
+ down_read(&c->state_lock);
|
|
+
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
struct bch_dev *ca = NULL;
|
|
int ret;
|
|
@@ -608,7 +627,7 @@ int bch2_alloc_read(struct bch_fs *c)
|
|
* bch2_check_alloc_key() which runs later:
|
|
*/
|
|
if (!ca) {
|
|
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
|
|
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
|
|
continue;
|
|
}
|
|
|
|
@@ -629,17 +648,17 @@ int bch2_alloc_read(struct bch_fs *c)
|
|
* bch2_check_alloc_key() which runs later:
|
|
*/
|
|
if (!ca) {
|
|
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
|
|
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
|
|
continue;
|
|
}
|
|
|
|
if (k.k->p.offset < ca->mi.first_bucket) {
|
|
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
|
|
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket));
|
|
continue;
|
|
}
|
|
|
|
if (k.k->p.offset >= ca->mi.nbuckets) {
|
|
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
|
|
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
|
|
continue;
|
|
}
|
|
|
|
@@ -652,6 +671,7 @@ int bch2_alloc_read(struct bch_fs *c)
|
|
bch2_dev_put(ca);
|
|
bch2_trans_put(trans);
|
|
|
|
+ up_read(&c->state_lock);
|
|
bch_err_fn(c, ret);
|
|
return ret;
|
|
}
|
|
@@ -673,8 +693,7 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans,
|
|
bch2_bkey_val_to_text(&buf, c, alloc_k);
|
|
|
|
int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
|
|
- "bucket incorrectly %sset in %s btree\n"
|
|
- " %s",
|
|
+ "bucket incorrectly %sset in %s btree\n%s",
|
|
set ? "" : "un",
|
|
bch2_btree_id_str(btree),
|
|
buf.buf);
|
|
@@ -777,14 +796,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s
|
|
s64 delta_sectors,
|
|
s64 delta_fragmented, unsigned flags)
|
|
{
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
|
|
- .dev_data_type.dev = ca->dev_idx,
|
|
- .dev_data_type.data_type = data_type,
|
|
- };
|
|
s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
|
|
|
|
- return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
|
|
+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
|
|
+ d, dev_data_type,
|
|
+ .dev = ca->dev_idx,
|
|
+ .data_type = data_type);
|
|
}
|
|
|
|
int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
|
|
@@ -837,7 +854,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
|
|
|
struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
|
|
if (!ca)
|
|
- return -EIO;
|
|
+ return -BCH_ERR_trigger_alloc;
|
|
|
|
struct bch_alloc_v4 old_a_convert;
|
|
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
|
|
@@ -871,6 +888,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
|
if (data_type_is_empty(new_a->data_type) &&
|
|
BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
|
|
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
|
|
+ if (new_a->oldest_gen == new_a->gen &&
|
|
+ !bch2_bucket_sectors_total(*new_a))
|
|
+ new_a->oldest_gen++;
|
|
new_a->gen++;
|
|
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
|
|
alloc_data_type_set(new_a, new_a->data_type);
|
|
@@ -889,26 +909,20 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
|
!new_a->io_time[READ])
|
|
new_a->io_time[READ] = bch2_current_io_time(c, READ);
|
|
|
|
- u64 old_lru = alloc_lru_idx_read(*old_a);
|
|
- u64 new_lru = alloc_lru_idx_read(*new_a);
|
|
- if (old_lru != new_lru) {
|
|
- ret = bch2_lru_change(trans, new.k->p.inode,
|
|
- bucket_to_u64(new.k->p),
|
|
- old_lru, new_lru);
|
|
- if (ret)
|
|
- goto err;
|
|
- }
|
|
+ ret = bch2_lru_change(trans, new.k->p.inode,
|
|
+ bucket_to_u64(new.k->p),
|
|
+ alloc_lru_idx_read(*old_a),
|
|
+ alloc_lru_idx_read(*new_a));
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
- old_lru = alloc_lru_idx_fragmentation(*old_a, ca);
|
|
- new_lru = alloc_lru_idx_fragmentation(*new_a, ca);
|
|
- if (old_lru != new_lru) {
|
|
- ret = bch2_lru_change(trans,
|
|
- BCH_LRU_FRAGMENTATION_START,
|
|
- bucket_to_u64(new.k->p),
|
|
- old_lru, new_lru);
|
|
- if (ret)
|
|
- goto err;
|
|
- }
|
|
+ ret = bch2_lru_change(trans,
|
|
+ BCH_LRU_BUCKET_FRAGMENTATION,
|
|
+ bucket_to_u64(new.k->p),
|
|
+ alloc_lru_idx_fragmentation(*old_a, ca),
|
|
+ alloc_lru_idx_fragmentation(*new_a, ca));
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
if (old_a->gen != new_a->gen) {
|
|
ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
|
|
@@ -916,15 +930,6 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
|
goto err;
|
|
}
|
|
|
|
- if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
|
|
- old_a->cached_sectors) {
|
|
- ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
|
|
- -((s64) old_a->cached_sectors),
|
|
- flags & BTREE_TRIGGER_gc);
|
|
- if (ret)
|
|
- goto err;
|
|
- }
|
|
-
|
|
ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1032,9 +1037,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
|
bch2_dev_put(ca);
|
|
return ret;
|
|
invalid_bucket:
|
|
- bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
|
|
+ bch2_fs_inconsistent(c, "reference to invalid bucket\n%s",
|
|
(bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
|
|
- ret = -EIO;
|
|
+ ret = -BCH_ERR_trigger_alloc;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1042,9 +1047,10 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
|
* This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
|
|
* extents style btrees, but works on non-extents btrees:
|
|
*/
|
|
-static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
|
|
+static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bpos end, struct bkey *hole)
|
|
{
|
|
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
|
|
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
|
|
|
|
if (bkey_err(k))
|
|
return k;
|
|
@@ -1055,9 +1061,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
|
|
struct btree_iter iter2;
|
|
struct bpos next;
|
|
|
|
- bch2_trans_copy_iter(&iter2, iter);
|
|
+ bch2_trans_copy_iter(trans, &iter2, iter);
|
|
|
|
- struct btree_path *path = btree_iter_path(iter->trans, iter);
|
|
+ struct btree_path *path = btree_iter_path(trans, iter);
|
|
if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
|
|
end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
|
|
|
|
@@ -1067,9 +1073,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
|
|
* btree node min/max is a closed interval, upto takes a half
|
|
* open interval:
|
|
*/
|
|
- k = bch2_btree_iter_peek_max(&iter2, end);
|
|
+ k = bch2_btree_iter_peek_max(trans, &iter2, end);
|
|
next = iter2.pos;
|
|
- bch2_trans_iter_exit(iter->trans, &iter2);
|
|
+ bch2_trans_iter_exit(trans, &iter2);
|
|
|
|
BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
|
|
|
|
@@ -1110,13 +1116,14 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck
|
|
return *ca != NULL;
|
|
}
|
|
|
|
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
|
|
- struct bch_dev **ca, struct bkey *hole)
|
|
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bch_dev **ca, struct bkey *hole)
|
|
{
|
|
- struct bch_fs *c = iter->trans->c;
|
|
+ struct bch_fs *c = trans->c;
|
|
struct bkey_s_c k;
|
|
again:
|
|
- k = bch2_get_key_or_hole(iter, POS_MAX, hole);
|
|
+ k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole);
|
|
if (bkey_err(k))
|
|
return k;
|
|
|
|
@@ -1129,7 +1136,7 @@ static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
|
|
if (!next_bucket(c, ca, &hole_start))
|
|
return bkey_s_c_null;
|
|
|
|
- bch2_btree_iter_set_pos(iter, hole_start);
|
|
+ bch2_btree_iter_set_pos(trans, iter, hole_start);
|
|
goto again;
|
|
}
|
|
|
|
@@ -1170,8 +1177,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
|
|
|
|
a = bch2_alloc_to_v4(alloc_k, &a_convert);
|
|
|
|
- bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
|
|
- k = bch2_btree_iter_peek_slot(discard_iter);
|
|
+ bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p);
|
|
+ k = bch2_btree_iter_peek_slot(trans, discard_iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1184,8 +1191,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
|
|
goto err;
|
|
}
|
|
|
|
- bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
|
|
- k = bch2_btree_iter_peek_slot(freespace_iter);
|
|
+ bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
|
|
+ k = bch2_btree_iter_peek_slot(trans, freespace_iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1198,16 +1205,15 @@ int bch2_check_alloc_key(struct btree_trans *trans,
|
|
goto err;
|
|
}
|
|
|
|
- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
|
|
- k = bch2_btree_iter_peek_slot(bucket_gens_iter);
|
|
+ bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
|
|
+ k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
|
|
if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
|
|
trans, bucket_gens_key_wrong,
|
|
- "incorrect gen in bucket_gens btree (got %u should be %u)\n"
|
|
- " %s",
|
|
+ "incorrect gen in bucket_gens btree (got %u should be %u)\n%s",
|
|
alloc_gen(k, gens_offset), a->gen,
|
|
(printbuf_reset(&buf),
|
|
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
|
|
@@ -1253,9 +1259,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
|
|
if (!ca->mi.freespace_initialized)
|
|
return 0;
|
|
|
|
- bch2_btree_iter_set_pos(freespace_iter, start);
|
|
+ bch2_btree_iter_set_pos(trans, freespace_iter, start);
|
|
|
|
- k = bch2_btree_iter_peek_slot(freespace_iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, freespace_iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1265,7 +1271,7 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
|
|
if (fsck_err_on(k.k->type != KEY_TYPE_set,
|
|
trans, freespace_hole_missing,
|
|
"hole in alloc btree missing in freespace btree\n"
|
|
- " device %llu buckets %llu-%llu",
|
|
+ "device %llu buckets %llu-%llu",
|
|
freespace_iter->pos.inode,
|
|
freespace_iter->pos.offset,
|
|
end->offset)) {
|
|
@@ -1304,9 +1310,9 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
|
|
unsigned i, gens_offset, gens_end_offset;
|
|
int ret;
|
|
|
|
- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
|
|
+ bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
|
|
|
|
- k = bch2_btree_iter_peek_slot(bucket_gens_iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1383,7 +1389,7 @@ static void check_discard_freespace_key_work(struct work_struct *work)
|
|
container_of(work, struct check_discard_freespace_key_async, work);
|
|
|
|
bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
|
|
- bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key);
|
|
+ enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key);
|
|
kfree(w);
|
|
}
|
|
|
|
@@ -1424,7 +1430,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
|
|
(state == BCH_DATA_free &&
|
|
genbits != alloc_freespace_genbits(*a))) {
|
|
if (fsck_err(trans, need_discard_freespace_key_bad,
|
|
- "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
|
|
+ "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
|
|
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
|
|
bch2_btree_id_str(iter->btree_id),
|
|
iter->pos.inode,
|
|
@@ -1439,7 +1445,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
|
|
*gen = a->gen;
|
|
out:
|
|
fsck_err:
|
|
- bch2_set_btree_iter_dontneed(&alloc_iter);
|
|
+ bch2_set_btree_iter_dontneed(trans, &alloc_iter);
|
|
bch2_trans_iter_exit(trans, &alloc_iter);
|
|
printbuf_exit(&buf);
|
|
return ret;
|
|
@@ -1460,7 +1466,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
|
|
if (!w)
|
|
goto out;
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) {
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) {
|
|
kfree(w);
|
|
goto out;
|
|
}
|
|
@@ -1505,7 +1511,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
|
|
struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
|
|
if (!ca) {
|
|
if (fsck_err(trans, bucket_gens_to_invalid_dev,
|
|
- "bucket_gens key for invalid device:\n %s",
|
|
+ "bucket_gens key for invalid device:\n%s",
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
|
ret = bch2_btree_delete_at(trans, iter, 0);
|
|
goto out;
|
|
@@ -1514,7 +1520,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
|
|
if (fsck_err_on(end <= ca->mi.first_bucket ||
|
|
start >= ca->mi.nbuckets,
|
|
trans, bucket_gens_to_invalid_buckets,
|
|
- "bucket_gens key for invalid buckets:\n %s",
|
|
+ "bucket_gens key for invalid buckets:\n%s",
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
|
ret = bch2_btree_delete_at(trans, iter, 0);
|
|
goto out;
|
|
@@ -1576,7 +1582,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
|
|
|
|
bch2_trans_begin(trans);
|
|
|
|
- k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
|
|
+ k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto bkey_err;
|
|
@@ -1614,7 +1620,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
|
|
if (ret)
|
|
goto bkey_err;
|
|
|
|
- bch2_btree_iter_set_pos(&iter, next);
|
|
+ bch2_btree_iter_set_pos(trans, &iter, next);
|
|
bkey_err:
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
continue;
|
|
@@ -1642,7 +1648,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
|
|
BTREE_ITER_prefetch);
|
|
while (1) {
|
|
bch2_trans_begin(trans);
|
|
- k = bch2_btree_iter_peek(&iter);
|
|
+ k = bch2_btree_iter_peek(trans, &iter);
|
|
if (!k.k)
|
|
break;
|
|
|
|
@@ -1661,7 +1667,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
|
|
break;
|
|
}
|
|
|
|
- bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
|
|
+ bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos));
|
|
}
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
if (ret)
|
|
@@ -1689,7 +1695,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
|
|
struct printbuf buf = PRINTBUF;
|
|
int ret;
|
|
|
|
- alloc_k = bch2_btree_iter_peek(alloc_iter);
|
|
+ alloc_k = bch2_btree_iter_peek(trans, alloc_iter);
|
|
if (!alloc_k.k)
|
|
return 0;
|
|
|
|
@@ -1705,7 +1711,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
|
|
|
|
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
|
|
if (lru_idx) {
|
|
- ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
|
|
+ ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION,
|
|
+ bucket_to_u64(alloc_k.k->p),
|
|
lru_idx, alloc_k, last_flushed);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1716,8 +1723,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
|
|
|
|
if (fsck_err_on(!a->io_time[READ],
|
|
trans, alloc_key_cached_but_read_time_zero,
|
|
- "cached bucket with read_time 0\n"
|
|
- " %s",
|
|
+ "cached bucket with read_time 0\n%s",
|
|
(printbuf_reset(&buf),
|
|
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
|
|
struct bkey_i_alloc_v4 *a_mut =
|
|
@@ -1735,7 +1741,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
|
|
a = &a_mut->v;
|
|
}
|
|
|
|
- ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
|
|
+ ret = bch2_lru_check_set(trans, alloc_k.k->p.inode,
|
|
+ bucket_to_u64(alloc_k.k->p),
|
|
+ a->io_time[READ],
|
|
alloc_k, last_flushed);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1757,7 +1765,8 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
|
|
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
|
|
POS_MIN, BTREE_ITER_prefetch, k,
|
|
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
- bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
|
|
+ bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?:
|
|
+ bch2_check_stripe_to_lru_refs(c);
|
|
|
|
bch2_bkey_buf_exit(&last_flushed, c);
|
|
bch_err_fn(c, ret);
|
|
@@ -1814,7 +1823,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct bpos pos = need_discard_iter->pos;
|
|
- struct btree_iter iter = { NULL };
|
|
+ struct btree_iter iter = {};
|
|
struct bkey_s_c k;
|
|
struct bkey_i_alloc_v4 *a;
|
|
struct printbuf buf = PRINTBUF;
|
|
@@ -1868,7 +1877,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
|
|
s->discarded++;
|
|
*discard_pos_done = iter.pos;
|
|
|
|
- if (ca->mi.discard && !c->opts.nochanges) {
|
|
+ if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) {
|
|
/*
|
|
* This works without any other locks because this is the only
|
|
* thread that removes items from the need_discard tree
|
|
@@ -1897,7 +1906,10 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
|
|
if (ret)
|
|
goto out;
|
|
|
|
- count_event(c, bucket_discard);
|
|
+ if (!fastpath)
|
|
+ count_event(c, bucket_discard);
|
|
+ else
|
|
+ count_event(c, bucket_discard_fast);
|
|
out:
|
|
fsck_err:
|
|
if (discard_locked)
|
|
@@ -1935,26 +1947,26 @@ static void bch2_do_discards_work(struct work_struct *work)
|
|
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
|
|
bch2_err_str(ret));
|
|
|
|
- percpu_ref_put(&ca->io_ref);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
|
|
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
|
|
}
|
|
|
|
void bch2_dev_do_discards(struct bch_dev *ca)
|
|
{
|
|
struct bch_fs *c = ca->fs;
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard))
|
|
return;
|
|
|
|
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
|
|
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards))
|
|
goto put_write_ref;
|
|
|
|
if (queue_work(c->write_ref_wq, &ca->discard_work))
|
|
return;
|
|
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
|
|
put_write_ref:
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
|
|
}
|
|
|
|
void bch2_do_discards(struct bch_fs *c)
|
|
@@ -2030,8 +2042,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
|
|
trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
|
|
|
|
bch2_trans_put(trans);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
|
|
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
|
|
}
|
|
|
|
static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
|
|
@@ -2041,30 +2053,88 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
|
|
if (discard_in_flight_add(ca, bucket, false))
|
|
return;
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast))
|
|
return;
|
|
|
|
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
|
|
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast))
|
|
goto put_ref;
|
|
|
|
if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
|
|
return;
|
|
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
|
|
put_ref:
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
|
|
+}
|
|
+
|
|
+static int invalidate_one_bp(struct btree_trans *trans,
|
|
+ struct bch_dev *ca,
|
|
+ struct bkey_s_c_backpointer bp,
|
|
+ struct bkey_buf *last_flushed)
|
|
+{
|
|
+ struct btree_iter extent_iter;
|
|
+ struct bkey_s_c extent_k =
|
|
+ bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed);
|
|
+ int ret = bkey_err(extent_k);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (!extent_k.k)
|
|
+ return 0;
|
|
+
|
|
+ struct bkey_i *n =
|
|
+ bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
|
|
+ BTREE_UPDATE_internal_snapshot_node);
|
|
+ ret = PTR_ERR_OR_ZERO(n);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx);
|
|
+err:
|
|
+ bch2_trans_iter_exit(trans, &extent_iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int invalidate_one_bucket_by_bps(struct btree_trans *trans,
|
|
+ struct bch_dev *ca,
|
|
+ struct bpos bucket,
|
|
+ u8 gen,
|
|
+ struct bkey_buf *last_flushed)
|
|
+{
|
|
+ struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket);
|
|
+ struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket);
|
|
+
|
|
+ return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
|
|
+ bp_start, bp_end, 0, k,
|
|
+ NULL, NULL,
|
|
+ BCH_WATERMARK_btree|
|
|
+ BCH_TRANS_COMMIT_no_enospc, ({
|
|
+ if (k.k->type != KEY_TYPE_backpointer)
|
|
+ continue;
|
|
+
|
|
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
|
|
+
|
|
+ if (bp.v->bucket_gen != gen)
|
|
+ continue;
|
|
+
|
|
+ /* filter out bps with gens that don't match */
|
|
+
|
|
+ invalidate_one_bp(trans, ca, bp, last_flushed);
|
|
+ }));
|
|
}
|
|
|
|
+noinline_for_stack
|
|
static int invalidate_one_bucket(struct btree_trans *trans,
|
|
+ struct bch_dev *ca,
|
|
struct btree_iter *lru_iter,
|
|
struct bkey_s_c lru_k,
|
|
+ struct bkey_buf *last_flushed,
|
|
s64 *nr_to_invalidate)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct bkey_i_alloc_v4 *a = NULL;
|
|
struct printbuf buf = PRINTBUF;
|
|
struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
|
|
- unsigned cached_sectors;
|
|
+ struct btree_iter alloc_iter = {};
|
|
int ret = 0;
|
|
|
|
if (*nr_to_invalidate <= 0)
|
|
@@ -2081,35 +2151,40 @@ static int invalidate_one_bucket(struct btree_trans *trans,
|
|
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
|
|
return 0;
|
|
|
|
- a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
|
|
- ret = PTR_ERR_OR_ZERO(a);
|
|
+ struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
|
|
+ BTREE_ID_alloc, bucket,
|
|
+ BTREE_ITER_cached);
|
|
+ ret = bkey_err(alloc_k);
|
|
if (ret)
|
|
- goto out;
|
|
+ return ret;
|
|
+
|
|
+ struct bch_alloc_v4 a_convert;
|
|
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
|
|
|
|
/* We expect harmless races here due to the btree write buffer: */
|
|
- if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
|
|
+ if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a))
|
|
goto out;
|
|
|
|
- BUG_ON(a->v.data_type != BCH_DATA_cached);
|
|
- BUG_ON(a->v.dirty_sectors);
|
|
-
|
|
- if (!a->v.cached_sectors)
|
|
- bch_err(c, "invalidating empty bucket, confused");
|
|
+ /*
|
|
+ * Impossible since alloc_lru_idx_read() only returns nonzero if the
|
|
+ * bucket is supposed to be on the cached bucket LRU (i.e.
|
|
+ * BCH_DATA_cached)
|
|
+ *
|
|
+ * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0
|
|
+ */
|
|
+ BUG_ON(a->data_type != BCH_DATA_cached);
|
|
+ BUG_ON(a->dirty_sectors);
|
|
|
|
- cached_sectors = a->v.cached_sectors;
|
|
+ if (!a->cached_sectors) {
|
|
+ bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset,
|
|
+ true, last_flushed);
|
|
+ goto out;
|
|
+ }
|
|
|
|
- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
|
|
- a->v.gen++;
|
|
- a->v.data_type = 0;
|
|
- a->v.dirty_sectors = 0;
|
|
- a->v.stripe_sectors = 0;
|
|
- a->v.cached_sectors = 0;
|
|
- a->v.io_time[READ] = bch2_current_io_time(c, READ);
|
|
- a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
|
|
+ unsigned cached_sectors = a->cached_sectors;
|
|
+ u8 gen = a->gen;
|
|
|
|
- ret = bch2_trans_commit(trans, NULL, NULL,
|
|
- BCH_WATERMARK_btree|
|
|
- BCH_TRANS_COMMIT_no_enospc);
|
|
+ ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed);
|
|
if (ret)
|
|
goto out;
|
|
|
|
@@ -2117,6 +2192,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
|
|
--*nr_to_invalidate;
|
|
out:
|
|
fsck_err:
|
|
+ bch2_trans_iter_exit(trans, &alloc_iter);
|
|
printbuf_exit(&buf);
|
|
return ret;
|
|
}
|
|
@@ -2126,9 +2202,9 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter
|
|
{
|
|
struct bkey_s_c k;
|
|
again:
|
|
- k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
|
|
+ k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
|
|
if (!k.k && !*wrapped) {
|
|
- bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
|
|
+ bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0));
|
|
*wrapped = true;
|
|
goto again;
|
|
}
|
|
@@ -2143,6 +2219,10 @@ static void bch2_do_invalidates_work(struct work_struct *work)
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
int ret = 0;
|
|
|
|
+ struct bkey_buf last_flushed;
|
|
+ bch2_bkey_buf_init(&last_flushed);
|
|
+ bkey_init(&last_flushed.k->k);
|
|
+
|
|
ret = bch2_btree_write_buffer_tryflush(trans);
|
|
if (ret)
|
|
goto err;
|
|
@@ -2167,38 +2247,39 @@ static void bch2_do_invalidates_work(struct work_struct *work)
|
|
if (!k.k)
|
|
break;
|
|
|
|
- ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
|
|
+ ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate);
|
|
restart_err:
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
continue;
|
|
if (ret)
|
|
break;
|
|
|
|
- bch2_btree_iter_advance(&iter);
|
|
+ bch2_btree_iter_advance(trans, &iter);
|
|
}
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
err:
|
|
bch2_trans_put(trans);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
|
|
+ bch2_bkey_buf_exit(&last_flushed, c);
|
|
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
|
|
}
|
|
|
|
void bch2_dev_do_invalidates(struct bch_dev *ca)
|
|
{
|
|
struct bch_fs *c = ca->fs;
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate))
|
|
return;
|
|
|
|
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
|
|
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates))
|
|
goto put_ref;
|
|
|
|
if (queue_work(c->write_ref_wq, &ca->invalidate_work))
|
|
return;
|
|
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
|
|
put_ref:
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
|
|
}
|
|
|
|
void bch2_do_invalidates(struct bch_fs *c)
|
|
@@ -2243,7 +2324,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
|
|
break;
|
|
}
|
|
|
|
- k = bch2_get_key_or_hole(&iter, end, &hole);
|
|
+ k = bch2_get_key_or_hole(trans, &iter, end, &hole);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto bkey_err;
|
|
@@ -2262,7 +2343,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
|
|
if (ret)
|
|
goto bkey_err;
|
|
|
|
- bch2_btree_iter_advance(&iter);
|
|
+ bch2_btree_iter_advance(trans, &iter);
|
|
} else {
|
|
struct bkey_i *freespace;
|
|
|
|
@@ -2282,7 +2363,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
|
|
if (ret)
|
|
goto bkey_err;
|
|
|
|
- bch2_btree_iter_set_pos(&iter, k.k->p);
|
|
+ bch2_btree_iter_set_pos(trans, &iter, k.k->p);
|
|
}
|
|
bkey_err:
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
@@ -2309,14 +2390,16 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
|
|
|
|
int bch2_fs_freespace_init(struct bch_fs *c)
|
|
{
|
|
- int ret = 0;
|
|
- bool doing_init = false;
|
|
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image))
|
|
+ return 0;
|
|
+
|
|
|
|
/*
|
|
* We can crash during the device add path, so we need to check this on
|
|
* every mount:
|
|
*/
|
|
|
|
+ bool doing_init = false;
|
|
for_each_member_device(c, ca) {
|
|
if (ca->mi.freespace_initialized)
|
|
continue;
|
|
@@ -2326,7 +2409,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
|
|
doing_init = true;
|
|
}
|
|
|
|
- ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
|
|
+ int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
|
|
if (ret) {
|
|
bch2_dev_put(ca);
|
|
bch_err_fn(c, ret);
|
|
@@ -2356,8 +2439,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
|
|
* We clear the LRU and need_discard btrees first so that we don't race
|
|
* with bch2_do_invalidates() and bch2_do_discards()
|
|
*/
|
|
- ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?:
|
|
- bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
|
|
+ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
|
|
BTREE_TRIGGER_norun, NULL) ?:
|
|
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
|
|
BTREE_TRIGGER_norun, NULL) ?:
|
|
@@ -2420,15 +2502,15 @@ void bch2_recalc_capacity(struct bch_fs *c)
|
|
|
|
lockdep_assert_held(&c->state_lock);
|
|
|
|
- for_each_online_member(c, ca) {
|
|
- struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
|
|
-
|
|
- ra_pages += bdi->ra_pages;
|
|
- }
|
|
+ rcu_read_lock();
|
|
+ for_each_member_device_rcu(c, ca, NULL) {
|
|
+ struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev);
|
|
+ if (bdev)
|
|
+ ra_pages += bdev->bd_disk->bdi->ra_pages;
|
|
|
|
- bch2_set_ra_pages(c, ra_pages);
|
|
+ if (ca->mi.state != BCH_MEMBER_STATE_rw)
|
|
+ continue;
|
|
|
|
- for_each_rw_member(c, ca) {
|
|
u64 dev_reserve = 0;
|
|
|
|
/*
|
|
@@ -2465,6 +2547,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
|
|
bucket_size_max = max_t(unsigned, bucket_size_max,
|
|
ca->mi.bucket_size);
|
|
}
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ bch2_set_ra_pages(c, ra_pages);
|
|
|
|
gc_reserve = c->opts.gc_reserve_bytes
|
|
? c->opts.gc_reserve_bytes >> 9
|
|
@@ -2487,27 +2572,41 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c)
|
|
{
|
|
u64 ret = U64_MAX;
|
|
|
|
- for_each_rw_member(c, ca)
|
|
+ rcu_read_lock();
|
|
+ for_each_rw_member_rcu(c, ca)
|
|
ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
|
|
+ rcu_read_unlock();
|
|
return ret;
|
|
}
|
|
|
|
static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
|
|
{
|
|
struct open_bucket *ob;
|
|
- bool ret = false;
|
|
|
|
for (ob = c->open_buckets;
|
|
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
|
|
ob++) {
|
|
- spin_lock(&ob->lock);
|
|
- if (ob->valid && !ob->on_partial_list &&
|
|
- ob->dev == ca->dev_idx)
|
|
- ret = true;
|
|
- spin_unlock(&ob->lock);
|
|
+ scoped_guard(spinlock, &ob->lock) {
|
|
+ if (ob->valid && !ob->on_partial_list &&
|
|
+ ob->dev == ca->dev_idx)
|
|
+ return true;
|
|
+ }
|
|
}
|
|
|
|
- return ret;
|
|
+ return false;
|
|
+}
|
|
+
|
|
+void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw)
|
|
+{
|
|
+ /* BCH_DATA_free == all rw devs */
|
|
+
|
|
+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
|
|
+ if (rw &&
|
|
+ (i == BCH_DATA_free ||
|
|
+ (ca->mi.data_allowed & BIT(i))))
|
|
+ set_bit(ca->dev_idx, c->rw_devs[i].d);
|
|
+ else
|
|
+ clear_bit(ca->dev_idx, c->rw_devs[i].d);
|
|
}
|
|
|
|
/* device goes ro: */
|
|
@@ -2516,9 +2615,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
|
|
lockdep_assert_held(&c->state_lock);
|
|
|
|
/* First, remove device from allocation groups: */
|
|
-
|
|
- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
|
|
- clear_bit(ca->dev_idx, c->rw_devs[i].d);
|
|
+ bch2_dev_allocator_set_rw(c, ca, false);
|
|
|
|
c->rw_devs_change_count++;
|
|
|
|
@@ -2552,10 +2649,7 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
|
|
{
|
|
lockdep_assert_held(&c->state_lock);
|
|
|
|
- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
|
|
- if (ca->mi.data_allowed & (1 << i))
|
|
- set_bit(ca->dev_idx, c->rw_devs[i].d);
|
|
-
|
|
+ bch2_dev_allocator_set_rw(c, ca, true);
|
|
c->rw_devs_change_count++;
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
|
|
index de25ba4ee94b..4f94c6a661bf 100644
|
|
--- a/fs/bcachefs/alloc_background.h
|
|
+++ b/fs/bcachefs/alloc_background.h
|
|
@@ -131,7 +131,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
|
|
if (a.stripe)
|
|
return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
|
|
if (bch2_bucket_sectors_dirty(a))
|
|
- return data_type;
|
|
+ return bucket_data_type(data_type);
|
|
if (a.cached_sectors)
|
|
return BCH_DATA_cached;
|
|
if (BCH_ALLOC_V4_NEED_DISCARD(&a))
|
|
@@ -321,11 +321,11 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
|
|
{
|
|
u64 want_free = ca->mi.nbuckets >> 7;
|
|
u64 free = max_t(s64, 0,
|
|
- u.d[BCH_DATA_free].buckets
|
|
- + u.d[BCH_DATA_need_discard].buckets
|
|
+ u.buckets[BCH_DATA_free]
|
|
+ + u.buckets[BCH_DATA_need_discard]
|
|
- bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
|
|
|
|
- return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
|
|
+ return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]);
|
|
}
|
|
|
|
void bch2_dev_do_invalidates(struct bch_dev *);
|
|
@@ -350,6 +350,7 @@ int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
|
|
void bch2_recalc_capacity(struct bch_fs *);
|
|
u64 bch2_min_rw_member_capacity(struct bch_fs *);
|
|
|
|
+void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool);
|
|
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
|
|
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
|
|
|
|
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
|
|
index 5a781fb4c794..1a52c12c51ae 100644
|
|
--- a/fs/bcachefs/alloc_foreground.c
|
|
+++ b/fs/bcachefs/alloc_foreground.c
|
|
@@ -127,14 +127,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
|
|
|
|
void bch2_open_bucket_write_error(struct bch_fs *c,
|
|
struct open_buckets *obs,
|
|
- unsigned dev)
|
|
+ unsigned dev, int err)
|
|
{
|
|
struct open_bucket *ob;
|
|
unsigned i;
|
|
|
|
open_bucket_for_each(c, obs, ob, i)
|
|
if (ob->dev == dev && ob->ec)
|
|
- bch2_ec_bucket_cancel(c, ob);
|
|
+ bch2_ec_bucket_cancel(c, ob, err);
|
|
}
|
|
|
|
static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
|
|
@@ -154,7 +154,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
|
|
|
|
static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
|
|
{
|
|
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs)
|
|
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs))
|
|
return false;
|
|
|
|
return bch2_is_superblock_bucket(ca, b);
|
|
@@ -179,29 +179,12 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
|
|
closure_wake_up(&c->freelist_wait);
|
|
}
|
|
|
|
-static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
|
|
-{
|
|
- switch (watermark) {
|
|
- case BCH_WATERMARK_interior_updates:
|
|
- return 0;
|
|
- case BCH_WATERMARK_reclaim:
|
|
- return OPEN_BUCKETS_COUNT / 6;
|
|
- case BCH_WATERMARK_btree:
|
|
- case BCH_WATERMARK_btree_copygc:
|
|
- return OPEN_BUCKETS_COUNT / 4;
|
|
- case BCH_WATERMARK_copygc:
|
|
- return OPEN_BUCKETS_COUNT / 3;
|
|
- default:
|
|
- return OPEN_BUCKETS_COUNT / 2;
|
|
- }
|
|
-}
|
|
-
|
|
static inline bool may_alloc_bucket(struct bch_fs *c,
|
|
- struct bpos bucket,
|
|
- struct bucket_alloc_state *s)
|
|
+ struct alloc_request *req,
|
|
+ struct bpos bucket)
|
|
{
|
|
if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
|
|
- s->skipped_open++;
|
|
+ req->counters.skipped_open++;
|
|
return false;
|
|
}
|
|
|
|
@@ -210,36 +193,37 @@ static inline bool may_alloc_bucket(struct bch_fs *c,
|
|
bucket.inode, bucket.offset);
|
|
if (journal_seq_ready > c->journal.flushed_seq_ondisk) {
|
|
if (journal_seq_ready > c->journal.flushing_seq)
|
|
- s->need_journal_commit++;
|
|
- s->skipped_need_journal_commit++;
|
|
+ req->counters.need_journal_commit++;
|
|
+ req->counters.skipped_need_journal_commit++;
|
|
return false;
|
|
}
|
|
|
|
if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
|
|
- s->skipped_nocow++;
|
|
+ req->counters.skipped_nocow++;
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
-static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
|
|
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
|
|
+ struct alloc_request *req,
|
|
u64 bucket, u8 gen,
|
|
- enum bch_watermark watermark,
|
|
- struct bucket_alloc_state *s,
|
|
struct closure *cl)
|
|
{
|
|
+ struct bch_dev *ca = req->ca;
|
|
+
|
|
if (unlikely(is_superblock_bucket(c, ca, bucket)))
|
|
return NULL;
|
|
|
|
if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
|
|
- s->skipped_nouse++;
|
|
+ req->counters.skipped_nouse++;
|
|
return NULL;
|
|
}
|
|
|
|
spin_lock(&c->freelist_lock);
|
|
|
|
- if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
|
|
+ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) {
|
|
if (cl)
|
|
closure_wait(&c->open_buckets_wait, cl);
|
|
|
|
@@ -251,7 +235,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
|
|
/* Recheck under lock: */
|
|
if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
|
|
spin_unlock(&c->freelist_lock);
|
|
- s->skipped_open++;
|
|
+ req->counters.skipped_open++;
|
|
return NULL;
|
|
}
|
|
|
|
@@ -275,16 +259,15 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
|
|
return ob;
|
|
}
|
|
|
|
-static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
|
|
- enum bch_watermark watermark,
|
|
- struct bucket_alloc_state *s,
|
|
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
|
|
+ struct alloc_request *req,
|
|
struct btree_iter *freespace_iter,
|
|
struct closure *cl)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
|
|
|
|
- if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s))
|
|
+ if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b)))
|
|
return NULL;
|
|
|
|
u8 gen;
|
|
@@ -294,7 +277,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
|
|
if (ret)
|
|
return NULL;
|
|
|
|
- return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl);
|
|
+ return __try_alloc_bucket(c, req, b, gen, cl);
|
|
}
|
|
|
|
/*
|
|
@@ -302,17 +285,16 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
|
|
*/
|
|
static noinline struct open_bucket *
|
|
bch2_bucket_alloc_early(struct btree_trans *trans,
|
|
- struct bch_dev *ca,
|
|
- enum bch_watermark watermark,
|
|
- struct bucket_alloc_state *s,
|
|
+ struct alloc_request *req,
|
|
struct closure *cl)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
+ struct bch_dev *ca = req->ca;
|
|
struct btree_iter iter, citer;
|
|
struct bkey_s_c k, ck;
|
|
struct open_bucket *ob = NULL;
|
|
u64 first_bucket = ca->mi.first_bucket;
|
|
- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
|
|
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
|
|
u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
|
|
u64 alloc_cursor = alloc_start;
|
|
int ret;
|
|
@@ -334,19 +316,19 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
|
|
if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
|
|
break;
|
|
|
|
- if (s->btree_bitmap != BTREE_BITMAP_ANY &&
|
|
- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
|
|
+ if (req->btree_bitmap != BTREE_BITMAP_ANY &&
|
|
+ req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
|
|
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
|
|
- if (s->btree_bitmap == BTREE_BITMAP_YES &&
|
|
+ if (req->btree_bitmap == BTREE_BITMAP_YES &&
|
|
bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
|
|
break;
|
|
|
|
bucket = sector_to_bucket(ca,
|
|
round_up(bucket_to_sector(ca, bucket) + 1,
|
|
1ULL << ca->mi.btree_bitmap_shift));
|
|
- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket));
|
|
- s->buckets_seen++;
|
|
- s->skipped_mi_btree_bitmap++;
|
|
+ bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket));
|
|
+ req->counters.buckets_seen++;
|
|
+ req->counters.skipped_mi_btree_bitmap++;
|
|
continue;
|
|
}
|
|
|
|
@@ -365,14 +347,13 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
|
|
if (a->data_type != BCH_DATA_free)
|
|
goto next;
|
|
|
|
- s->buckets_seen++;
|
|
+ req->counters.buckets_seen++;
|
|
|
|
- ob = may_alloc_bucket(c, k.k->p, s)
|
|
- ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen,
|
|
- watermark, s, cl)
|
|
+ ob = may_alloc_bucket(c, req, k.k->p)
|
|
+ ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl)
|
|
: NULL;
|
|
next:
|
|
- bch2_set_btree_iter_dontneed(&citer);
|
|
+ bch2_set_btree_iter_dontneed(trans, &citer);
|
|
bch2_trans_iter_exit(trans, &citer);
|
|
if (ob)
|
|
break;
|
|
@@ -395,15 +376,14 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
|
|
}
|
|
|
|
static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
|
|
- struct bch_dev *ca,
|
|
- enum bch_watermark watermark,
|
|
- struct bucket_alloc_state *s,
|
|
- struct closure *cl)
|
|
+ struct alloc_request *req,
|
|
+ struct closure *cl)
|
|
{
|
|
+ struct bch_dev *ca = req->ca;
|
|
struct btree_iter iter;
|
|
struct bkey_s_c k;
|
|
struct open_bucket *ob = NULL;
|
|
- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
|
|
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
|
|
u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
|
|
u64 alloc_cursor = alloc_start;
|
|
int ret;
|
|
@@ -419,13 +399,13 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
|
|
iter.k.size = iter.k.p.offset - iter.pos.offset;
|
|
|
|
while (iter.k.size) {
|
|
- s->buckets_seen++;
|
|
+ req->counters.buckets_seen++;
|
|
|
|
u64 bucket = iter.pos.offset & ~(~0ULL << 56);
|
|
- if (s->btree_bitmap != BTREE_BITMAP_ANY &&
|
|
- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
|
|
+ if (req->btree_bitmap != BTREE_BITMAP_ANY &&
|
|
+ req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
|
|
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
|
|
- if (s->btree_bitmap == BTREE_BITMAP_YES &&
|
|
+ if (req->btree_bitmap == BTREE_BITMAP_YES &&
|
|
bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
|
|
goto fail;
|
|
|
|
@@ -434,16 +414,16 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
|
|
1ULL << ca->mi.btree_bitmap_shift));
|
|
alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
|
|
|
|
- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
|
|
- s->skipped_mi_btree_bitmap++;
|
|
+ bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor));
|
|
+ req->counters.skipped_mi_btree_bitmap++;
|
|
goto next;
|
|
}
|
|
|
|
- ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl);
|
|
+ ob = try_alloc_bucket(trans, req, &iter, cl);
|
|
if (ob) {
|
|
if (!IS_ERR(ob))
|
|
*dev_alloc_cursor = iter.pos.offset;
|
|
- bch2_set_btree_iter_dontneed(&iter);
|
|
+ bch2_set_btree_iter_dontneed(trans, &iter);
|
|
break;
|
|
}
|
|
|
|
@@ -470,33 +450,30 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
|
|
return ob;
|
|
}
|
|
|
|
-static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
|
|
- enum bch_watermark watermark,
|
|
- enum bch_data_type data_type,
|
|
+static noinline void trace_bucket_alloc2(struct bch_fs *c,
|
|
+ struct alloc_request *req,
|
|
struct closure *cl,
|
|
- struct bch_dev_usage *usage,
|
|
- struct bucket_alloc_state *s,
|
|
struct open_bucket *ob)
|
|
{
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
printbuf_tabstop_push(&buf, 24);
|
|
|
|
- prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx);
|
|
- prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]);
|
|
- prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]);
|
|
+ prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx);
|
|
+ prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]);
|
|
+ prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]);
|
|
prt_printf(&buf, "blocking\t%u\n", cl != NULL);
|
|
- prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets);
|
|
- prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark));
|
|
- prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
|
|
+ prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]);
|
|
+ prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark));
|
|
+ prt_printf(&buf, "copygc_wait\t%llu/%lli\n",
|
|
bch2_copygc_wait_amount(c),
|
|
c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
|
|
- prt_printf(&buf, "seen\t%llu\n", s->buckets_seen);
|
|
- prt_printf(&buf, "open\t%llu\n", s->skipped_open);
|
|
- prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit);
|
|
- prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow);
|
|
- prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse);
|
|
- prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap);
|
|
+ prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen);
|
|
+ prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open);
|
|
+ prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit);
|
|
+ prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow);
|
|
+ prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse);
|
|
+ prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap);
|
|
|
|
if (!IS_ERR(ob)) {
|
|
prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
|
|
@@ -512,47 +489,42 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
|
|
/**
|
|
* bch2_bucket_alloc_trans - allocate a single bucket from a specific device
|
|
* @trans: transaction object
|
|
- * @ca: device to allocate from
|
|
- * @watermark: how important is this allocation?
|
|
- * @data_type: BCH_DATA_journal, btree, user...
|
|
+ * @req: state for the entire allocation
|
|
* @cl: if not NULL, closure to be used to wait if buckets not available
|
|
* @nowait: if true, do not wait for buckets to become available
|
|
- * @usage: for secondarily also returning the current device usage
|
|
*
|
|
* Returns: an open_bucket on success, or an ERR_PTR() on failure.
|
|
*/
|
|
static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
|
|
- struct bch_dev *ca,
|
|
- enum bch_watermark watermark,
|
|
- enum bch_data_type data_type,
|
|
- struct closure *cl,
|
|
- bool nowait,
|
|
- struct bch_dev_usage *usage)
|
|
+ struct alloc_request *req,
|
|
+ struct closure *cl,
|
|
+ bool nowait)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
+ struct bch_dev *ca = req->ca;
|
|
struct open_bucket *ob = NULL;
|
|
bool freespace = READ_ONCE(ca->mi.freespace_initialized);
|
|
u64 avail;
|
|
- struct bucket_alloc_state s = {
|
|
- .btree_bitmap = data_type == BCH_DATA_btree,
|
|
- };
|
|
bool waiting = nowait;
|
|
+
|
|
+ req->btree_bitmap = req->data_type == BCH_DATA_btree;
|
|
+ memset(&req->counters, 0, sizeof(req->counters));
|
|
again:
|
|
- bch2_dev_usage_read_fast(ca, usage);
|
|
- avail = dev_buckets_free(ca, *usage, watermark);
|
|
+ bch2_dev_usage_read_fast(ca, &req->usage);
|
|
+ avail = dev_buckets_free(ca, req->usage, req->watermark);
|
|
|
|
- if (usage->d[BCH_DATA_need_discard].buckets > avail)
|
|
+ if (req->usage.buckets[BCH_DATA_need_discard] > avail)
|
|
bch2_dev_do_discards(ca);
|
|
|
|
- if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
|
|
+ if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail)
|
|
bch2_gc_gens_async(c);
|
|
|
|
- if (should_invalidate_buckets(ca, *usage))
|
|
+ if (should_invalidate_buckets(ca, req->usage))
|
|
bch2_dev_do_invalidates(ca);
|
|
|
|
if (!avail) {
|
|
- if (watermark > BCH_WATERMARK_normal &&
|
|
- c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
|
|
+ if (req->watermark > BCH_WATERMARK_normal &&
|
|
+ c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
|
|
goto alloc;
|
|
|
|
if (cl && !waiting) {
|
|
@@ -571,18 +543,18 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
|
|
closure_wake_up(&c->freelist_wait);
|
|
alloc:
|
|
ob = likely(freespace)
|
|
- ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
|
|
- : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
|
|
+ ? bch2_bucket_alloc_freelist(trans, req, cl)
|
|
+ : bch2_bucket_alloc_early(trans, req, cl);
|
|
|
|
- if (s.need_journal_commit * 2 > avail)
|
|
+ if (req->counters.need_journal_commit * 2 > avail)
|
|
bch2_journal_flush_async(&c->journal, NULL);
|
|
|
|
- if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) {
|
|
- s.btree_bitmap = BTREE_BITMAP_ANY;
|
|
+ if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) {
|
|
+ req->btree_bitmap = BTREE_BITMAP_ANY;
|
|
goto alloc;
|
|
}
|
|
|
|
- if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
|
|
+ if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) {
|
|
freespace = false;
|
|
goto alloc;
|
|
}
|
|
@@ -591,7 +563,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
|
|
ob = ERR_PTR(-BCH_ERR_no_buckets_found);
|
|
|
|
if (!IS_ERR(ob))
|
|
- ob->data_type = data_type;
|
|
+ ob->data_type = req->data_type;
|
|
|
|
if (!IS_ERR(ob))
|
|
count_event(c, bucket_alloc);
|
|
@@ -601,7 +573,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
|
|
if (!IS_ERR(ob)
|
|
? trace_bucket_alloc_enabled()
|
|
: trace_bucket_alloc_fail_enabled())
|
|
- trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob);
|
|
+ trace_bucket_alloc2(c, req, cl, ob);
|
|
|
|
return ob;
|
|
}
|
|
@@ -611,20 +583,22 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
|
|
enum bch_data_type data_type,
|
|
struct closure *cl)
|
|
{
|
|
- struct bch_dev_usage usage;
|
|
struct open_bucket *ob;
|
|
+ struct alloc_request req = {
|
|
+ .watermark = watermark,
|
|
+ .data_type = data_type,
|
|
+ .ca = ca,
|
|
+ };
|
|
|
|
bch2_trans_do(c,
|
|
- PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
|
|
- data_type, cl, false, &usage)));
|
|
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false)));
|
|
return ob;
|
|
}
|
|
|
|
static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
|
|
unsigned l, unsigned r)
|
|
{
|
|
- return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
|
|
- (stripe->next_alloc[l] < stripe->next_alloc[r]));
|
|
+ return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]);
|
|
}
|
|
|
|
#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
|
|
@@ -643,25 +617,62 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
|
|
return ret;
|
|
}
|
|
|
|
+static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */
|
|
+static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */
|
|
+static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */
|
|
+
|
|
+static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe)
|
|
+{
|
|
+ /*
|
|
+ * Avoid underflowing clock hands if at all possible, if clock hands go
|
|
+ * to 0 then we lose information - clock hands can be in a wide range if
|
|
+ * we have devices we rarely try to allocate from, if we generally
|
|
+ * allocate from a specified target but only sometimes have to fall back
|
|
+ * to the whole filesystem.
|
|
+ */
|
|
+ u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */
|
|
+ u64 scale_min = 0; /* minumum we must subtract to avoid overflow */
|
|
+
|
|
+ for (u64 *v = stripe->next_alloc;
|
|
+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) {
|
|
+ if (*v)
|
|
+ scale_max = min(scale_max, *v);
|
|
+ if (*v > stripe_clock_hand_max)
|
|
+ scale_min = max(scale_min, *v - stripe_clock_hand_max);
|
|
+ }
|
|
+
|
|
+ u64 scale = max(scale_min, scale_max);
|
|
+
|
|
+ for (u64 *v = stripe->next_alloc;
|
|
+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
|
|
+ *v = *v < scale ? 0 : *v - scale;
|
|
+}
|
|
+
|
|
static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
|
|
struct dev_stripe_state *stripe,
|
|
struct bch_dev_usage *usage)
|
|
{
|
|
+ /*
|
|
+ * Stripe state has a per device clock hand: we allocate from the device
|
|
+ * with the smallest clock hand.
|
|
+ *
|
|
+ * When we allocate, we don't do a simple increment; we add the inverse
|
|
+ * of the device's free space. This results in round robin behavior that
|
|
+ * biases in favor of the device(s) with more free space.
|
|
+ */
|
|
+
|
|
u64 *v = stripe->next_alloc + ca->dev_idx;
|
|
- u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
|
|
+ u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
|
|
u64 free_space_inv = free_space
|
|
- ? div64_u64(1ULL << 48, free_space)
|
|
- : 1ULL << 48;
|
|
- u64 scale = *v / 4;
|
|
+ ? div64_u64(stripe_clock_hand_inv, free_space)
|
|
+ : stripe_clock_hand_inv;
|
|
|
|
- if (*v + free_space_inv >= *v)
|
|
- *v += free_space_inv;
|
|
- else
|
|
- *v = U64_MAX;
|
|
+ /* Saturating add, avoid overflow: */
|
|
+ u64 sum = *v + free_space_inv;
|
|
+ *v = sum >= *v ? sum : U64_MAX;
|
|
|
|
- for (v = stripe->next_alloc;
|
|
- v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
|
|
- *v = *v < scale ? 0 : *v - scale;
|
|
+ if (unlikely(*v > stripe_clock_hand_rescale))
|
|
+ bch2_stripe_state_rescale(stripe);
|
|
}
|
|
|
|
void bch2_dev_stripe_increment(struct bch_dev *ca,
|
|
@@ -674,24 +685,20 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
|
|
}
|
|
|
|
static int add_new_bucket(struct bch_fs *c,
|
|
- struct open_buckets *ptrs,
|
|
- struct bch_devs_mask *devs_may_alloc,
|
|
- unsigned nr_replicas,
|
|
- unsigned *nr_effective,
|
|
- bool *have_cache,
|
|
- struct open_bucket *ob)
|
|
+ struct alloc_request *req,
|
|
+ struct open_bucket *ob)
|
|
{
|
|
unsigned durability = ob_dev(c, ob)->mi.durability;
|
|
|
|
- BUG_ON(*nr_effective >= nr_replicas);
|
|
+ BUG_ON(req->nr_effective >= req->nr_replicas);
|
|
|
|
- __clear_bit(ob->dev, devs_may_alloc->d);
|
|
- *nr_effective += durability;
|
|
- *have_cache |= !durability;
|
|
+ __clear_bit(ob->dev, req->devs_may_alloc.d);
|
|
+ req->nr_effective += durability;
|
|
+ req->have_cache |= !durability;
|
|
|
|
- ob_push(c, ptrs, ob);
|
|
+ ob_push(c, &req->ptrs, ob);
|
|
|
|
- if (*nr_effective >= nr_replicas)
|
|
+ if (req->nr_effective >= req->nr_replicas)
|
|
return 1;
|
|
if (ob->ec)
|
|
return 1;
|
|
@@ -699,39 +706,31 @@ static int add_new_bucket(struct bch_fs *c,
|
|
}
|
|
|
|
int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
|
|
- struct open_buckets *ptrs,
|
|
- struct dev_stripe_state *stripe,
|
|
- struct bch_devs_mask *devs_may_alloc,
|
|
- unsigned nr_replicas,
|
|
- unsigned *nr_effective,
|
|
- bool *have_cache,
|
|
- enum bch_write_flags flags,
|
|
- enum bch_data_type data_type,
|
|
- enum bch_watermark watermark,
|
|
- struct closure *cl)
|
|
+ struct alloc_request *req,
|
|
+ struct dev_stripe_state *stripe,
|
|
+ struct closure *cl)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
int ret = -BCH_ERR_insufficient_devices;
|
|
|
|
- BUG_ON(*nr_effective >= nr_replicas);
|
|
+ BUG_ON(req->nr_effective >= req->nr_replicas);
|
|
|
|
- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc);
|
|
+ struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc);
|
|
darray_for_each(devs_sorted, i) {
|
|
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i);
|
|
- if (!ca)
|
|
+ req->ca = bch2_dev_tryget_noerror(c, *i);
|
|
+ if (!req->ca)
|
|
continue;
|
|
|
|
- if (!ca->mi.durability && *have_cache) {
|
|
- bch2_dev_put(ca);
|
|
+ if (!req->ca->mi.durability && req->have_cache) {
|
|
+ bch2_dev_put(req->ca);
|
|
continue;
|
|
}
|
|
|
|
- struct bch_dev_usage usage;
|
|
- struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
|
|
- cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
|
|
+ struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl,
|
|
+ req->flags & BCH_WRITE_alloc_nowait);
|
|
if (!IS_ERR(ob))
|
|
- bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
|
|
- bch2_dev_put(ca);
|
|
+ bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage);
|
|
+ bch2_dev_put(req->ca);
|
|
|
|
if (IS_ERR(ob)) {
|
|
ret = PTR_ERR(ob);
|
|
@@ -740,9 +739,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
|
|
continue;
|
|
}
|
|
|
|
- if (add_new_bucket(c, ptrs, devs_may_alloc,
|
|
- nr_replicas, nr_effective,
|
|
- have_cache, ob)) {
|
|
+ if (add_new_bucket(c, req, ob)) {
|
|
ret = 0;
|
|
break;
|
|
}
|
|
@@ -760,34 +757,27 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
|
|
*/
|
|
|
|
static int bucket_alloc_from_stripe(struct btree_trans *trans,
|
|
- struct open_buckets *ptrs,
|
|
- struct write_point *wp,
|
|
- struct bch_devs_mask *devs_may_alloc,
|
|
- u16 target,
|
|
- unsigned nr_replicas,
|
|
- unsigned *nr_effective,
|
|
- bool *have_cache,
|
|
- enum bch_watermark watermark,
|
|
- enum bch_write_flags flags,
|
|
- struct closure *cl)
|
|
+ struct alloc_request *req,
|
|
+ struct closure *cl)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
int ret = 0;
|
|
|
|
- if (nr_replicas < 2)
|
|
+ if (req->nr_replicas < 2)
|
|
return 0;
|
|
|
|
- if (ec_open_bucket(c, ptrs))
|
|
+ if (ec_open_bucket(c, &req->ptrs))
|
|
return 0;
|
|
|
|
struct ec_stripe_head *h =
|
|
- bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
|
|
+ bch2_ec_stripe_head_get(trans, req, 0, cl);
|
|
if (IS_ERR(h))
|
|
return PTR_ERR(h);
|
|
if (!h)
|
|
return 0;
|
|
|
|
- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
|
|
+ struct dev_alloc_list devs_sorted =
|
|
+ bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc);
|
|
darray_for_each(devs_sorted, i)
|
|
for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
|
|
if (!h->s->blocks[ec_idx])
|
|
@@ -799,9 +789,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
|
|
ob->ec = h->s;
|
|
ec_stripe_new_get(h->s, STRIPE_REF_io);
|
|
|
|
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
|
|
- nr_replicas, nr_effective,
|
|
- have_cache, ob);
|
|
+ ret = add_new_bucket(c, req, ob);
|
|
goto out;
|
|
}
|
|
}
|
|
@@ -813,65 +801,49 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
|
|
/* Sector allocator */
|
|
|
|
static bool want_bucket(struct bch_fs *c,
|
|
- struct write_point *wp,
|
|
- struct bch_devs_mask *devs_may_alloc,
|
|
- bool *have_cache, bool ec,
|
|
+ struct alloc_request *req,
|
|
struct open_bucket *ob)
|
|
{
|
|
struct bch_dev *ca = ob_dev(c, ob);
|
|
|
|
- if (!test_bit(ob->dev, devs_may_alloc->d))
|
|
+ if (!test_bit(ob->dev, req->devs_may_alloc.d))
|
|
return false;
|
|
|
|
- if (ob->data_type != wp->data_type)
|
|
+ if (ob->data_type != req->wp->data_type)
|
|
return false;
|
|
|
|
if (!ca->mi.durability &&
|
|
- (wp->data_type == BCH_DATA_btree || ec || *have_cache))
|
|
+ (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache))
|
|
return false;
|
|
|
|
- if (ec != (ob->ec != NULL))
|
|
+ if (req->ec != (ob->ec != NULL))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static int bucket_alloc_set_writepoint(struct bch_fs *c,
|
|
- struct open_buckets *ptrs,
|
|
- struct write_point *wp,
|
|
- struct bch_devs_mask *devs_may_alloc,
|
|
- unsigned nr_replicas,
|
|
- unsigned *nr_effective,
|
|
- bool *have_cache,
|
|
- bool ec)
|
|
+ struct alloc_request *req)
|
|
{
|
|
- struct open_buckets ptrs_skip = { .nr = 0 };
|
|
struct open_bucket *ob;
|
|
unsigned i;
|
|
int ret = 0;
|
|
|
|
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
|
|
- if (!ret && want_bucket(c, wp, devs_may_alloc,
|
|
- have_cache, ec, ob))
|
|
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
|
|
- nr_replicas, nr_effective,
|
|
- have_cache, ob);
|
|
+ req->scratch_ptrs.nr = 0;
|
|
+
|
|
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
|
|
+ if (!ret && want_bucket(c, req, ob))
|
|
+ ret = add_new_bucket(c, req, ob);
|
|
else
|
|
- ob_push(c, &ptrs_skip, ob);
|
|
+ ob_push(c, &req->scratch_ptrs, ob);
|
|
}
|
|
- wp->ptrs = ptrs_skip;
|
|
+ req->wp->ptrs = req->scratch_ptrs;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int bucket_alloc_set_partial(struct bch_fs *c,
|
|
- struct open_buckets *ptrs,
|
|
- struct write_point *wp,
|
|
- struct bch_devs_mask *devs_may_alloc,
|
|
- unsigned nr_replicas,
|
|
- unsigned *nr_effective,
|
|
- bool *have_cache, bool ec,
|
|
- enum bch_watermark watermark)
|
|
+ struct alloc_request *req)
|
|
{
|
|
int i, ret = 0;
|
|
|
|
@@ -886,13 +858,12 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
|
|
for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
|
|
struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
|
|
|
|
- if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
|
|
+ if (want_bucket(c, req, ob)) {
|
|
struct bch_dev *ca = ob_dev(c, ob);
|
|
- struct bch_dev_usage usage;
|
|
u64 avail;
|
|
|
|
- bch2_dev_usage_read_fast(ca, &usage);
|
|
- avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets;
|
|
+ bch2_dev_usage_read_fast(ca, &req->usage);
|
|
+ avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets;
|
|
if (!avail)
|
|
continue;
|
|
|
|
@@ -905,9 +876,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
|
|
bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
|
|
rcu_read_unlock();
|
|
|
|
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
|
|
- nr_replicas, nr_effective,
|
|
- have_cache, ob);
|
|
+ ret = add_new_bucket(c, req, ob);
|
|
if (ret)
|
|
break;
|
|
}
|
|
@@ -918,61 +887,41 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
|
|
}
|
|
|
|
static int __open_bucket_add_buckets(struct btree_trans *trans,
|
|
- struct open_buckets *ptrs,
|
|
- struct write_point *wp,
|
|
- struct bch_devs_list *devs_have,
|
|
- u16 target,
|
|
- bool erasure_code,
|
|
- unsigned nr_replicas,
|
|
- unsigned *nr_effective,
|
|
- bool *have_cache,
|
|
- enum bch_watermark watermark,
|
|
- enum bch_write_flags flags,
|
|
- struct closure *_cl)
|
|
+ struct alloc_request *req,
|
|
+ struct closure *_cl)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct bch_devs_mask devs;
|
|
struct open_bucket *ob;
|
|
struct closure *cl = NULL;
|
|
unsigned i;
|
|
int ret;
|
|
|
|
- devs = target_rw_devs(c, wp->data_type, target);
|
|
+ req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target);
|
|
|
|
/* Don't allocate from devices we already have pointers to: */
|
|
- darray_for_each(*devs_have, i)
|
|
- __clear_bit(*i, devs.d);
|
|
+ darray_for_each(*req->devs_have, i)
|
|
+ __clear_bit(*i, req->devs_may_alloc.d);
|
|
|
|
- open_bucket_for_each(c, ptrs, ob, i)
|
|
- __clear_bit(ob->dev, devs.d);
|
|
+ open_bucket_for_each(c, &req->ptrs, ob, i)
|
|
+ __clear_bit(ob->dev, req->devs_may_alloc.d);
|
|
|
|
- ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
|
|
- nr_replicas, nr_effective,
|
|
- have_cache, erasure_code);
|
|
+ ret = bucket_alloc_set_writepoint(c, req);
|
|
if (ret)
|
|
return ret;
|
|
|
|
- ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
|
|
- nr_replicas, nr_effective,
|
|
- have_cache, erasure_code, watermark);
|
|
+ ret = bucket_alloc_set_partial(c, req);
|
|
if (ret)
|
|
return ret;
|
|
|
|
- if (erasure_code) {
|
|
- ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
|
|
- target,
|
|
- nr_replicas, nr_effective,
|
|
- have_cache,
|
|
- watermark, flags, _cl);
|
|
+ if (req->ec) {
|
|
+ ret = bucket_alloc_from_stripe(trans, req, _cl);
|
|
} else {
|
|
retry_blocking:
|
|
/*
|
|
* Try nonblocking first, so that if one device is full we'll try from
|
|
* other devices:
|
|
*/
|
|
- ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
|
|
- nr_replicas, nr_effective, have_cache,
|
|
- flags, wp->data_type, watermark, cl);
|
|
+ ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl);
|
|
if (ret &&
|
|
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
|
|
!bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
|
|
@@ -986,38 +935,27 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
|
|
}
|
|
|
|
static int open_bucket_add_buckets(struct btree_trans *trans,
|
|
- struct open_buckets *ptrs,
|
|
- struct write_point *wp,
|
|
- struct bch_devs_list *devs_have,
|
|
- u16 target,
|
|
- unsigned erasure_code,
|
|
- unsigned nr_replicas,
|
|
- unsigned *nr_effective,
|
|
- bool *have_cache,
|
|
- enum bch_watermark watermark,
|
|
- enum bch_write_flags flags,
|
|
- struct closure *cl)
|
|
+ struct alloc_request *req,
|
|
+ struct closure *cl)
|
|
{
|
|
int ret;
|
|
|
|
- if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
|
|
- ret = __open_bucket_add_buckets(trans, ptrs, wp,
|
|
- devs_have, target, erasure_code,
|
|
- nr_replicas, nr_effective, have_cache,
|
|
- watermark, flags, cl);
|
|
+ if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) {
|
|
+ ret = __open_bucket_add_buckets(trans, req, cl);
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
|
|
bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
|
|
bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
|
|
bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
|
|
return ret;
|
|
- if (*nr_effective >= nr_replicas)
|
|
+ if (req->nr_effective >= req->nr_replicas)
|
|
return 0;
|
|
}
|
|
|
|
- ret = __open_bucket_add_buckets(trans, ptrs, wp,
|
|
- devs_have, target, false,
|
|
- nr_replicas, nr_effective, have_cache,
|
|
- watermark, flags, cl);
|
|
+ bool ec = false;
|
|
+ swap(ec, req->ec);
|
|
+ ret = __open_bucket_add_buckets(trans, req, cl);
|
|
+ swap(ec, req->ec);
|
|
+
|
|
return ret < 0 ? ret : 0;
|
|
}
|
|
|
|
@@ -1270,26 +1208,26 @@ static struct write_point *writepoint_find(struct btree_trans *trans,
|
|
|
|
static noinline void
|
|
deallocate_extra_replicas(struct bch_fs *c,
|
|
- struct open_buckets *ptrs,
|
|
- struct open_buckets *ptrs_no_use,
|
|
- unsigned extra_replicas)
|
|
+ struct alloc_request *req)
|
|
{
|
|
- struct open_buckets ptrs2 = { 0 };
|
|
struct open_bucket *ob;
|
|
+ unsigned extra_replicas = req->nr_effective - req->nr_replicas;
|
|
unsigned i;
|
|
|
|
- open_bucket_for_each(c, ptrs, ob, i) {
|
|
+ req->scratch_ptrs.nr = 0;
|
|
+
|
|
+ open_bucket_for_each(c, &req->ptrs, ob, i) {
|
|
unsigned d = ob_dev(c, ob)->mi.durability;
|
|
|
|
if (d && d <= extra_replicas) {
|
|
extra_replicas -= d;
|
|
- ob_push(c, ptrs_no_use, ob);
|
|
+ ob_push(c, &req->wp->ptrs, ob);
|
|
} else {
|
|
- ob_push(c, &ptrs2, ob);
|
|
+ ob_push(c, &req->scratch_ptrs, ob);
|
|
}
|
|
}
|
|
|
|
- *ptrs = ptrs2;
|
|
+ req->ptrs = req->scratch_ptrs;
|
|
}
|
|
|
|
/*
|
|
@@ -1308,51 +1246,53 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
|
|
struct write_point **wp_ret)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct write_point *wp;
|
|
struct open_bucket *ob;
|
|
- struct open_buckets ptrs;
|
|
- unsigned nr_effective, write_points_nr;
|
|
- bool have_cache;
|
|
- int ret;
|
|
+ unsigned write_points_nr;
|
|
int i;
|
|
|
|
+ struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req));
|
|
+ int ret = PTR_ERR_OR_ZERO(req);
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
+
|
|
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
|
|
erasure_code = false;
|
|
|
|
+ req->nr_replicas = nr_replicas;
|
|
+ req->target = target;
|
|
+ req->ec = erasure_code;
|
|
+ req->watermark = watermark;
|
|
+ req->flags = flags;
|
|
+ req->devs_have = devs_have;
|
|
+
|
|
BUG_ON(!nr_replicas || !nr_replicas_required);
|
|
retry:
|
|
- ptrs.nr = 0;
|
|
- nr_effective = 0;
|
|
- write_points_nr = c->write_points_nr;
|
|
- have_cache = false;
|
|
+ req->ptrs.nr = 0;
|
|
+ req->nr_effective = 0;
|
|
+ req->have_cache = false;
|
|
+ write_points_nr = c->write_points_nr;
|
|
+
|
|
+ *wp_ret = req->wp = writepoint_find(trans, write_point.v);
|
|
|
|
- *wp_ret = wp = writepoint_find(trans, write_point.v);
|
|
+ req->data_type = req->wp->data_type;
|
|
|
|
ret = bch2_trans_relock(trans);
|
|
if (ret)
|
|
goto err;
|
|
|
|
/* metadata may not allocate on cache devices: */
|
|
- if (wp->data_type != BCH_DATA_user)
|
|
- have_cache = true;
|
|
-
|
|
- if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
|
|
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
|
|
- target, erasure_code,
|
|
- nr_replicas, &nr_effective,
|
|
- &have_cache, watermark,
|
|
- flags, NULL);
|
|
+ if (req->data_type != BCH_DATA_user)
|
|
+ req->have_cache = true;
|
|
+
|
|
+ if (target && !(flags & BCH_WRITE_only_specified_devs)) {
|
|
+ ret = open_bucket_add_buckets(trans, req, NULL);
|
|
if (!ret ||
|
|
bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
goto alloc_done;
|
|
|
|
/* Don't retry from all devices if we're out of open buckets: */
|
|
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
|
|
- int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
|
|
- target, erasure_code,
|
|
- nr_replicas, &nr_effective,
|
|
- &have_cache, watermark,
|
|
- flags, cl);
|
|
+ int ret2 = open_bucket_add_buckets(trans, req, cl);
|
|
if (!ret2 ||
|
|
bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
|
|
bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
|
|
@@ -1365,59 +1305,74 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
|
|
* Only try to allocate cache (durability = 0 devices) from the
|
|
* specified target:
|
|
*/
|
|
- have_cache = true;
|
|
+ req->have_cache = true;
|
|
+ req->target = 0;
|
|
|
|
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
|
|
- 0, erasure_code,
|
|
- nr_replicas, &nr_effective,
|
|
- &have_cache, watermark,
|
|
- flags, cl);
|
|
+ ret = open_bucket_add_buckets(trans, req, cl);
|
|
} else {
|
|
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
|
|
- target, erasure_code,
|
|
- nr_replicas, &nr_effective,
|
|
- &have_cache, watermark,
|
|
- flags, cl);
|
|
+ ret = open_bucket_add_buckets(trans, req, cl);
|
|
}
|
|
alloc_done:
|
|
- BUG_ON(!ret && nr_effective < nr_replicas);
|
|
+ BUG_ON(!ret && req->nr_effective < req->nr_replicas);
|
|
|
|
- if (erasure_code && !ec_open_bucket(c, &ptrs))
|
|
+ if (erasure_code && !ec_open_bucket(c, &req->ptrs))
|
|
pr_debug("failed to get ec bucket: ret %u", ret);
|
|
|
|
if (ret == -BCH_ERR_insufficient_devices &&
|
|
- nr_effective >= nr_replicas_required)
|
|
+ req->nr_effective >= nr_replicas_required)
|
|
ret = 0;
|
|
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (nr_effective > nr_replicas)
|
|
- deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
|
|
+ if (req->nr_effective > req->nr_replicas)
|
|
+ deallocate_extra_replicas(c, req);
|
|
|
|
/* Free buckets we didn't use: */
|
|
- open_bucket_for_each(c, &wp->ptrs, ob, i)
|
|
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i)
|
|
open_bucket_free_unused(c, ob);
|
|
|
|
- wp->ptrs = ptrs;
|
|
+ req->wp->ptrs = req->ptrs;
|
|
|
|
- wp->sectors_free = UINT_MAX;
|
|
+ req->wp->sectors_free = UINT_MAX;
|
|
|
|
- open_bucket_for_each(c, &wp->ptrs, ob, i)
|
|
- wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
|
|
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
|
|
+ /*
|
|
+ * Ensure proper write alignment - either due to misaligned
|
|
+ * bucket sizes (from buggy bcachefs-tools), or writes that mix
|
|
+ * logical/physical alignment:
|
|
+ */
|
|
+ struct bch_dev *ca = ob_dev(c, ob);
|
|
+ u64 offset = bucket_to_sector(ca, ob->bucket) +
|
|
+ ca->mi.bucket_size -
|
|
+ ob->sectors_free;
|
|
+ unsigned align = round_up(offset, block_sectors(c)) - offset;
|
|
|
|
- BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
|
|
+ ob->sectors_free = max_t(int, 0, ob->sectors_free - align);
|
|
+
|
|
+ req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free);
|
|
+ }
|
|
+
|
|
+ req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c));
|
|
+
|
|
+ /* Did alignment use up space in an open_bucket? */
|
|
+ if (unlikely(!req->wp->sectors_free)) {
|
|
+ bch2_alloc_sectors_done(c, req->wp);
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX);
|
|
|
|
return 0;
|
|
err:
|
|
- open_bucket_for_each(c, &wp->ptrs, ob, i)
|
|
- if (ptrs.nr < ARRAY_SIZE(ptrs.v))
|
|
- ob_push(c, &ptrs, ob);
|
|
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i)
|
|
+ if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v))
|
|
+ ob_push(c, &req->ptrs, ob);
|
|
else
|
|
open_bucket_free_unused(c, ob);
|
|
- wp->ptrs = ptrs;
|
|
+ req->wp->ptrs = req->ptrs;
|
|
|
|
- mutex_unlock(&wp->lock);
|
|
+ mutex_unlock(&req->wp->lock);
|
|
|
|
if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
|
|
try_decrease_writepoints(trans, write_points_nr))
|
|
@@ -1426,27 +1381,13 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
|
|
if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
|
|
ret = -BCH_ERR_bucket_alloc_blocked;
|
|
|
|
- if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
|
|
+ if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
|
|
bch2_err_matches(ret, BCH_ERR_freelist_empty))
|
|
ret = -BCH_ERR_bucket_alloc_blocked;
|
|
|
|
return ret;
|
|
}
|
|
|
|
-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
|
|
-{
|
|
- struct bch_dev *ca = ob_dev(c, ob);
|
|
-
|
|
- return (struct bch_extent_ptr) {
|
|
- .type = 1 << BCH_EXTENT_ENTRY_ptr,
|
|
- .gen = ob->gen,
|
|
- .dev = ob->dev,
|
|
- .offset = bucket_to_sector(ca, ob->bucket) +
|
|
- ca->mi.bucket_size -
|
|
- ob->sectors_free,
|
|
- };
|
|
-}
|
|
-
|
|
void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
|
|
struct bkey_i *k, unsigned sectors,
|
|
bool cached)
|
|
@@ -1576,8 +1517,10 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
|
|
struct open_bucket *ob;
|
|
unsigned i;
|
|
|
|
+ mutex_lock(&wp->lock);
|
|
+
|
|
prt_printf(out, "%lu: ", wp->write_point);
|
|
- prt_human_readable_u64(out, wp->sectors_allocated);
|
|
+ prt_human_readable_u64(out, wp->sectors_allocated << 9);
|
|
|
|
prt_printf(out, " last wrote: ");
|
|
bch2_pr_time_units(out, sched_clock() - wp->last_used);
|
|
@@ -1593,6 +1536,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
|
|
open_bucket_for_each(c, &wp->ptrs, ob, i)
|
|
bch2_open_bucket_to_text(out, c, ob);
|
|
printbuf_indent_sub(out, 2);
|
|
+
|
|
+ mutex_unlock(&wp->lock);
|
|
}
|
|
|
|
void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
|
|
@@ -1650,7 +1595,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
|
|
void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
|
|
{
|
|
struct bch_fs *c = ca->fs;
|
|
- struct bch_dev_usage stats = bch2_dev_usage_read(ca);
|
|
+ struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca);
|
|
unsigned nr[BCH_DATA_NR];
|
|
|
|
memset(nr, 0, sizeof(nr));
|
|
@@ -1673,7 +1618,8 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
|
|
printbuf_tabstop_push(out, 16);
|
|
|
|
prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets);
|
|
- prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats));
|
|
+ prt_printf(out, "buckets to invalidate\t%llu\r\n",
|
|
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)));
|
|
}
|
|
|
|
static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
|
|
@@ -1689,7 +1635,12 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
|
|
printbuf_indent_sub(&buf, 2);
|
|
prt_newline(&buf);
|
|
|
|
- for_each_online_member(c, ca) {
|
|
+ bch2_printbuf_make_room(&buf, 4096);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ buf.atomic++;
|
|
+
|
|
+ for_each_online_member_rcu(c, ca) {
|
|
prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
|
|
printbuf_indent_add(&buf, 2);
|
|
bch2_dev_alloc_debug_to_text(&buf, ca);
|
|
@@ -1697,6 +1648,9 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
|
|
prt_newline(&buf);
|
|
}
|
|
|
|
+ --buf.atomic;
|
|
+ rcu_read_unlock();
|
|
+
|
|
prt_printf(&buf, "Copygc debug:\n");
|
|
printbuf_indent_add(&buf, 2);
|
|
bch2_copygc_wait_to_text(&buf, c);
|
|
@@ -1708,7 +1662,7 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
|
|
bch2_journal_debug_to_text(&buf, &c->journal);
|
|
printbuf_indent_sub(&buf, 2);
|
|
|
|
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
|
|
index f25481a0d1a0..2e01c7b61ed1 100644
|
|
--- a/fs/bcachefs/alloc_foreground.h
|
|
+++ b/fs/bcachefs/alloc_foreground.h
|
|
@@ -3,8 +3,10 @@
|
|
#define _BCACHEFS_ALLOC_FOREGROUND_H
|
|
|
|
#include "bcachefs.h"
|
|
+#include "buckets.h"
|
|
#include "alloc_types.h"
|
|
#include "extents.h"
|
|
+#include "io_write_types.h"
|
|
#include "sb-members.h"
|
|
|
|
#include <linux/hash.h>
|
|
@@ -23,6 +25,52 @@ struct dev_alloc_list {
|
|
u8 data[BCH_SB_MEMBERS_MAX];
|
|
};
|
|
|
|
+struct alloc_request {
|
|
+ unsigned nr_replicas;
|
|
+ unsigned target;
|
|
+ bool ec;
|
|
+ enum bch_watermark watermark;
|
|
+ enum bch_write_flags flags;
|
|
+ enum bch_data_type data_type;
|
|
+ struct bch_devs_list *devs_have;
|
|
+ struct write_point *wp;
|
|
+
|
|
+ /* These fields are used primarily by open_bucket_add_buckets */
|
|
+ struct open_buckets ptrs;
|
|
+ unsigned nr_effective; /* sum of @ptrs durability */
|
|
+ bool have_cache; /* have we allocated from a 0 durability dev */
|
|
+ struct bch_devs_mask devs_may_alloc;
|
|
+
|
|
+ /* bch2_bucket_alloc_set_trans(): */
|
|
+ struct bch_dev_usage usage;
|
|
+
|
|
+ /* bch2_bucket_alloc_trans(): */
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ enum {
|
|
+ BTREE_BITMAP_NO,
|
|
+ BTREE_BITMAP_YES,
|
|
+ BTREE_BITMAP_ANY,
|
|
+ } btree_bitmap;
|
|
+
|
|
+ struct {
|
|
+ u64 buckets_seen;
|
|
+ u64 skipped_open;
|
|
+ u64 skipped_need_journal_commit;
|
|
+ u64 need_journal_commit;
|
|
+ u64 skipped_nocow;
|
|
+ u64 skipped_nouse;
|
|
+ u64 skipped_mi_btree_bitmap;
|
|
+ } counters;
|
|
+
|
|
+ unsigned scratch_nr_replicas;
|
|
+ unsigned scratch_nr_effective;
|
|
+ bool scratch_have_cache;
|
|
+ enum bch_data_type scratch_data_type;
|
|
+ struct open_buckets scratch_ptrs;
|
|
+ struct bch_devs_mask scratch_devs_may_alloc;
|
|
+};
|
|
+
|
|
struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
|
|
struct dev_stripe_state *,
|
|
struct bch_devs_mask *);
|
|
@@ -33,6 +81,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
|
|
return bch2_dev_have_ref(c, ob->dev);
|
|
}
|
|
|
|
+static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark)
|
|
+{
|
|
+ switch (watermark) {
|
|
+ case BCH_WATERMARK_interior_updates:
|
|
+ return 0;
|
|
+ case BCH_WATERMARK_reclaim:
|
|
+ return OPEN_BUCKETS_COUNT / 6;
|
|
+ case BCH_WATERMARK_btree:
|
|
+ case BCH_WATERMARK_btree_copygc:
|
|
+ return OPEN_BUCKETS_COUNT / 4;
|
|
+ case BCH_WATERMARK_copygc:
|
|
+ return OPEN_BUCKETS_COUNT / 3;
|
|
+ default:
|
|
+ return OPEN_BUCKETS_COUNT / 2;
|
|
+ }
|
|
+}
|
|
+
|
|
struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
|
|
enum bch_watermark, enum bch_data_type,
|
|
struct closure *);
|
|
@@ -65,7 +130,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
|
|
}
|
|
|
|
void bch2_open_bucket_write_error(struct bch_fs *,
|
|
- struct open_buckets *, unsigned);
|
|
+ struct open_buckets *, unsigned, int);
|
|
|
|
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
|
|
|
|
@@ -93,7 +158,9 @@ static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct writ
|
|
unsigned i;
|
|
|
|
open_bucket_for_each(c, &wp->ptrs, ob, i)
|
|
- ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
|
|
+ ob_push(c, ob->sectors_free < block_sectors(c)
|
|
+ ? &ptrs
|
|
+ : &keep, ob);
|
|
wp->ptrs = keep;
|
|
|
|
mutex_unlock(&wp->lock);
|
|
@@ -154,11 +221,8 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
|
|
}
|
|
|
|
enum bch_write_flags;
|
|
-int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
|
|
- struct dev_stripe_state *, struct bch_devs_mask *,
|
|
- unsigned, unsigned *, bool *, enum bch_write_flags,
|
|
- enum bch_data_type, enum bch_watermark,
|
|
- struct closure *);
|
|
+int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *,
|
|
+ struct dev_stripe_state *, struct closure *);
|
|
|
|
int bch2_alloc_sectors_start_trans(struct btree_trans *,
|
|
unsigned, unsigned,
|
|
@@ -170,7 +234,19 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
|
|
struct closure *,
|
|
struct write_point **);
|
|
|
|
-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
|
|
+static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
|
|
+{
|
|
+ struct bch_dev *ca = ob_dev(c, ob);
|
|
+
|
|
+ return (struct bch_extent_ptr) {
|
|
+ .type = 1 << BCH_EXTENT_ENTRY_ptr,
|
|
+ .gen = ob->gen,
|
|
+ .dev = ob->dev,
|
|
+ .offset = bucket_to_sector(ca, ob->bucket) +
|
|
+ ca->mi.bucket_size -
|
|
+ ob->sectors_free,
|
|
+ };
|
|
+}
|
|
|
|
/*
|
|
* Append pointers to the space we just allocated to @k, and mark @sectors space
|
|
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
|
|
index 4aa8ee026cb8..e7becdf22cba 100644
|
|
--- a/fs/bcachefs/alloc_types.h
|
|
+++ b/fs/bcachefs/alloc_types.h
|
|
@@ -8,22 +8,6 @@
|
|
#include "clock_types.h"
|
|
#include "fifo.h"
|
|
|
|
-struct bucket_alloc_state {
|
|
- enum {
|
|
- BTREE_BITMAP_NO,
|
|
- BTREE_BITMAP_YES,
|
|
- BTREE_BITMAP_ANY,
|
|
- } btree_bitmap;
|
|
-
|
|
- u64 buckets_seen;
|
|
- u64 skipped_open;
|
|
- u64 skipped_need_journal_commit;
|
|
- u64 need_journal_commit;
|
|
- u64 skipped_nocow;
|
|
- u64 skipped_nouse;
|
|
- u64 skipped_mi_btree_bitmap;
|
|
-};
|
|
-
|
|
#define BCH_WATERMARKS() \
|
|
x(stripe) \
|
|
x(normal) \
|
|
@@ -90,6 +74,7 @@ struct dev_stripe_state {
|
|
x(stopped) \
|
|
x(waiting_io) \
|
|
x(waiting_work) \
|
|
+ x(runnable) \
|
|
x(running)
|
|
|
|
enum write_point_state {
|
|
@@ -125,6 +110,7 @@ struct write_point {
|
|
enum write_point_state state;
|
|
u64 last_state_change;
|
|
u64 time[WRITE_POINT_STATE_NR];
|
|
+ u64 last_runtime;
|
|
} __aligned(SMP_CACHE_BYTES);
|
|
};
|
|
|
|
diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c
|
|
new file mode 100644
|
|
index 000000000000..a7cd1f0f0964
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/async_objs.c
|
|
@@ -0,0 +1,132 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Async obj debugging: keep asynchronous objects on (very fast) lists, make
|
|
+ * them visibile in debugfs:
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "async_objs.h"
|
|
+#include "btree_io.h"
|
|
+#include "debug.h"
|
|
+#include "io_read.h"
|
|
+#include "io_write.h"
|
|
+
|
|
+#include <linux/debugfs.h>
|
|
+
|
|
+static void promote_obj_to_text(struct printbuf *out, void *obj)
|
|
+{
|
|
+ bch2_promote_op_to_text(out, obj);
|
|
+}
|
|
+
|
|
+static void rbio_obj_to_text(struct printbuf *out, void *obj)
|
|
+{
|
|
+ bch2_read_bio_to_text(out, obj);
|
|
+}
|
|
+
|
|
+static void write_op_obj_to_text(struct printbuf *out, void *obj)
|
|
+{
|
|
+ bch2_write_op_to_text(out, obj);
|
|
+}
|
|
+
|
|
+static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj)
|
|
+{
|
|
+ struct btree_read_bio *rbio = obj;
|
|
+ bch2_btree_read_bio_to_text(out, rbio);
|
|
+}
|
|
+
|
|
+static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj)
|
|
+{
|
|
+ struct btree_write_bio *wbio = obj;
|
|
+ bch2_bio_to_text(out, &wbio->wbio.bio);
|
|
+}
|
|
+
|
|
+static int bch2_async_obj_list_open(struct inode *inode, struct file *file)
|
|
+{
|
|
+ struct async_obj_list *list = inode->i_private;
|
|
+ struct dump_iter *i;
|
|
+
|
|
+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
|
|
+ if (!i)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ file->private_data = i;
|
|
+ i->from = POS_MIN;
|
|
+ i->iter = 0;
|
|
+ i->c = container_of(list, struct bch_fs, async_objs[list->idx]);
|
|
+ i->list = list;
|
|
+ i->buf = PRINTBUF;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf,
|
|
+ size_t size, loff_t *ppos)
|
|
+{
|
|
+ struct dump_iter *i = file->private_data;
|
|
+ struct async_obj_list *list = i->list;
|
|
+ ssize_t ret = 0;
|
|
+
|
|
+ i->ubuf = buf;
|
|
+ i->size = size;
|
|
+ i->ret = 0;
|
|
+
|
|
+ struct genradix_iter iter;
|
|
+ void *obj;
|
|
+ fast_list_for_each_from(&list->list, iter, obj, i->iter) {
|
|
+ ret = bch2_debugfs_flush_buf(i);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (!i->size)
|
|
+ break;
|
|
+
|
|
+ list->obj_to_text(&i->buf, obj);
|
|
+ }
|
|
+
|
|
+ if (i->buf.allocation_failure)
|
|
+ ret = -ENOMEM;
|
|
+ else
|
|
+ i->iter = iter.pos;
|
|
+
|
|
+ if (!ret)
|
|
+ ret = bch2_debugfs_flush_buf(i);
|
|
+
|
|
+ return ret ?: i->ret;
|
|
+}
|
|
+
|
|
+static const struct file_operations async_obj_ops = {
|
|
+ .owner = THIS_MODULE,
|
|
+ .open = bch2_async_obj_list_open,
|
|
+ .release = bch2_dump_release,
|
|
+ .read = bch2_async_obj_list_read,
|
|
+};
|
|
+
|
|
+void bch2_fs_async_obj_debugfs_init(struct bch_fs *c)
|
|
+{
|
|
+ c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir);
|
|
+
|
|
+#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \
|
|
+ &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops);
|
|
+ BCH_ASYNC_OBJ_LISTS()
|
|
+#undef x
|
|
+}
|
|
+
|
|
+void bch2_fs_async_obj_exit(struct bch_fs *c)
|
|
+{
|
|
+ for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++)
|
|
+ fast_list_exit(&c->async_objs[i].list);
|
|
+}
|
|
+
|
|
+int bch2_fs_async_obj_init(struct bch_fs *c)
|
|
+{
|
|
+ for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) {
|
|
+ if (fast_list_init(&c->async_objs[i].list))
|
|
+ return -BCH_ERR_ENOMEM_async_obj_init;
|
|
+ c->async_objs[i].idx = i;
|
|
+ }
|
|
+
|
|
+#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text;
|
|
+ BCH_ASYNC_OBJ_LISTS()
|
|
+#undef x
|
|
+
|
|
+ return 0;
|
|
+}
|
|
diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h
|
|
new file mode 100644
|
|
index 000000000000..cd6489b8cf76
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/async_objs.h
|
|
@@ -0,0 +1,44 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_ASYNC_OBJS_H
|
|
+#define _BCACHEFS_ASYNC_OBJS_H
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
|
|
+static inline void __async_object_list_del(struct fast_list *head, unsigned idx)
|
|
+{
|
|
+ fast_list_remove(head, idx);
|
|
+}
|
|
+
|
|
+static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx)
|
|
+{
|
|
+ int ret = fast_list_add(head, obj);
|
|
+ *idx = ret > 0 ? ret : 0;
|
|
+ return ret < 0 ? ret : 0;
|
|
+}
|
|
+
|
|
+#define async_object_list_del(_c, _list, idx) \
|
|
+ __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx)
|
|
+
|
|
+#define async_object_list_add(_c, _list, obj, idx) \
|
|
+ __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx)
|
|
+
|
|
+void bch2_fs_async_obj_debugfs_init(struct bch_fs *);
|
|
+void bch2_fs_async_obj_exit(struct bch_fs *);
|
|
+int bch2_fs_async_obj_init(struct bch_fs *);
|
|
+
|
|
+#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
|
|
+
|
|
+#define async_object_list_del(_c, _n, idx) do {} while (0)
|
|
+
|
|
+static inline int __async_object_list_add(void)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add()
|
|
+
|
|
+static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {}
|
|
+static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {}
|
|
+static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; }
|
|
+
|
|
+#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
|
|
+
|
|
+#endif /* _BCACHEFS_ASYNC_OBJS_H */
|
|
diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h
|
|
new file mode 100644
|
|
index 000000000000..8d713c0f5841
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/async_objs_types.h
|
|
@@ -0,0 +1,25 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H
|
|
+#define _BCACHEFS_ASYNC_OBJS_TYPES_H
|
|
+
|
|
+#define BCH_ASYNC_OBJ_LISTS() \
|
|
+ x(promote) \
|
|
+ x(rbio) \
|
|
+ x(write_op) \
|
|
+ x(btree_read_bio) \
|
|
+ x(btree_write_bio)
|
|
+
|
|
+enum bch_async_obj_lists {
|
|
+#define x(n) BCH_ASYNC_OBJ_LIST_##n,
|
|
+ BCH_ASYNC_OBJ_LISTS()
|
|
+#undef x
|
|
+ BCH_ASYNC_OBJ_NR
|
|
+};
|
|
+
|
|
+struct async_obj_list {
|
|
+ struct fast_list list;
|
|
+ void (*obj_to_text)(struct printbuf *, void *);
|
|
+ unsigned idx;
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */
|
|
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
|
|
index ebeb6a5ff9d2..cde7dd115267 100644
|
|
--- a/fs/bcachefs/backpointers.c
|
|
+++ b/fs/bcachefs/backpointers.c
|
|
@@ -11,9 +11,21 @@
|
|
#include "checksum.h"
|
|
#include "disk_accounting.h"
|
|
#include "error.h"
|
|
+#include "progress.h"
|
|
+#include "recovery_passes.h"
|
|
|
|
#include <linux/mm.h>
|
|
|
|
+static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64);
|
|
+
|
|
+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
|
|
+{
|
|
+ return (struct bbpos) {
|
|
+ .btree = bp.btree_id,
|
|
+ .pos = bp.pos,
|
|
+ };
|
|
+}
|
|
+
|
|
int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
struct bkey_validate_context from)
|
|
{
|
|
@@ -49,6 +61,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke
|
|
}
|
|
|
|
bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
|
|
+ prt_str(out, " data_type=");
|
|
+ bch2_prt_data_type(out, bp.v->data_type);
|
|
prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
|
|
(u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
|
|
bp.v->bucket_len,
|
|
@@ -93,6 +107,9 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct printbuf buf = PRINTBUF;
|
|
+ bool will_check = c->recovery.passes_to_run &
|
|
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
|
|
+ int ret = 0;
|
|
|
|
if (insert) {
|
|
prt_printf(&buf, "existing backpointer found when inserting ");
|
|
@@ -106,9 +123,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
|
|
|
|
prt_printf(&buf, "for ");
|
|
bch2_bkey_val_to_text(&buf, c, orig_k);
|
|
-
|
|
- bch_err(c, "%s", buf.buf);
|
|
- } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
|
|
+ } else if (!will_check) {
|
|
prt_printf(&buf, "backpointer not found when deleting\n");
|
|
printbuf_indent_add(&buf, 2);
|
|
|
|
@@ -122,17 +137,14 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
|
|
|
|
prt_printf(&buf, "for ");
|
|
bch2_bkey_val_to_text(&buf, c, orig_k);
|
|
-
|
|
- bch_err(c, "%s", buf.buf);
|
|
}
|
|
|
|
- printbuf_exit(&buf);
|
|
+ if (!will_check && __bch2_inconsistent_error(c, &buf))
|
|
+ ret = -BCH_ERR_erofs_unfixed_errors;
|
|
|
|
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
|
|
- return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
|
|
- } else {
|
|
- return 0;
|
|
- }
|
|
+ bch_err(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ return ret;
|
|
}
|
|
|
|
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
|
|
@@ -172,7 +184,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
|
|
|
|
static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
|
|
{
|
|
- return (likely(!bch2_backpointers_no_use_write_buffer)
|
|
+ return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
|
|
? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
|
|
: bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?:
|
|
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
|
@@ -182,7 +194,7 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
|
|
struct bkey_s_c visiting_k,
|
|
struct bkey_buf *last_flushed)
|
|
{
|
|
- return likely(!bch2_backpointers_no_use_write_buffer)
|
|
+ return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
|
|
? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed)
|
|
: 0;
|
|
}
|
|
@@ -190,7 +202,8 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
|
|
static int backpointer_target_not_found(struct btree_trans *trans,
|
|
struct bkey_s_c_backpointer bp,
|
|
struct bkey_s_c target_k,
|
|
- struct bkey_buf *last_flushed)
|
|
+ struct bkey_buf *last_flushed,
|
|
+ bool commit)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct printbuf buf = PRINTBUF;
|
|
@@ -207,11 +220,11 @@ static int backpointer_target_not_found(struct btree_trans *trans,
|
|
if (ret)
|
|
return ret;
|
|
|
|
- prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
|
|
+ prt_printf(&buf, "backpointer doesn't match %s it points to:\n",
|
|
bp.v->level ? "btree node" : "extent");
|
|
bch2_bkey_val_to_text(&buf, c, bp.s_c);
|
|
|
|
- prt_printf(&buf, "\n ");
|
|
+ prt_newline(&buf);
|
|
bch2_bkey_val_to_text(&buf, c, target_k);
|
|
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k);
|
|
@@ -219,63 +232,45 @@ static int backpointer_target_not_found(struct btree_trans *trans,
|
|
struct extent_ptr_decoded p;
|
|
bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry)
|
|
if (p.ptr.dev == bp.k->p.inode) {
|
|
- prt_printf(&buf, "\n ");
|
|
+ prt_newline(&buf);
|
|
struct bkey_i_backpointer bp2;
|
|
bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2);
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i));
|
|
}
|
|
|
|
if (fsck_err(trans, backpointer_to_missing_ptr,
|
|
- "%s", buf.buf))
|
|
+ "%s", buf.buf)) {
|
|
ret = bch2_backpointer_del(trans, bp.k->p);
|
|
+ if (ret || !commit)
|
|
+ goto out;
|
|
+
|
|
+ /*
|
|
+ * Normally, on transaction commit from inside a transaction,
|
|
+ * we'll return -BCH_ERR_transaction_restart_nested, since a
|
|
+ * transaction commit invalidates pointers given out by peek().
|
|
+ *
|
|
+ * However, since we're updating a write buffer btree, if we
|
|
+ * return a transaction restart and loop we won't see that the
|
|
+ * backpointer has been deleted without an additional write
|
|
+ * buffer flush - and those are expensive.
|
|
+ *
|
|
+ * So we're relying on the caller immediately advancing to the
|
|
+ * next backpointer and starting a new transaction immediately
|
|
+ * after backpointer_get_key() returns NULL:
|
|
+ */
|
|
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
|
+ }
|
|
+out:
|
|
fsck_err:
|
|
printbuf_exit(&buf);
|
|
return ret;
|
|
}
|
|
|
|
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
|
|
- struct bkey_s_c_backpointer bp,
|
|
- struct btree_iter *iter,
|
|
- unsigned iter_flags,
|
|
- struct bkey_buf *last_flushed)
|
|
-{
|
|
- struct bch_fs *c = trans->c;
|
|
-
|
|
- if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
|
|
- return bkey_s_c_null;
|
|
-
|
|
- if (likely(!bp.v->level)) {
|
|
- bch2_trans_node_iter_init(trans, iter,
|
|
- bp.v->btree_id,
|
|
- bp.v->pos,
|
|
- 0, 0,
|
|
- iter_flags);
|
|
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
|
|
- if (bkey_err(k)) {
|
|
- bch2_trans_iter_exit(trans, iter);
|
|
- return k;
|
|
- }
|
|
-
|
|
- if (k.k &&
|
|
- extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
|
|
- return k;
|
|
-
|
|
- bch2_trans_iter_exit(trans, iter);
|
|
- int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
|
|
- return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
|
|
- } else {
|
|
- struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
|
|
- if (IS_ERR_OR_NULL(b))
|
|
- return ((struct bkey_s_c) { .k = ERR_CAST(b) });
|
|
-
|
|
- return bkey_i_to_s_c(&b->key);
|
|
- }
|
|
-}
|
|
-
|
|
-struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
|
|
- struct bkey_s_c_backpointer bp,
|
|
- struct btree_iter *iter,
|
|
- struct bkey_buf *last_flushed)
|
|
+static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
|
|
+ struct bkey_s_c_backpointer bp,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_buf *last_flushed,
|
|
+ bool commit)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
|
|
@@ -287,7 +282,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
|
|
0,
|
|
bp.v->level - 1,
|
|
0);
|
|
- struct btree *b = bch2_btree_iter_peek_node(iter);
|
|
+ struct btree *b = bch2_btree_iter_peek_node(trans, iter);
|
|
if (IS_ERR_OR_NULL(b))
|
|
goto err;
|
|
|
|
@@ -300,7 +295,8 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
|
|
if (btree_node_will_make_reachable(b)) {
|
|
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
|
|
} else {
|
|
- int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed);
|
|
+ int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
|
|
+ last_flushed, commit);
|
|
b = ret ? ERR_PTR(ret) : NULL;
|
|
}
|
|
err:
|
|
@@ -308,6 +304,79 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
|
|
return b;
|
|
}
|
|
|
|
+static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
|
|
+ struct bkey_s_c_backpointer bp,
|
|
+ struct btree_iter *iter,
|
|
+ unsigned iter_flags,
|
|
+ struct bkey_buf *last_flushed,
|
|
+ bool commit)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+
|
|
+ if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
|
|
+ return bkey_s_c_null;
|
|
+
|
|
+ bch2_trans_node_iter_init(trans, iter,
|
|
+ bp.v->btree_id,
|
|
+ bp.v->pos,
|
|
+ 0,
|
|
+ bp.v->level,
|
|
+ iter_flags);
|
|
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
|
|
+ if (bkey_err(k)) {
|
|
+ bch2_trans_iter_exit(trans, iter);
|
|
+ return k;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * peek_slot() doesn't normally return NULL - except when we ask for a
|
|
+ * key at a btree level that doesn't exist.
|
|
+ *
|
|
+ * We may want to revisit this and change peek_slot():
|
|
+ */
|
|
+ if (!k.k) {
|
|
+ bkey_init(&iter->k);
|
|
+ iter->k.p = bp.v->pos;
|
|
+ k.k = &iter->k;
|
|
+ }
|
|
+
|
|
+ if (k.k &&
|
|
+ extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
|
|
+ return k;
|
|
+
|
|
+ bch2_trans_iter_exit(trans, iter);
|
|
+
|
|
+ if (!bp.v->level) {
|
|
+ int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit);
|
|
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
|
|
+ } else {
|
|
+ struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
|
|
+ if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
|
|
+ return bkey_s_c_null;
|
|
+ if (IS_ERR_OR_NULL(b))
|
|
+ return ((struct bkey_s_c) { .k = ERR_CAST(b) });
|
|
+
|
|
+ return bkey_i_to_s_c(&b->key);
|
|
+ }
|
|
+}
|
|
+
|
|
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
|
|
+ struct bkey_s_c_backpointer bp,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_buf *last_flushed)
|
|
+{
|
|
+ return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true);
|
|
+}
|
|
+
|
|
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
|
|
+ struct bkey_s_c_backpointer bp,
|
|
+ struct btree_iter *iter,
|
|
+ unsigned iter_flags,
|
|
+ struct bkey_buf *last_flushed)
|
|
+{
|
|
+ return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true);
|
|
+}
|
|
+
|
|
static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
|
|
struct bkey_buf *last_flushed)
|
|
{
|
|
@@ -315,7 +384,7 @@ static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, st
|
|
return 0;
|
|
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter alloc_iter = { NULL };
|
|
+ struct btree_iter alloc_iter = {};
|
|
struct bkey_s_c alloc_k;
|
|
struct printbuf buf = PRINTBUF;
|
|
int ret = 0;
|
|
@@ -419,7 +488,8 @@ static int check_extent_checksum(struct btree_trans *trans,
|
|
|
|
bytes = p.crc.compressed_size << 9;
|
|
|
|
- struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ);
|
|
+ struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ,
|
|
+ BCH_DEV_READ_REF_check_extent_checksums);
|
|
if (!ca)
|
|
return false;
|
|
|
|
@@ -436,12 +506,11 @@ static int check_extent_checksum(struct btree_trans *trans,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
|
|
- prt_printf(&buf, "\n ");
|
|
+ prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n");
|
|
bch2_btree_id_to_text(&buf, btree);
|
|
prt_str(&buf, " ");
|
|
bch2_bkey_val_to_text(&buf, c, extent);
|
|
- prt_printf(&buf, "\n ");
|
|
+ prt_newline(&buf);
|
|
bch2_btree_id_to_text(&buf, o_btree);
|
|
prt_str(&buf, " ");
|
|
bch2_bkey_val_to_text(&buf, c, extent2);
|
|
@@ -457,7 +526,8 @@ static int check_extent_checksum(struct btree_trans *trans,
|
|
if (bio)
|
|
bio_put(bio);
|
|
kvfree(data_buf);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ],
|
|
+ BCH_DEV_READ_REF_check_extent_checksums);
|
|
printbuf_exit(&buf);
|
|
return ret;
|
|
}
|
|
@@ -504,7 +574,7 @@ static int check_bp_exists(struct btree_trans *trans,
|
|
struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
|
|
|
|
struct bkey_s_c other_extent =
|
|
- bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL);
|
|
+ __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false);
|
|
ret = bkey_err(other_extent);
|
|
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
|
|
ret = 0;
|
|
@@ -514,11 +584,27 @@ static int check_bp_exists(struct btree_trans *trans,
|
|
if (!other_extent.k)
|
|
goto missing;
|
|
|
|
+ rcu_read_lock();
|
|
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode);
|
|
+ if (ca) {
|
|
+ struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent);
|
|
+ bkey_for_each_ptr(other_extent_ptrs, ptr)
|
|
+ if (ptr->dev == bp->k.p.inode &&
|
|
+ dev_ptr_stale_rcu(ca, ptr)) {
|
|
+ ret = drop_dev_and_update(trans, other_bp.v->btree_id,
|
|
+ other_extent, bp->k.p.inode);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
if (bch2_extents_match(orig_k, other_extent)) {
|
|
printbuf_reset(&buf);
|
|
- prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
|
|
+ prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n");
|
|
bch2_bkey_val_to_text(&buf, c, orig_k);
|
|
- prt_str(&buf, "\n ");
|
|
+ prt_newline(&buf);
|
|
bch2_bkey_val_to_text(&buf, c, other_extent);
|
|
bch_err(c, "%s", buf.buf);
|
|
|
|
@@ -557,20 +643,20 @@ static int check_bp_exists(struct btree_trans *trans,
|
|
}
|
|
|
|
printbuf_reset(&buf);
|
|
- prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode);
|
|
+ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode);
|
|
bch2_bkey_val_to_text(&buf, c, orig_k);
|
|
- prt_str(&buf, "\n ");
|
|
+ prt_newline(&buf);
|
|
bch2_bkey_val_to_text(&buf, c, other_extent);
|
|
bch_err(c, "%s", buf.buf);
|
|
ret = -BCH_ERR_fsck_repair_unimplemented;
|
|
goto err;
|
|
missing:
|
|
printbuf_reset(&buf);
|
|
- prt_str(&buf, "missing backpointer\n for: ");
|
|
+ prt_str(&buf, "missing backpointer\nfor: ");
|
|
bch2_bkey_val_to_text(&buf, c, orig_k);
|
|
- prt_printf(&buf, "\n want: ");
|
|
+ prt_printf(&buf, "\nwant: ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
|
|
- prt_printf(&buf, "\n got: ");
|
|
+ prt_printf(&buf, "\ngot: ");
|
|
bch2_bkey_val_to_text(&buf, c, bp_k);
|
|
|
|
if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
|
|
@@ -590,28 +676,38 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
|
|
struct extent_ptr_decoded p;
|
|
|
|
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
- if (p.ptr.cached)
|
|
- continue;
|
|
-
|
|
if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
|
|
continue;
|
|
|
|
rcu_read_lock();
|
|
struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
|
|
- bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches);
|
|
- bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty);
|
|
- rcu_read_unlock();
|
|
+ if (!ca) {
|
|
+ rcu_read_unlock();
|
|
+ continue;
|
|
+ }
|
|
|
|
- if (check || empty) {
|
|
- struct bkey_i_backpointer bp;
|
|
- bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
|
|
+ if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) {
|
|
+ rcu_read_unlock();
|
|
+ continue;
|
|
+ }
|
|
|
|
- int ret = check
|
|
- ? check_bp_exists(trans, s, &bp, k)
|
|
- : bch2_bucket_backpointer_mod(trans, k, &bp, true);
|
|
- if (ret)
|
|
- return ret;
|
|
+ u64 b = PTR_BUCKET_NR(ca, &p.ptr);
|
|
+ if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) {
|
|
+ rcu_read_unlock();
|
|
+ continue;
|
|
}
|
|
+
|
|
+ bool empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ struct bkey_i_backpointer bp;
|
|
+ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
|
|
+
|
|
+ int ret = !empty
|
|
+ ? check_bp_exists(trans, s, &bp, k)
|
|
+ : bch2_bucket_backpointer_mod(trans, k, &bp, true);
|
|
+ if (ret)
|
|
+ return ret;
|
|
}
|
|
|
|
return 0;
|
|
@@ -630,7 +726,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
|
|
retry:
|
|
bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
|
|
0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
|
|
- b = bch2_btree_iter_peek_node(&iter);
|
|
+ b = bch2_btree_iter_peek_node(trans, &iter);
|
|
ret = PTR_ERR_OR_ZERO(b);
|
|
if (ret)
|
|
goto err;
|
|
@@ -649,14 +745,6 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
-static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
|
|
-{
|
|
- return (struct bbpos) {
|
|
- .btree = bp.btree_id,
|
|
- .pos = bp.pos,
|
|
- };
|
|
-}
|
|
-
|
|
static u64 mem_may_pin_bytes(struct bch_fs *c)
|
|
{
|
|
struct sysinfo i;
|
|
@@ -715,69 +803,11 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
-struct progress_indicator_state {
|
|
- unsigned long next_print;
|
|
- u64 nodes_seen;
|
|
- u64 nodes_total;
|
|
- struct btree *last_node;
|
|
-};
|
|
-
|
|
-static inline void progress_init(struct progress_indicator_state *s,
|
|
- struct bch_fs *c,
|
|
- u64 btree_id_mask)
|
|
+static inline int bch2_fs_going_ro(struct bch_fs *c)
|
|
{
|
|
- memset(s, 0, sizeof(*s));
|
|
-
|
|
- s->next_print = jiffies + HZ * 10;
|
|
-
|
|
- for (unsigned i = 0; i < BTREE_ID_NR; i++) {
|
|
- if (!(btree_id_mask & BIT_ULL(i)))
|
|
- continue;
|
|
-
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_btree,
|
|
- .btree.id = i,
|
|
- };
|
|
-
|
|
- u64 v;
|
|
- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
|
|
- s->nodes_total += div64_ul(v, btree_sectors(c));
|
|
- }
|
|
-}
|
|
-
|
|
-static inline bool progress_update_p(struct progress_indicator_state *s)
|
|
-{
|
|
- bool ret = time_after_eq(jiffies, s->next_print);
|
|
-
|
|
- if (ret)
|
|
- s->next_print = jiffies + HZ * 10;
|
|
- return ret;
|
|
-}
|
|
-
|
|
-static void progress_update_iter(struct btree_trans *trans,
|
|
- struct progress_indicator_state *s,
|
|
- struct btree_iter *iter,
|
|
- const char *msg)
|
|
-{
|
|
- struct bch_fs *c = trans->c;
|
|
- struct btree *b = path_l(btree_iter_path(trans, iter))->b;
|
|
-
|
|
- s->nodes_seen += b != s->last_node;
|
|
- s->last_node = b;
|
|
-
|
|
- if (progress_update_p(s)) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- unsigned percent = s->nodes_total
|
|
- ? div64_u64(s->nodes_seen * 100, s->nodes_total)
|
|
- : 0;
|
|
-
|
|
- prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
|
|
- msg, percent, s->nodes_seen, s->nodes_total);
|
|
- bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
|
|
-
|
|
- bch_info(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
+ return test_bit(BCH_FS_going_ro, &c->flags)
|
|
+ ? -EROFS
|
|
+ : 0;
|
|
}
|
|
|
|
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
|
|
@@ -787,7 +817,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
|
|
struct progress_indicator_state progress;
|
|
int ret = 0;
|
|
|
|
- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
|
|
+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
|
|
|
|
for (enum btree_id btree_id = 0;
|
|
btree_id < btree_id_nr_alive(c);
|
|
@@ -806,7 +836,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
|
|
BTREE_ITER_prefetch);
|
|
|
|
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
|
|
- progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
|
|
+ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
|
|
+ bch2_fs_going_ro(c) ?:
|
|
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
|
|
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
|
}));
|
|
@@ -827,7 +858,7 @@ enum alloc_sector_counter {
|
|
ALLOC_SECTORS_NR
|
|
};
|
|
|
|
-static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t)
|
|
+static int data_type_to_alloc_counter(enum bch_data_type t)
|
|
{
|
|
switch (t) {
|
|
case BCH_DATA_btree:
|
|
@@ -836,15 +867,17 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t
|
|
case BCH_DATA_cached:
|
|
return ALLOC_cached;
|
|
case BCH_DATA_stripe:
|
|
+ case BCH_DATA_parity:
|
|
return ALLOC_stripe;
|
|
default:
|
|
- BUG();
|
|
+ return -1;
|
|
}
|
|
}
|
|
|
|
static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos);
|
|
|
|
static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k,
|
|
+ bool *had_mismatch,
|
|
struct bkey_buf *last_flushed)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
@@ -852,6 +885,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
|
|
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
|
|
bool need_commit = false;
|
|
|
|
+ *had_mismatch = false;
|
|
+
|
|
if (a->data_type == BCH_DATA_sb ||
|
|
a->data_type == BCH_DATA_journal ||
|
|
a->data_type == BCH_DATA_parity)
|
|
@@ -889,7 +924,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
|
|
if (bp.v->bucket_gen != a->gen)
|
|
continue;
|
|
|
|
- sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len;
|
|
+ int alloc_counter = data_type_to_alloc_counter(bp.v->data_type);
|
|
+ if (alloc_counter < 0)
|
|
+ continue;
|
|
+
|
|
+ sectors[alloc_counter] += bp.v->bucket_len;
|
|
};
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
if (ret)
|
|
@@ -901,9 +940,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
|
|
goto err;
|
|
}
|
|
|
|
- /* Cached pointers don't have backpointers: */
|
|
-
|
|
if (sectors[ALLOC_dirty] != a->dirty_sectors ||
|
|
+ sectors[ALLOC_cached] != a->cached_sectors ||
|
|
sectors[ALLOC_stripe] != a->stripe_sectors) {
|
|
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
|
|
ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
|
|
@@ -912,17 +950,25 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
|
|
}
|
|
|
|
if (sectors[ALLOC_dirty] > a->dirty_sectors ||
|
|
+ sectors[ALLOC_cached] > a->cached_sectors ||
|
|
sectors[ALLOC_stripe] > a->stripe_sectors) {
|
|
ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
|
|
-BCH_ERR_transaction_restart_nested;
|
|
goto err;
|
|
}
|
|
|
|
- if (!sectors[ALLOC_dirty] &&
|
|
- !sectors[ALLOC_stripe])
|
|
- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty);
|
|
- else
|
|
- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
|
|
+ bool empty = (sectors[ALLOC_dirty] +
|
|
+ sectors[ALLOC_stripe] +
|
|
+ sectors[ALLOC_cached]) == 0;
|
|
+
|
|
+ ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch,
|
|
+ alloc_k.k->p.offset) ?:
|
|
+ (empty
|
|
+ ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty,
|
|
+ alloc_k.k->p.offset)
|
|
+ : 0);
|
|
+
|
|
+ *had_mismatch = true;
|
|
}
|
|
err:
|
|
bch2_dev_put(ca);
|
|
@@ -946,8 +992,14 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
|
|
goto next;
|
|
|
|
struct bpos bucket = bp_pos_to_bucket(ca, pos);
|
|
- bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches,
|
|
- ca->mi.nbuckets, bucket.offset);
|
|
+ u64 next = ca->mi.nbuckets;
|
|
+
|
|
+ unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
|
|
+ if (bitmap)
|
|
+ next = min_t(u64, next,
|
|
+ find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset));
|
|
+
|
|
+ bucket.offset = next;
|
|
if (bucket.offset == ca->mi.nbuckets)
|
|
goto next;
|
|
|
|
@@ -973,7 +1025,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k,
|
|
{
|
|
struct btree_iter iter;
|
|
bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0);
|
|
- struct btree *b = bch2_btree_iter_peek_node(&iter);
|
|
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
|
|
int ret = PTR_ERR_OR_ZERO(b);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1056,28 +1108,6 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
|
|
{
|
|
int ret = 0;
|
|
|
|
- /*
|
|
- * Can't allow devices to come/go/resize while we have bucket bitmaps
|
|
- * allocated
|
|
- */
|
|
- lockdep_assert_held(&c->state_lock);
|
|
-
|
|
- for_each_member_device(c, ca) {
|
|
- BUG_ON(ca->bucket_backpointer_mismatches);
|
|
- ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
|
|
- sizeof(unsigned long),
|
|
- GFP_KERNEL);
|
|
- ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
|
|
- sizeof(unsigned long),
|
|
- GFP_KERNEL);
|
|
- if (!ca->bucket_backpointer_mismatches ||
|
|
- !ca->bucket_backpointer_empty) {
|
|
- bch2_dev_put(ca);
|
|
- ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
|
|
- goto err_free_bitmaps;
|
|
- }
|
|
- }
|
|
-
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
struct extents_to_bp_state s = { .bp_start = POS_MIN };
|
|
|
|
@@ -1086,23 +1116,24 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
|
|
|
|
ret = for_each_btree_key(trans, iter, BTREE_ID_alloc,
|
|
POS_MIN, BTREE_ITER_prefetch, k, ({
|
|
- check_bucket_backpointer_mismatch(trans, k, &s.last_flushed);
|
|
+ bool had_mismatch;
|
|
+ bch2_fs_going_ro(c) ?:
|
|
+ check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed);
|
|
}));
|
|
if (ret)
|
|
goto err;
|
|
|
|
- u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0;
|
|
+ u64 nr_buckets = 0, nr_mismatches = 0;
|
|
for_each_member_device(c, ca) {
|
|
nr_buckets += ca->mi.nbuckets;
|
|
- nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets);
|
|
- nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets);
|
|
+ nr_mismatches += ca->bucket_backpointer_mismatch.nr;
|
|
}
|
|
|
|
- if (!nr_mismatches && !nr_empty)
|
|
+ if (!nr_mismatches)
|
|
goto err;
|
|
|
|
bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
|
|
- nr_mismatches + nr_empty, nr_buckets);
|
|
+ nr_mismatches, nr_buckets);
|
|
|
|
while (1) {
|
|
ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
|
|
@@ -1133,22 +1164,71 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
|
|
|
|
s.bp_start = bpos_successor(s.bp_end);
|
|
}
|
|
+
|
|
+ for_each_member_device(c, ca) {
|
|
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
|
|
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
|
|
+ }
|
|
err:
|
|
bch2_trans_put(trans);
|
|
bch2_bkey_buf_exit(&s.last_flushed, c);
|
|
bch2_btree_cache_unpin(c);
|
|
-err_free_bitmaps:
|
|
- for_each_member_device(c, ca) {
|
|
- kvfree(ca->bucket_backpointer_empty);
|
|
- ca->bucket_backpointer_empty = NULL;
|
|
- kvfree(ca->bucket_backpointer_mismatches);
|
|
- ca->bucket_backpointer_mismatches = NULL;
|
|
- }
|
|
|
|
bch_err_fn(c, ret);
|
|
return ret;
|
|
}
|
|
|
|
+static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans,
|
|
+ struct bpos bucket,
|
|
+ bool *had_mismatch,
|
|
+ struct bkey_buf *last_flushed)
|
|
+{
|
|
+ struct btree_iter alloc_iter;
|
|
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter,
|
|
+ BTREE_ID_alloc, bucket,
|
|
+ BTREE_ITER_cached);
|
|
+ int ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed);
|
|
+ bch2_trans_iter_exit(trans, &alloc_iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans,
|
|
+ struct bch_dev *ca, u64 bucket,
|
|
+ bool copygc,
|
|
+ struct bkey_buf *last_flushed)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ bool had_mismatch;
|
|
+ int ret = lockrestart_do(trans,
|
|
+ check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket),
|
|
+ &had_mismatch, last_flushed));
|
|
+ if (ret || !had_mismatch)
|
|
+ return ret;
|
|
+
|
|
+ u64 nr = ca->bucket_backpointer_mismatch.nr;
|
|
+ u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0;
|
|
+
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ __bch2_log_msg_start(ca->name, &buf);
|
|
+
|
|
+ prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n",
|
|
+ bucket, nr, ca->mi.nbuckets);
|
|
+
|
|
+ bch2_run_explicit_recovery_pass(c, &buf,
|
|
+ BCH_RECOVERY_PASS_check_extents_to_backpointers,
|
|
+ nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0);
|
|
+
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* backpointers -> extents */
|
|
+
|
|
static int check_one_backpointer(struct btree_trans *trans,
|
|
struct bbpos start,
|
|
struct bbpos end,
|
|
@@ -1206,11 +1286,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
|
|
|
|
bch2_bkey_buf_init(&last_flushed);
|
|
bkey_init(&last_flushed.k->k);
|
|
- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
|
|
+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
|
|
|
|
int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
|
|
POS_MIN, BTREE_ITER_prefetch, k, ({
|
|
- progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
|
|
+ bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
|
|
check_one_backpointer(trans, start, end, k, &last_flushed);
|
|
}));
|
|
|
|
@@ -1264,3 +1344,48 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
|
|
bch_err_fn(c, ret);
|
|
return ret;
|
|
}
|
|
+
|
|
+static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit)
|
|
+{
|
|
+ scoped_guard(mutex, &b->lock) {
|
|
+ if (!b->buckets) {
|
|
+ b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
|
|
+ sizeof(unsigned long), GFP_KERNEL);
|
|
+ if (!b->buckets)
|
|
+ return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
|
|
+ }
|
|
+
|
|
+ b->nr += !__test_and_set_bit(bit, b->buckets);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_size)
|
|
+{
|
|
+ scoped_guard(mutex, &b->lock) {
|
|
+ if (!b->buckets)
|
|
+ return 0;
|
|
+
|
|
+ unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size),
|
|
+ sizeof(unsigned long), GFP_KERNEL);
|
|
+ if (!n)
|
|
+ return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
|
|
+
|
|
+ memcpy(n, b->buckets,
|
|
+ BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long));
|
|
+ kvfree(b->buckets);
|
|
+ b->buckets = n;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_bucket_bitmap_free(struct bucket_bitmap *b)
|
|
+{
|
|
+ mutex_lock(&b->lock);
|
|
+ kvfree(b->buckets);
|
|
+ b->buckets = NULL;
|
|
+ b->nr = 0;
|
|
+ mutex_unlock(&b->lock);
|
|
+}
|
|
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
|
|
index 060dad1521ee..6840561084ce 100644
|
|
--- a/fs/bcachefs/backpointers.h
|
|
+++ b/fs/bcachefs/backpointers.h
|
|
@@ -1,6 +1,6 @@
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
-#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
|
|
-#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
|
|
+#ifndef _BCACHEFS_BACKPOINTERS_H
|
|
+#define _BCACHEFS_BACKPOINTERS_H
|
|
|
|
#include "btree_cache.h"
|
|
#include "btree_iter.h"
|
|
@@ -102,7 +102,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
|
|
struct bkey_i_backpointer *bp,
|
|
bool insert)
|
|
{
|
|
- if (unlikely(bch2_backpointers_no_use_write_buffer))
|
|
+ if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer))
|
|
return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
|
|
|
|
if (!insert) {
|
|
@@ -123,7 +123,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
|
|
return BCH_DATA_btree;
|
|
case KEY_TYPE_extent:
|
|
case KEY_TYPE_reflink_v:
|
|
- return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
|
|
+ if (p.has_ec)
|
|
+ return BCH_DATA_stripe;
|
|
+ if (p.ptr.cached)
|
|
+ return BCH_DATA_cached;
|
|
+ else
|
|
+ return BCH_DATA_user;
|
|
case KEY_TYPE_stripe: {
|
|
const struct bch_extent_ptr *ptr = &entry->ptr;
|
|
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
|
|
@@ -147,7 +152,20 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
|
|
struct bkey_i_backpointer *bp)
|
|
{
|
|
bkey_backpointer_init(&bp->k_i);
|
|
- bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset);
|
|
+ bp->k.p.inode = p.ptr.dev;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_stripe)
|
|
+ bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
|
|
+ else {
|
|
+ /*
|
|
+ * Put stripe backpointers where they won't collide with the
|
|
+ * extent backpointers within the stripe:
|
|
+ */
|
|
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
|
|
+ bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) <<
|
|
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1;
|
|
+ }
|
|
+
|
|
bp->v = (struct bch_backpointer) {
|
|
.btree_id = btree_id,
|
|
.level = level,
|
|
@@ -164,8 +182,20 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_b
|
|
struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
|
|
struct btree_iter *, struct bkey_buf *);
|
|
|
|
+int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64,
|
|
+ bool, struct bkey_buf *);
|
|
+
|
|
int bch2_check_btree_backpointers(struct bch_fs *);
|
|
int bch2_check_extents_to_backpointers(struct bch_fs *);
|
|
int bch2_check_backpointers_to_extents(struct bch_fs *);
|
|
|
|
+static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i)
|
|
+{
|
|
+ unsigned long *bitmap = READ_ONCE(b->buckets);
|
|
+ return bitmap && test_bit(i, bitmap);
|
|
+}
|
|
+
|
|
+int bch2_bucket_bitmap_resize(struct bucket_bitmap *, u64, u64);
|
|
+void bch2_bucket_bitmap_free(struct bucket_bitmap *);
|
|
+
|
|
#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
|
|
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
|
|
index 161cf2f05d2a..7824da2af9d0 100644
|
|
--- a/fs/bcachefs/bcachefs.h
|
|
+++ b/fs/bcachefs/bcachefs.h
|
|
@@ -203,22 +203,24 @@
|
|
#include <linux/types.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/zstd.h>
|
|
+#include <linux/unicode.h>
|
|
|
|
#include "bcachefs_format.h"
|
|
#include "btree_journal_iter_types.h"
|
|
#include "disk_accounting_types.h"
|
|
#include "errcode.h"
|
|
+#include "fast_list.h"
|
|
#include "fifo.h"
|
|
#include "nocow_locking_types.h"
|
|
#include "opts.h"
|
|
-#include "recovery_passes_types.h"
|
|
#include "sb-errors_types.h"
|
|
#include "seqmutex.h"
|
|
+#include "snapshot_types.h"
|
|
#include "time_stats.h"
|
|
#include "util.h"
|
|
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
-#define BCH_WRITE_REF_DEBUG
|
|
+#define ENUMERATED_REF_DEBUG
|
|
#endif
|
|
|
|
#ifndef dynamic_fault
|
|
@@ -268,7 +270,8 @@ do { \
|
|
|
|
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
|
|
|
|
-void bch2_print_str(struct bch_fs *, const char *);
|
|
+void bch2_print_str(struct bch_fs *, const char *, const char *);
|
|
+void bch2_print_str_nonblocking(struct bch_fs *, const char *, const char *);
|
|
|
|
__printf(2, 3)
|
|
void bch2_print_opts(struct bch_opts *, const char *, ...);
|
|
@@ -292,6 +295,16 @@ do { \
|
|
bch2_print(_c, __VA_ARGS__); \
|
|
} while (0)
|
|
|
|
+#define bch2_print_str_ratelimited(_c, ...) \
|
|
+do { \
|
|
+ static DEFINE_RATELIMIT_STATE(_rs, \
|
|
+ DEFAULT_RATELIMIT_INTERVAL, \
|
|
+ DEFAULT_RATELIMIT_BURST); \
|
|
+ \
|
|
+ if (__ratelimit(&_rs)) \
|
|
+ bch2_print_str(_c, __VA_ARGS__); \
|
|
+} while (0)
|
|
+
|
|
#define bch_info(c, fmt, ...) \
|
|
bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
|
|
#define bch_info_ratelimited(c, fmt, ...) \
|
|
@@ -389,17 +402,20 @@ do { \
|
|
"compare them") \
|
|
BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \
|
|
"Don't use the write buffer for backpointers, enabling "\
|
|
- "extra runtime checks")
|
|
-
|
|
-/* Parameters that should only be compiled in debug mode: */
|
|
-#define BCH_DEBUG_PARAMS_DEBUG() \
|
|
- BCH_DEBUG_PARAM(expensive_debug_checks, \
|
|
- "Enables various runtime debugging checks that " \
|
|
- "significantly affect performance") \
|
|
+ "extra runtime checks") \
|
|
+ BCH_DEBUG_PARAM(debug_check_btree_locking, \
|
|
+ "Enable additional asserts for btree locking") \
|
|
BCH_DEBUG_PARAM(debug_check_iterators, \
|
|
"Enables extra verification for btree iterators") \
|
|
+ BCH_DEBUG_PARAM(debug_check_bset_lookups, \
|
|
+ "Enables extra verification for bset lookups") \
|
|
BCH_DEBUG_PARAM(debug_check_btree_accounting, \
|
|
"Verify btree accounting for keys within a node") \
|
|
+ BCH_DEBUG_PARAM(debug_check_bkey_unpack, \
|
|
+ "Enables extra verification for bkey unpack")
|
|
+
|
|
+/* Parameters that should only be compiled in debug mode: */
|
|
+#define BCH_DEBUG_PARAMS_DEBUG() \
|
|
BCH_DEBUG_PARAM(journal_seq_verify, \
|
|
"Store the journal sequence number in the version " \
|
|
"number of every btree key, and verify that btree " \
|
|
@@ -426,28 +442,28 @@ do { \
|
|
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
|
|
#endif
|
|
|
|
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
|
|
-BCH_DEBUG_PARAMS()
|
|
+#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name;
|
|
+BCH_DEBUG_PARAMS_ALL()
|
|
#undef BCH_DEBUG_PARAM
|
|
|
|
-#ifndef CONFIG_BCACHEFS_DEBUG
|
|
-#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
|
|
-BCH_DEBUG_PARAMS_DEBUG()
|
|
-#undef BCH_DEBUG_PARAM
|
|
-#endif
|
|
-
|
|
#define BCH_TIME_STATS() \
|
|
x(btree_node_mem_alloc) \
|
|
x(btree_node_split) \
|
|
x(btree_node_compact) \
|
|
x(btree_node_merge) \
|
|
x(btree_node_sort) \
|
|
+ x(btree_node_get) \
|
|
x(btree_node_read) \
|
|
x(btree_node_read_done) \
|
|
+ x(btree_node_write) \
|
|
x(btree_interior_update_foreground) \
|
|
x(btree_interior_update_total) \
|
|
x(btree_gc) \
|
|
x(data_write) \
|
|
+ x(data_write_to_submit) \
|
|
+ x(data_write_to_queue) \
|
|
+ x(data_write_to_btree_update) \
|
|
+ x(data_write_btree_update) \
|
|
x(data_read) \
|
|
x(data_promote) \
|
|
x(journal_flush_write) \
|
|
@@ -456,6 +472,7 @@ BCH_DEBUG_PARAMS_DEBUG()
|
|
x(blocked_journal_low_on_space) \
|
|
x(blocked_journal_low_on_pin) \
|
|
x(blocked_journal_max_in_flight) \
|
|
+ x(blocked_journal_max_open) \
|
|
x(blocked_key_cache_flush) \
|
|
x(blocked_allocate) \
|
|
x(blocked_allocate_open_bucket) \
|
|
@@ -470,6 +487,7 @@ enum bch_time_stats {
|
|
};
|
|
|
|
#include "alloc_types.h"
|
|
+#include "async_objs_types.h"
|
|
#include "btree_gc_types.h"
|
|
#include "btree_types.h"
|
|
#include "btree_node_scan_types.h"
|
|
@@ -479,10 +497,12 @@ enum bch_time_stats {
|
|
#include "clock_types.h"
|
|
#include "disk_groups_types.h"
|
|
#include "ec_types.h"
|
|
+#include "enumerated_ref_types.h"
|
|
#include "journal_types.h"
|
|
#include "keylist_types.h"
|
|
#include "quota_types.h"
|
|
#include "rebalance_types.h"
|
|
+#include "recovery_passes_types.h"
|
|
#include "replicas_types.h"
|
|
#include "sb-members_types.h"
|
|
#include "subvolume_types.h"
|
|
@@ -511,6 +531,57 @@ struct discard_in_flight {
|
|
u64 bucket:63;
|
|
};
|
|
|
|
+#define BCH_DEV_READ_REFS() \
|
|
+ x(bch2_online_devs) \
|
|
+ x(trans_mark_dev_sbs) \
|
|
+ x(read_fua_test) \
|
|
+ x(sb_field_resize) \
|
|
+ x(write_super) \
|
|
+ x(journal_read) \
|
|
+ x(fs_journal_alloc) \
|
|
+ x(fs_resize_on_mount) \
|
|
+ x(btree_node_read) \
|
|
+ x(btree_node_read_all_replicas) \
|
|
+ x(btree_node_scrub) \
|
|
+ x(btree_node_write) \
|
|
+ x(btree_node_scan) \
|
|
+ x(btree_verify_replicas) \
|
|
+ x(btree_node_ondisk_to_text) \
|
|
+ x(io_read) \
|
|
+ x(check_extent_checksums) \
|
|
+ x(ec_block)
|
|
+
|
|
+enum bch_dev_read_ref {
|
|
+#define x(n) BCH_DEV_READ_REF_##n,
|
|
+ BCH_DEV_READ_REFS()
|
|
+#undef x
|
|
+ BCH_DEV_READ_REF_NR,
|
|
+};
|
|
+
|
|
+#define BCH_DEV_WRITE_REFS() \
|
|
+ x(journal_write) \
|
|
+ x(journal_do_discards) \
|
|
+ x(dev_do_discards) \
|
|
+ x(discard_one_bucket_fast) \
|
|
+ x(do_invalidates) \
|
|
+ x(nocow_flush) \
|
|
+ x(io_write) \
|
|
+ x(ec_block) \
|
|
+ x(ec_bucket_zero)
|
|
+
|
|
+enum bch_dev_write_ref {
|
|
+#define x(n) BCH_DEV_WRITE_REF_##n,
|
|
+ BCH_DEV_WRITE_REFS()
|
|
+#undef x
|
|
+ BCH_DEV_WRITE_REF_NR,
|
|
+};
|
|
+
|
|
+struct bucket_bitmap {
|
|
+ unsigned long *buckets;
|
|
+ u64 nr;
|
|
+ struct mutex lock;
|
|
+};
|
|
+
|
|
struct bch_dev {
|
|
struct kobject kobj;
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
@@ -521,8 +592,7 @@ struct bch_dev {
|
|
struct percpu_ref ref;
|
|
#endif
|
|
struct completion ref_completion;
|
|
- struct percpu_ref io_ref;
|
|
- struct completion io_ref_completion;
|
|
+ struct enumerated_ref io_ref[2];
|
|
|
|
struct bch_fs *fs;
|
|
|
|
@@ -533,6 +603,7 @@ struct bch_dev {
|
|
*/
|
|
struct bch_member_cpu mi;
|
|
atomic64_t errors[BCH_MEMBER_ERROR_NR];
|
|
+ unsigned long write_errors_start;
|
|
|
|
__uuid_t uuid;
|
|
char name[BDEVNAME_SIZE];
|
|
@@ -555,10 +626,11 @@ struct bch_dev {
|
|
u8 *oldest_gen;
|
|
unsigned long *buckets_nouse;
|
|
|
|
- unsigned long *bucket_backpointer_mismatches;
|
|
- unsigned long *bucket_backpointer_empty;
|
|
+ struct bucket_bitmap bucket_backpointer_mismatch;
|
|
+ struct bucket_bitmap bucket_backpointer_empty;
|
|
|
|
- struct bch_dev_usage __percpu *usage;
|
|
+ struct bch_dev_usage_full __percpu
|
|
+ *usage;
|
|
|
|
/* Allocator: */
|
|
u64 alloc_cursor[3];
|
|
@@ -567,10 +639,6 @@ struct bch_dev {
|
|
unsigned nr_partial_buckets;
|
|
unsigned nr_btree_reserve;
|
|
|
|
- size_t inc_gen_needs_gc;
|
|
- size_t inc_gen_really_needs_gc;
|
|
- size_t buckets_waiting_on_journal;
|
|
-
|
|
struct work_struct invalidate_work;
|
|
struct work_struct discard_work;
|
|
struct mutex discard_buckets_in_flight_lock;
|
|
@@ -609,21 +677,23 @@ struct bch_dev {
|
|
x(accounting_replay_done) \
|
|
x(may_go_rw) \
|
|
x(rw) \
|
|
+ x(rw_init_done) \
|
|
x(was_rw) \
|
|
x(stopping) \
|
|
x(emergency_ro) \
|
|
x(going_ro) \
|
|
x(write_disable_complete) \
|
|
x(clean_shutdown) \
|
|
- x(recovery_running) \
|
|
- x(fsck_running) \
|
|
+ x(in_recovery) \
|
|
+ x(in_fsck) \
|
|
x(initial_gc_unfixed) \
|
|
x(need_delete_dead_snapshots) \
|
|
x(error) \
|
|
x(topology_error) \
|
|
x(errors_fixed) \
|
|
x(errors_not_fixed) \
|
|
- x(no_invalid_checks)
|
|
+ x(no_invalid_checks) \
|
|
+ x(discard_mount_opt_set) \
|
|
|
|
enum bch_fs_flags {
|
|
#define x(n) BCH_FS_##n,
|
|
@@ -642,8 +712,10 @@ struct btree_transaction_stats {
|
|
struct bch2_time_stats lock_hold_times;
|
|
struct mutex lock;
|
|
unsigned nr_max_paths;
|
|
- unsigned journal_entries_size;
|
|
unsigned max_mem;
|
|
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+ darray_trans_kmalloc_trace trans_kmalloc_trace;
|
|
+#endif
|
|
char *max_paths_text;
|
|
};
|
|
|
|
@@ -664,9 +736,6 @@ struct btree_trans_buf {
|
|
struct btree_trans *trans;
|
|
};
|
|
|
|
-#define BCACHEFS_ROOT_SUBVOL_INUM \
|
|
- ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
|
|
-
|
|
#define BCH_WRITE_REFS() \
|
|
x(journal) \
|
|
x(trans) \
|
|
@@ -687,7 +756,9 @@ struct btree_trans_buf {
|
|
x(gc_gens) \
|
|
x(snapshot_delete_pagecache) \
|
|
x(sysfs) \
|
|
- x(btree_write_buffer)
|
|
+ x(btree_write_buffer) \
|
|
+ x(btree_node_scrub) \
|
|
+ x(async_recovery_passes)
|
|
|
|
enum bch_write_ref {
|
|
#define x(n) BCH_WRITE_REF_##n,
|
|
@@ -696,6 +767,8 @@ enum bch_write_ref {
|
|
BCH_WRITE_REF_NR,
|
|
};
|
|
|
|
+#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0)
|
|
+
|
|
struct bch_fs {
|
|
struct closure cl;
|
|
|
|
@@ -719,11 +792,7 @@ struct bch_fs {
|
|
struct rw_semaphore state_lock;
|
|
|
|
/* Counts outstanding writes, for clean transition to read-only */
|
|
-#ifdef BCH_WRITE_REF_DEBUG
|
|
- atomic_long_t writes[BCH_WRITE_REF_NR];
|
|
-#else
|
|
- struct percpu_ref writes;
|
|
-#endif
|
|
+ struct enumerated_ref writes;
|
|
/*
|
|
* Certain operations are only allowed in single threaded mode, during
|
|
* recovery, and we want to assert that this is the case:
|
|
@@ -767,6 +836,7 @@ struct bch_fs {
|
|
|
|
u8 nr_devices;
|
|
u8 clean;
|
|
+ bool multi_device; /* true if we've ever had more than one device */
|
|
|
|
u8 encryption_type;
|
|
|
|
@@ -776,10 +846,16 @@ struct bch_fs {
|
|
unsigned nsec_per_time_unit;
|
|
u64 features;
|
|
u64 compat;
|
|
+ u64 recovery_passes_required;
|
|
unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
|
|
u64 btrees_lost_data;
|
|
} sb;
|
|
+ DARRAY(enum bcachefs_metadata_version)
|
|
+ incompat_versions_requested;
|
|
|
|
+#ifdef CONFIG_UNICODE
|
|
+ struct unicode_map *cf_encoding;
|
|
+#endif
|
|
|
|
struct bch_sb_handle disk_sb;
|
|
|
|
@@ -795,7 +871,7 @@ struct bch_fs {
|
|
struct mutex snapshot_table_lock;
|
|
struct rw_semaphore snapshot_create_lock;
|
|
|
|
- struct work_struct snapshot_delete_work;
|
|
+ struct snapshot_delete snapshot_delete;
|
|
struct work_struct snapshot_wait_for_pagecache_and_delete_work;
|
|
snapshot_id_list snapshots_unlinked;
|
|
struct mutex snapshots_unlinked_lock;
|
|
@@ -860,7 +936,7 @@ struct bch_fs {
|
|
struct btree_write_buffer btree_write_buffer;
|
|
|
|
struct workqueue_struct *btree_update_wq;
|
|
- struct workqueue_struct *btree_io_complete_wq;
|
|
+ struct workqueue_struct *btree_write_complete_wq;
|
|
/* copygc needs its own workqueue for index updates.. */
|
|
struct workqueue_struct *copygc_wq;
|
|
/*
|
|
@@ -871,6 +947,7 @@ struct bch_fs {
|
|
struct workqueue_struct *write_ref_wq;
|
|
|
|
/* ALLOCATION */
|
|
+ struct bch_devs_mask online_devs;
|
|
struct bch_devs_mask rw_devs[BCH_DATA_NR];
|
|
unsigned long rw_devs_change_count;
|
|
|
|
@@ -965,13 +1042,16 @@ struct bch_fs {
|
|
nocow_locks;
|
|
struct rhashtable promote_table;
|
|
|
|
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
|
|
+ struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR];
|
|
+#endif
|
|
+
|
|
mempool_t compression_bounce[2];
|
|
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
|
|
size_t zstd_workspace_size;
|
|
|
|
- struct crypto_shash *sha256;
|
|
- struct crypto_sync_skcipher *chacha20;
|
|
- struct crypto_shash *poly1305;
|
|
+ struct bch_key chacha20_key;
|
|
+ bool chacha20_key_set;
|
|
|
|
atomic64_t key_version;
|
|
|
|
@@ -993,15 +1073,11 @@ struct bch_fs {
|
|
wait_queue_head_t copygc_running_wq;
|
|
|
|
/* STRIPES: */
|
|
- GENRADIX(struct stripe) stripes;
|
|
GENRADIX(struct gc_stripe) gc_stripes;
|
|
|
|
struct hlist_head ec_stripes_new[32];
|
|
spinlock_t ec_stripes_new_lock;
|
|
|
|
- ec_stripes_heap ec_stripes_heap;
|
|
- struct mutex ec_stripes_heap_lock;
|
|
-
|
|
/* ERASURE CODING */
|
|
struct list_head ec_stripe_head_list;
|
|
struct mutex ec_stripe_head_lock;
|
|
@@ -1039,25 +1115,12 @@ struct bch_fs {
|
|
/* RECOVERY */
|
|
u64 journal_replay_seq_start;
|
|
u64 journal_replay_seq_end;
|
|
- /*
|
|
- * Two different uses:
|
|
- * "Has this fsck pass?" - i.e. should this type of error be an
|
|
- * emergency read-only
|
|
- * And, in certain situations fsck will rewind to an earlier pass: used
|
|
- * for signaling to the toplevel code which pass we want to run now.
|
|
- */
|
|
- enum bch_recovery_pass curr_recovery_pass;
|
|
- enum bch_recovery_pass next_recovery_pass;
|
|
- /* bitmask of recovery passes that we actually ran */
|
|
- u64 recovery_passes_complete;
|
|
- /* never rewinds version of curr_recovery_pass */
|
|
- enum bch_recovery_pass recovery_pass_done;
|
|
- spinlock_t recovery_pass_lock;
|
|
- struct semaphore online_fsck_mutex;
|
|
+ struct bch_fs_recovery recovery;
|
|
|
|
/* DEBUG JUNK */
|
|
struct dentry *fs_debug_dir;
|
|
struct dentry *btree_debug_dir;
|
|
+ struct dentry *async_obj_dir;
|
|
struct btree_debug btree_debug[BTREE_ID_NR];
|
|
struct btree *verify_data;
|
|
struct btree_node *verify_ondisk;
|
|
@@ -1099,54 +1162,6 @@ struct bch_fs {
|
|
|
|
extern struct wait_queue_head bch2_read_only_wait;
|
|
|
|
-static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
|
|
-{
|
|
-#ifdef BCH_WRITE_REF_DEBUG
|
|
- atomic_long_inc(&c->writes[ref]);
|
|
-#else
|
|
- percpu_ref_get(&c->writes);
|
|
-#endif
|
|
-}
|
|
-
|
|
-static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
|
|
-{
|
|
-#ifdef BCH_WRITE_REF_DEBUG
|
|
- return !test_bit(BCH_FS_going_ro, &c->flags) &&
|
|
- atomic_long_inc_not_zero(&c->writes[ref]);
|
|
-#else
|
|
- return percpu_ref_tryget(&c->writes);
|
|
-#endif
|
|
-}
|
|
-
|
|
-static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
|
|
-{
|
|
-#ifdef BCH_WRITE_REF_DEBUG
|
|
- return !test_bit(BCH_FS_going_ro, &c->flags) &&
|
|
- atomic_long_inc_not_zero(&c->writes[ref]);
|
|
-#else
|
|
- return percpu_ref_tryget_live(&c->writes);
|
|
-#endif
|
|
-}
|
|
-
|
|
-static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
|
|
-{
|
|
-#ifdef BCH_WRITE_REF_DEBUG
|
|
- long v = atomic_long_dec_return(&c->writes[ref]);
|
|
-
|
|
- BUG_ON(v < 0);
|
|
- if (v)
|
|
- return;
|
|
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
|
|
- if (atomic_long_read(&c->writes[i]))
|
|
- return;
|
|
-
|
|
- set_bit(BCH_FS_write_disable_complete, &c->flags);
|
|
- wake_up(&bch2_read_only_wait);
|
|
-#else
|
|
- percpu_ref_put(&c->writes);
|
|
-#endif
|
|
-}
|
|
-
|
|
static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
|
|
{
|
|
if (test_bit(BCH_FS_stopping, &c->flags))
|
|
@@ -1247,4 +1262,17 @@ static inline unsigned data_replicas_required(struct bch_fs *c)
|
|
#define BKEY_PADDED_ONSTACK(key, pad) \
|
|
struct { struct bkey_i key; __u64 key ## _pad[pad]; }
|
|
|
|
+/*
|
|
+ * This is needed because discard is both a filesystem option and a device
|
|
+ * option, and mount options are supposed to apply to that mount and not be
|
|
+ * persisted, i.e. if it's set as a mount option we can't propagate it to the
|
|
+ * device.
|
|
+ */
|
|
+static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ return test_bit(BCH_FS_discard_mount_opt_set, &c->flags)
|
|
+ ? c->opts.discard
|
|
+ : ca->mi.discard;
|
|
+}
|
|
+
|
|
#endif /* _BCACHEFS_H */
|
|
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
|
|
index f70f0108401f..b4a04df5ea95 100644
|
|
--- a/fs/bcachefs/bcachefs_format.h
|
|
+++ b/fs/bcachefs/bcachefs_format.h
|
|
@@ -366,6 +366,10 @@ static inline void bkey_init(struct bkey *k)
|
|
#define __BKEY_PADDED(key, pad) \
|
|
struct bkey_i key; __u64 key ## _pad[pad]
|
|
|
|
+enum bch_bkey_type_flags {
|
|
+ BKEY_TYPE_strict_btree_checks = BIT(0),
|
|
+};
|
|
+
|
|
/*
|
|
* - DELETED keys are used internally to mark keys that should be ignored but
|
|
* override keys in composition order. Their version number is ignored.
|
|
@@ -383,46 +387,46 @@ static inline void bkey_init(struct bkey *k)
|
|
*
|
|
* - WHITEOUT: for hash table btrees
|
|
*/
|
|
-#define BCH_BKEY_TYPES() \
|
|
- x(deleted, 0) \
|
|
- x(whiteout, 1) \
|
|
- x(error, 2) \
|
|
- x(cookie, 3) \
|
|
- x(hash_whiteout, 4) \
|
|
- x(btree_ptr, 5) \
|
|
- x(extent, 6) \
|
|
- x(reservation, 7) \
|
|
- x(inode, 8) \
|
|
- x(inode_generation, 9) \
|
|
- x(dirent, 10) \
|
|
- x(xattr, 11) \
|
|
- x(alloc, 12) \
|
|
- x(quota, 13) \
|
|
- x(stripe, 14) \
|
|
- x(reflink_p, 15) \
|
|
- x(reflink_v, 16) \
|
|
- x(inline_data, 17) \
|
|
- x(btree_ptr_v2, 18) \
|
|
- x(indirect_inline_data, 19) \
|
|
- x(alloc_v2, 20) \
|
|
- x(subvolume, 21) \
|
|
- x(snapshot, 22) \
|
|
- x(inode_v2, 23) \
|
|
- x(alloc_v3, 24) \
|
|
- x(set, 25) \
|
|
- x(lru, 26) \
|
|
- x(alloc_v4, 27) \
|
|
- x(backpointer, 28) \
|
|
- x(inode_v3, 29) \
|
|
- x(bucket_gens, 30) \
|
|
- x(snapshot_tree, 31) \
|
|
- x(logged_op_truncate, 32) \
|
|
- x(logged_op_finsert, 33) \
|
|
- x(accounting, 34) \
|
|
- x(inode_alloc_cursor, 35)
|
|
+#define BCH_BKEY_TYPES() \
|
|
+ x(deleted, 0, 0) \
|
|
+ x(whiteout, 1, 0) \
|
|
+ x(error, 2, 0) \
|
|
+ x(cookie, 3, 0) \
|
|
+ x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(extent, 6, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(reservation, 7, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(inode, 8, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(dirent, 10, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(xattr, 11, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(alloc, 12, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(quota, 13, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(stripe, 14, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(set, 25, 0) \
|
|
+ x(lru, 26, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(accounting, 34, BKEY_TYPE_strict_btree_checks) \
|
|
+ x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks)
|
|
|
|
enum bch_bkey_type {
|
|
-#define x(name, nr) KEY_TYPE_##name = nr,
|
|
+#define x(name, nr, ...) KEY_TYPE_##name = nr,
|
|
BCH_BKEY_TYPES()
|
|
#undef x
|
|
KEY_TYPE_MAX,
|
|
@@ -493,7 +497,8 @@ struct bch_sb_field {
|
|
x(members_v2, 11) \
|
|
x(errors, 12) \
|
|
x(ext, 13) \
|
|
- x(downgrade, 14)
|
|
+ x(downgrade, 14) \
|
|
+ x(recovery_passes, 15)
|
|
|
|
#include "alloc_background_format.h"
|
|
#include "dirent_format.h"
|
|
@@ -506,6 +511,7 @@ struct bch_sb_field {
|
|
#include "logged_ops_format.h"
|
|
#include "lru_format.h"
|
|
#include "quota_format.h"
|
|
+#include "recovery_passes_format.h"
|
|
#include "reflink_format.h"
|
|
#include "replicas_format.h"
|
|
#include "snapshot_format.h"
|
|
@@ -686,7 +692,15 @@ struct bch_sb_field_ext {
|
|
x(inode_depth, BCH_VERSION(1, 17)) \
|
|
x(persistent_inode_cursors, BCH_VERSION(1, 18)) \
|
|
x(autofix_errors, BCH_VERSION(1, 19)) \
|
|
- x(directory_size, BCH_VERSION(1, 20))
|
|
+ x(directory_size, BCH_VERSION(1, 20)) \
|
|
+ x(cached_backpointers, BCH_VERSION(1, 21)) \
|
|
+ x(stripe_backpointers, BCH_VERSION(1, 22)) \
|
|
+ x(stripe_lru, BCH_VERSION(1, 23)) \
|
|
+ x(casefolding, BCH_VERSION(1, 24)) \
|
|
+ x(extent_flags, BCH_VERSION(1, 25)) \
|
|
+ x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \
|
|
+ x(fast_device_removal, BCH_VERSION(1, 27)) \
|
|
+ x(inode_has_case_insensitive, BCH_VERSION(1, 28))
|
|
|
|
enum bcachefs_metadata_version {
|
|
bcachefs_metadata_version_min = 9,
|
|
@@ -837,6 +851,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
|
|
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
|
|
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
|
|
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
|
|
+LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64);
|
|
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
|
|
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
|
|
LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
|
|
@@ -855,6 +870,11 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
|
|
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
|
|
struct bch_sb, flags[5], 48, 64);
|
|
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
|
|
+LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
|
|
+LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
|
|
+LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22);
|
|
+LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23);
|
|
+LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24);
|
|
|
|
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
|
|
{
|
|
@@ -908,7 +928,10 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
|
|
x(journal_no_flush, 16) \
|
|
x(alloc_v2, 17) \
|
|
x(extents_across_btree_nodes, 18) \
|
|
- x(incompat_version_field, 19)
|
|
+ x(incompat_version_field, 19) \
|
|
+ x(casefolding, 20) \
|
|
+ x(no_alloc_info, 21) \
|
|
+ x(small_image, 22)
|
|
|
|
#define BCH_SB_FEATURES_ALWAYS \
|
|
(BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \
|
|
@@ -922,7 +945,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
|
|
BIT_ULL(BCH_FEATURE_new_siphash)| \
|
|
BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
|
|
BIT_ULL(BCH_FEATURE_new_varint)| \
|
|
- BIT_ULL(BCH_FEATURE_journal_no_flush))
|
|
+ BIT_ULL(BCH_FEATURE_journal_no_flush)| \
|
|
+ BIT_ULL(BCH_FEATURE_incompat_version_field))
|
|
|
|
enum bch_sb_feature {
|
|
#define x(f, n) BCH_FEATURE_##f,
|
|
@@ -974,6 +998,19 @@ enum bch_error_actions {
|
|
BCH_ON_ERROR_NR
|
|
};
|
|
|
|
+#define BCH_DEGRADED_ACTIONS() \
|
|
+ x(ask, 0) \
|
|
+ x(yes, 1) \
|
|
+ x(very, 2) \
|
|
+ x(no, 3)
|
|
+
|
|
+enum bch_degraded_actions {
|
|
+#define x(t, n) BCH_DEGRADED_##t = n,
|
|
+ BCH_DEGRADED_ACTIONS()
|
|
+#undef x
|
|
+ BCH_DEGRADED_ACTIONS_NR
|
|
+};
|
|
+
|
|
#define BCH_STR_HASH_TYPES() \
|
|
x(crc32c, 0) \
|
|
x(crc64, 1) \
|
|
@@ -1133,7 +1170,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
|
|
x(log, 9) \
|
|
x(overwrite, 10) \
|
|
x(write_buffer_keys, 11) \
|
|
- x(datetime, 12)
|
|
+ x(datetime, 12) \
|
|
+ x(log_bkey, 13)
|
|
|
|
enum bch_jset_entry_type {
|
|
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
|
|
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
|
|
index 3c23bdf788ce..52594e925eb7 100644
|
|
--- a/fs/bcachefs/bcachefs_ioctl.h
|
|
+++ b/fs/bcachefs/bcachefs_ioctl.h
|
|
@@ -87,6 +87,7 @@ struct bch_ioctl_incremental {
|
|
#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
|
|
#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
|
|
#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
|
|
+#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters)
|
|
|
|
/* ioctl below act on a particular file, not the filesystem as a whole: */
|
|
|
|
@@ -213,6 +214,10 @@ struct bch_ioctl_data {
|
|
struct bpos end_pos;
|
|
|
|
union {
|
|
+ struct {
|
|
+ __u32 dev;
|
|
+ __u32 data_types;
|
|
+ } scrub;
|
|
struct {
|
|
__u32 dev;
|
|
__u32 pad;
|
|
@@ -229,6 +234,11 @@ enum bch_data_event {
|
|
BCH_DATA_EVENT_NR = 1,
|
|
};
|
|
|
|
+enum data_progress_data_type_special {
|
|
+ DATA_PROGRESS_DATA_TYPE_phys = 254,
|
|
+ DATA_PROGRESS_DATA_TYPE_done = 255,
|
|
+};
|
|
+
|
|
struct bch_ioctl_data_progress {
|
|
__u8 data_type;
|
|
__u8 btree_id;
|
|
@@ -237,11 +247,19 @@ struct bch_ioctl_data_progress {
|
|
|
|
__u64 sectors_done;
|
|
__u64 sectors_total;
|
|
+ __u64 sectors_error_corrected;
|
|
+ __u64 sectors_error_uncorrected;
|
|
} __packed __aligned(8);
|
|
|
|
+enum bch_ioctl_data_event_ret {
|
|
+ BCH_IOCTL_DATA_EVENT_RET_done = 1,
|
|
+ BCH_IOCTL_DATA_EVENT_RET_device_offline = 2,
|
|
+};
|
|
+
|
|
struct bch_ioctl_data_event {
|
|
__u8 type;
|
|
- __u8 pad[7];
|
|
+ __u8 ret;
|
|
+ __u8 pad[6];
|
|
union {
|
|
struct bch_ioctl_data_progress p;
|
|
__u64 pad2[15];
|
|
@@ -443,4 +461,13 @@ struct bch_ioctl_query_accounting {
|
|
struct bkey_i_accounting accounting[];
|
|
};
|
|
|
|
+#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0)
|
|
+
|
|
+struct bch_ioctl_query_counters {
|
|
+ __u16 nr;
|
|
+ __u16 flags;
|
|
+ __u32 pad;
|
|
+ __u64 d[];
|
|
+};
|
|
+
|
|
#endif /* _BCACHEFS_IOCTL_H */
|
|
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
|
|
index 995ba32e9b6e..ee823c640642 100644
|
|
--- a/fs/bcachefs/bkey.c
|
|
+++ b/fs/bcachefs/bkey.c
|
|
@@ -47,11 +47,9 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out,
|
|
}
|
|
}
|
|
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
-
|
|
-static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
|
|
- const struct bkey *unpacked,
|
|
- const struct bkey_format *format)
|
|
+static void __bch2_bkey_pack_verify(const struct bkey_packed *packed,
|
|
+ const struct bkey *unpacked,
|
|
+ const struct bkey_format *format)
|
|
{
|
|
struct bkey tmp;
|
|
|
|
@@ -95,11 +93,13 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
|
|
}
|
|
}
|
|
|
|
-#else
|
|
static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
|
|
- const struct bkey *unpacked,
|
|
- const struct bkey_format *format) {}
|
|
-#endif
|
|
+ const struct bkey *unpacked,
|
|
+ const struct bkey_format *format)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack))
|
|
+ __bch2_bkey_pack_verify(packed, unpacked, format);
|
|
+}
|
|
|
|
struct pack_state {
|
|
const struct bkey_format *format;
|
|
@@ -398,7 +398,6 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
|
|
return ret;
|
|
}
|
|
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
static bool bkey_packed_successor(struct bkey_packed *out,
|
|
const struct btree *b,
|
|
struct bkey_packed k)
|
|
@@ -455,7 +454,6 @@ static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
|
|
|
|
return false;
|
|
}
|
|
-#endif
|
|
|
|
/*
|
|
* Returns a packed key that compares <= in
|
|
@@ -472,9 +470,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
|
|
const struct bkey_format *f = &b->format;
|
|
struct pack_state state = pack_state_init(f, out);
|
|
u64 *w = out->_data;
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
struct bpos orig = in;
|
|
-#endif
|
|
bool exact = true;
|
|
unsigned i;
|
|
|
|
@@ -527,18 +523,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
|
|
out->format = KEY_FORMAT_LOCAL_BTREE;
|
|
out->type = KEY_TYPE_deleted;
|
|
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
- if (exact) {
|
|
- BUG_ON(bkey_cmp_left_packed(b, out, &orig));
|
|
- } else {
|
|
- struct bkey_packed successor;
|
|
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
|
|
+ if (exact) {
|
|
+ BUG_ON(bkey_cmp_left_packed(b, out, &orig));
|
|
+ } else {
|
|
+ struct bkey_packed successor;
|
|
|
|
- BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
|
|
- BUG_ON(bkey_packed_successor(&successor, b, *out) &&
|
|
- bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
|
|
- !bkey_format_has_too_big_fields(f));
|
|
+ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
|
|
+ BUG_ON(bkey_packed_successor(&successor, b, *out) &&
|
|
+ bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
|
|
+ !bkey_format_has_too_big_fields(f));
|
|
+ }
|
|
}
|
|
-#endif
|
|
|
|
return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
|
|
}
|
|
@@ -627,14 +623,13 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
|
|
}
|
|
}
|
|
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
- {
|
|
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
|
|
printbuf_exit(&buf);
|
|
}
|
|
-#endif
|
|
+
|
|
return ret;
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
|
|
index 054e2d5e8448..3ccd521c190a 100644
|
|
--- a/fs/bcachefs/bkey.h
|
|
+++ b/fs/bcachefs/bkey.h
|
|
@@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r)
|
|
static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
|
|
{
|
|
return bpos_eq(l.k->p, r.k->p) &&
|
|
+ l.k->size == r.k->size &&
|
|
bkey_bytes(l.k) == bkey_bytes(r.k) &&
|
|
!memcmp(l.v, r.v, bkey_val_bytes(l.k));
|
|
}
|
|
@@ -397,8 +398,7 @@ __bkey_unpack_key_format_checked(const struct btree *b,
|
|
compiled_unpack_fn unpack_fn = b->aux_data;
|
|
unpack_fn(dst, src);
|
|
|
|
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
|
|
- bch2_expensive_debug_checks) {
|
|
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
|
|
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
|
|
|
|
BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
|
|
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
|
|
index 15c93576b5c2..fcd8c82cba4f 100644
|
|
--- a/fs/bcachefs/bkey_methods.c
|
|
+++ b/fs/bcachefs/bkey_methods.c
|
|
@@ -21,7 +21,7 @@
|
|
#include "xattr.h"
|
|
|
|
const char * const bch2_bkey_types[] = {
|
|
-#define x(name, nr) #name,
|
|
+#define x(name, nr, ...) #name,
|
|
BCH_BKEY_TYPES()
|
|
#undef x
|
|
NULL
|
|
@@ -115,7 +115,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
|
|
})
|
|
|
|
const struct bkey_ops bch2_bkey_ops[] = {
|
|
-#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
|
|
+#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
|
|
BCH_BKEY_TYPES()
|
|
#undef x
|
|
};
|
|
@@ -155,6 +155,12 @@ static u64 bch2_key_types_allowed[] = {
|
|
#undef x
|
|
};
|
|
|
|
+static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = {
|
|
+#define x(name, nr, flags) [KEY_TYPE_##name] = flags,
|
|
+ BCH_BKEY_TYPES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
const char *bch2_btree_node_type_str(enum btree_node_type type)
|
|
{
|
|
return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
|
|
@@ -177,8 +183,18 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
if (type >= BKEY_TYPE_NR)
|
|
return 0;
|
|
|
|
- bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
|
|
- (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) &&
|
|
+ enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX
|
|
+ ? bch2_bkey_type_flags[k.k->type]
|
|
+ : 0;
|
|
+
|
|
+ bool strict_key_type_allowed =
|
|
+ (from.flags & BCH_VALIDATE_commit) ||
|
|
+ type == BKEY_TYPE_btree ||
|
|
+ (from.btree < BTREE_ID_NR &&
|
|
+ (bkey_flags & BKEY_TYPE_strict_btree_checks));
|
|
+
|
|
+ bkey_fsck_err_on(strict_key_type_allowed &&
|
|
+ k.k->type < KEY_TYPE_MAX &&
|
|
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
|
|
c, bkey_invalid_type_for_btree,
|
|
"invalid key type for btree %s (%s)",
|
|
@@ -340,7 +356,7 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
|
|
return ops->key_merge &&
|
|
bch2_bkey_maybe_mergable(l.k, r.k) &&
|
|
(u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
|
|
- !bch2_key_merging_disabled &&
|
|
+ !static_branch_unlikely(&bch2_key_merging_disabled) &&
|
|
ops->key_merge(c, l, r);
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
|
|
index 9a4a83d6fd2d..32841f762eb2 100644
|
|
--- a/fs/bcachefs/bset.c
|
|
+++ b/fs/bcachefs/bset.c
|
|
@@ -144,8 +144,6 @@ struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
|
|
return nr;
|
|
}
|
|
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
-
|
|
void __bch2_verify_btree_nr_keys(struct btree *b)
|
|
{
|
|
struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
|
|
@@ -153,7 +151,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b)
|
|
BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
|
|
}
|
|
|
|
-static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
|
|
+static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
|
|
struct btree *b)
|
|
{
|
|
struct btree_node_iter iter = *_iter;
|
|
@@ -190,8 +188,8 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
|
|
}
|
|
}
|
|
|
|
-void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
|
|
- struct btree *b)
|
|
+void __bch2_btree_node_iter_verify(struct btree_node_iter *iter,
|
|
+ struct btree *b)
|
|
{
|
|
struct btree_node_iter_set *set, *s2;
|
|
struct bkey_packed *k, *p;
|
|
@@ -237,8 +235,8 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
|
|
}
|
|
}
|
|
|
|
-void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
|
|
- struct bkey_packed *insert, unsigned clobber_u64s)
|
|
+static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
|
|
+ struct bkey_packed *insert, unsigned clobber_u64s)
|
|
{
|
|
struct bset_tree *t = bch2_bkey_to_bset(b, where);
|
|
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
|
|
@@ -285,12 +283,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
|
|
#endif
|
|
}
|
|
|
|
-#else
|
|
-
|
|
-static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
|
|
- struct btree *b) {}
|
|
+static inline void bch2_verify_insert_pos(struct btree *b,
|
|
+ struct bkey_packed *where,
|
|
+ struct bkey_packed *insert,
|
|
+ unsigned clobber_u64s)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
|
|
+ __bch2_verify_insert_pos(b, where, insert, clobber_u64s);
|
|
+}
|
|
|
|
-#endif
|
|
|
|
/* Auxiliary search trees */
|
|
|
|
@@ -361,9 +362,8 @@ static struct bkey_float *bkey_float(const struct btree *b,
|
|
return ro_aux_tree_base(b, t)->f + idx;
|
|
}
|
|
|
|
-static void bset_aux_tree_verify(struct btree *b)
|
|
+static void __bset_aux_tree_verify(struct btree *b)
|
|
{
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
for_each_bset(b, t) {
|
|
if (t->aux_data_offset == U16_MAX)
|
|
continue;
|
|
@@ -375,7 +375,12 @@ static void bset_aux_tree_verify(struct btree *b)
|
|
BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
|
|
BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
|
|
}
|
|
-#endif
|
|
+}
|
|
+
|
|
+static inline void bset_aux_tree_verify(struct btree *b)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
|
|
+ __bset_aux_tree_verify(b);
|
|
}
|
|
|
|
void bch2_btree_keys_init(struct btree *b)
|
|
@@ -495,15 +500,11 @@ static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
|
|
};
|
|
}
|
|
|
|
-static void bch2_bset_verify_rw_aux_tree(struct btree *b,
|
|
- struct bset_tree *t)
|
|
+static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t)
|
|
{
|
|
struct bkey_packed *k = btree_bkey_first(b, t);
|
|
unsigned j = 0;
|
|
|
|
- if (!bch2_expensive_debug_checks)
|
|
- return;
|
|
-
|
|
BUG_ON(bset_has_ro_aux_tree(t));
|
|
|
|
if (!bset_has_rw_aux_tree(t))
|
|
@@ -530,6 +531,13 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
|
|
}
|
|
}
|
|
|
|
+static inline void bch2_bset_verify_rw_aux_tree(struct btree *b,
|
|
+ struct bset_tree *t)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
|
|
+ __bch2_bset_verify_rw_aux_tree(b, t);
|
|
+}
|
|
+
|
|
/* returns idx of first entry >= offset: */
|
|
static unsigned rw_aux_tree_bsearch(struct btree *b,
|
|
struct bset_tree *t,
|
|
@@ -869,7 +877,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
|
|
k = p;
|
|
}
|
|
|
|
- if (bch2_expensive_debug_checks) {
|
|
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
|
|
BUG_ON(ret >= orig_k);
|
|
|
|
for (i = ret
|
|
@@ -1195,7 +1203,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
|
|
bkey_iter_pos_cmp(b, m, search) < 0)
|
|
m = bkey_p_next(m);
|
|
|
|
- if (bch2_expensive_debug_checks) {
|
|
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
|
|
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
|
|
|
|
BUG_ON(prev &&
|
|
@@ -1435,9 +1443,9 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
|
|
void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
|
|
struct btree *b)
|
|
{
|
|
- if (bch2_expensive_debug_checks) {
|
|
- bch2_btree_node_iter_verify(iter, b);
|
|
- bch2_btree_node_iter_next_check(iter, b);
|
|
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
|
|
+ __bch2_btree_node_iter_verify(iter, b);
|
|
+ __bch2_btree_node_iter_next_check(iter, b);
|
|
}
|
|
|
|
__bch2_btree_node_iter_advance(iter, b);
|
|
@@ -1453,8 +1461,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
|
|
struct btree_node_iter_set *set;
|
|
unsigned end = 0;
|
|
|
|
- if (bch2_expensive_debug_checks)
|
|
- bch2_btree_node_iter_verify(iter, b);
|
|
+ bch2_btree_node_iter_verify(iter, b);
|
|
|
|
for_each_bset(b, t) {
|
|
k = bch2_bkey_prev_all(b, t,
|
|
@@ -1489,8 +1496,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
|
|
iter->data[0].k = __btree_node_key_to_offset(b, prev);
|
|
iter->data[0].end = end;
|
|
|
|
- if (bch2_expensive_debug_checks)
|
|
- bch2_btree_node_iter_verify(iter, b);
|
|
+ bch2_btree_node_iter_verify(iter, b);
|
|
return prev;
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
|
|
index 6953d55b72cc..a15ecf9d006e 100644
|
|
--- a/fs/bcachefs/bset.h
|
|
+++ b/fs/bcachefs/bset.h
|
|
@@ -517,27 +517,19 @@ void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
|
|
void bch2_dump_btree_node(struct bch_fs *, struct btree *);
|
|
void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
|
|
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
-
|
|
void __bch2_verify_btree_nr_keys(struct btree *);
|
|
-void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
|
|
-void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
|
|
- struct bkey_packed *, unsigned);
|
|
-
|
|
-#else
|
|
+void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
|
|
|
|
-static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
|
|
static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
|
|
- struct btree *b) {}
|
|
-static inline void bch2_verify_insert_pos(struct btree *b,
|
|
- struct bkey_packed *where,
|
|
- struct bkey_packed *insert,
|
|
- unsigned clobber_u64s) {}
|
|
-#endif
|
|
+ struct btree *b)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
|
|
+ __bch2_btree_node_iter_verify(iter, b);
|
|
+}
|
|
|
|
static inline void bch2_verify_btree_nr_keys(struct btree *b)
|
|
{
|
|
- if (bch2_debug_check_btree_accounting)
|
|
+ if (static_branch_unlikely(&bch2_debug_check_btree_accounting))
|
|
__bch2_verify_btree_nr_keys(b);
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
|
|
index 1ec1f90e0eb3..a5d983309311 100644
|
|
--- a/fs/bcachefs/btree_cache.c
|
|
+++ b/fs/bcachefs/btree_cache.c
|
|
@@ -15,14 +15,9 @@
|
|
|
|
#include <linux/prefetch.h>
|
|
#include <linux/sched/mm.h>
|
|
+#include <linux/seq_buf.h>
|
|
#include <linux/swap.h>
|
|
|
|
-#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
|
|
-do { \
|
|
- if (shrinker_counter) \
|
|
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \
|
|
-} while (0)
|
|
-
|
|
const char * const bch2_btree_node_flags[] = {
|
|
"typebit",
|
|
"typebit",
|
|
@@ -350,115 +345,118 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc,
|
|
return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
|
|
}
|
|
|
|
-/*
|
|
- * this version is for btree nodes that have already been freed (we're not
|
|
- * reaping a real btree node)
|
|
- */
|
|
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
|
|
+static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b,
|
|
+ bool flush, bool locked)
|
|
{
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
- int ret = 0;
|
|
|
|
lockdep_assert_held(&bc->lock);
|
|
-wait_on_io:
|
|
- if (b->flags & ((1U << BTREE_NODE_dirty)|
|
|
- (1U << BTREE_NODE_read_in_flight)|
|
|
+
|
|
+ if (btree_node_noevict(b)) {
|
|
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++;
|
|
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
|
|
+ }
|
|
+ if (btree_node_write_blocked(b)) {
|
|
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++;
|
|
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
|
|
+ }
|
|
+ if (btree_node_will_make_reachable(b)) {
|
|
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++;
|
|
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
|
|
+ }
|
|
+
|
|
+ if (btree_node_dirty(b)) {
|
|
+ if (!flush) {
|
|
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++;
|
|
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
|
|
+ }
|
|
+
|
|
+ if (locked) {
|
|
+ /*
|
|
+ * Using the underscore version because we don't want to compact
|
|
+ * bsets after the write, since this node is about to be evicted
|
|
+ * - unless btree verify mode is enabled, since it runs out of
|
|
+ * the post write cleanup:
|
|
+ */
|
|
+ if (static_branch_unlikely(&bch2_verify_btree_ondisk))
|
|
+ bch2_btree_node_write(c, b, SIX_LOCK_intent,
|
|
+ BTREE_WRITE_cache_reclaim);
|
|
+ else
|
|
+ __bch2_btree_node_write(c, b,
|
|
+ BTREE_WRITE_cache_reclaim);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
|
|
(1U << BTREE_NODE_write_in_flight))) {
|
|
if (!flush) {
|
|
- if (btree_node_dirty(b))
|
|
- BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
|
|
- else if (btree_node_read_in_flight(b))
|
|
- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
|
|
+ if (btree_node_read_in_flight(b))
|
|
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++;
|
|
else if (btree_node_write_in_flight(b))
|
|
- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
|
|
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++;
|
|
return -BCH_ERR_ENOMEM_btree_node_reclaim;
|
|
}
|
|
|
|
+ if (locked)
|
|
+ return -EINTR;
|
|
+
|
|
/* XXX: waiting on IO with btree cache lock held */
|
|
bch2_btree_node_wait_on_read(b);
|
|
bch2_btree_node_wait_on_write(b);
|
|
}
|
|
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * this version is for btree nodes that have already been freed (we're not
|
|
+ * reaping a real btree node)
|
|
+ */
|
|
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ int ret = 0;
|
|
+
|
|
+ lockdep_assert_held(&bc->lock);
|
|
+retry_unlocked:
|
|
+ ret = __btree_node_reclaim_checks(c, b, flush, false);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
if (!six_trylock_intent(&b->c.lock)) {
|
|
- BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
|
|
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++;
|
|
return -BCH_ERR_ENOMEM_btree_node_reclaim;
|
|
}
|
|
|
|
if (!six_trylock_write(&b->c.lock)) {
|
|
- BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
|
|
- goto out_unlock_intent;
|
|
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++;
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
|
|
}
|
|
|
|
/* recheck under lock */
|
|
- if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
|
|
- (1U << BTREE_NODE_write_in_flight))) {
|
|
- if (!flush) {
|
|
- if (btree_node_read_in_flight(b))
|
|
- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
|
|
- else if (btree_node_write_in_flight(b))
|
|
- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
|
|
- goto out_unlock;
|
|
- }
|
|
+ ret = __btree_node_reclaim_checks(c, b, flush, true);
|
|
+ if (ret) {
|
|
six_unlock_write(&b->c.lock);
|
|
six_unlock_intent(&b->c.lock);
|
|
- goto wait_on_io;
|
|
- }
|
|
-
|
|
- if (btree_node_noevict(b)) {
|
|
- BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
|
|
- goto out_unlock;
|
|
- }
|
|
- if (btree_node_write_blocked(b)) {
|
|
- BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
|
|
- goto out_unlock;
|
|
- }
|
|
- if (btree_node_will_make_reachable(b)) {
|
|
- BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
|
|
- goto out_unlock;
|
|
+ if (ret == -EINTR)
|
|
+ goto retry_unlocked;
|
|
+ return ret;
|
|
}
|
|
|
|
- if (btree_node_dirty(b)) {
|
|
- if (!flush) {
|
|
- BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
|
|
- goto out_unlock;
|
|
- }
|
|
- /*
|
|
- * Using the underscore version because we don't want to compact
|
|
- * bsets after the write, since this node is about to be evicted
|
|
- * - unless btree verify mode is enabled, since it runs out of
|
|
- * the post write cleanup:
|
|
- */
|
|
- if (bch2_verify_btree_ondisk)
|
|
- bch2_btree_node_write(c, b, SIX_LOCK_intent,
|
|
- BTREE_WRITE_cache_reclaim);
|
|
- else
|
|
- __bch2_btree_node_write(c, b,
|
|
- BTREE_WRITE_cache_reclaim);
|
|
-
|
|
- six_unlock_write(&b->c.lock);
|
|
- six_unlock_intent(&b->c.lock);
|
|
- goto wait_on_io;
|
|
- }
|
|
-out:
|
|
if (b->hash_val && !ret)
|
|
trace_and_count(c, btree_cache_reap, c, b);
|
|
- return ret;
|
|
-out_unlock:
|
|
- six_unlock_write(&b->c.lock);
|
|
-out_unlock_intent:
|
|
- six_unlock_intent(&b->c.lock);
|
|
- ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
|
|
- goto out;
|
|
+ return 0;
|
|
}
|
|
|
|
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
|
|
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
|
|
{
|
|
- return __btree_node_reclaim(c, b, false, shrinker_counter);
|
|
+ return __btree_node_reclaim(c, b, false);
|
|
}
|
|
|
|
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
|
|
{
|
|
- return __btree_node_reclaim(c, b, true, false);
|
|
+ return __btree_node_reclaim(c, b, true);
|
|
}
|
|
|
|
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
|
@@ -476,7 +474,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
|
unsigned long ret = SHRINK_STOP;
|
|
bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
|
|
|
|
- if (bch2_btree_shrinker_disabled)
|
|
+ if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
|
|
return SHRINK_STOP;
|
|
|
|
mutex_lock(&bc->lock);
|
|
@@ -490,7 +488,10 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
|
* IO can always make forward progress:
|
|
*/
|
|
can_free = btree_cache_can_free(list);
|
|
- nr = min_t(unsigned long, nr, can_free);
|
|
+ if (nr > can_free) {
|
|
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free;
|
|
+ nr = can_free;
|
|
+ }
|
|
|
|
i = 0;
|
|
list_for_each_entry_safe(b, t, &bc->freeable, list) {
|
|
@@ -506,7 +507,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
|
if (touched >= nr)
|
|
goto out;
|
|
|
|
- if (!btree_node_reclaim(c, b, true)) {
|
|
+ if (!btree_node_reclaim(c, b)) {
|
|
btree_node_data_free(bc, b);
|
|
six_unlock_write(&b->c.lock);
|
|
six_unlock_intent(&b->c.lock);
|
|
@@ -522,7 +523,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
|
clear_btree_node_accessed(b);
|
|
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
|
|
--touched;;
|
|
- } else if (!btree_node_reclaim(c, b, true)) {
|
|
+ } else if (!btree_node_reclaim(c, b)) {
|
|
__bch2_btree_node_hash_remove(bc, b);
|
|
__btree_node_data_free(bc, b);
|
|
|
|
@@ -569,12 +570,25 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
|
|
{
|
|
struct btree_cache_list *list = shrink->private_data;
|
|
|
|
- if (bch2_btree_shrinker_disabled)
|
|
+ if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
|
|
return 0;
|
|
|
|
return btree_cache_can_free(list);
|
|
}
|
|
|
|
+static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
|
|
+{
|
|
+ struct btree_cache_list *list = shrink->private_data;
|
|
+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
|
|
+
|
|
+ char *cbuf;
|
|
+ size_t buflen = seq_buf_get_buf(s, &cbuf);
|
|
+ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
|
|
+
|
|
+ bch2_btree_cache_to_text(&out, bc);
|
|
+ seq_buf_commit(s, out.pos);
|
|
+}
|
|
+
|
|
void bch2_fs_btree_cache_exit(struct bch_fs *c)
|
|
{
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
@@ -610,6 +624,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
|
|
btree_node_write_in_flight(b));
|
|
|
|
btree_node_data_free(bc, b);
|
|
+ cond_resched();
|
|
}
|
|
|
|
BUG_ON(!bch2_journal_error(&c->journal) &&
|
|
@@ -665,6 +680,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
|
|
bc->live[0].shrink = shrink;
|
|
shrink->count_objects = bch2_btree_cache_count;
|
|
shrink->scan_objects = bch2_btree_cache_scan;
|
|
+ shrink->to_text = bch2_btree_cache_shrinker_to_text;
|
|
shrink->seeks = 2;
|
|
shrink->private_data = &bc->live[0];
|
|
shrinker_register(shrink);
|
|
@@ -675,6 +691,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
|
|
bc->live[1].shrink = shrink;
|
|
shrink->count_objects = bch2_btree_cache_count;
|
|
shrink->scan_objects = bch2_btree_cache_scan;
|
|
+ shrink->to_text = bch2_btree_cache_shrinker_to_text;
|
|
shrink->seeks = 8;
|
|
shrink->private_data = &bc->live[1];
|
|
shrinker_register(shrink);
|
|
@@ -754,7 +771,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
|
|
list_for_each_entry_reverse(b, &bc->live[i].list, list)
|
|
- if (!btree_node_reclaim(c, b, false))
|
|
+ if (!btree_node_reclaim(c, b))
|
|
return b;
|
|
|
|
while (1) {
|
|
@@ -789,7 +806,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
|
|
* disk node. Check the freed list before allocating a new one:
|
|
*/
|
|
list_for_each_entry(b, freed, list)
|
|
- if (!btree_node_reclaim(c, b, false)) {
|
|
+ if (!btree_node_reclaim(c, b)) {
|
|
list_del_init(&b->list);
|
|
goto got_node;
|
|
}
|
|
@@ -816,7 +833,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
|
|
* the list. Check if there's any freed nodes there:
|
|
*/
|
|
list_for_each_entry(b2, &bc->freeable, list)
|
|
- if (!btree_node_reclaim(c, b2, false)) {
|
|
+ if (!btree_node_reclaim(c, b2)) {
|
|
swap(b->data, b2->data);
|
|
swap(b->aux_data, b2->aux_data);
|
|
|
|
@@ -851,7 +868,6 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
|
|
b->sib_u64s[1] = 0;
|
|
b->whiteout_u64s = 0;
|
|
bch2_btree_keys_init(b);
|
|
- set_btree_node_accessed(b);
|
|
|
|
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
|
|
start_time);
|
|
@@ -977,7 +993,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
|
|
|
|
/* Unlock before doing IO: */
|
|
six_unlock_intent(&b->c.lock);
|
|
- bch2_trans_unlock_noassert(trans);
|
|
+ bch2_trans_unlock(trans);
|
|
|
|
bch2_btree_node_read(trans, b, sync);
|
|
|
|
@@ -1003,7 +1019,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
|
|
{
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
|
|
+ if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
|
|
return;
|
|
|
|
prt_printf(&buf,
|
|
@@ -1285,6 +1301,10 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
|
|
six_unlock_read(&b->c.lock);
|
|
goto retry;
|
|
}
|
|
+
|
|
+ /* avoid atomic set bit if it's not needed: */
|
|
+ if (!btree_node_accessed(b))
|
|
+ set_btree_node_accessed(b);
|
|
}
|
|
|
|
/* XXX: waiting on IO with btree locks held: */
|
|
@@ -1300,10 +1320,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
|
|
prefetch(p + L1_CACHE_BYTES * 2);
|
|
}
|
|
|
|
- /* avoid atomic set bit if it's not needed: */
|
|
- if (!btree_node_accessed(b))
|
|
- set_btree_node_accessed(b);
|
|
-
|
|
if (unlikely(btree_node_read_error(b))) {
|
|
six_unlock_read(&b->c.lock);
|
|
b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
|
|
@@ -1416,7 +1432,7 @@ void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
|
|
prt_printf(out, "%u", r->level);
|
|
else
|
|
prt_printf(out, "(unknown)");
|
|
- prt_printf(out, "\n ");
|
|
+ prt_newline(out);
|
|
|
|
bch2_bkey_val_to_text(out, c, k);
|
|
}
|
|
@@ -1492,9 +1508,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
|
|
|
|
prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
|
|
prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
|
|
- prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
|
|
+ prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve);
|
|
+ prt_btree_cache_line(out, c, "freed:", bc->nr_freeable);
|
|
prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
|
|
- prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
|
|
+ prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held");
|
|
prt_newline(out);
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
|
|
@@ -1505,6 +1522,7 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
|
|
}
|
|
|
|
prt_newline(out);
|
|
+ prt_printf(out, "counters since mount:\n");
|
|
prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
|
|
prt_printf(out, "not freed:\n");
|
|
|
|
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
|
|
index dd1d9b74076e..91b6395421df 100644
|
|
--- a/fs/bcachefs/btree_gc.c
|
|
+++ b/fs/bcachefs/btree_gc.c
|
|
@@ -22,11 +22,13 @@
|
|
#include "debug.h"
|
|
#include "disk_accounting.h"
|
|
#include "ec.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "error.h"
|
|
#include "extents.h"
|
|
#include "journal.h"
|
|
#include "keylist.h"
|
|
#include "move.h"
|
|
+#include "progress.h"
|
|
#include "recovery_passes.h"
|
|
#include "reflink.h"
|
|
#include "recovery.h"
|
|
@@ -46,6 +48,27 @@
|
|
#define DROP_PREV_NODE 11
|
|
#define DID_FILL_FROM_SCAN 12
|
|
|
|
+/*
|
|
+ * Returns true if it's a btree we can easily reconstruct, or otherwise won't
|
|
+ * cause data loss if it's missing:
|
|
+ */
|
|
+static bool btree_id_important(enum btree_id btree)
|
|
+{
|
|
+ if (btree_id_is_alloc(btree))
|
|
+ return false;
|
|
+
|
|
+ switch (btree) {
|
|
+ case BTREE_ID_quotas:
|
|
+ case BTREE_ID_snapshot_trees:
|
|
+ case BTREE_ID_logged_ops:
|
|
+ case BTREE_ID_rebalance_work:
|
|
+ case BTREE_ID_subvolume_children:
|
|
+ return false;
|
|
+ default:
|
|
+ return true;
|
|
+ }
|
|
+}
|
|
+
|
|
static const char * const bch2_gc_phase_strs[] = {
|
|
#define x(n) #n,
|
|
GC_PHASES()
|
|
@@ -212,15 +235,15 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
|
|
|
|
prt_printf(&buf, " at ");
|
|
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
|
- prt_printf(&buf, ":\n parent: ");
|
|
+ prt_printf(&buf, ":\nparent: ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
|
|
|
if (prev) {
|
|
- prt_printf(&buf, "\n prev: ");
|
|
+ prt_printf(&buf, "\nprev: ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
|
|
}
|
|
|
|
- prt_str(&buf, "\n next: ");
|
|
+ prt_str(&buf, "\nnext: ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
|
|
|
|
if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
|
|
@@ -279,12 +302,12 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
|
|
if (bpos_eq(child->key.k.p, b->key.k.p))
|
|
return 0;
|
|
|
|
- prt_printf(&buf, " at ");
|
|
+ prt_printf(&buf, "\nat: ");
|
|
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
|
- prt_printf(&buf, ":\n parent: ");
|
|
+ prt_printf(&buf, "\nparent: ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
|
|
|
- prt_str(&buf, "\n child: ");
|
|
+ prt_str(&buf, "\nchild: ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
|
|
|
|
if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
|
|
@@ -348,21 +371,13 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
|
|
prt_char(&buf, ' ');
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
|
|
|
|
- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
|
|
- trans, btree_node_read_error,
|
|
- "Topology repair: unreadable btree node at\n"
|
|
- " %s",
|
|
- buf.buf)) {
|
|
+ if (bch2_err_matches(ret, EIO)) {
|
|
bch2_btree_node_evict(trans, cur_k.k);
|
|
cur = NULL;
|
|
ret = bch2_journal_key_delete(c, b->c.btree_id,
|
|
b->c.level, cur_k.k->k.p);
|
|
if (ret)
|
|
break;
|
|
-
|
|
- ret = bch2_btree_lost_data(c, b->c.btree_id);
|
|
- if (ret)
|
|
- break;
|
|
continue;
|
|
}
|
|
|
|
@@ -524,9 +539,6 @@ int bch2_check_topology(struct bch_fs *c)
|
|
bch2_btree_id_to_text(&buf, i);
|
|
|
|
if (r->error) {
|
|
- ret = bch2_btree_lost_data(c, i);
|
|
- if (ret)
|
|
- break;
|
|
reconstruct_root:
|
|
bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
|
|
|
|
@@ -534,8 +546,10 @@ int bch2_check_topology(struct bch_fs *c)
|
|
r->error = 0;
|
|
|
|
if (!bch2_btree_has_scanned_nodes(c, i)) {
|
|
- mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
|
|
- "no nodes found for btree %s, continue?", buf.buf);
|
|
+ __fsck_err(trans,
|
|
+ FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0),
|
|
+ btree_root_unreadable_and_scan_found_nothing,
|
|
+ "no nodes found for btree %s, continue?", buf.buf);
|
|
bch2_btree_root_alloc_fake_trans(trans, i, 0);
|
|
} else {
|
|
bch2_btree_root_alloc_fake_trans(trans, i, 1);
|
|
@@ -605,13 +619,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
|
|
deleted.p = k.k->p;
|
|
|
|
if (initial) {
|
|
- BUG_ON(bch2_journal_seq_verify &&
|
|
+ BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) &&
|
|
k.k->bversion.lo > atomic64_read(&c->journal.seq));
|
|
|
|
if (fsck_err_on(btree_id != BTREE_ID_accounting &&
|
|
k.k->bversion.lo > atomic64_read(&c->key_version),
|
|
trans, bkey_version_in_future,
|
|
- "key version number higher than recorded %llu\n %s",
|
|
+ "key version number higher than recorded %llu\n%s",
|
|
atomic64_read(&c->key_version),
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
|
atomic64_set(&c->key_version, k.k->bversion.lo);
|
|
@@ -619,7 +633,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
|
|
|
|
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
|
|
trans, btree_bitmap_not_marked,
|
|
- "btree ptr not marked in member info btree allocated bitmap\n %s",
|
|
+ "btree ptr not marked in member info btree allocated bitmap\n%s",
|
|
(printbuf_reset(&buf),
|
|
bch2_bkey_val_to_text(&buf, c, k),
|
|
buf.buf))) {
|
|
@@ -656,7 +670,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
|
|
return ret;
|
|
}
|
|
|
|
-static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
|
|
+static int bch2_gc_btree(struct btree_trans *trans,
|
|
+ struct progress_indicator_state *progress,
|
|
+ enum btree_id btree, bool initial)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
|
|
@@ -673,6 +689,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
|
|
BTREE_ITER_prefetch);
|
|
|
|
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
|
|
+ bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
|
|
gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
|
|
bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
|
|
}));
|
|
@@ -688,7 +705,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
|
|
struct btree_iter iter;
|
|
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
|
|
0, bch2_btree_id_root(c, btree)->b->c.level, 0);
|
|
- struct btree *b = bch2_btree_iter_peek_node(&iter);
|
|
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
|
|
ret = PTR_ERR_OR_ZERO(b);
|
|
if (ret)
|
|
goto err_root;
|
|
@@ -717,22 +734,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
|
|
static int bch2_gc_btrees(struct bch_fs *c)
|
|
{
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
- enum btree_id ids[BTREE_ID_NR];
|
|
struct printbuf buf = PRINTBUF;
|
|
- unsigned i;
|
|
int ret = 0;
|
|
|
|
- for (i = 0; i < BTREE_ID_NR; i++)
|
|
+ struct progress_indicator_state progress;
|
|
+ bch2_progress_init(&progress, c, ~0ULL);
|
|
+
|
|
+ enum btree_id ids[BTREE_ID_NR];
|
|
+ for (unsigned i = 0; i < BTREE_ID_NR; i++)
|
|
ids[i] = i;
|
|
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
|
|
|
|
- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
|
|
+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
|
|
unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
|
|
|
|
if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
|
|
continue;
|
|
|
|
- ret = bch2_gc_btree(trans, btree, true);
|
|
+ ret = bch2_gc_btree(trans, &progress, btree, true);
|
|
}
|
|
|
|
printbuf_exit(&buf);
|
|
@@ -1015,8 +1034,7 @@ int bch2_check_allocations(struct bch_fs *c)
|
|
{
|
|
int ret;
|
|
|
|
- lockdep_assert_held(&c->state_lock);
|
|
-
|
|
+ down_read(&c->state_lock);
|
|
down_write(&c->gc_lock);
|
|
|
|
bch2_btree_interior_updates_flush(c);
|
|
@@ -1054,12 +1072,17 @@ int bch2_check_allocations(struct bch_fs *c)
|
|
percpu_up_write(&c->mark_lock);
|
|
|
|
up_write(&c->gc_lock);
|
|
+ up_read(&c->state_lock);
|
|
|
|
/*
|
|
* At startup, allocations can happen directly instead of via the
|
|
* allocator thread - issue wakeup in case they blocked on gc_lock:
|
|
*/
|
|
closure_wake_up(&c->freelist_wait);
|
|
+
|
|
+ if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags))
|
|
+ bch2_sb_members_clean_deleted(c);
|
|
+
|
|
bch_err_fn(c, ret);
|
|
return ret;
|
|
}
|
|
@@ -1194,7 +1217,7 @@ int bch2_gc_gens(struct bch_fs *c)
|
|
BCH_TRANS_COMMIT_no_enospc, ({
|
|
ca = bch2_dev_iterate(c, ca, k.k->p.inode);
|
|
if (!ca) {
|
|
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
|
|
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
|
|
continue;
|
|
}
|
|
bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
|
|
@@ -1228,26 +1251,21 @@ static void bch2_gc_gens_work(struct work_struct *work)
|
|
{
|
|
struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
|
|
bch2_gc_gens(c);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
|
|
}
|
|
|
|
void bch2_gc_gens_async(struct bch_fs *c)
|
|
{
|
|
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) &&
|
|
+ if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) &&
|
|
!queue_work(c->write_ref_wq, &c->gc_gens_work))
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
|
|
-}
|
|
-
|
|
-void bch2_fs_btree_gc_exit(struct bch_fs *c)
|
|
-{
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
|
|
}
|
|
|
|
-int bch2_fs_btree_gc_init(struct bch_fs *c)
|
|
+void bch2_fs_btree_gc_init_early(struct bch_fs *c)
|
|
{
|
|
seqcount_init(&c->gc_pos_lock);
|
|
INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
|
|
|
|
init_rwsem(&c->gc_lock);
|
|
mutex_init(&c->gc_gens_lock);
|
|
- return 0;
|
|
}
|
|
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
|
|
index 9693a90a48a2..ec77662369a2 100644
|
|
--- a/fs/bcachefs/btree_gc.h
|
|
+++ b/fs/bcachefs/btree_gc.h
|
|
@@ -83,7 +83,6 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
|
|
int bch2_gc_gens(struct bch_fs *);
|
|
void bch2_gc_gens_async(struct bch_fs *);
|
|
|
|
-void bch2_fs_btree_gc_exit(struct bch_fs *);
|
|
-int bch2_fs_btree_gc_init(struct bch_fs *);
|
|
+void bch2_fs_btree_gc_init_early(struct bch_fs *);
|
|
|
|
#endif /* _BCACHEFS_BTREE_GC_H */
|
|
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
|
|
index 756736f9243d..34018296053a 100644
|
|
--- a/fs/bcachefs/btree_io.c
|
|
+++ b/fs/bcachefs/btree_io.c
|
|
@@ -1,6 +1,8 @@
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include "bcachefs.h"
|
|
+#include "async_objs.h"
|
|
+#include "bkey_buf.h"
|
|
#include "bkey_methods.h"
|
|
#include "bkey_sort.h"
|
|
#include "btree_cache.h"
|
|
@@ -12,6 +14,7 @@
|
|
#include "buckets.h"
|
|
#include "checksum.h"
|
|
#include "debug.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "error.h"
|
|
#include "extents.h"
|
|
#include "io_write.h"
|
|
@@ -40,6 +43,7 @@ void bch2_btree_node_io_unlock(struct btree *b)
|
|
|
|
clear_btree_node_write_in_flight_inner(b);
|
|
clear_btree_node_write_in_flight(b);
|
|
+ smp_mb__after_atomic();
|
|
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
|
|
}
|
|
|
|
@@ -512,21 +516,23 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
|
|
|
|
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
|
|
struct bch_dev *ca,
|
|
+ bool print_pos,
|
|
struct btree *b, struct bset *i, struct bkey_packed *k,
|
|
- unsigned offset, int write)
|
|
+ unsigned offset, int rw)
|
|
{
|
|
- prt_printf(out, bch2_log_msg(c, "%s"),
|
|
- write == READ
|
|
- ? "error validating btree node "
|
|
- : "corrupt btree node before write ");
|
|
- if (ca)
|
|
- prt_printf(out, "on %s ", ca->name);
|
|
- prt_printf(out, "at btree ");
|
|
- bch2_btree_pos_to_text(out, c, b);
|
|
+ if (print_pos) {
|
|
+ prt_str(out, rw == READ
|
|
+ ? "error validating btree node "
|
|
+ : "corrupt btree node before write ");
|
|
+ prt_printf(out, "at btree ");
|
|
+ bch2_btree_pos_to_text(out, c, b);
|
|
+ prt_newline(out);
|
|
+ }
|
|
|
|
- printbuf_indent_add(out, 2);
|
|
+ if (ca)
|
|
+ prt_printf(out, "%s ", ca->name);
|
|
|
|
- prt_printf(out, "\nnode offset %u/%u",
|
|
+ prt_printf(out, "node offset %u/%u",
|
|
b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
|
|
if (i)
|
|
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
|
|
@@ -537,34 +543,32 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
|
|
prt_str(out, ": ");
|
|
}
|
|
|
|
-__printf(10, 11)
|
|
+__printf(11, 12)
|
|
static int __btree_err(int ret,
|
|
struct bch_fs *c,
|
|
struct bch_dev *ca,
|
|
struct btree *b,
|
|
struct bset *i,
|
|
struct bkey_packed *k,
|
|
- int write,
|
|
- bool have_retry,
|
|
+ int rw,
|
|
enum bch_sb_error_id err_type,
|
|
+ struct bch_io_failures *failed,
|
|
+ struct printbuf *err_msg,
|
|
const char *fmt, ...)
|
|
{
|
|
- struct printbuf out = PRINTBUF;
|
|
- bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
|
|
- va_list args;
|
|
+ if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
|
|
+ return -BCH_ERR_fsck_fix;
|
|
|
|
- btree_err_msg(&out, c, ca, b, i, k, b->written, write);
|
|
+ bool have_retry = false;
|
|
+ int ret2;
|
|
|
|
- va_start(args, fmt);
|
|
- prt_vprintf(&out, fmt, args);
|
|
- va_end(args);
|
|
+ if (ca) {
|
|
+ bch2_mark_btree_validate_failure(failed, ca->dev_idx);
|
|
|
|
- if (write == WRITE) {
|
|
- bch2_print_string_as_lines(KERN_ERR, out.buf);
|
|
- ret = c->opts.errors == BCH_ON_ERROR_continue
|
|
- ? 0
|
|
- : -BCH_ERR_fsck_errors_not_fixed;
|
|
- goto out;
|
|
+ struct extent_ptr_decoded pick;
|
|
+ have_retry = !bch2_bkey_pick_read_device(c,
|
|
+ bkey_i_to_s_c(&b->key),
|
|
+ failed, &pick, -1);
|
|
}
|
|
|
|
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
|
|
@@ -572,37 +576,77 @@ static int __btree_err(int ret,
|
|
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
|
|
ret = -BCH_ERR_btree_node_read_err_bad_node;
|
|
|
|
- if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
|
|
- bch2_sb_error_count(c, err_type);
|
|
+ bch2_sb_error_count(c, err_type);
|
|
+
|
|
+ bool print_deferred = err_msg &&
|
|
+ rw == READ &&
|
|
+ !(test_bit(BCH_FS_in_fsck, &c->flags) &&
|
|
+ c->opts.fix_errors == FSCK_FIX_ask);
|
|
+
|
|
+ struct printbuf out = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &out);
|
|
+
|
|
+ if (!print_deferred)
|
|
+ err_msg = &out;
|
|
+
|
|
+ btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw);
|
|
+
|
|
+ va_list args;
|
|
+ va_start(args, fmt);
|
|
+ prt_vprintf(err_msg, fmt, args);
|
|
+ va_end(args);
|
|
+
|
|
+ if (print_deferred) {
|
|
+ prt_newline(err_msg);
|
|
+
|
|
+ switch (ret) {
|
|
+ case -BCH_ERR_btree_node_read_err_fixable:
|
|
+ ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type);
|
|
+ if (ret2 != -BCH_ERR_fsck_fix &&
|
|
+ ret2 != -BCH_ERR_fsck_ignore) {
|
|
+ ret = ret2;
|
|
+ goto fsck_err;
|
|
+ }
|
|
+
|
|
+ if (!have_retry)
|
|
+ ret = -BCH_ERR_fsck_fix;
|
|
+ goto out;
|
|
+ case -BCH_ERR_btree_node_read_err_bad_node:
|
|
+ prt_str(&out, ", ");
|
|
+ ret = __bch2_topology_error(c, &out);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (rw == WRITE) {
|
|
+ prt_str(&out, ", ");
|
|
+ ret = __bch2_inconsistent_error(c, &out)
|
|
+ ? -BCH_ERR_fsck_errors_not_fixed
|
|
+ : 0;
|
|
+ goto print;
|
|
+ }
|
|
|
|
switch (ret) {
|
|
case -BCH_ERR_btree_node_read_err_fixable:
|
|
- ret = !silent
|
|
- ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
|
|
- : -BCH_ERR_fsck_fix;
|
|
- if (ret != -BCH_ERR_fsck_fix &&
|
|
- ret != -BCH_ERR_fsck_ignore)
|
|
+ ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf);
|
|
+ if (ret2 != -BCH_ERR_fsck_fix &&
|
|
+ ret2 != -BCH_ERR_fsck_ignore) {
|
|
+ ret = ret2;
|
|
goto fsck_err;
|
|
- ret = -BCH_ERR_fsck_fix;
|
|
- break;
|
|
- case -BCH_ERR_btree_node_read_err_want_retry:
|
|
- case -BCH_ERR_btree_node_read_err_must_retry:
|
|
- if (!silent)
|
|
- bch2_print_string_as_lines(KERN_ERR, out.buf);
|
|
- break;
|
|
+ }
|
|
+
|
|
+ if (!have_retry)
|
|
+ ret = -BCH_ERR_fsck_fix;
|
|
+ goto out;
|
|
case -BCH_ERR_btree_node_read_err_bad_node:
|
|
- if (!silent)
|
|
- bch2_print_string_as_lines(KERN_ERR, out.buf);
|
|
- ret = bch2_topology_error(c);
|
|
- break;
|
|
- case -BCH_ERR_btree_node_read_err_incompatible:
|
|
- if (!silent)
|
|
- bch2_print_string_as_lines(KERN_ERR, out.buf);
|
|
- ret = -BCH_ERR_fsck_errors_not_fixed;
|
|
+ prt_str(&out, ", ");
|
|
+ ret = __bch2_topology_error(c, &out);
|
|
break;
|
|
- default:
|
|
- BUG();
|
|
}
|
|
+print:
|
|
+ bch2_print_str(c, KERN_ERR, out.buf);
|
|
out:
|
|
fsck_err:
|
|
printbuf_exit(&out);
|
|
@@ -611,8 +655,9 @@ static int __btree_err(int ret,
|
|
|
|
#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
|
|
({ \
|
|
- int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \
|
|
+ int _ret = __btree_err(type, c, ca, b, i, k, write, \
|
|
BCH_FSCK_ERR_##_err_type, \
|
|
+ failed, err_msg, \
|
|
msg, ##__VA_ARGS__); \
|
|
\
|
|
if (_ret != -BCH_ERR_fsck_fix) { \
|
|
@@ -620,7 +665,7 @@ static int __btree_err(int ret,
|
|
goto fsck_err; \
|
|
} \
|
|
\
|
|
- *saw_error = true; \
|
|
+ true; \
|
|
})
|
|
|
|
#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
|
|
@@ -678,8 +723,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
|
|
|
|
static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
|
struct btree *b, struct bset *i,
|
|
- unsigned offset, unsigned sectors,
|
|
- int write, bool have_retry, bool *saw_error)
|
|
+ unsigned offset, unsigned sectors, int write,
|
|
+ struct bch_io_failures *failed,
|
|
+ struct printbuf *err_msg)
|
|
{
|
|
unsigned version = le16_to_cpu(i->version);
|
|
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
|
|
@@ -816,7 +862,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
|
-BCH_ERR_btree_node_read_err_bad_node,
|
|
c, ca, b, i, NULL,
|
|
btree_node_bad_format,
|
|
- "invalid bkey format: %s\n %s", buf1.buf,
|
|
+ "invalid bkey format: %s\n%s", buf1.buf,
|
|
(printbuf_reset(&buf2),
|
|
bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
|
|
printbuf_reset(&buf1);
|
|
@@ -892,7 +938,8 @@ static inline int btree_node_read_bkey_cmp(const struct btree *b,
|
|
|
|
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
|
|
struct bset *i, int write,
|
|
- bool have_retry, bool *saw_error)
|
|
+ struct bch_io_failures *failed,
|
|
+ struct printbuf *err_msg)
|
|
{
|
|
unsigned version = le16_to_cpu(i->version);
|
|
struct bkey_packed *k, *prev = NULL;
|
|
@@ -1005,7 +1052,9 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
|
|
}
|
|
|
|
int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
|
- struct btree *b, bool have_retry, bool *saw_error)
|
|
+ struct btree *b,
|
|
+ struct bch_io_failures *failed,
|
|
+ struct printbuf *err_msg)
|
|
{
|
|
struct btree_node_entry *bne;
|
|
struct sort_iter *iter;
|
|
@@ -1015,11 +1064,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
|
bool used_mempool, blacklisted;
|
|
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
|
|
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
|
|
- unsigned u64s;
|
|
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
|
|
u64 max_journal_seq = 0;
|
|
struct printbuf buf = PRINTBUF;
|
|
- int ret = 0, retry_read = 0, write = READ;
|
|
+ int ret = 0, write = READ;
|
|
u64 start_time = local_clock();
|
|
|
|
b->version_ondisk = U16_MAX;
|
|
@@ -1153,15 +1201,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
|
b->version_ondisk = min(b->version_ondisk,
|
|
le16_to_cpu(i->version));
|
|
|
|
- ret = validate_bset(c, ca, b, i, b->written, sectors,
|
|
- READ, have_retry, saw_error);
|
|
+ ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg);
|
|
if (ret)
|
|
goto fsck_err;
|
|
|
|
if (!b->written)
|
|
btree_node_set_format(b, b->data->format);
|
|
|
|
- ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
|
|
+ ret = validate_bset_keys(c, b, i, READ, failed, err_msg);
|
|
if (ret)
|
|
goto fsck_err;
|
|
|
|
@@ -1222,23 +1269,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
|
sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
|
|
sorted->keys.u64s = 0;
|
|
|
|
- set_btree_bset(b, b->set, &b->data->keys);
|
|
-
|
|
b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
|
|
memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
|
|
btree_buf_bytes(b) -
|
|
sizeof(struct btree_node) -
|
|
b->nr.live_u64s * sizeof(u64));
|
|
|
|
- u64s = le16_to_cpu(sorted->keys.u64s);
|
|
+ b->data->keys.u64s = sorted->keys.u64s;
|
|
*sorted = *b->data;
|
|
- sorted->keys.u64s = cpu_to_le16(u64s);
|
|
swap(sorted, b->data);
|
|
set_btree_bset(b, b->set, &b->data->keys);
|
|
b->nsets = 1;
|
|
b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
|
|
|
|
- BUG_ON(b->nr.live_u64s != u64s);
|
|
+ BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s));
|
|
|
|
btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
|
|
|
|
@@ -1252,7 +1296,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
|
|
|
ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
|
|
if (ret == -BCH_ERR_fsck_delete_bkey ||
|
|
- (bch2_inject_invalid_keys &&
|
|
+ (static_branch_unlikely(&bch2_inject_invalid_keys) &&
|
|
!bversion_cmp(u.k->bversion, MAX_VERSION))) {
|
|
btree_keys_account_key_drop(&b->nr, 0, k);
|
|
|
|
@@ -1292,20 +1336,11 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
|
|
|
if (!ptr_written)
|
|
set_btree_node_need_rewrite(b);
|
|
-out:
|
|
+fsck_err:
|
|
mempool_free(iter, &c->fill_iter);
|
|
printbuf_exit(&buf);
|
|
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
|
|
- return retry_read;
|
|
-fsck_err:
|
|
- if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
|
|
- ret == -BCH_ERR_btree_node_read_err_must_retry) {
|
|
- retry_read = 1;
|
|
- } else {
|
|
- set_btree_node_read_error(b);
|
|
- bch2_btree_lost_data(c, b->c.btree_id);
|
|
- }
|
|
- goto out;
|
|
+ return ret;
|
|
}
|
|
|
|
static void btree_node_read_work(struct work_struct *work)
|
|
@@ -1317,17 +1352,28 @@ static void btree_node_read_work(struct work_struct *work)
|
|
struct btree *b = rb->b;
|
|
struct bio *bio = &rb->bio;
|
|
struct bch_io_failures failed = { .nr = 0 };
|
|
+ int ret = 0;
|
|
+
|
|
struct printbuf buf = PRINTBUF;
|
|
- bool saw_error = false;
|
|
- bool retry = false;
|
|
- bool can_retry;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+
|
|
+ prt_printf(&buf, "btree node read error at btree ");
|
|
+ bch2_btree_pos_to_text(&buf, c, b);
|
|
+ prt_newline(&buf);
|
|
|
|
goto start;
|
|
while (1) {
|
|
- retry = true;
|
|
- bch_info(c, "retrying read");
|
|
- ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
|
|
+ ret = bch2_bkey_pick_read_device(c,
|
|
+ bkey_i_to_s_c(&b->key),
|
|
+ &failed, &rb->pick, -1);
|
|
+ if (ret) {
|
|
+ set_btree_node_read_error(b);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
|
|
rb->have_ioref = ca != NULL;
|
|
+ rb->start_time = local_clock();
|
|
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
|
|
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
|
|
bio->bi_iter.bi_size = btree_buf_bytes(b);
|
|
@@ -1338,60 +1384,66 @@ static void btree_node_read_work(struct work_struct *work)
|
|
} else {
|
|
bio->bi_status = BLK_STS_REMOVED;
|
|
}
|
|
+
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
|
+ rb->start_time, !bio->bi_status);
|
|
start:
|
|
- printbuf_reset(&buf);
|
|
- bch2_btree_pos_to_text(&buf, c, b);
|
|
- bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
|
|
- "btree read error %s for %s",
|
|
- bch2_blk_status_to_str(bio->bi_status), buf.buf);
|
|
if (rb->have_ioref)
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read);
|
|
rb->have_ioref = false;
|
|
|
|
- bch2_mark_io_failure(&failed, &rb->pick);
|
|
-
|
|
- can_retry = bch2_bkey_pick_read_device(c,
|
|
- bkey_i_to_s_c(&b->key),
|
|
- &failed, &rb->pick) > 0;
|
|
-
|
|
- if (!bio->bi_status &&
|
|
- !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
|
|
- if (retry)
|
|
- bch_info(c, "retry success");
|
|
- break;
|
|
+ if (bio->bi_status) {
|
|
+ bch2_mark_io_failure(&failed, &rb->pick, false);
|
|
+ continue;
|
|
}
|
|
|
|
- saw_error = true;
|
|
+ ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
|
|
+ if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
|
|
+ ret == -BCH_ERR_btree_node_read_err_must_retry)
|
|
+ continue;
|
|
|
|
- if (!can_retry) {
|
|
+ if (ret)
|
|
set_btree_node_read_error(b);
|
|
- bch2_btree_lost_data(c, b->c.btree_id);
|
|
- break;
|
|
- }
|
|
+
|
|
+ break;
|
|
}
|
|
|
|
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
|
|
- rb->start_time);
|
|
- bio_put(&rb->bio);
|
|
+ bch2_io_failures_to_text(&buf, c, &failed);
|
|
|
|
- if ((saw_error ||
|
|
+ if (btree_node_read_error(b))
|
|
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
|
|
+
|
|
+ /*
|
|
+ * only print retry success if we read from a replica with no errors
|
|
+ */
|
|
+ if (btree_node_read_error(b))
|
|
+ prt_printf(&buf, "ret %s", bch2_err_str(ret));
|
|
+ else if (failed.nr) {
|
|
+ if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
|
|
+ prt_printf(&buf, "retry success");
|
|
+ else
|
|
+ prt_printf(&buf, "repair success");
|
|
+ }
|
|
+
|
|
+ if ((failed.nr ||
|
|
btree_node_need_rewrite(b)) &&
|
|
!btree_node_read_error(b) &&
|
|
- c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
|
|
- if (saw_error) {
|
|
- printbuf_reset(&buf);
|
|
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
|
- prt_str(&buf, " ");
|
|
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
|
- bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s",
|
|
- __func__, buf.buf);
|
|
- }
|
|
-
|
|
+ c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
|
|
+ prt_printf(&buf, " (rewriting node)");
|
|
bch2_btree_node_rewrite_async(c, b);
|
|
}
|
|
+ prt_newline(&buf);
|
|
|
|
+ if (failed.nr)
|
|
+ bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
|
|
+
|
|
+ async_object_list_del(c, btree_read_bio, rb->list_idx);
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
|
|
+ rb->start_time);
|
|
+ bio_put(&rb->bio);
|
|
printbuf_exit(&buf);
|
|
clear_btree_node_read_in_flight(b);
|
|
+ smp_mb__after_atomic();
|
|
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
|
|
}
|
|
|
|
@@ -1400,16 +1452,20 @@ static void btree_node_read_endio(struct bio *bio)
|
|
struct btree_read_bio *rb =
|
|
container_of(bio, struct btree_read_bio, bio);
|
|
struct bch_fs *c = rb->c;
|
|
+ struct bch_dev *ca = rb->have_ioref
|
|
+ ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
|
|
|
|
- if (rb->have_ioref) {
|
|
- struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
|
|
-
|
|
- bch2_latency_acct(ca, rb->start_time, READ);
|
|
- }
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
|
+ rb->start_time, !bio->bi_status);
|
|
|
|
queue_work(c->btree_read_complete_wq, &rb->work);
|
|
}
|
|
|
|
+void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio)
|
|
+{
|
|
+ bch2_bio_to_text(out, &rbio->bio);
|
|
+}
|
|
+
|
|
struct btree_node_read_all {
|
|
struct closure cl;
|
|
struct bch_fs *c;
|
|
@@ -1469,12 +1525,13 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
|
|
struct btree *b = ra->b;
|
|
struct printbuf buf = PRINTBUF;
|
|
bool dump_bset_maps = false;
|
|
- bool have_retry = false;
|
|
int ret = 0, best = -1, write = READ;
|
|
unsigned i, written = 0, written2 = 0;
|
|
__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
|
|
? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
|
|
bool _saw_error = false, *saw_error = &_saw_error;
|
|
+ struct printbuf *err_msg = NULL;
|
|
+ struct bch_io_failures *failed = NULL;
|
|
|
|
for (i = 0; i < ra->nr; i++) {
|
|
struct btree_node *bn = ra->buf[i];
|
|
@@ -1567,14 +1624,19 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
|
|
|
|
if (best >= 0) {
|
|
memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
|
|
- ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
|
|
+ ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL);
|
|
} else {
|
|
ret = -1;
|
|
}
|
|
|
|
if (ret) {
|
|
set_btree_node_read_error(b);
|
|
- bch2_btree_lost_data(c, b->c.btree_id);
|
|
+
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
|
|
+ if (buf.pos)
|
|
+ bch_err(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
} else if (*saw_error)
|
|
bch2_btree_node_rewrite_async(c, b);
|
|
|
|
@@ -1588,6 +1650,7 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
|
|
printbuf_exit(&buf);
|
|
|
|
clear_btree_node_read_in_flight(b);
|
|
+ smp_mb__after_atomic();
|
|
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
|
|
}
|
|
|
|
@@ -1602,6 +1665,8 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
|
|
struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
|
|
|
|
bch2_latency_acct(ca, rb->start_time, READ);
|
|
+ enumerated_ref_put(&ca->io_ref[READ],
|
|
+ BCH_DEV_READ_REF_btree_node_read_all_replicas);
|
|
}
|
|
|
|
ra->err[rb->idx] = bio->bi_status;
|
|
@@ -1641,7 +1706,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
|
|
|
|
i = 0;
|
|
bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
|
|
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
|
|
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
|
|
+ BCH_DEV_READ_REF_btree_node_read_all_replicas);
|
|
struct btree_read_bio *rb =
|
|
container_of(ra->bio[i], struct btree_read_bio, bio);
|
|
rb->c = c;
|
|
@@ -1692,33 +1758,42 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
|
|
|
|
trace_and_count(c, btree_node_read, trans, b);
|
|
|
|
- if (bch2_verify_all_btree_replicas &&
|
|
+ if (static_branch_unlikely(&bch2_verify_all_btree_replicas) &&
|
|
!btree_node_read_all_replicas(c, b, sync))
|
|
return;
|
|
|
|
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
|
|
- NULL, &pick);
|
|
+ NULL, &pick, -1);
|
|
|
|
if (ret <= 0) {
|
|
+ bool ratelimit = true;
|
|
struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
|
|
prt_str(&buf, "btree node read error: no device to read from\n at ");
|
|
bch2_btree_pos_to_text(&buf, c, b);
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
-
|
|
- if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
|
|
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
|
|
- bch2_fatal_error(c);
|
|
+ prt_newline(&buf);
|
|
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
|
|
+
|
|
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
|
|
+ bch2_fs_emergency_read_only2(c, &buf))
|
|
+ ratelimit = false;
|
|
+
|
|
+ static DEFINE_RATELIMIT_STATE(rs,
|
|
+ DEFAULT_RATELIMIT_INTERVAL,
|
|
+ DEFAULT_RATELIMIT_BURST);
|
|
+ if (!ratelimit || __ratelimit(&rs))
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
|
|
set_btree_node_read_error(b);
|
|
- bch2_btree_lost_data(c, b->c.btree_id);
|
|
clear_btree_node_read_in_flight(b);
|
|
+ smp_mb__after_atomic();
|
|
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
|
|
- printbuf_exit(&buf);
|
|
return;
|
|
}
|
|
|
|
- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
|
|
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
|
|
|
|
bio = bio_alloc_bioset(NULL,
|
|
buf_pages(b->data, btree_buf_bytes(b)),
|
|
@@ -1737,6 +1812,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
|
|
bio->bi_end_io = btree_node_read_endio;
|
|
bch2_bio_map(bio, b->data, btree_buf_bytes(b));
|
|
|
|
+ async_object_list_add(c, btree_read_bio, rb, &rb->list_idx);
|
|
+
|
|
if (rb->have_ioref) {
|
|
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
|
|
bio_sectors(bio));
|
|
@@ -1811,6 +1888,192 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
|
|
return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
|
|
}
|
|
|
|
+struct btree_node_scrub {
|
|
+ struct bch_fs *c;
|
|
+ struct bch_dev *ca;
|
|
+ void *buf;
|
|
+ bool used_mempool;
|
|
+ unsigned written;
|
|
+
|
|
+ enum btree_id btree;
|
|
+ unsigned level;
|
|
+ struct bkey_buf key;
|
|
+ __le64 seq;
|
|
+
|
|
+ struct work_struct work;
|
|
+ struct bio bio;
|
|
+};
|
|
+
|
|
+static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
|
|
+ struct printbuf *err)
|
|
+{
|
|
+ unsigned written = 0;
|
|
+
|
|
+ if (le64_to_cpu(data->magic) != bset_magic(c)) {
|
|
+ prt_printf(err, "bad magic: want %llx, got %llx",
|
|
+ bset_magic(c), le64_to_cpu(data->magic));
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ while (written < (ptr_written ?: btree_sectors(c))) {
|
|
+ struct btree_node_entry *bne;
|
|
+ struct bset *i;
|
|
+ bool first = !written;
|
|
+
|
|
+ if (first) {
|
|
+ bne = NULL;
|
|
+ i = &data->keys;
|
|
+ } else {
|
|
+ bne = (void *) data + (written << 9);
|
|
+ i = &bne->keys;
|
|
+
|
|
+ if (!ptr_written && i->seq != data->keys.seq)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ struct nonce nonce = btree_nonce(i, written << 9);
|
|
+ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
|
|
+
|
|
+ if (first) {
|
|
+ if (good_csum_type) {
|
|
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
|
|
+ if (bch2_crc_cmp(data->csum, csum)) {
|
|
+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ written += vstruct_sectors(data, c->block_bits);
|
|
+ } else {
|
|
+ if (good_csum_type) {
|
|
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
|
|
+ if (bch2_crc_cmp(bne->csum, csum)) {
|
|
+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ written += vstruct_sectors(bne, c->block_bits);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void btree_node_scrub_work(struct work_struct *work)
|
|
+{
|
|
+ struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
|
|
+ struct bch_fs *c = scrub->c;
|
|
+ struct printbuf err = PRINTBUF;
|
|
+
|
|
+ __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
|
|
+ bkey_i_to_s_c(scrub->key.k));
|
|
+ prt_newline(&err);
|
|
+
|
|
+ if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
|
|
+ struct btree_trans *trans = bch2_trans_get(c);
|
|
+
|
|
+ struct btree_iter iter;
|
|
+ bch2_trans_node_iter_init(trans, &iter, scrub->btree,
|
|
+ scrub->key.k->k.p, 0, scrub->level - 1, 0);
|
|
+
|
|
+ struct btree *b;
|
|
+ int ret = lockrestart_do(trans,
|
|
+ PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter)));
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) {
|
|
+ bch_err(c, "error validating btree node during scrub on %s at btree %s",
|
|
+ scrub->ca->name, err.buf);
|
|
+
|
|
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0);
|
|
+ }
|
|
+err:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ bch2_trans_begin(trans);
|
|
+ bch2_trans_put(trans);
|
|
+ }
|
|
+
|
|
+ printbuf_exit(&err);
|
|
+ bch2_bkey_buf_exit(&scrub->key, c);;
|
|
+ btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
|
|
+ enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
|
|
+ kfree(scrub);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
|
|
+}
|
|
+
|
|
+static void btree_node_scrub_endio(struct bio *bio)
|
|
+{
|
|
+ struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
|
|
+
|
|
+ queue_work(scrub->c->btree_read_complete_wq, &scrub->work);
|
|
+}
|
|
+
|
|
+int bch2_btree_node_scrub(struct btree_trans *trans,
|
|
+ enum btree_id btree, unsigned level,
|
|
+ struct bkey_s_c k, unsigned dev)
|
|
+{
|
|
+ if (k.k->type != KEY_TYPE_btree_ptr_v2)
|
|
+ return 0;
|
|
+
|
|
+ struct bch_fs *c = trans->c;
|
|
+
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub))
|
|
+ return -BCH_ERR_erofs_no_writes;
|
|
+
|
|
+ struct extent_ptr_decoded pick;
|
|
+ int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
|
|
+ if (ret <= 0)
|
|
+ goto err;
|
|
+
|
|
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
|
|
+ BCH_DEV_READ_REF_btree_node_scrub);
|
|
+ if (!ca) {
|
|
+ ret = -BCH_ERR_device_offline;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bool used_mempool = false;
|
|
+ void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
|
|
+
|
|
+ unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
|
|
+
|
|
+ struct btree_node_scrub *scrub =
|
|
+ kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
|
|
+ if (!scrub) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err_free;
|
|
+ }
|
|
+
|
|
+ scrub->c = c;
|
|
+ scrub->ca = ca;
|
|
+ scrub->buf = buf;
|
|
+ scrub->used_mempool = used_mempool;
|
|
+ scrub->written = btree_ptr_sectors_written(k);
|
|
+
|
|
+ scrub->btree = btree;
|
|
+ scrub->level = level;
|
|
+ bch2_bkey_buf_init(&scrub->key);
|
|
+ bch2_bkey_buf_reassemble(&scrub->key, c, k);
|
|
+ scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq;
|
|
+
|
|
+ INIT_WORK(&scrub->work, btree_node_scrub_work);
|
|
+
|
|
+ bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ);
|
|
+ bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
|
|
+ scrub->bio.bi_iter.bi_sector = pick.ptr.offset;
|
|
+ scrub->bio.bi_end_io = btree_node_scrub_endio;
|
|
+ submit_bio(&scrub->bio);
|
|
+ return 0;
|
|
+err_free:
|
|
+ btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
|
|
+err:
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
|
|
struct btree_write *w)
|
|
{
|
|
@@ -1831,7 +2094,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
|
|
bch2_journal_pin_drop(&c->journal, &w->journal);
|
|
}
|
|
|
|
-static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
|
|
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
|
|
{
|
|
struct btree_write *w = btree_prev_write(b);
|
|
unsigned long old, new;
|
|
@@ -1839,6 +2102,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
|
|
|
|
bch2_btree_complete_write(c, b, w);
|
|
|
|
+ if (start_time)
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time);
|
|
+
|
|
old = READ_ONCE(b->flags);
|
|
do {
|
|
new = old;
|
|
@@ -1865,11 +2131,13 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
|
|
|
|
if (new & (1U << BTREE_NODE_write_in_flight))
|
|
__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
|
|
- else
|
|
+ else {
|
|
+ smp_mb__after_atomic();
|
|
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
|
|
+ }
|
|
}
|
|
|
|
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
|
|
+static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
|
|
{
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
|
|
@@ -1877,7 +2145,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
|
|
|
|
/* we don't need transaction context anymore after we got the lock. */
|
|
bch2_trans_put(trans);
|
|
- __btree_node_write_done(c, b);
|
|
+ __btree_node_write_done(c, b, start_time);
|
|
six_unlock_read(&b->c.lock);
|
|
}
|
|
|
|
@@ -1887,6 +2155,7 @@ static void btree_node_write_work(struct work_struct *work)
|
|
container_of(work, struct btree_write_bio, work);
|
|
struct bch_fs *c = wbio->wbio.c;
|
|
struct btree *b = wbio->wbio.bio.bi_private;
|
|
+ u64 start_time = wbio->start_time;
|
|
int ret = 0;
|
|
|
|
btree_bounce_free(c,
|
|
@@ -1918,13 +2187,20 @@ static void btree_node_write_work(struct work_struct *work)
|
|
goto err;
|
|
}
|
|
out:
|
|
+ async_object_list_del(c, btree_write_bio, wbio->list_idx);
|
|
bio_put(&wbio->wbio.bio);
|
|
- btree_node_write_done(c, b);
|
|
+ btree_node_write_done(c, b, start_time);
|
|
return;
|
|
err:
|
|
set_btree_node_noevict(b);
|
|
- bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
|
|
- "writing btree node: %s", bch2_err_str(ret));
|
|
+
|
|
+ if (!bch2_err_matches(ret, EROFS)) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret));
|
|
+ bch2_btree_pos_to_text(&buf, c, b);
|
|
+ bch2_fs_fatal_error(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
goto out;
|
|
}
|
|
|
|
@@ -1937,23 +2213,34 @@ static void btree_node_write_endio(struct bio *bio)
|
|
struct bch_fs *c = wbio->c;
|
|
struct btree *b = wbio->bio.bi_private;
|
|
struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
|
|
- unsigned long flags;
|
|
|
|
- if (wbio->have_ioref)
|
|
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
|
|
+ wbio->submit_time, !bio->bi_status);
|
|
+
|
|
+ if (ca && bio->bi_status) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ buf.atomic++;
|
|
+ prt_printf(&buf, "btree write error: %s\n ",
|
|
+ bch2_blk_status_to_str(bio->bi_status));
|
|
+ bch2_btree_pos_to_text(&buf, c, b);
|
|
+ bch_err_dev_ratelimited(ca, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
|
|
- if (!ca ||
|
|
- bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
|
|
- "btree write error: %s",
|
|
- bch2_blk_status_to_str(bio->bi_status)) ||
|
|
- bch2_meta_write_fault("btree")) {
|
|
+ if (bio->bi_status) {
|
|
+ unsigned long flags;
|
|
spin_lock_irqsave(&c->btree_write_error_lock, flags);
|
|
bch2_dev_list_add_dev(&orig->failed, wbio->dev);
|
|
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
|
|
}
|
|
|
|
+ /*
|
|
+ * XXX: we should be using io_ref[WRITE], but we aren't retrying failed
|
|
+ * btree writes yet (due to device removal/ro):
|
|
+ */
|
|
if (wbio->have_ioref)
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ],
|
|
+ BCH_DEV_READ_REF_btree_node_write);
|
|
|
|
if (parent) {
|
|
bio_put(bio);
|
|
@@ -1962,16 +2249,15 @@ static void btree_node_write_endio(struct bio *bio)
|
|
}
|
|
|
|
clear_btree_node_write_in_flight_inner(b);
|
|
+ smp_mb__after_atomic();
|
|
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
|
|
INIT_WORK(&wb->work, btree_node_write_work);
|
|
- queue_work(c->btree_io_complete_wq, &wb->work);
|
|
+ queue_work(c->btree_write_complete_wq, &wb->work);
|
|
}
|
|
|
|
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
|
|
struct bset *i, unsigned sectors)
|
|
{
|
|
- bool saw_error;
|
|
-
|
|
int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
|
|
(struct bkey_validate_context) {
|
|
.from = BKEY_VALIDATE_btree_node,
|
|
@@ -1984,8 +2270,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
|
|
return ret;
|
|
}
|
|
|
|
- ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
|
|
- validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
|
|
+ ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?:
|
|
+ validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL);
|
|
if (ret) {
|
|
bch2_inconsistent_error(c);
|
|
dump_stack();
|
|
@@ -2023,6 +2309,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
|
|
bool validate_before_checksum = false;
|
|
enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
|
|
void *data;
|
|
+ u64 start_time = local_clock();
|
|
int ret;
|
|
|
|
if (flags & BTREE_WRITE_ALREADY_STARTED)
|
|
@@ -2231,6 +2518,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
|
|
wbio->data = data;
|
|
wbio->data_bytes = bytes;
|
|
wbio->sector_offset = b->written;
|
|
+ wbio->start_time = start_time;
|
|
wbio->wbio.c = c;
|
|
wbio->wbio.used_mempool = used_mempool;
|
|
wbio->wbio.first_btree_write = !b->written;
|
|
@@ -2250,6 +2538,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
|
|
atomic64_inc(&c->btree_write_stats[type].nr);
|
|
atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
|
|
|
|
+ async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx);
|
|
+
|
|
INIT_WORK(&wbio->work, btree_write_submit);
|
|
queue_work(c->btree_write_submit_wq, &wbio->work);
|
|
return;
|
|
@@ -2258,7 +2548,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
|
|
b->written += sectors_to_write;
|
|
nowrite:
|
|
btree_bounce_free(c, bytes, used_mempool, data);
|
|
- __btree_node_write_done(c, b);
|
|
+ __btree_node_write_done(c, b, 0);
|
|
}
|
|
|
|
/*
|
|
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
|
|
index 6f9e4a6dacf7..30a5180532c8 100644
|
|
--- a/fs/bcachefs/btree_io.h
|
|
+++ b/fs/bcachefs/btree_io.h
|
|
@@ -41,6 +41,9 @@ struct btree_read_bio {
|
|
u64 start_time;
|
|
unsigned have_ioref:1;
|
|
unsigned idx:7;
|
|
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
|
|
+ unsigned list_idx;
|
|
+#endif
|
|
struct extent_ptr_decoded pick;
|
|
struct work_struct work;
|
|
struct bio bio;
|
|
@@ -52,6 +55,10 @@ struct btree_write_bio {
|
|
void *data;
|
|
unsigned data_bytes;
|
|
unsigned sector_offset;
|
|
+ u64 start_time;
|
|
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
|
|
+ unsigned list_idx;
|
|
+#endif
|
|
struct bch_write_bio wbio;
|
|
};
|
|
|
|
@@ -127,11 +134,18 @@ void bch2_btree_build_aux_trees(struct btree *);
|
|
void bch2_btree_init_next(struct btree_trans *, struct btree *);
|
|
|
|
int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
|
|
- struct btree *, bool, bool *);
|
|
+ struct btree *,
|
|
+ struct bch_io_failures *,
|
|
+ struct printbuf *);
|
|
void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
|
|
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
|
|
const struct bkey_i *, unsigned);
|
|
|
|
+void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *);
|
|
+
|
|
+int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned,
|
|
+ struct bkey_s_c, unsigned);
|
|
+
|
|
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
|
|
|
|
enum btree_write_flags {
|
|
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
|
|
index e32fce4fd258..0f0b80c8c29a 100644
|
|
--- a/fs/bcachefs/btree_iter.c
|
|
+++ b/fs/bcachefs/btree_iter.c
|
|
@@ -16,6 +16,7 @@
|
|
#include "journal_io.h"
|
|
#include "replicas.h"
|
|
#include "snapshot.h"
|
|
+#include "super.h"
|
|
#include "trace.h"
|
|
|
|
#include <linux/random.h>
|
|
@@ -114,11 +115,9 @@ static inline bool btree_path_pos_in_node(struct btree_path *path,
|
|
!btree_path_pos_after_node(path, b);
|
|
}
|
|
|
|
-/* Btree iterator: */
|
|
+/* Debug: */
|
|
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
-
|
|
-static void bch2_btree_path_verify_cached(struct btree_trans *trans,
|
|
+static void __bch2_btree_path_verify_cached(struct btree_trans *trans,
|
|
struct btree_path *path)
|
|
{
|
|
struct bkey_cached *ck;
|
|
@@ -135,7 +134,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans,
|
|
btree_node_unlock(trans, path, 0);
|
|
}
|
|
|
|
-static void bch2_btree_path_verify_level(struct btree_trans *trans,
|
|
+static void __bch2_btree_path_verify_level(struct btree_trans *trans,
|
|
struct btree_path *path, unsigned level)
|
|
{
|
|
struct btree_path_level *l;
|
|
@@ -147,16 +146,13 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
|
|
struct printbuf buf3 = PRINTBUF;
|
|
const char *msg;
|
|
|
|
- if (!bch2_debug_check_iterators)
|
|
- return;
|
|
-
|
|
l = &path->l[level];
|
|
tmp = l->iter;
|
|
locked = btree_node_locked(path, level);
|
|
|
|
if (path->cached) {
|
|
if (!level)
|
|
- bch2_btree_path_verify_cached(trans, path);
|
|
+ __bch2_btree_path_verify_cached(trans, path);
|
|
return;
|
|
}
|
|
|
|
@@ -217,7 +213,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
|
|
msg, level, buf1.buf, buf2.buf, buf3.buf);
|
|
}
|
|
|
|
-static void bch2_btree_path_verify(struct btree_trans *trans,
|
|
+static void __bch2_btree_path_verify(struct btree_trans *trans,
|
|
struct btree_path *path)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
@@ -229,25 +225,23 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
|
|
break;
|
|
}
|
|
|
|
- bch2_btree_path_verify_level(trans, path, i);
|
|
+ __bch2_btree_path_verify_level(trans, path, i);
|
|
}
|
|
|
|
bch2_btree_path_verify_locks(path);
|
|
}
|
|
|
|
-void bch2_trans_verify_paths(struct btree_trans *trans)
|
|
+void __bch2_trans_verify_paths(struct btree_trans *trans)
|
|
{
|
|
struct btree_path *path;
|
|
unsigned iter;
|
|
|
|
trans_for_each_path(trans, path, iter)
|
|
- bch2_btree_path_verify(trans, path);
|
|
+ __bch2_btree_path_verify(trans, path);
|
|
}
|
|
|
|
-static void bch2_btree_iter_verify(struct btree_iter *iter)
|
|
+static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
-
|
|
BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
|
|
|
|
BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
|
|
@@ -258,11 +252,11 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
|
|
!btree_type_has_snapshot_field(iter->btree_id));
|
|
|
|
if (iter->update_path)
|
|
- bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
|
|
- bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
|
|
+ __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
|
|
+ __bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
|
|
}
|
|
|
|
-static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
|
|
+static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
|
|
{
|
|
BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
|
|
!iter->pos.snapshot);
|
|
@@ -276,16 +270,13 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
|
|
bkey_gt(iter->pos, iter->k.p)));
|
|
}
|
|
|
|
-static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
|
|
+static int __bch2_btree_iter_verify_ret(struct btree_trans *trans,
|
|
+ struct btree_iter *iter, struct bkey_s_c k)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
struct btree_iter copy;
|
|
struct bkey_s_c prev;
|
|
int ret = 0;
|
|
|
|
- if (!bch2_debug_check_iterators)
|
|
- return 0;
|
|
-
|
|
if (!(iter->flags & BTREE_ITER_filter_snapshots))
|
|
return 0;
|
|
|
|
@@ -299,7 +290,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
|
|
bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos,
|
|
BTREE_ITER_nopreserve|
|
|
BTREE_ITER_all_snapshots);
|
|
- prev = bch2_btree_iter_prev(©);
|
|
+ prev = bch2_btree_iter_prev(trans, ©);
|
|
if (!prev.k)
|
|
goto out;
|
|
|
|
@@ -326,7 +317,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
|
|
return ret;
|
|
}
|
|
|
|
-void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
|
|
+void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
|
|
struct bpos pos)
|
|
{
|
|
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
|
@@ -359,17 +350,40 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
|
|
panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
|
|
}
|
|
|
|
-#else
|
|
-
|
|
static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
|
|
- struct btree_path *path, unsigned l) {}
|
|
+ struct btree_path *path, unsigned l)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
|
|
+ __bch2_btree_path_verify_level(trans, path, l);
|
|
+}
|
|
+
|
|
static inline void bch2_btree_path_verify(struct btree_trans *trans,
|
|
- struct btree_path *path) {}
|
|
-static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
|
|
-static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
|
|
-static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
|
|
+ struct btree_path *path)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
|
|
+ __bch2_btree_path_verify(trans, path);
|
|
+}
|
|
|
|
-#endif
|
|
+static inline void bch2_btree_iter_verify(struct btree_trans *trans,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
|
|
+ __bch2_btree_iter_verify(trans, iter);
|
|
+}
|
|
+
|
|
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
|
|
+ __bch2_btree_iter_verify_entry_exit(iter);
|
|
+}
|
|
+
|
|
+static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ return static_branch_unlikely(&bch2_debug_check_iterators)
|
|
+ ? __bch2_btree_iter_verify_ret(trans, iter, k)
|
|
+ : 0;
|
|
+}
|
|
|
|
/* Btree path: fixups after btree updates */
|
|
|
|
@@ -523,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
|
|
__bch2_btree_node_iter_fix(path, b, node_iter, t,
|
|
where, clobber_u64s, new_u64s);
|
|
|
|
- if (bch2_debug_check_iterators)
|
|
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
|
|
bch2_btree_node_iter_verify(node_iter, b);
|
|
}
|
|
|
|
@@ -562,20 +576,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
|
|
bch2_btree_node_iter_peek_all(&l->iter, l->b));
|
|
}
|
|
|
|
-static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
|
|
- struct btree_path *path,
|
|
- struct btree_path_level *l,
|
|
- struct bkey *u)
|
|
-{
|
|
- struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
|
|
- bch2_btree_node_iter_peek(&l->iter, l->b));
|
|
-
|
|
- path->pos = k.k ? k.k->p : l->b->key.k.p;
|
|
- trans->paths_sorted = false;
|
|
- bch2_btree_path_verify_level(trans, path, l - path->l);
|
|
- return k;
|
|
-}
|
|
-
|
|
static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
|
|
struct btree_path *path,
|
|
struct btree_path_level *l,
|
|
@@ -1176,7 +1176,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
|
|
}
|
|
|
|
if (path->cached) {
|
|
- ret = bch2_btree_path_traverse_cached(trans, path, flags);
|
|
+ ret = bch2_btree_path_traverse_cached(trans, path_idx, flags);
|
|
goto out;
|
|
}
|
|
|
|
@@ -1499,24 +1499,16 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
|
|
prt_newline(buf);
|
|
}
|
|
|
|
- for (struct jset_entry *e = trans->journal_entries;
|
|
+ for (struct jset_entry *e = btree_trans_journal_entries_start(trans);
|
|
e != btree_trans_journal_entries_top(trans);
|
|
- e = vstruct_next(e))
|
|
+ e = vstruct_next(e)) {
|
|
bch2_journal_entry_to_text(buf, trans->c, e);
|
|
+ prt_newline(buf);
|
|
+ }
|
|
|
|
printbuf_indent_sub(buf, 2);
|
|
}
|
|
|
|
-noinline __cold
|
|
-void bch2_dump_trans_updates(struct btree_trans *trans)
|
|
-{
|
|
- struct printbuf buf = PRINTBUF;
|
|
-
|
|
- bch2_trans_updates_to_text(&buf, trans);
|
|
- bch2_print_str(trans->c, buf.buf);
|
|
- printbuf_exit(&buf);
|
|
-}
|
|
-
|
|
static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
|
|
{
|
|
struct btree_path *path = trans->paths + path_idx;
|
|
@@ -1613,7 +1605,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
|
|
__bch2_trans_paths_to_text(&buf, trans, nosort);
|
|
bch2_trans_updates_to_text(&buf, trans);
|
|
|
|
- bch2_print_str(trans->c, buf.buf);
|
|
+ bch2_print_str(trans->c, KERN_ERR, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
@@ -1877,10 +1869,8 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
|
|
return (struct bkey_s_c) { u, NULL };
|
|
}
|
|
|
|
-void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
|
|
+void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
-
|
|
if (!iter->path || trans->restarted)
|
|
return;
|
|
|
|
@@ -1892,17 +1882,14 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
|
|
/* Btree iterators: */
|
|
|
|
int __must_check
|
|
-__bch2_btree_iter_traverse(struct btree_iter *iter)
|
|
+__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
- return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
|
|
+ return bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
|
}
|
|
|
|
int __must_check
|
|
-bch2_btree_iter_traverse(struct btree_iter *iter)
|
|
+bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
- int ret;
|
|
-
|
|
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
|
|
|
iter->path = bch2_btree_path_set_pos(trans, iter->path,
|
|
@@ -1910,7 +1897,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
|
|
iter->flags & BTREE_ITER_intent,
|
|
btree_iter_ip_allocated(iter));
|
|
|
|
- ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
|
|
+ int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -1922,14 +1909,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
|
|
|
|
/* Iterate across nodes (leaf and interior nodes) */
|
|
|
|
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
|
|
+struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans,
|
|
+ struct btree_iter *iter)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
struct btree *b = NULL;
|
|
int ret;
|
|
|
|
EBUG_ON(trans->paths[iter->path].cached);
|
|
- bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify(trans, iter);
|
|
|
|
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
|
if (ret)
|
|
@@ -1951,7 +1938,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
|
|
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
|
|
out:
|
|
bch2_btree_iter_verify_entry_exit(iter);
|
|
- bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify(trans, iter);
|
|
|
|
return b;
|
|
err:
|
|
@@ -1960,26 +1947,26 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
|
|
}
|
|
|
|
/* Only kept for -tools */
|
|
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
|
|
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans,
|
|
+ struct btree_iter *iter)
|
|
{
|
|
struct btree *b;
|
|
|
|
- while (b = bch2_btree_iter_peek_node(iter),
|
|
+ while (b = bch2_btree_iter_peek_node(trans, iter),
|
|
bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
|
|
- bch2_trans_begin(iter->trans);
|
|
+ bch2_trans_begin(trans);
|
|
|
|
return b;
|
|
}
|
|
|
|
-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
|
|
+struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
struct btree *b = NULL;
|
|
int ret;
|
|
|
|
EBUG_ON(trans->paths[iter->path].cached);
|
|
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
|
- bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify(trans, iter);
|
|
|
|
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
|
if (ret)
|
|
@@ -1998,6 +1985,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
|
|
return NULL;
|
|
}
|
|
|
|
+ /*
|
|
+ * We don't correctly handle nodes with extra intent locks here:
|
|
+ * downgrade so we don't violate locking invariants
|
|
+ */
|
|
+ bch2_btree_path_downgrade(trans, path);
|
|
+
|
|
if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
|
|
__bch2_btree_path_unlock(trans, path);
|
|
path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
|
|
@@ -2046,7 +2039,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
|
|
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
|
|
out:
|
|
bch2_btree_iter_verify_entry_exit(iter);
|
|
- bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify(trans, iter);
|
|
|
|
return b;
|
|
err:
|
|
@@ -2056,7 +2049,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
|
|
|
|
/* Iterate across keys (in leaf nodes only) */
|
|
|
|
-inline bool bch2_btree_iter_advance(struct btree_iter *iter)
|
|
+inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
struct bpos pos = iter->k.p;
|
|
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
|
|
@@ -2065,11 +2058,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
|
|
|
|
if (ret && !(iter->flags & BTREE_ITER_is_extents))
|
|
pos = bkey_successor(iter, pos);
|
|
- bch2_btree_iter_set_pos(iter, pos);
|
|
+ bch2_btree_iter_set_pos(trans, iter, pos);
|
|
return ret;
|
|
}
|
|
|
|
-inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
|
|
+inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
struct bpos pos = bkey_start_pos(&iter->k);
|
|
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
|
|
@@ -2078,7 +2071,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
|
|
|
|
if (ret && !(iter->flags & BTREE_ITER_is_extents))
|
|
pos = bkey_predecessor(iter, pos);
|
|
- bch2_btree_iter_set_pos(iter, pos);
|
|
+ bch2_btree_iter_set_pos(trans, iter, pos);
|
|
return ret;
|
|
}
|
|
|
|
@@ -2205,9 +2198,9 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans,
|
|
* bkey_s_c_null:
|
|
*/
|
|
static noinline
|
|
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
|
|
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bpos pos)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
struct bch_fs *c = trans->c;
|
|
struct bkey u;
|
|
struct bkey_s_c k;
|
|
@@ -2253,14 +2246,14 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
|
|
return k;
|
|
}
|
|
|
|
-static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
|
|
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bpos search_key)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
struct bkey_s_c k, k2;
|
|
int ret;
|
|
|
|
EBUG_ON(btree_iter_path(trans, iter)->cached);
|
|
- bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify(trans, iter);
|
|
|
|
while (1) {
|
|
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
|
|
@@ -2270,7 +2263,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
|
|
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
|
if (unlikely(ret)) {
|
|
/* ensure that iter->k is consistent with iter->pos: */
|
|
- bch2_btree_iter_set_pos(iter, iter->pos);
|
|
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
|
|
k = bkey_s_c_err(ret);
|
|
break;
|
|
}
|
|
@@ -2280,7 +2273,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
|
|
|
|
if (unlikely(!l->b)) {
|
|
/* No btree nodes at requested level: */
|
|
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
|
|
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
|
|
k = bkey_s_c_null;
|
|
break;
|
|
}
|
|
@@ -2291,10 +2284,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
|
|
|
|
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
|
|
k.k &&
|
|
- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
|
|
+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
|
|
k = k2;
|
|
if (bkey_err(k)) {
|
|
- bch2_btree_iter_set_pos(iter, iter->pos);
|
|
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
|
|
break;
|
|
}
|
|
}
|
|
@@ -2327,27 +2320,28 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
|
|
search_key = bpos_successor(l->b->key.k.p);
|
|
} else {
|
|
/* End of btree: */
|
|
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
|
|
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
|
|
k = bkey_s_c_null;
|
|
break;
|
|
}
|
|
}
|
|
|
|
- bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify(trans, iter);
|
|
return k;
|
|
}
|
|
|
|
/**
|
|
* bch2_btree_iter_peek_max() - returns first key greater than or equal to
|
|
* iterator's current position
|
|
+ * @trans: btree transaction object
|
|
* @iter: iterator to peek from
|
|
* @end: search limit: returns keys less than or equal to @end
|
|
*
|
|
* Returns: key if found, or an error extractable with bkey_err().
|
|
*/
|
|
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end)
|
|
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bpos end)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
struct bpos search_key = btree_iter_search_key(iter);
|
|
struct bkey_s_c k;
|
|
struct bpos iter_pos = iter->pos;
|
|
@@ -2370,7 +2364,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
|
|
}
|
|
|
|
while (1) {
|
|
- k = __bch2_btree_iter_peek(iter, search_key);
|
|
+ k = __bch2_btree_iter_peek(trans, iter, search_key);
|
|
if (unlikely(!k.k))
|
|
goto end;
|
|
if (unlikely(bkey_err(k)))
|
|
@@ -2484,9 +2478,9 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
|
|
if (!(iter->flags & BTREE_ITER_all_snapshots))
|
|
iter->pos.snapshot = iter->snapshot;
|
|
|
|
- ret = bch2_btree_iter_verify_ret(iter, k);
|
|
+ ret = bch2_btree_iter_verify_ret(trans, iter, k);
|
|
if (unlikely(ret)) {
|
|
- bch2_btree_iter_set_pos(iter, iter->pos);
|
|
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
|
|
k = bkey_s_c_err(ret);
|
|
}
|
|
|
|
@@ -2494,7 +2488,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
|
|
|
|
return k;
|
|
end:
|
|
- bch2_btree_iter_set_pos(iter, end);
|
|
+ bch2_btree_iter_set_pos(trans, iter, end);
|
|
k = bkey_s_c_null;
|
|
goto out_no_locked;
|
|
}
|
|
@@ -2502,24 +2496,25 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
|
|
/**
|
|
* bch2_btree_iter_next() - returns first key greater than iterator's current
|
|
* position
|
|
+ * @trans: btree transaction object
|
|
* @iter: iterator to peek from
|
|
*
|
|
* Returns: key if found, or an error extractable with bkey_err().
|
|
*/
|
|
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
|
|
+struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
- if (!bch2_btree_iter_advance(iter))
|
|
+ if (!bch2_btree_iter_advance(trans, iter))
|
|
return bkey_s_c_null;
|
|
|
|
- return bch2_btree_iter_peek(iter);
|
|
+ return bch2_btree_iter_peek(trans, iter);
|
|
}
|
|
|
|
-static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
|
|
+static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bpos search_key)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
struct bkey_s_c k, k2;
|
|
|
|
- bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify(trans, iter);
|
|
|
|
while (1) {
|
|
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
|
|
@@ -2529,7 +2524,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
|
|
int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
|
if (unlikely(ret)) {
|
|
/* ensure that iter->k is consistent with iter->pos: */
|
|
- bch2_btree_iter_set_pos(iter, iter->pos);
|
|
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
|
|
k = bkey_s_c_err(ret);
|
|
break;
|
|
}
|
|
@@ -2539,7 +2534,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
|
|
|
|
if (unlikely(!l->b)) {
|
|
/* No btree nodes at requested level: */
|
|
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
|
|
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
|
|
k = bkey_s_c_null;
|
|
break;
|
|
}
|
|
@@ -2555,10 +2550,10 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
|
|
|
|
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
|
|
k.k &&
|
|
- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
|
|
+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
|
|
k = k2;
|
|
if (bkey_err(k2)) {
|
|
- bch2_btree_iter_set_pos(iter, iter->pos);
|
|
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
|
|
break;
|
|
}
|
|
}
|
|
@@ -2579,28 +2574,33 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
|
|
search_key = bpos_predecessor(path->l[0].b->data->min_key);
|
|
} else {
|
|
/* Start of btree: */
|
|
- bch2_btree_iter_set_pos(iter, POS_MIN);
|
|
+ bch2_btree_iter_set_pos(trans, iter, POS_MIN);
|
|
k = bkey_s_c_null;
|
|
break;
|
|
}
|
|
}
|
|
|
|
- bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify(trans, iter);
|
|
return k;
|
|
}
|
|
|
|
/**
|
|
* bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
|
|
* iterator's current position
|
|
+ * @trans: btree transaction object
|
|
* @iter: iterator to peek from
|
|
* @end: search limit: returns keys greater than or equal to @end
|
|
*
|
|
* Returns: key if found, or an error extractable with bkey_err().
|
|
*/
|
|
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
|
|
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bpos end)
|
|
{
|
|
if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
|
|
- !bkey_eq(iter->pos, POS_MAX)) {
|
|
+ !bkey_eq(iter->pos, POS_MAX) &&
|
|
+ !((iter->flags & BTREE_ITER_is_extents) &&
|
|
+ iter->pos.offset == U64_MAX)) {
|
|
+
|
|
/*
|
|
* bkey_start_pos(), for extents, is not monotonically
|
|
* increasing until after filtering for snapshots:
|
|
@@ -2609,7 +2609,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
|
|
* real visible extents - easiest to just use peek_slot() (which
|
|
* internally uses peek() for extents)
|
|
*/
|
|
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
|
|
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
|
|
if (bkey_err(k))
|
|
return k;
|
|
|
|
@@ -2619,14 +2619,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
|
|
return k;
|
|
}
|
|
|
|
- struct btree_trans *trans = iter->trans;
|
|
struct bpos search_key = iter->pos;
|
|
struct bkey_s_c k;
|
|
btree_path_idx_t saved_path = 0;
|
|
|
|
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
|
bch2_btree_iter_verify_entry_exit(iter);
|
|
- EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
|
|
+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode);
|
|
|
|
int ret = trans_maybe_inject_restart(trans, _RET_IP_);
|
|
if (unlikely(ret)) {
|
|
@@ -2635,7 +2634,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
|
|
}
|
|
|
|
while (1) {
|
|
- k = __bch2_btree_iter_peek_prev(iter, search_key);
|
|
+ k = __bch2_btree_iter_peek_prev(trans, iter, search_key);
|
|
if (unlikely(!k.k))
|
|
goto end;
|
|
if (unlikely(bkey_err(k)))
|
|
@@ -2726,10 +2725,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
|
|
bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
|
|
|
|
bch2_btree_iter_verify_entry_exit(iter);
|
|
- bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify(trans, iter);
|
|
return k;
|
|
end:
|
|
- bch2_btree_iter_set_pos(iter, end);
|
|
+ bch2_btree_iter_set_pos(trans, iter, end);
|
|
k = bkey_s_c_null;
|
|
goto out_no_locked;
|
|
}
|
|
@@ -2737,34 +2736,34 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
|
|
/**
|
|
* bch2_btree_iter_prev() - returns first key less than iterator's current
|
|
* position
|
|
+ * @trans: btree transaction object
|
|
* @iter: iterator to peek from
|
|
*
|
|
* Returns: key if found, or an error extractable with bkey_err().
|
|
*/
|
|
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
|
|
+struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
- if (!bch2_btree_iter_rewind(iter))
|
|
+ if (!bch2_btree_iter_rewind(trans, iter))
|
|
return bkey_s_c_null;
|
|
|
|
- return bch2_btree_iter_peek_prev(iter);
|
|
+ return bch2_btree_iter_peek_prev(trans, iter);
|
|
}
|
|
|
|
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
|
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
struct bpos search_key;
|
|
struct bkey_s_c k;
|
|
int ret;
|
|
|
|
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
|
- bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify(trans, iter);
|
|
bch2_btree_iter_verify_entry_exit(iter);
|
|
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
|
|
|
|
ret = trans_maybe_inject_restart(trans, _RET_IP_);
|
|
if (unlikely(ret)) {
|
|
k = bkey_s_c_err(ret);
|
|
- goto out_no_locked;
|
|
+ goto out;
|
|
}
|
|
|
|
/* extents can't span inode numbers: */
|
|
@@ -2773,7 +2772,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
|
if (iter->pos.inode == KEY_INODE_MAX)
|
|
return bkey_s_c_null;
|
|
|
|
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
|
|
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
|
|
}
|
|
|
|
search_key = btree_iter_search_key(iter);
|
|
@@ -2784,13 +2783,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
|
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
|
if (unlikely(ret)) {
|
|
k = bkey_s_c_err(ret);
|
|
- goto out_no_locked;
|
|
+ goto out;
|
|
}
|
|
|
|
struct btree_path *path = btree_iter_path(trans, iter);
|
|
if (unlikely(!btree_path_node(path, path->level)))
|
|
return bkey_s_c_null;
|
|
|
|
+ btree_path_set_should_be_locked(trans, path);
|
|
+
|
|
if ((iter->flags & BTREE_ITER_cached) ||
|
|
!(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
|
|
k = bkey_s_c_null;
|
|
@@ -2807,16 +2808,16 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
|
goto out;
|
|
|
|
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
|
|
- (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
|
|
+ (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) {
|
|
if (!bkey_err(k))
|
|
iter->k = *k.k;
|
|
/* We're not returning a key from iter->path: */
|
|
- goto out_no_locked;
|
|
+ goto out;
|
|
}
|
|
|
|
- k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
|
|
+ k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
|
|
if (unlikely(!k.k))
|
|
- goto out_no_locked;
|
|
+ goto out;
|
|
|
|
if (unlikely(k.k->type == KEY_TYPE_whiteout &&
|
|
(iter->flags & BTREE_ITER_filter_snapshots) &&
|
|
@@ -2834,8 +2835,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
|
if (iter->flags & BTREE_ITER_intent) {
|
|
struct btree_iter iter2;
|
|
|
|
- bch2_trans_copy_iter(&iter2, iter);
|
|
- k = bch2_btree_iter_peek_max(&iter2, end);
|
|
+ bch2_trans_copy_iter(trans, &iter2, iter);
|
|
+ k = bch2_btree_iter_peek_max(trans, &iter2, end);
|
|
|
|
if (k.k && !bkey_err(k)) {
|
|
swap(iter->key_cache_path, iter2.key_cache_path);
|
|
@@ -2846,15 +2847,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
|
} else {
|
|
struct bpos pos = iter->pos;
|
|
|
|
- k = bch2_btree_iter_peek_max(iter, end);
|
|
+ k = bch2_btree_iter_peek_max(trans, iter, end);
|
|
if (unlikely(bkey_err(k)))
|
|
- bch2_btree_iter_set_pos(iter, pos);
|
|
+ bch2_btree_iter_set_pos(trans, iter, pos);
|
|
else
|
|
iter->pos = pos;
|
|
}
|
|
|
|
if (unlikely(bkey_err(k)))
|
|
- goto out_no_locked;
|
|
+ goto out;
|
|
|
|
next = k.k ? bkey_start_pos(k.k) : POS_MAX;
|
|
|
|
@@ -2876,42 +2877,40 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
|
}
|
|
}
|
|
out:
|
|
- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
|
|
-out_no_locked:
|
|
bch2_btree_iter_verify_entry_exit(iter);
|
|
- bch2_btree_iter_verify(iter);
|
|
- ret = bch2_btree_iter_verify_ret(iter, k);
|
|
+ bch2_btree_iter_verify(trans, iter);
|
|
+ ret = bch2_btree_iter_verify_ret(trans, iter, k);
|
|
if (unlikely(ret))
|
|
return bkey_s_c_err(ret);
|
|
|
|
return k;
|
|
}
|
|
|
|
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
|
|
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
- if (!bch2_btree_iter_advance(iter))
|
|
+ if (!bch2_btree_iter_advance(trans, iter))
|
|
return bkey_s_c_null;
|
|
|
|
- return bch2_btree_iter_peek_slot(iter);
|
|
+ return bch2_btree_iter_peek_slot(trans, iter);
|
|
}
|
|
|
|
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
|
|
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
- if (!bch2_btree_iter_rewind(iter))
|
|
+ if (!bch2_btree_iter_rewind(trans, iter))
|
|
return bkey_s_c_null;
|
|
|
|
- return bch2_btree_iter_peek_slot(iter);
|
|
+ return bch2_btree_iter_peek_slot(trans, iter);
|
|
}
|
|
|
|
/* Obsolete, but still used by rust wrapper in -tools */
|
|
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
|
|
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
struct bkey_s_c k;
|
|
|
|
- while (btree_trans_too_many_iters(iter->trans) ||
|
|
- (k = bch2_btree_iter_peek_type(iter, iter->flags),
|
|
+ while (btree_trans_too_many_iters(trans) ||
|
|
+ (k = bch2_btree_iter_peek_type(trans, iter, iter->flags),
|
|
bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
|
|
- bch2_trans_begin(iter->trans);
|
|
+ bch2_trans_begin(trans);
|
|
|
|
return k;
|
|
}
|
|
@@ -2944,7 +2943,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
|
|
struct btree_path *path, *prev = NULL;
|
|
struct trans_for_each_path_inorder_iter iter;
|
|
|
|
- if (!bch2_debug_check_iterators)
|
|
+ if (!static_branch_unlikely(&bch2_debug_check_iterators))
|
|
return;
|
|
|
|
trans_for_each_path_inorder(trans, path, iter) {
|
|
@@ -3057,7 +3056,6 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
|
|
iter->path = 0;
|
|
iter->update_path = 0;
|
|
iter->key_cache_path = 0;
|
|
- iter->trans = NULL;
|
|
}
|
|
|
|
void bch2_trans_iter_init_outlined(struct btree_trans *trans,
|
|
@@ -3097,10 +3095,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
|
|
BUG_ON(iter->min_depth != depth);
|
|
}
|
|
|
|
-void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
|
|
+void bch2_trans_copy_iter(struct btree_trans *trans,
|
|
+ struct btree_iter *dst, struct btree_iter *src)
|
|
{
|
|
- struct btree_trans *trans = src->trans;
|
|
-
|
|
*dst = *src;
|
|
#ifdef TRACK_PATH_ALLOCATED
|
|
dst->ip_allocated = _RET_IP_;
|
|
@@ -3112,7 +3109,19 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
|
|
dst->key_cache_path = 0;
|
|
}
|
|
|
|
-void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
|
|
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+void bch2_trans_kmalloc_trace_to_text(struct printbuf *out,
|
|
+ darray_trans_kmalloc_trace *trace)
|
|
+{
|
|
+ printbuf_tabstops_reset(out);
|
|
+ printbuf_tabstop_push(out, 60);
|
|
+
|
|
+ darray_for_each(*trace, i)
|
|
+ prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes);
|
|
+}
|
|
+#endif
|
|
+
|
|
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
unsigned new_top = trans->mem_top + size;
|
|
@@ -3122,14 +3131,35 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
|
|
void *new_mem;
|
|
void *p;
|
|
|
|
- WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
|
|
+ if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) {
|
|
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace);
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+#endif
|
|
+ }
|
|
|
|
ret = trans_maybe_inject_restart(trans, _RET_IP_);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
|
|
struct btree_transaction_stats *s = btree_trans_stats(trans);
|
|
- s->max_mem = max(s->max_mem, new_bytes);
|
|
+ if (new_bytes > s->max_mem) {
|
|
+ mutex_lock(&s->lock);
|
|
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+ darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr);
|
|
+ s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size,
|
|
+ trans->trans_kmalloc_trace.nr);
|
|
+
|
|
+ memcpy(s->trans_kmalloc_trace.data,
|
|
+ trans->trans_kmalloc_trace.data,
|
|
+ sizeof(s->trans_kmalloc_trace.data[0]) *
|
|
+ s->trans_kmalloc_trace.nr);
|
|
+#endif
|
|
+ s->max_mem = new_bytes;
|
|
+ mutex_unlock(&s->lock);
|
|
+ }
|
|
|
|
if (trans->used_mempool) {
|
|
if (trans->mem_bytes >= new_bytes)
|
|
@@ -3189,6 +3219,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
|
|
BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
|
|
}
|
|
out_change_top:
|
|
+ bch2_trans_kmalloc_trace(trans, size, ip);
|
|
+
|
|
p = trans->mem + trans->mem_top;
|
|
trans->mem_top += size;
|
|
memset(p, 0, size);
|
|
@@ -3248,7 +3280,6 @@ u32 bch2_trans_begin(struct btree_trans *trans)
|
|
|
|
trans->restart_count++;
|
|
trans->mem_top = 0;
|
|
- trans->journal_entries = NULL;
|
|
|
|
trans_for_each_path(trans, path, i) {
|
|
path->should_be_locked = false;
|
|
@@ -3302,6 +3333,10 @@ u32 bch2_trans_begin(struct btree_trans *trans)
|
|
}
|
|
#endif
|
|
|
|
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+ trans->trans_kmalloc_trace.nr = 0;
|
|
+#endif
|
|
+
|
|
trans_set_locked(trans, false);
|
|
|
|
if (trans->restarted) {
|
|
@@ -3402,7 +3437,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
|
|
}
|
|
|
|
trans->nr_paths_max = s->nr_max_paths;
|
|
- trans->journal_entries_size = s->journal_entries_size;
|
|
}
|
|
|
|
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
|
|
@@ -3414,29 +3448,45 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
|
|
return trans;
|
|
}
|
|
|
|
-static void check_btree_paths_leaked(struct btree_trans *trans)
|
|
-{
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
- struct bch_fs *c = trans->c;
|
|
+
|
|
+static bool btree_paths_leaked(struct btree_trans *trans)
|
|
+{
|
|
struct btree_path *path;
|
|
unsigned i;
|
|
|
|
trans_for_each_path(trans, path, i)
|
|
if (path->ref)
|
|
- goto leaked;
|
|
- return;
|
|
-leaked:
|
|
- bch_err(c, "btree paths leaked from %s!", trans->fn);
|
|
- trans_for_each_path(trans, path, i)
|
|
- if (path->ref)
|
|
- printk(KERN_ERR " btree %s %pS\n",
|
|
- bch2_btree_id_str(path->btree_id),
|
|
- (void *) path->ip_allocated);
|
|
- /* Be noisy about this: */
|
|
- bch2_fatal_error(c);
|
|
-#endif
|
|
+ return true;
|
|
+ return false;
|
|
}
|
|
|
|
+static void check_btree_paths_leaked(struct btree_trans *trans)
|
|
+{
|
|
+ if (btree_paths_leaked(trans)) {
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_path *path;
|
|
+ unsigned i;
|
|
+
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+
|
|
+ prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn);
|
|
+ trans_for_each_path(trans, path, i)
|
|
+ if (path->ref)
|
|
+ prt_printf(&buf, "btree %s %pS\n",
|
|
+ bch2_btree_id_str(path->btree_id),
|
|
+ (void *) path->ip_allocated);
|
|
+
|
|
+ bch2_fs_emergency_read_only2(c, &buf);
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
+}
|
|
+#else
|
|
+static inline void check_btree_paths_leaked(struct btree_trans *trans) {}
|
|
+#endif
|
|
+
|
|
void bch2_trans_put(struct btree_trans *trans)
|
|
__releases(&c->btree_trans_barrier)
|
|
{
|
|
@@ -3471,6 +3521,9 @@ void bch2_trans_put(struct btree_trans *trans)
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
darray_exit(&trans->last_restarted_trace);
|
|
#endif
|
|
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+ darray_exit(&trans->trans_kmalloc_trace);
|
|
+#endif
|
|
|
|
unsigned long *paths_allocated = trans->paths_allocated;
|
|
trans->paths_allocated = NULL;
|
|
@@ -3625,6 +3678,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
|
|
for (s = c->btree_transaction_stats;
|
|
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
|
|
s++) {
|
|
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+ darray_exit(&s->trans_kmalloc_trace);
|
|
+#endif
|
|
kfree(s->max_paths_text);
|
|
bch2_time_stats_exit(&s->lock_hold_times);
|
|
}
|
|
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
|
|
index b96157f3dc9c..cafd35a5e7a3 100644
|
|
--- a/fs/bcachefs/btree_iter.h
|
|
+++ b/fs/bcachefs/btree_iter.h
|
|
@@ -9,7 +9,6 @@
|
|
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
|
|
void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
|
|
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
|
|
-void bch2_dump_trans_updates(struct btree_trans *);
|
|
void bch2_dump_trans_paths_updates(struct btree_trans *);
|
|
|
|
static inline int __bkey_err(const struct bkey *k)
|
|
@@ -286,14 +285,23 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex
|
|
: __bch2_trans_mutex_lock(trans, lock);
|
|
}
|
|
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
-void bch2_trans_verify_paths(struct btree_trans *);
|
|
-void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
|
|
-#else
|
|
-static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
|
|
-static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
|
|
- struct bpos pos) {}
|
|
-#endif
|
|
+/* Debug: */
|
|
+
|
|
+void __bch2_trans_verify_paths(struct btree_trans *);
|
|
+void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
|
|
+
|
|
+static inline void bch2_trans_verify_paths(struct btree_trans *trans)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
|
|
+ __bch2_trans_verify_paths(trans);
|
|
+}
|
|
+
|
|
+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree,
|
|
+ struct bpos pos)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
|
|
+ __bch2_assert_pos_locked(trans, btree, pos);
|
|
+}
|
|
|
|
void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
|
|
struct btree *, struct bkey_packed *);
|
|
@@ -335,13 +343,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra
|
|
}
|
|
|
|
__always_inline
|
|
-static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
|
|
+static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip)
|
|
{
|
|
BUG_ON(err <= 0);
|
|
BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
|
|
|
|
trans->restarted = err;
|
|
trans->last_restarted_ip = ip;
|
|
+ return -err;
|
|
+}
|
|
+
|
|
+__always_inline
|
|
+static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
|
|
+{
|
|
+ btree_trans_restart_foreign_task(trans, err, ip);
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
darray_exit(&trans->last_restarted_trace);
|
|
bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
|
|
@@ -387,36 +402,37 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct
|
|
void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
|
|
void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
|
|
|
|
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
|
|
-int __must_check bch2_btree_iter_traverse(struct btree_iter *);
|
|
+int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
|
|
+int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
|
|
|
|
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
|
|
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
|
|
-struct btree *bch2_btree_iter_next_node(struct btree_iter *);
|
|
+struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *);
|
|
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *);
|
|
+struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *);
|
|
|
|
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos);
|
|
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
|
|
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos);
|
|
+struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *);
|
|
|
|
-static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
|
|
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans,
|
|
+ struct btree_iter *iter)
|
|
{
|
|
- return bch2_btree_iter_peek_max(iter, SPOS_MAX);
|
|
+ return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX);
|
|
}
|
|
|
|
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
|
|
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos);
|
|
|
|
-static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
|
|
+static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter)
|
|
{
|
|
- return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
|
|
+ return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN);
|
|
}
|
|
|
|
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
|
|
+struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *);
|
|
|
|
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
|
|
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
|
|
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
|
|
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *);
|
|
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *);
|
|
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *);
|
|
|
|
-bool bch2_btree_iter_advance(struct btree_iter *);
|
|
-bool bch2_btree_iter_rewind(struct btree_iter *);
|
|
+bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *);
|
|
+bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *);
|
|
|
|
static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
|
|
{
|
|
@@ -427,10 +443,9 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo
|
|
iter->k.size = 0;
|
|
}
|
|
|
|
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
|
|
+static inline void bch2_btree_iter_set_pos(struct btree_trans *trans,
|
|
+ struct btree_iter *iter, struct bpos new_pos)
|
|
{
|
|
- struct btree_trans *trans = iter->trans;
|
|
-
|
|
if (unlikely(iter->update_path))
|
|
bch2_path_put(trans, iter->update_path,
|
|
iter->flags & BTREE_ITER_intent);
|
|
@@ -448,13 +463,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
|
|
iter->pos = bkey_start_pos(&iter->k);
|
|
}
|
|
|
|
-static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
|
|
+static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans,
|
|
+ struct btree_iter *iter, u32 snapshot)
|
|
{
|
|
struct bpos pos = iter->pos;
|
|
|
|
iter->snapshot = snapshot;
|
|
pos.snapshot = snapshot;
|
|
- bch2_btree_iter_set_pos(iter, pos);
|
|
+ bch2_btree_iter_set_pos(trans, iter, pos);
|
|
}
|
|
|
|
void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
|
|
@@ -496,7 +512,6 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
|
|
unsigned flags,
|
|
unsigned long ip)
|
|
{
|
|
- iter->trans = trans;
|
|
iter->update_path = 0;
|
|
iter->key_cache_path = 0;
|
|
iter->btree_id = btree_id;
|
|
@@ -533,47 +548,77 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans,
|
|
void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
|
|
enum btree_id, struct bpos,
|
|
unsigned, unsigned, unsigned);
|
|
-void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
|
|
+void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *);
|
|
|
|
-void bch2_set_btree_iter_dontneed(struct btree_iter *);
|
|
+void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *);
|
|
|
|
-void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
|
|
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+void bch2_trans_kmalloc_trace_to_text(struct printbuf *,
|
|
+ darray_trans_kmalloc_trace *);
|
|
+#endif
|
|
|
|
-/**
|
|
- * bch2_trans_kmalloc - allocate memory for use by the current transaction
|
|
- *
|
|
- * Must be called after bch2_trans_begin, which on second and further calls
|
|
- * frees all memory allocated in this transaction
|
|
- */
|
|
-static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
|
|
+void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long);
|
|
+
|
|
+static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size,
|
|
+ unsigned long ip)
|
|
+{
|
|
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+ darray_push(&trans->trans_kmalloc_trace,
|
|
+ ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size }));
|
|
+#endif
|
|
+}
|
|
+
|
|
+static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size,
|
|
+ unsigned long ip)
|
|
{
|
|
size = roundup(size, 8);
|
|
|
|
+ bch2_trans_kmalloc_trace(trans, size, ip);
|
|
+
|
|
if (likely(trans->mem_top + size <= trans->mem_bytes)) {
|
|
void *p = trans->mem + trans->mem_top;
|
|
|
|
trans->mem_top += size;
|
|
- memset(p, 0, size);
|
|
return p;
|
|
} else {
|
|
- return __bch2_trans_kmalloc(trans, size);
|
|
+ return __bch2_trans_kmalloc(trans, size, ip);
|
|
}
|
|
}
|
|
|
|
-static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
|
|
+static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size,
|
|
+ unsigned long ip)
|
|
{
|
|
- size = round_up(size, 8);
|
|
+ size = roundup(size, 8);
|
|
+
|
|
+ bch2_trans_kmalloc_trace(trans, size, ip);
|
|
|
|
if (likely(trans->mem_top + size <= trans->mem_bytes)) {
|
|
void *p = trans->mem + trans->mem_top;
|
|
|
|
trans->mem_top += size;
|
|
+ memset(p, 0, size);
|
|
return p;
|
|
} else {
|
|
- return __bch2_trans_kmalloc(trans, size);
|
|
+ return __bch2_trans_kmalloc(trans, size, ip);
|
|
}
|
|
}
|
|
|
|
+/**
|
|
+ * bch2_trans_kmalloc - allocate memory for use by the current transaction
|
|
+ *
|
|
+ * Must be called after bch2_trans_begin, which on second and further calls
|
|
+ * frees all memory allocated in this transaction
|
|
+ */
|
|
+static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
|
|
+{
|
|
+ return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_);
|
|
+}
|
|
+
|
|
+static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
|
|
+{
|
|
+ return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_);
|
|
+}
|
|
+
|
|
static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
unsigned btree_id, struct bpos pos,
|
|
@@ -582,7 +627,7 @@ static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
|
|
struct bkey_s_c k;
|
|
|
|
bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
|
|
- k = bch2_btree_iter_peek_slot(iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, iter);
|
|
|
|
if (!bkey_err(k) && type && k.k->type != type)
|
|
k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
|
|
@@ -652,14 +697,14 @@ u32 bch2_trans_begin(struct btree_trans *);
|
|
int _ret3 = 0; \
|
|
do { \
|
|
_ret3 = lockrestart_do((_trans), ({ \
|
|
- struct btree *_b = bch2_btree_iter_peek_node(&_iter); \
|
|
+ struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\
|
|
if (!_b) \
|
|
break; \
|
|
\
|
|
PTR_ERR_OR_ZERO(_b) ?: (_do); \
|
|
})) ?: \
|
|
lockrestart_do((_trans), \
|
|
- PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \
|
|
+ PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\
|
|
} while (!_ret3); \
|
|
\
|
|
bch2_trans_iter_exit((_trans), &(_iter)); \
|
|
@@ -671,31 +716,34 @@ u32 bch2_trans_begin(struct btree_trans *);
|
|
__for_each_btree_node(_trans, _iter, _btree_id, _start, \
|
|
0, 0, _flags, _b, _do)
|
|
|
|
-static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
|
|
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
unsigned flags)
|
|
{
|
|
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
|
|
- bch2_btree_iter_peek_prev(iter);
|
|
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) :
|
|
+ bch2_btree_iter_peek_prev(trans, iter);
|
|
}
|
|
|
|
-static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
|
|
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
unsigned flags)
|
|
{
|
|
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
|
|
- bch2_btree_iter_peek(iter);
|
|
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) :
|
|
+ bch2_btree_iter_peek(trans, iter);
|
|
}
|
|
|
|
-static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter,
|
|
- struct bpos end,
|
|
- unsigned flags)
|
|
+static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bpos end,
|
|
+ unsigned flags)
|
|
{
|
|
if (!(flags & BTREE_ITER_slots))
|
|
- return bch2_btree_iter_peek_max(iter, end);
|
|
+ return bch2_btree_iter_peek_max(trans, iter, end);
|
|
|
|
if (bkey_gt(iter->pos, end))
|
|
return bkey_s_c_null;
|
|
|
|
- return bch2_btree_iter_peek_slot(iter);
|
|
+ return bch2_btree_iter_peek_slot(trans, iter);
|
|
}
|
|
|
|
int __bch2_btree_trans_too_many_iters(struct btree_trans *);
|
|
@@ -762,14 +810,14 @@ transaction_restart: \
|
|
\
|
|
do { \
|
|
_ret3 = lockrestart_do(_trans, ({ \
|
|
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), \
|
|
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \
|
|
_end, (_flags)); \
|
|
if (!(_k).k) \
|
|
break; \
|
|
\
|
|
bkey_err(_k) ?: (_do); \
|
|
})); \
|
|
- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
|
|
+ } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \
|
|
\
|
|
bch2_trans_iter_exit((_trans), &(_iter)); \
|
|
_ret3; \
|
|
@@ -807,14 +855,14 @@ transaction_restart: \
|
|
\
|
|
do { \
|
|
_ret3 = lockrestart_do(_trans, ({ \
|
|
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \
|
|
+ (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \
|
|
(_flags)); \
|
|
if (!(_k).k) \
|
|
break; \
|
|
\
|
|
bkey_err(_k) ?: (_do); \
|
|
})); \
|
|
- } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \
|
|
+ } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \
|
|
\
|
|
bch2_trans_iter_exit((_trans), &(_iter)); \
|
|
_ret3; \
|
|
@@ -844,37 +892,38 @@ transaction_restart: \
|
|
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
|
|
(_journal_seq), (_commit_flags)))
|
|
|
|
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
|
|
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *,
|
|
+ struct btree_iter *);
|
|
|
|
#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \
|
|
_start, _end, _flags, _k, _ret) \
|
|
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
|
|
(_start), (_flags)); \
|
|
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\
|
|
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\
|
|
!((_ret) = bkey_err(_k)) && (_k).k; \
|
|
- bch2_btree_iter_advance(&(_iter)))
|
|
+ bch2_btree_iter_advance(_trans, &(_iter)))
|
|
|
|
-#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\
|
|
+#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\
|
|
for (; \
|
|
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \
|
|
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \
|
|
!((_ret) = bkey_err(_k)) && (_k).k; \
|
|
- bch2_btree_iter_advance(&(_iter)))
|
|
+ bch2_btree_iter_advance(_trans, &(_iter)))
|
|
|
|
#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
|
|
_start, _flags, _k, _ret) \
|
|
for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
|
|
SPOS_MAX, _flags, _k, _ret)
|
|
|
|
-#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
|
|
- _start, _flags, _k, _ret) \
|
|
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
|
|
- (_start), (_flags)); \
|
|
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \
|
|
- !((_ret) = bkey_err(_k)) && (_k).k; \
|
|
- bch2_btree_iter_rewind(&(_iter)))
|
|
+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
|
|
+ _start, _flags, _k, _ret) \
|
|
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
|
|
+ (_start), (_flags)); \
|
|
+ (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \
|
|
+ !((_ret) = bkey_err(_k)) && (_k).k; \
|
|
+ bch2_btree_iter_rewind(_trans, &(_iter)))
|
|
|
|
-#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
|
|
- for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
|
|
+#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \
|
|
+ for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret)
|
|
|
|
/*
|
|
* This should not be used in a fastpath, without first trying _do in
|
|
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
|
|
index 6d25e3f85ce8..ade3b5addd75 100644
|
|
--- a/fs/bcachefs/btree_journal_iter.c
|
|
+++ b/fs/bcachefs/btree_journal_iter.c
|
|
@@ -288,7 +288,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
|
|
.size = max_t(size_t, keys->size, 8) * 2,
|
|
};
|
|
|
|
- new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
|
|
+ new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL);
|
|
if (!new_keys.data) {
|
|
bch_err(c, "%s: error allocating new key array (size %zu)",
|
|
__func__, new_keys.size);
|
|
@@ -687,7 +687,8 @@ void bch2_journal_keys_put(struct bch_fs *c)
|
|
|
|
static void __journal_keys_sort(struct journal_keys *keys)
|
|
{
|
|
- sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
|
|
+ sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
|
|
+ journal_sort_key_cmp, NULL);
|
|
|
|
cond_resched();
|
|
|
|
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
|
|
index edce59433375..741329f1400a 100644
|
|
--- a/fs/bcachefs/btree_key_cache.c
|
|
+++ b/fs/bcachefs/btree_key_cache.c
|
|
@@ -13,6 +13,7 @@
|
|
#include "trace.h"
|
|
|
|
#include <linux/sched/mm.h>
|
|
+#include <linux/seq_buf.h>
|
|
|
|
static inline bool btree_uses_pcpu_readers(enum btree_id id)
|
|
{
|
|
@@ -101,8 +102,8 @@ static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu
|
|
kmem_cache_free(bch2_key_cache, ck);
|
|
}
|
|
|
|
-static void bkey_cached_free(struct btree_key_cache *bc,
|
|
- struct bkey_cached *ck)
|
|
+static inline void bkey_cached_free_noassert(struct btree_key_cache *bc,
|
|
+ struct bkey_cached *ck)
|
|
{
|
|
kfree(ck->k);
|
|
ck->k = NULL;
|
|
@@ -116,6 +117,19 @@ static void bkey_cached_free(struct btree_key_cache *bc,
|
|
this_cpu_inc(*bc->nr_pending);
|
|
}
|
|
|
|
+static void bkey_cached_free(struct btree_trans *trans,
|
|
+ struct btree_key_cache *bc,
|
|
+ struct bkey_cached *ck)
|
|
+{
|
|
+ /*
|
|
+ * we'll hit strange issues in the SRCU code if we aren't holding an
|
|
+ * SRCU read lock...
|
|
+ */
|
|
+ EBUG_ON(!trans->srcu_held);
|
|
+
|
|
+ bkey_cached_free_noassert(bc, ck);
|
|
+}
|
|
+
|
|
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
|
|
{
|
|
gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
|
|
@@ -281,16 +295,31 @@ static int btree_key_cache_create(struct btree_trans *trans,
|
|
ck_path->uptodate = BTREE_ITER_UPTODATE;
|
|
return 0;
|
|
err:
|
|
- bkey_cached_free(bc, ck);
|
|
+ bkey_cached_free(trans, bc, ck);
|
|
mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
|
|
|
|
return ret;
|
|
}
|
|
|
|
+static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans,
|
|
+ struct btree_path *ck_path,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ bch2_bpos_to_text(&buf, ck_path->pos);
|
|
+ prt_char(&buf, ' ');
|
|
+ bch2_bkey_val_to_text(&buf, trans->c, k);
|
|
+ trace_key_cache_fill(trans, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+}
|
|
+
|
|
static noinline int btree_key_cache_fill(struct btree_trans *trans,
|
|
- struct btree_path *ck_path,
|
|
+ btree_path_idx_t ck_path_idx,
|
|
unsigned flags)
|
|
{
|
|
+ struct btree_path *ck_path = trans->paths + ck_path_idx;
|
|
+
|
|
if (flags & BTREE_ITER_cached_nofill) {
|
|
ck_path->l[0].b = NULL;
|
|
return 0;
|
|
@@ -306,12 +335,13 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
|
|
BTREE_ITER_key_cache_fill|
|
|
BTREE_ITER_cached_nofill);
|
|
iter.flags &= ~BTREE_ITER_with_journal;
|
|
- k = bch2_btree_iter_peek_slot(&iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
|
|
/* Recheck after btree lookup, before allocating: */
|
|
+ ck_path = trans->paths + ck_path_idx;
|
|
ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
|
|
if (unlikely(ret))
|
|
goto out;
|
|
@@ -320,28 +350,22 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (trace_key_cache_fill_enabled()) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
-
|
|
- bch2_bpos_to_text(&buf, ck_path->pos);
|
|
- prt_char(&buf, ' ');
|
|
- bch2_bkey_val_to_text(&buf, trans->c, k);
|
|
- trace_key_cache_fill(trans, buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
+ if (trace_key_cache_fill_enabled())
|
|
+ do_trace_key_cache_fill(trans, ck_path, k);
|
|
out:
|
|
/* We're not likely to need this iterator again: */
|
|
- bch2_set_btree_iter_dontneed(&iter);
|
|
+ bch2_set_btree_iter_dontneed(trans, &iter);
|
|
err:
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
|
|
static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
|
|
- struct btree_path *path)
|
|
+ btree_path_idx_t path_idx)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct bkey_cached *ck;
|
|
+ struct btree_path *path = trans->paths + path_idx;
|
|
retry:
|
|
ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
|
|
if (!ck)
|
|
@@ -367,27 +391,32 @@ static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
-int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
|
|
+int bch2_btree_path_traverse_cached(struct btree_trans *trans,
|
|
+ btree_path_idx_t path_idx,
|
|
unsigned flags)
|
|
{
|
|
- EBUG_ON(path->level);
|
|
-
|
|
- path->l[1].b = NULL;
|
|
+ EBUG_ON(trans->paths[path_idx].level);
|
|
|
|
int ret;
|
|
do {
|
|
- ret = btree_path_traverse_cached_fast(trans, path);
|
|
+ ret = btree_path_traverse_cached_fast(trans, path_idx);
|
|
if (unlikely(ret == -ENOENT))
|
|
- ret = btree_key_cache_fill(trans, path, flags);
|
|
+ ret = btree_key_cache_fill(trans, path_idx, flags);
|
|
} while (ret == -EEXIST);
|
|
|
|
+ struct btree_path *path = trans->paths + path_idx;
|
|
+
|
|
if (unlikely(ret)) {
|
|
path->uptodate = BTREE_ITER_NEED_TRAVERSE;
|
|
if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
|
|
btree_node_unlock(trans, path, 0);
|
|
path->l[0].b = ERR_PTR(ret);
|
|
}
|
|
+ } else {
|
|
+ BUG_ON(path->uptodate);
|
|
+ BUG_ON(!path->nodes_locked);
|
|
}
|
|
+
|
|
return ret;
|
|
}
|
|
|
|
@@ -412,7 +441,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
|
|
BTREE_ITER_intent);
|
|
b_iter.flags &= ~BTREE_ITER_with_key_cache;
|
|
|
|
- ret = bch2_btree_iter_traverse(&c_iter);
|
|
+ ret = bch2_btree_iter_traverse(trans, &c_iter);
|
|
if (ret)
|
|
goto out;
|
|
|
|
@@ -444,7 +473,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
|
|
!test_bit(JOURNAL_space_low, &c->journal.flags))
|
|
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
|
|
|
|
- struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter);
|
|
+ struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter);
|
|
ret = bkey_err(btree_k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -496,7 +525,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
|
|
|
|
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
|
|
if (bkey_cached_evict(&c->btree_key_cache, ck)) {
|
|
- bkey_cached_free(&c->btree_key_cache, ck);
|
|
+ bkey_cached_free(trans, &c->btree_key_cache, ck);
|
|
} else {
|
|
six_unlock_write(&ck->c.lock);
|
|
six_unlock_intent(&ck->c.lock);
|
|
@@ -610,7 +639,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
|
|
}
|
|
|
|
bkey_cached_evict(bc, ck);
|
|
- bkey_cached_free(bc, ck);
|
|
+ bkey_cached_free(trans, bc, ck);
|
|
|
|
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
|
|
|
|
@@ -678,7 +707,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
|
|
} else if (!bkey_cached_lock_for_evict(ck)) {
|
|
bc->skipped_lock_fail++;
|
|
} else if (bkey_cached_evict(bc, ck)) {
|
|
- bkey_cached_free(bc, ck);
|
|
+ bkey_cached_free_noassert(bc, ck);
|
|
bc->freed++;
|
|
freed++;
|
|
} else {
|
|
@@ -784,6 +813,18 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
|
|
{
|
|
}
|
|
|
|
+static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
|
|
+{
|
|
+ struct bch_fs *c = shrink->private_data;
|
|
+ struct btree_key_cache *bc = &c->btree_key_cache;
|
|
+ char *cbuf;
|
|
+ size_t buflen = seq_buf_get_buf(s, &cbuf);
|
|
+ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
|
|
+
|
|
+ bch2_btree_key_cache_to_text(&out, bc);
|
|
+ seq_buf_commit(s, out.pos);
|
|
+}
|
|
+
|
|
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
|
|
{
|
|
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
|
@@ -808,6 +849,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
|
|
bc->shrink = shrink;
|
|
shrink->count_objects = bch2_btree_key_cache_count;
|
|
shrink->scan_objects = bch2_btree_key_cache_scan;
|
|
+ shrink->to_text = bch2_btree_key_cache_shrinker_to_text;
|
|
shrink->batch = 1 << 14;
|
|
shrink->seeks = 0;
|
|
shrink->private_data = c;
|
|
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
|
|
index 51d6289b8dee..82d8c72512a9 100644
|
|
--- a/fs/bcachefs/btree_key_cache.h
|
|
+++ b/fs/bcachefs/btree_key_cache.h
|
|
@@ -40,8 +40,7 @@ int bch2_btree_key_cache_journal_flush(struct journal *,
|
|
struct bkey_cached *
|
|
bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
|
|
|
|
-int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
|
|
- unsigned);
|
|
+int bch2_btree_path_traverse_cached(struct btree_trans *, btree_path_idx_t, unsigned);
|
|
|
|
bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
|
|
struct btree_insert_entry *);
|
|
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
|
|
index caef65adeae4..59a366fdd24c 100644
|
|
--- a/fs/bcachefs/btree_locking.c
|
|
+++ b/fs/bcachefs/btree_locking.c
|
|
@@ -1,6 +1,7 @@
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include "bcachefs.h"
|
|
+#include "btree_cache.h"
|
|
#include "btree_locking.h"
|
|
#include "btree_types.h"
|
|
|
|
@@ -91,10 +92,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
|
|
struct trans_waiting_for_lock *i;
|
|
|
|
for (i = g->g; i != g->g + g->nr; i++) {
|
|
- struct task_struct *task = i->trans->locking_wait.task;
|
|
+ struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
|
|
if (i != g->g)
|
|
prt_str(out, "<- ");
|
|
- prt_printf(out, "%u ", task ?task->pid : 0);
|
|
+ prt_printf(out, "%u ", task ? task->pid : 0);
|
|
}
|
|
prt_newline(out);
|
|
}
|
|
@@ -172,7 +173,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
|
|
{
|
|
if (i == g->g) {
|
|
trace_would_deadlock(g, i->trans);
|
|
- return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
|
|
+ return btree_trans_restart_foreign_task(i->trans,
|
|
+ BCH_ERR_transaction_restart_would_deadlock,
|
|
+ _THIS_IP_);
|
|
} else {
|
|
i->trans->lock_must_abort = true;
|
|
wake_up_process(i->trans->locking_wait.task);
|
|
@@ -234,7 +237,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
|
|
prt_newline(&buf);
|
|
}
|
|
|
|
- bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
|
|
+ bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf);
|
|
printbuf_exit(&buf);
|
|
BUG();
|
|
}
|
|
@@ -616,22 +619,23 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
|
|
unsigned new_locks_want,
|
|
struct get_locks_fail *f)
|
|
{
|
|
- EBUG_ON(path->locks_want >= new_locks_want);
|
|
-
|
|
- path->locks_want = new_locks_want;
|
|
+ path->locks_want = max_t(unsigned, path->locks_want, new_locks_want);
|
|
|
|
bool ret = btree_path_get_locks(trans, path, true, f);
|
|
bch2_trans_verify_locks(trans);
|
|
return ret;
|
|
}
|
|
|
|
-bool __bch2_btree_path_upgrade(struct btree_trans *trans,
|
|
- struct btree_path *path,
|
|
- unsigned new_locks_want,
|
|
- struct get_locks_fail *f)
|
|
+int __bch2_btree_path_upgrade(struct btree_trans *trans,
|
|
+ struct btree_path *path,
|
|
+ unsigned new_locks_want)
|
|
{
|
|
- bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f);
|
|
- if (ret)
|
|
+ struct get_locks_fail f = {};
|
|
+ unsigned old_locks = path->nodes_locked;
|
|
+ unsigned old_locks_want = path->locks_want;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, &f))
|
|
goto out;
|
|
|
|
/*
|
|
@@ -666,6 +670,28 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
|
|
btree_path_get_locks(trans, linked, true, NULL);
|
|
}
|
|
}
|
|
+
|
|
+ count_event(trans->c, trans_restart_upgrade);
|
|
+ if (trace_trans_restart_upgrade_enabled()) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_);
|
|
+ prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id));
|
|
+ bch2_bpos_to_text(&buf, path->pos);
|
|
+ prt_printf(&buf, "locks want %u -> %u level %u\n",
|
|
+ old_locks_want, new_locks_want, f.l);
|
|
+ prt_printf(&buf, "nodes_locked %x -> %x\n",
|
|
+ old_locks, path->nodes_locked);
|
|
+ prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) :
|
|
+ !f.b ? "(null)" : "(node)");
|
|
+ prt_printf(&buf, "path seq %u node seq %u\n",
|
|
+ IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq,
|
|
+ path->l[f.l].lock_seq);
|
|
+
|
|
+ trace_trans_restart_upgrade(trans->c, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
|
|
out:
|
|
bch2_trans_verify_locks(trans);
|
|
return ret;
|
|
@@ -736,7 +762,9 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
bch2_bpos_to_text(&buf, path->pos);
|
|
- prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq);
|
|
+ prt_printf(&buf, " %s l=%u seq=%u node seq=",
|
|
+ bch2_btree_id_str(path->btree_id),
|
|
+ f->l, path->l[f->l].lock_seq);
|
|
if (IS_ERR_OR_NULL(f->b)) {
|
|
prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
|
|
} else {
|
|
@@ -797,13 +825,6 @@ int bch2_trans_relock_notrace(struct btree_trans *trans)
|
|
return __bch2_trans_relock(trans, false);
|
|
}
|
|
|
|
-void bch2_trans_unlock_noassert(struct btree_trans *trans)
|
|
-{
|
|
- __bch2_trans_unlock(trans);
|
|
-
|
|
- trans_set_unlocked(trans);
|
|
-}
|
|
-
|
|
void bch2_trans_unlock(struct btree_trans *trans)
|
|
{
|
|
__bch2_trans_unlock(trans);
|
|
@@ -840,9 +861,7 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans,
|
|
|
|
/* Debug */
|
|
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
-
|
|
-void bch2_btree_path_verify_locks(struct btree_path *path)
|
|
+void __bch2_btree_path_verify_locks(struct btree_path *path)
|
|
{
|
|
/*
|
|
* A path may be uptodate and yet have nothing locked if and only if
|
|
@@ -883,7 +902,7 @@ static bool bch2_trans_locked(struct btree_trans *trans)
|
|
return false;
|
|
}
|
|
|
|
-void bch2_trans_verify_locks(struct btree_trans *trans)
|
|
+void __bch2_trans_verify_locks(struct btree_trans *trans)
|
|
{
|
|
if (!trans->locked) {
|
|
BUG_ON(bch2_trans_locked(trans));
|
|
@@ -894,7 +913,5 @@ void bch2_trans_verify_locks(struct btree_trans *trans)
|
|
unsigned i;
|
|
|
|
trans_for_each_path(trans, path, i)
|
|
- bch2_btree_path_verify_locks(path);
|
|
+ __bch2_btree_path_verify_locks(path);
|
|
}
|
|
-
|
|
-#endif
|
|
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
|
|
index b33ab7af8440..1bb28e21d021 100644
|
|
--- a/fs/bcachefs/btree_locking.h
|
|
+++ b/fs/bcachefs/btree_locking.h
|
|
@@ -15,7 +15,6 @@
|
|
|
|
void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
|
|
|
|
-void bch2_trans_unlock_noassert(struct btree_trans *);
|
|
void bch2_trans_unlock_write(struct btree_trans *);
|
|
|
|
static inline bool is_btree_node(struct btree_path *path, unsigned l)
|
|
@@ -381,27 +380,18 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
|
|
struct btree_path *, unsigned,
|
|
struct get_locks_fail *);
|
|
|
|
-bool __bch2_btree_path_upgrade(struct btree_trans *,
|
|
- struct btree_path *, unsigned,
|
|
- struct get_locks_fail *);
|
|
+int __bch2_btree_path_upgrade(struct btree_trans *,
|
|
+ struct btree_path *, unsigned);
|
|
|
|
static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
|
|
struct btree_path *path,
|
|
unsigned new_locks_want)
|
|
{
|
|
- struct get_locks_fail f = {};
|
|
- unsigned old_locks_want = path->locks_want;
|
|
-
|
|
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
|
|
|
|
- if (path->locks_want < new_locks_want
|
|
- ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
|
|
- : path->nodes_locked)
|
|
- return 0;
|
|
-
|
|
- trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
|
|
- old_locks_want, new_locks_want, &f);
|
|
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
|
|
+ return likely(path->locks_want >= new_locks_want && path->nodes_locked)
|
|
+ ? 0
|
|
+ : __bch2_btree_path_upgrade(trans, path, new_locks_want);
|
|
}
|
|
|
|
/* misc: */
|
|
@@ -439,12 +429,19 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
|
|
|
|
int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
|
|
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
-void bch2_btree_path_verify_locks(struct btree_path *);
|
|
-void bch2_trans_verify_locks(struct btree_trans *);
|
|
-#else
|
|
-static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
|
|
-static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
|
|
-#endif
|
|
+void __bch2_btree_path_verify_locks(struct btree_path *);
|
|
+void __bch2_trans_verify_locks(struct btree_trans *);
|
|
+
|
|
+static inline void bch2_btree_path_verify_locks(struct btree_path *path)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_btree_locking))
|
|
+ __bch2_btree_path_verify_locks(path);
|
|
+}
|
|
+
|
|
+static inline void bch2_trans_verify_locks(struct btree_trans *trans)
|
|
+{
|
|
+ if (static_branch_unlikely(&bch2_debug_check_btree_locking))
|
|
+ __bch2_trans_verify_locks(trans);
|
|
+}
|
|
|
|
#endif /* _BCACHEFS_BTREE_LOCKING_H */
|
|
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
|
|
index a7f06deee13c..5a97a6b8a757 100644
|
|
--- a/fs/bcachefs/btree_node_scan.c
|
|
+++ b/fs/bcachefs/btree_node_scan.c
|
|
@@ -13,6 +13,7 @@
|
|
|
|
#include <linux/kthread.h>
|
|
#include <linux/min_heap.h>
|
|
+#include <linux/sched/sysctl.h>
|
|
#include <linux/sort.h>
|
|
|
|
struct find_btree_nodes_worker {
|
|
@@ -166,17 +167,23 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
|
|
bio->bi_iter.bi_sector = offset;
|
|
bch2_bio_map(bio, bn, PAGE_SIZE);
|
|
|
|
+ u64 submit_time = local_clock();
|
|
submit_bio_wait(bio);
|
|
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
|
|
- "IO error in try_read_btree_node() at %llu: %s",
|
|
- offset, bch2_blk_status_to_str(bio->bi_status)))
|
|
+
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
|
|
+
|
|
+ if (bio->bi_status) {
|
|
+ bch_err_dev_ratelimited(ca,
|
|
+ "IO error in try_read_btree_node() at %llu: %s",
|
|
+ offset, bch2_blk_status_to_str(bio->bi_status));
|
|
return;
|
|
+ }
|
|
|
|
if (le64_to_cpu(bn->magic) != bset_magic(c))
|
|
return;
|
|
|
|
if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
|
|
- if (!c->chacha20)
|
|
+ if (!c->chacha20_key_set)
|
|
return;
|
|
|
|
struct nonce nonce = btree_nonce(&bn->keys, 0);
|
|
@@ -264,7 +271,7 @@ static int read_btree_nodes_worker(void *p)
|
|
err:
|
|
bio_put(bio);
|
|
free_page((unsigned long) buf);
|
|
- percpu_ref_get(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
|
|
closure_put(w->cl);
|
|
kfree(w);
|
|
return 0;
|
|
@@ -278,37 +285,37 @@ static int read_btree_nodes(struct find_btree_nodes *f)
|
|
|
|
closure_init_stack(&cl);
|
|
|
|
- for_each_online_member(c, ca) {
|
|
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) {
|
|
if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
|
|
continue;
|
|
|
|
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
|
|
- struct task_struct *t;
|
|
-
|
|
if (!w) {
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
|
|
ret = -ENOMEM;
|
|
goto err;
|
|
}
|
|
|
|
- percpu_ref_get(&ca->io_ref);
|
|
- closure_get(&cl);
|
|
w->cl = &cl;
|
|
w->f = f;
|
|
w->ca = ca;
|
|
|
|
- t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
|
|
+ struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
|
|
ret = PTR_ERR_OR_ZERO(t);
|
|
if (ret) {
|
|
- percpu_ref_put(&ca->io_ref);
|
|
- closure_put(&cl);
|
|
- f->ret = ret;
|
|
- bch_err(c, "error starting kthread: %i", ret);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
|
|
+ kfree(w);
|
|
+ bch_err_msg(c, ret, "starting kthread");
|
|
break;
|
|
}
|
|
+
|
|
+ closure_get(&cl);
|
|
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
|
|
+ wake_up_process(t);
|
|
}
|
|
err:
|
|
- closure_sync(&cl);
|
|
+ while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2))
|
|
+ ;
|
|
return f->ret ?: ret;
|
|
}
|
|
|
|
@@ -388,10 +395,10 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
|
|
printbuf_reset(&buf);
|
|
prt_printf(&buf, "%s: nodes found:\n", __func__);
|
|
found_btree_nodes_to_text(&buf, c, f->nodes);
|
|
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
|
|
+ bch2_print_str(c, KERN_INFO, buf.buf);
|
|
}
|
|
|
|
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
|
|
+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
|
|
|
|
dst = 0;
|
|
darray_for_each(f->nodes, i) {
|
|
@@ -411,13 +418,13 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
|
|
}
|
|
f->nodes.nr = dst;
|
|
|
|
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
|
|
+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
|
|
|
|
if (0 && c->opts.verbose) {
|
|
printbuf_reset(&buf);
|
|
prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
|
|
found_btree_nodes_to_text(&buf, c, f->nodes);
|
|
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
|
|
+ bch2_print_str(c, KERN_INFO, buf.buf);
|
|
}
|
|
|
|
swap(nodes_heap, f->nodes);
|
|
@@ -463,7 +470,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
|
|
printbuf_reset(&buf);
|
|
prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
|
|
found_btree_nodes_to_text(&buf, c, f->nodes);
|
|
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
|
|
+ bch2_print_str(c, KERN_INFO, buf.buf);
|
|
} else {
|
|
bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
|
|
}
|
|
@@ -534,7 +541,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
|
|
|
|
struct find_btree_nodes *f = &c->found_btree_nodes;
|
|
|
|
- int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
|
|
+ int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -572,10 +579,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
|
|
|
|
found_btree_node_to_key(&tmp.k, &n);
|
|
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
|
|
- bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
|
|
- printbuf_exit(&buf);
|
|
+ if (c->opts.verbose) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
|
|
+ bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
|
|
BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
|
|
(struct bkey_validate_context) {
|
|
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
|
|
index 2811b6857c97..422d49a5c57c 100644
|
|
--- a/fs/bcachefs/btree_node_scan_types.h
|
|
+++ b/fs/bcachefs/btree_node_scan_types.h
|
|
@@ -2,7 +2,7 @@
|
|
#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
|
|
#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
|
|
|
|
-#include "darray.h"
|
|
+#include <linux/darray.h>
|
|
|
|
struct found_btree_node {
|
|
bool range_updated:1;
|
|
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
|
|
index c4f524b2ca9a..1c03c965d836 100644
|
|
--- a/fs/bcachefs/btree_trans_commit.c
|
|
+++ b/fs/bcachefs/btree_trans_commit.c
|
|
@@ -11,6 +11,7 @@
|
|
#include "btree_write_buffer.h"
|
|
#include "buckets.h"
|
|
#include "disk_accounting.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "errcode.h"
|
|
#include "error.h"
|
|
#include "journal.h"
|
|
@@ -20,6 +21,7 @@
|
|
#include "snapshot.h"
|
|
|
|
#include <linux/prefetch.h>
|
|
+#include <linux/string_helpers.h>
|
|
|
|
static const char * const trans_commit_flags_strs[] = {
|
|
#define x(n, ...) #n,
|
|
@@ -164,6 +166,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
|
|
EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
|
|
EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
|
|
EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
|
|
+ kmsan_check_memory(insert, bkey_bytes(&insert->k));
|
|
|
|
k = bch2_btree_node_iter_peek_all(node_iter, b);
|
|
if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
|
|
@@ -336,6 +339,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
|
|
BUG_ON(i->cached != path->cached);
|
|
BUG_ON(i->level != path->level);
|
|
BUG_ON(i->btree_id != path->btree_id);
|
|
+ BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id));
|
|
EBUG_ON(!i->level &&
|
|
btree_type_has_snapshots(i->btree_id) &&
|
|
!(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
|
|
@@ -364,7 +368,8 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
|
|
struct jset_entry_log *l =
|
|
container_of(entry, struct jset_entry_log, entry);
|
|
|
|
- strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
|
|
+ memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64),
|
|
+ trans->fn, strlen(trans->fn), 0);
|
|
}
|
|
|
|
static inline int btree_key_can_insert(struct btree_trans *trans,
|
|
@@ -517,69 +522,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
|
|
}
|
|
}
|
|
|
|
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
|
|
- unsigned *btree_id_updates_start)
|
|
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
|
|
{
|
|
- bool trans_trigger_run;
|
|
+ unsigned sort_id_start = 0;
|
|
|
|
- /*
|
|
- * Running triggers will append more updates to the list of updates as
|
|
- * we're walking it:
|
|
- */
|
|
- do {
|
|
- trans_trigger_run = false;
|
|
+ while (sort_id_start < trans->nr_updates) {
|
|
+ unsigned i, sort_id = trans->updates[sort_id_start].sort_order;
|
|
+ bool trans_trigger_run;
|
|
|
|
- for (unsigned i = *btree_id_updates_start;
|
|
- i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
|
|
- i++) {
|
|
- if (trans->updates[i].btree_id < btree_id) {
|
|
- *btree_id_updates_start = i;
|
|
- continue;
|
|
+ /*
|
|
+ * For a given btree, this algorithm runs insert triggers before
|
|
+ * overwrite triggers: this is so that when extents are being
|
|
+ * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop
|
|
+ * references before they are re-added.
|
|
+ *
|
|
+ * Running triggers will append more updates to the list of
|
|
+ * updates as we're walking it:
|
|
+ */
|
|
+ do {
|
|
+ trans_trigger_run = false;
|
|
+
|
|
+ for (i = sort_id_start;
|
|
+ i < trans->nr_updates && trans->updates[i].sort_order <= sort_id;
|
|
+ i++) {
|
|
+ if (trans->updates[i].sort_order < sort_id) {
|
|
+ sort_id_start = i;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ int ret = run_one_trans_trigger(trans, trans->updates + i);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+ if (ret)
|
|
+ trans_trigger_run = true;
|
|
}
|
|
+ } while (trans_trigger_run);
|
|
|
|
- int ret = run_one_trans_trigger(trans, trans->updates + i);
|
|
- if (ret < 0)
|
|
- return ret;
|
|
- if (ret)
|
|
- trans_trigger_run = true;
|
|
- }
|
|
- } while (trans_trigger_run);
|
|
-
|
|
- trans_for_each_update(trans, i)
|
|
- BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
|
|
- i->btree_id == btree_id &&
|
|
- btree_node_type_has_trans_triggers(i->bkey_type) &&
|
|
- (!i->insert_trigger_run || !i->overwrite_trigger_run));
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
|
|
-{
|
|
- unsigned btree_id = 0, btree_id_updates_start = 0;
|
|
- int ret = 0;
|
|
-
|
|
- /*
|
|
- *
|
|
- * For a given btree, this algorithm runs insert triggers before
|
|
- * overwrite triggers: this is so that when extents are being moved
|
|
- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
|
|
- * they are re-added.
|
|
- */
|
|
- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
|
|
- if (btree_id == BTREE_ID_alloc)
|
|
- continue;
|
|
-
|
|
- ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start);
|
|
- if (ret)
|
|
- return ret;
|
|
+ sort_id_start = i;
|
|
}
|
|
|
|
- btree_id_updates_start = 0;
|
|
- ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start);
|
|
- if (ret)
|
|
- return ret;
|
|
-
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
trans_for_each_update(trans, i)
|
|
BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
|
|
@@ -666,10 +647,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
|
|
|
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
|
|
!(flags & BCH_TRANS_COMMIT_no_journal_res)) {
|
|
- if (bch2_journal_seq_verify)
|
|
+ if (static_branch_unlikely(&bch2_journal_seq_verify))
|
|
trans_for_each_update(trans, i)
|
|
i->k->k.bversion.lo = trans->journal_res.seq;
|
|
- else if (bch2_inject_invalid_keys)
|
|
+ else if (static_branch_unlikely(&bch2_inject_invalid_keys))
|
|
trans_for_each_update(trans, i)
|
|
i->k->k.bversion = MAX_VERSION;
|
|
}
|
|
@@ -682,18 +663,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
|
h = h->next;
|
|
}
|
|
|
|
- struct jset_entry *entry = trans->journal_entries;
|
|
+ struct bkey_i *accounting;
|
|
|
|
percpu_down_read(&c->mark_lock);
|
|
- for (entry = trans->journal_entries;
|
|
- entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
|
|
- entry = vstruct_next(entry))
|
|
- if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
|
|
- entry->start->k.type == KEY_TYPE_accounting) {
|
|
- ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags);
|
|
- if (ret)
|
|
- goto revert_fs_usage;
|
|
- }
|
|
+ for (accounting = btree_trans_subbuf_base(trans, &trans->accounting);
|
|
+ accounting != btree_trans_subbuf_top(trans, &trans->accounting);
|
|
+ accounting = bkey_next(accounting)) {
|
|
+ ret = bch2_accounting_trans_commit_hook(trans,
|
|
+ bkey_i_to_accounting(accounting), flags);
|
|
+ if (ret)
|
|
+ goto revert_fs_usage;
|
|
+ }
|
|
percpu_up_read(&c->mark_lock);
|
|
|
|
/* XXX: we only want to run this if deltas are nonzero */
|
|
@@ -717,8 +697,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
|
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
|
|
validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit;
|
|
|
|
- for (struct jset_entry *i = trans->journal_entries;
|
|
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
|
|
+ for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
|
|
+ i != btree_trans_journal_entries_top(trans);
|
|
i = vstruct_next(i)) {
|
|
ret = bch2_journal_entry_validate(c, NULL, i,
|
|
bcachefs_metadata_version_current,
|
|
@@ -773,11 +753,18 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
|
}
|
|
|
|
memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
|
|
- trans->journal_entries,
|
|
- trans->journal_entries_u64s);
|
|
+ btree_trans_journal_entries_start(trans),
|
|
+ trans->journal_entries.u64s);
|
|
+
|
|
+ trans->journal_res.offset += trans->journal_entries.u64s;
|
|
+ trans->journal_res.u64s -= trans->journal_entries.u64s;
|
|
|
|
- trans->journal_res.offset += trans->journal_entries_u64s;
|
|
- trans->journal_res.u64s -= trans->journal_entries_u64s;
|
|
+ memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res,
|
|
+ BCH_JSET_ENTRY_write_buffer_keys,
|
|
+ BTREE_ID_accounting, 0,
|
|
+ trans->accounting.u64s)->_data,
|
|
+ btree_trans_subbuf_base(trans, &trans->accounting),
|
|
+ trans->accounting.u64s);
|
|
|
|
if (trans->journal_seq)
|
|
*trans->journal_seq = trans->journal_res.seq;
|
|
@@ -799,13 +786,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
|
bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
|
|
percpu_down_read(&c->mark_lock);
|
|
revert_fs_usage:
|
|
- for (struct jset_entry *entry2 = trans->journal_entries;
|
|
- entry2 != entry;
|
|
- entry2 = vstruct_next(entry2))
|
|
- if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys &&
|
|
- entry2->start->k.type == KEY_TYPE_accounting)
|
|
- bch2_accounting_trans_commit_revert(trans,
|
|
- bkey_i_to_accounting(entry2->start), flags);
|
|
+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
|
|
+ i != accounting;
|
|
+ i = bkey_next(i))
|
|
+ bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags);
|
|
percpu_up_read(&c->mark_lock);
|
|
return ret;
|
|
}
|
|
@@ -903,18 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
|
|
struct bch_fs *c = trans->c;
|
|
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
|
|
|
|
- switch (ret) {
|
|
- case -BCH_ERR_btree_insert_btree_node_full:
|
|
- ret = bch2_btree_split_leaf(trans, i->path, flags);
|
|
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
- trace_and_count(c, trans_restart_btree_node_split, trans,
|
|
- trace_ip, trans->paths + i->path);
|
|
- break;
|
|
- case -BCH_ERR_btree_insert_need_mark_replicas:
|
|
- ret = drop_locks_do(trans,
|
|
- bch2_accounting_update_sb(trans));
|
|
- break;
|
|
- case -BCH_ERR_journal_res_get_blocked:
|
|
+ if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) {
|
|
/*
|
|
* XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
|
|
* flag
|
|
@@ -922,13 +895,26 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
|
|
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
|
|
watermark < BCH_WATERMARK_reclaim) {
|
|
ret = -BCH_ERR_journal_reclaim_would_deadlock;
|
|
- break;
|
|
+ goto out;
|
|
}
|
|
|
|
ret = drop_locks_do(trans,
|
|
bch2_trans_journal_res_get(trans,
|
|
(flags & BCH_WATERMARK_MASK)|
|
|
JOURNAL_RES_GET_CHECK));
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ switch (ret) {
|
|
+ case -BCH_ERR_btree_insert_btree_node_full:
|
|
+ ret = bch2_btree_split_leaf(trans, i->path, flags);
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
+ trace_and_count(c, trans_restart_btree_node_split, trans,
|
|
+ trace_ip, trans->paths + i->path);
|
|
+ break;
|
|
+ case -BCH_ERR_btree_insert_need_mark_replicas:
|
|
+ ret = drop_locks_do(trans,
|
|
+ bch2_accounting_update_sb(trans));
|
|
break;
|
|
case -BCH_ERR_btree_insert_need_journal_reclaim:
|
|
bch2_trans_unlock(trans);
|
|
@@ -950,7 +936,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
|
|
BUG_ON(ret >= 0);
|
|
break;
|
|
}
|
|
-
|
|
+out:
|
|
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
|
|
|
|
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
|
|
@@ -978,8 +964,8 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
|
|
return ret;
|
|
}
|
|
|
|
- for (struct jset_entry *i = trans->journal_entries;
|
|
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
|
|
+ for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
|
|
+ i != btree_trans_journal_entries_top(trans);
|
|
i = vstruct_next(i))
|
|
if (i->type == BCH_JSET_ENTRY_btree_keys ||
|
|
i->type == BCH_JSET_ENTRY_write_buffer_keys) {
|
|
@@ -988,6 +974,14 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
|
|
return ret;
|
|
}
|
|
|
|
+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
|
|
+ i != btree_trans_subbuf_top(trans, &trans->accounting);
|
|
+ i = bkey_next(i)) {
|
|
+ int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
@@ -1004,7 +998,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
|
|
goto out_reset;
|
|
|
|
if (!trans->nr_updates &&
|
|
- !trans->journal_entries_u64s)
|
|
+ !trans->journal_entries.u64s &&
|
|
+ !trans->accounting.u64s)
|
|
goto out_reset;
|
|
|
|
ret = bch2_trans_commit_run_triggers(trans);
|
|
@@ -1012,7 +1007,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
|
|
goto out_reset;
|
|
|
|
if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
|
|
- unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
|
|
+ unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) {
|
|
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
|
|
ret = do_bch2_trans_commit_to_journal_replay(trans);
|
|
else
|
|
@@ -1022,7 +1017,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
|
|
|
|
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
|
|
|
|
- trans->journal_u64s = trans->journal_entries_u64s;
|
|
+ trans->journal_u64s = trans->journal_entries.u64s + jset_u64s(trans->accounting.u64s);
|
|
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
|
|
if (trans->journal_transaction_names)
|
|
trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
|
|
@@ -1078,7 +1073,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
|
|
trace_and_count(c, transaction_commit, trans, _RET_IP_);
|
|
out:
|
|
if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_trans);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans);
|
|
out_reset:
|
|
if (!ret)
|
|
bch2_trans_downgrade(trans);
|
|
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
|
|
index a09cbe9cd94f..1cec08467e17 100644
|
|
--- a/fs/bcachefs/btree_types.h
|
|
+++ b/fs/bcachefs/btree_types.h
|
|
@@ -2,13 +2,13 @@
|
|
#ifndef _BCACHEFS_BTREE_TYPES_H
|
|
#define _BCACHEFS_BTREE_TYPES_H
|
|
|
|
+#include <linux/darray_types.h>
|
|
#include <linux/list.h>
|
|
#include <linux/rhashtable.h>
|
|
|
|
#include "bbpos_types.h"
|
|
#include "btree_key_cache_types.h"
|
|
#include "buckets_types.h"
|
|
-#include "darray.h"
|
|
#include "errcode.h"
|
|
#include "journal_types.h"
|
|
#include "replicas_types.h"
|
|
@@ -139,6 +139,7 @@ struct btree {
|
|
};
|
|
|
|
#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \
|
|
+ x(cache_reserve) \
|
|
x(lock_intent) \
|
|
x(lock_write) \
|
|
x(dirty) \
|
|
@@ -257,9 +258,6 @@ struct btree_node_iter {
|
|
*
|
|
* BTREE_TRIGGER_insert - @new is entering the btree
|
|
* BTREE_TRIGGER_overwrite - @old is leaving the btree
|
|
- *
|
|
- * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc
|
|
- * trigger
|
|
*/
|
|
#define BTREE_TRIGGER_FLAGS() \
|
|
x(norun) \
|
|
@@ -269,8 +267,7 @@ struct btree_node_iter {
|
|
x(gc) \
|
|
x(insert) \
|
|
x(overwrite) \
|
|
- x(is_root) \
|
|
- x(bucket_invalidate)
|
|
+ x(is_root)
|
|
|
|
enum {
|
|
#define x(n) BTREE_ITER_FLAG_BIT_##n,
|
|
@@ -367,7 +364,6 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
|
|
* @nodes_intent_locked - bitmask indicating which locks are intent locks
|
|
*/
|
|
struct btree_iter {
|
|
- struct btree_trans *trans;
|
|
btree_path_idx_t path;
|
|
btree_path_idx_t update_path;
|
|
btree_path_idx_t key_cache_path;
|
|
@@ -423,6 +419,7 @@ static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
|
|
|
|
struct btree_insert_entry {
|
|
unsigned flags;
|
|
+ u8 sort_order;
|
|
u8 bkey_type;
|
|
enum btree_id btree_id:8;
|
|
u8 level:4;
|
|
@@ -477,6 +474,18 @@ struct btree_trans_paths {
|
|
struct btree_path paths[];
|
|
};
|
|
|
|
+struct trans_kmalloc_trace {
|
|
+ unsigned long ip;
|
|
+ size_t bytes;
|
|
+};
|
|
+typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace;
|
|
+
|
|
+struct btree_trans_subbuf {
|
|
+ u16 base;
|
|
+ u16 u64s;
|
|
+ u16 size;;
|
|
+};
|
|
+
|
|
struct btree_trans {
|
|
struct bch_fs *c;
|
|
|
|
@@ -488,6 +497,9 @@ struct btree_trans {
|
|
void *mem;
|
|
unsigned mem_top;
|
|
unsigned mem_bytes;
|
|
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+ darray_trans_kmalloc_trace trans_kmalloc_trace;
|
|
+#endif
|
|
|
|
btree_path_idx_t nr_sorted;
|
|
btree_path_idx_t nr_paths;
|
|
@@ -528,9 +540,8 @@ struct btree_trans {
|
|
int srcu_idx;
|
|
|
|
/* update path: */
|
|
- u16 journal_entries_u64s;
|
|
- u16 journal_entries_size;
|
|
- struct jset_entry *journal_entries;
|
|
+ struct btree_trans_subbuf journal_entries;
|
|
+ struct btree_trans_subbuf accounting;
|
|
|
|
struct btree_trans_commit_hook *hooks;
|
|
struct journal_entry_pin *journal_pin;
|
|
@@ -647,13 +658,13 @@ static inline struct bset_tree *bset_tree_last(struct btree *b)
|
|
static inline void *
|
|
__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
|
|
{
|
|
- return (void *) ((u64 *) b->data + 1 + offset);
|
|
+ return (void *) ((u64 *) b->data + offset);
|
|
}
|
|
|
|
static inline u16
|
|
__btree_node_ptr_to_offset(const struct btree *b, const void *p)
|
|
{
|
|
- u16 ret = (u64 *) p - 1 - (u64 *) b->data;
|
|
+ u16 ret = (u64 *) p - (u64 *) b->data;
|
|
|
|
EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
|
|
return ret;
|
|
@@ -853,6 +864,18 @@ static inline bool btree_type_uses_write_buffer(enum btree_id btree)
|
|
return BIT_ULL(btree) & mask;
|
|
}
|
|
|
|
+static inline u8 btree_trigger_order(enum btree_id btree)
|
|
+{
|
|
+ switch (btree) {
|
|
+ case BTREE_ID_alloc:
|
|
+ return U8_MAX;
|
|
+ case BTREE_ID_stripes:
|
|
+ return U8_MAX - 1;
|
|
+ default:
|
|
+ return btree;
|
|
+ }
|
|
+}
|
|
+
|
|
struct btree_root {
|
|
struct btree *b;
|
|
|
|
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
|
|
index 13d794f201a5..afd05c3dfd03 100644
|
|
--- a/fs/bcachefs/btree_update.c
|
|
+++ b/fs/bcachefs/btree_update.c
|
|
@@ -14,10 +14,13 @@
|
|
#include "snapshot.h"
|
|
#include "trace.h"
|
|
|
|
+#include <linux/darray.h>
|
|
+#include <linux/string_helpers.h>
|
|
+
|
|
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
|
|
const struct btree_insert_entry *r)
|
|
{
|
|
- return cmp_int(l->btree_id, r->btree_id) ?:
|
|
+ return cmp_int(l->sort_order, r->sort_order) ?:
|
|
cmp_int(l->cached, r->cached) ?:
|
|
-cmp_int(l->level, r->level) ?:
|
|
bpos_cmp(l->k->k.p, r->k->k.p);
|
|
@@ -126,7 +129,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
|
|
struct bpos new_pos)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter old_iter, new_iter = { NULL };
|
|
+ struct btree_iter old_iter, new_iter = {};
|
|
struct bkey_s_c old_k, new_k;
|
|
snapshot_id_list s;
|
|
struct bkey_i *update;
|
|
@@ -140,7 +143,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
|
|
bch2_trans_iter_init(trans, &old_iter, id, old_pos,
|
|
BTREE_ITER_not_extents|
|
|
BTREE_ITER_all_snapshots);
|
|
- while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
|
|
+ while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k &&
|
|
!(ret = bkey_err(old_k)) &&
|
|
bkey_eq(old_pos, old_k.k->p)) {
|
|
struct bpos whiteout_pos =
|
|
@@ -296,7 +299,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
|
|
BTREE_ITER_intent|
|
|
BTREE_ITER_with_updates|
|
|
BTREE_ITER_not_extents);
|
|
- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
|
|
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
|
|
if ((ret = bkey_err(k)))
|
|
goto err;
|
|
if (!k.k)
|
|
@@ -322,8 +325,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
|
|
if (done)
|
|
goto out;
|
|
next:
|
|
- bch2_btree_iter_advance(&iter);
|
|
- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
|
|
+ bch2_btree_iter_advance(trans, &iter);
|
|
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
|
|
if ((ret = bkey_err(k)))
|
|
goto err;
|
|
if (!k.k)
|
|
@@ -397,6 +400,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
|
|
|
|
n = (struct btree_insert_entry) {
|
|
.flags = flags,
|
|
+ .sort_order = btree_trigger_order(path->btree_id),
|
|
.bkey_type = __btree_node_type(path->level, path->btree_id),
|
|
.btree_id = path->btree_id,
|
|
.level = path->level,
|
|
@@ -508,9 +512,12 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
|
|
- struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
|
|
+int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
|
|
+ unsigned long ip)
|
|
{
|
|
+ kmsan_check_memory(k, bkey_bytes(&k->k));
|
|
+
|
|
btree_path_idx_t path_idx = iter->update_path ?: iter->path;
|
|
int ret;
|
|
|
|
@@ -543,7 +550,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
|
|
path_idx = iter->key_cache_path;
|
|
}
|
|
|
|
- return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_);
|
|
+ return bch2_trans_update_by_path(trans, path_idx, k, flags, ip);
|
|
}
|
|
|
|
int bch2_btree_insert_clone_trans(struct btree_trans *trans,
|
|
@@ -559,43 +566,42 @@ int bch2_btree_insert_clone_trans(struct btree_trans *trans,
|
|
return bch2_btree_insert_trans(trans, btree, n, 0);
|
|
}
|
|
|
|
-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
|
|
+void *__bch2_trans_subbuf_alloc(struct btree_trans *trans,
|
|
+ struct btree_trans_subbuf *buf,
|
|
+ unsigned u64s)
|
|
{
|
|
- unsigned new_top = trans->journal_entries_u64s + u64s;
|
|
- unsigned old_size = trans->journal_entries_size;
|
|
-
|
|
- if (new_top > trans->journal_entries_size) {
|
|
- trans->journal_entries_size = roundup_pow_of_two(new_top);
|
|
+ unsigned new_top = buf->u64s + u64s;
|
|
+ unsigned old_size = buf->size;
|
|
|
|
- btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size;
|
|
- }
|
|
+ if (new_top > buf->size)
|
|
+ buf->size = roundup_pow_of_two(new_top);
|
|
|
|
- struct jset_entry *n =
|
|
- bch2_trans_kmalloc_nomemzero(trans,
|
|
- trans->journal_entries_size * sizeof(u64));
|
|
+ void *n = bch2_trans_kmalloc_nomemzero(trans, buf->size * sizeof(u64));
|
|
if (IS_ERR(n))
|
|
- return ERR_CAST(n);
|
|
+ return n;
|
|
|
|
- if (trans->journal_entries)
|
|
- memcpy(n, trans->journal_entries, old_size * sizeof(u64));
|
|
- trans->journal_entries = n;
|
|
+ if (buf->u64s)
|
|
+ memcpy(n,
|
|
+ btree_trans_subbuf_base(trans, buf),
|
|
+ old_size * sizeof(u64));
|
|
+ buf->base = (u64 *) n - (u64 *) trans->mem;
|
|
|
|
- struct jset_entry *e = btree_trans_journal_entries_top(trans);
|
|
- trans->journal_entries_u64s = new_top;
|
|
- return e;
|
|
+ void *p = btree_trans_subbuf_top(trans, buf);
|
|
+ buf->u64s = new_top;
|
|
+ return p;
|
|
}
|
|
|
|
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
|
|
enum btree_id btree, struct bpos end)
|
|
{
|
|
bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
|
|
- struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
|
|
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter);
|
|
int ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- bch2_btree_iter_advance(iter);
|
|
- k = bch2_btree_iter_peek_slot(iter);
|
|
+ bch2_btree_iter_advance(trans, iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -631,7 +637,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
|
|
BTREE_ITER_cached|
|
|
BTREE_ITER_not_extents|
|
|
BTREE_ITER_intent);
|
|
- ret = bch2_btree_iter_traverse(&iter) ?:
|
|
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_trans_update(trans, &iter, k, flags);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
@@ -643,7 +649,7 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
|
|
struct btree_iter iter;
|
|
bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
|
|
BTREE_ITER_intent|flags);
|
|
- int ret = bch2_btree_iter_traverse(&iter) ?:
|
|
+ int ret = bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_trans_update(trans, &iter, k, flags);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
@@ -692,7 +698,7 @@ int bch2_btree_delete(struct btree_trans *trans,
|
|
bch2_trans_iter_init(trans, &iter, btree, pos,
|
|
BTREE_ITER_cached|
|
|
BTREE_ITER_intent);
|
|
- ret = bch2_btree_iter_traverse(&iter) ?:
|
|
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_btree_delete_at(trans, &iter, update_flags);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
@@ -710,7 +716,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
|
|
int ret = 0;
|
|
|
|
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
|
|
- while ((k = bch2_btree_iter_peek_max(&iter, end)).k) {
|
|
+ while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) {
|
|
struct disk_reservation disk_res =
|
|
bch2_disk_reservation_init(trans->c, 0);
|
|
struct bkey_i delete;
|
|
@@ -805,7 +811,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
|
|
struct btree_iter iter;
|
|
bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
|
|
|
|
- int ret = bch2_btree_iter_traverse(&iter) ?:
|
|
+ int ret = bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_btree_bit_mod_iter(trans, &iter, set);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
@@ -826,7 +832,6 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
|
|
int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
|
|
{
|
|
unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64));
|
|
- prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos);
|
|
|
|
int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
|
|
if (ret)
|
|
@@ -839,7 +844,20 @@ int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
|
|
|
|
struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
|
|
journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
|
|
- memcpy(l->d, buf->buf, buf->pos);
|
|
+ memcpy_and_pad(l->d, u64s * sizeof(u64), buf->buf, buf->pos, 0);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree,
|
|
+ unsigned level, struct bkey_i *k)
|
|
+{
|
|
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
|
|
+ int ret = PTR_ERR_OR_ZERO(e);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ journal_entry_init(e, BCH_JSET_ENTRY_log_bkey, btree, level, k->k.u64s);
|
|
+ bkey_copy(e->start, k);
|
|
return 0;
|
|
}
|
|
|
|
@@ -852,7 +870,6 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
|
|
prt_vprintf(&buf, fmt, args);
|
|
|
|
unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
|
|
- prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos);
|
|
|
|
int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
|
|
if (ret)
|
|
@@ -865,7 +882,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
|
|
|
|
struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
|
|
journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
|
|
- memcpy(l->d, buf.buf, buf.pos);
|
|
+ memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0);
|
|
c->journal.early_journal_entries.nr += jset_u64s(u64s);
|
|
} else {
|
|
ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
|
|
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
|
|
index 47d8690f01bf..a54dc7277177 100644
|
|
--- a/fs/bcachefs/btree_update.h
|
|
+++ b/fs/bcachefs/btree_update.h
|
|
@@ -102,26 +102,60 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *
|
|
int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
|
|
enum btree_id, struct bpos);
|
|
|
|
-int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
|
|
- struct bkey_i *, enum btree_iter_update_trigger_flags);
|
|
+int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *,
|
|
+ struct bkey_i *, enum btree_iter_update_trigger_flags,
|
|
+ unsigned long);
|
|
|
|
-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
|
|
+static inline int __must_check
|
|
+bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
|
|
+{
|
|
+ return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_);
|
|
+}
|
|
+
|
|
+static inline void *btree_trans_subbuf_base(struct btree_trans *trans,
|
|
+ struct btree_trans_subbuf *buf)
|
|
+{
|
|
+ return (u64 *) trans->mem + buf->base;
|
|
+}
|
|
+
|
|
+static inline void *btree_trans_subbuf_top(struct btree_trans *trans,
|
|
+ struct btree_trans_subbuf *buf)
|
|
+{
|
|
+ return (u64 *) trans->mem + buf->base + buf->u64s;
|
|
+}
|
|
+
|
|
+void *__bch2_trans_subbuf_alloc(struct btree_trans *,
|
|
+ struct btree_trans_subbuf *,
|
|
+ unsigned);
|
|
+
|
|
+static inline void *
|
|
+bch2_trans_subbuf_alloc(struct btree_trans *trans,
|
|
+ struct btree_trans_subbuf *buf,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ if (buf->u64s + u64s > buf->size)
|
|
+ return __bch2_trans_subbuf_alloc(trans, buf, u64s);
|
|
+
|
|
+ void *p = btree_trans_subbuf_top(trans, buf);
|
|
+ buf->u64s += u64s;
|
|
+ return p;
|
|
+}
|
|
+
|
|
+static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans)
|
|
+{
|
|
+ return btree_trans_subbuf_base(trans, &trans->journal_entries);
|
|
+}
|
|
|
|
static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
|
|
{
|
|
- return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
|
|
+ return btree_trans_subbuf_top(trans, &trans->journal_entries);
|
|
}
|
|
|
|
static inline struct jset_entry *
|
|
bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
|
|
{
|
|
- if (!trans->journal_entries ||
|
|
- trans->journal_entries_u64s + u64s > trans->journal_entries_size)
|
|
- return __bch2_trans_jset_entry_alloc(trans, u64s);
|
|
-
|
|
- struct jset_entry *e = btree_trans_journal_entries_top(trans);
|
|
- trans->journal_entries_u64s += u64s;
|
|
- return e;
|
|
+ return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s);
|
|
}
|
|
|
|
int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
|
|
@@ -133,6 +167,10 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr
|
|
enum btree_id btree,
|
|
struct bkey_i *k)
|
|
{
|
|
+ kmsan_check_memory(k, bkey_bytes(&k->k));
|
|
+
|
|
+ EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
|
|
+
|
|
if (unlikely(!btree_type_uses_write_buffer(btree))) {
|
|
int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
|
|
dump_stack();
|
|
@@ -168,6 +206,8 @@ void bch2_trans_commit_hook(struct btree_trans *,
|
|
int __bch2_trans_commit(struct btree_trans *, unsigned);
|
|
|
|
int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
|
|
+int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *);
|
|
+
|
|
__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
|
|
__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
|
|
|
|
@@ -213,12 +253,15 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
|
|
bch2_path_put(trans, i->path, true);
|
|
|
|
trans->nr_updates = 0;
|
|
- trans->journal_entries_u64s = 0;
|
|
+ trans->journal_entries.u64s = 0;
|
|
+ trans->journal_entries.size = 0;
|
|
+ trans->accounting.u64s = 0;
|
|
+ trans->accounting.size = 0;
|
|
trans->hooks = NULL;
|
|
trans->extra_disk_res = 0;
|
|
}
|
|
|
|
-static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
|
|
+static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
|
|
unsigned type, unsigned min_bytes)
|
|
{
|
|
unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
|
|
@@ -241,7 +284,7 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t
|
|
return mut;
|
|
}
|
|
|
|
-static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
|
|
+static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
|
|
{
|
|
return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
|
|
}
|
|
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
|
|
index e4e7c804625e..74e65714fecd 100644
|
|
--- a/fs/bcachefs/btree_update_interior.c
|
|
+++ b/fs/bcachefs/btree_update_interior.c
|
|
@@ -14,6 +14,7 @@
|
|
#include "btree_locking.h"
|
|
#include "buckets.h"
|
|
#include "clock.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "error.h"
|
|
#include "extents.h"
|
|
#include "io_write.h"
|
|
@@ -35,6 +36,8 @@ static const char * const bch2_btree_update_modes[] = {
|
|
NULL
|
|
};
|
|
|
|
+static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *);
|
|
+
|
|
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
|
|
btree_path_idx_t, struct btree *, struct keylist *);
|
|
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
|
|
@@ -54,6 +57,8 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
|
struct bkey_buf prev;
|
|
int ret = 0;
|
|
|
|
+ printbuf_indent_add_nextline(&buf, 2);
|
|
+
|
|
BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
|
|
!bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
|
|
b->data->min_key));
|
|
@@ -64,19 +69,20 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
|
|
|
if (b == btree_node_root(c, b)) {
|
|
if (!bpos_eq(b->data->min_key, POS_MIN)) {
|
|
- printbuf_reset(&buf);
|
|
+ ret = __bch2_topology_error(c, &buf);
|
|
+
|
|
bch2_bpos_to_text(&buf, b->data->min_key);
|
|
log_fsck_err(trans, btree_root_bad_min_key,
|
|
"btree root with incorrect min_key: %s", buf.buf);
|
|
- goto topology_repair;
|
|
+ goto out;
|
|
}
|
|
|
|
if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
|
|
- printbuf_reset(&buf);
|
|
+ ret = __bch2_topology_error(c, &buf);
|
|
bch2_bpos_to_text(&buf, b->data->max_key);
|
|
log_fsck_err(trans, btree_root_bad_max_key,
|
|
"btree root with incorrect max_key: %s", buf.buf);
|
|
- goto topology_repair;
|
|
+ goto out;
|
|
}
|
|
}
|
|
|
|
@@ -94,20 +100,19 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
|
: bpos_successor(prev.k->k.p);
|
|
|
|
if (!bpos_eq(expected_min, bp.v->min_key)) {
|
|
- bch2_topology_error(c);
|
|
+ ret = __bch2_topology_error(c, &buf);
|
|
|
|
- printbuf_reset(&buf);
|
|
- prt_str(&buf, "end of prev node doesn't match start of next node\n in ");
|
|
+ prt_str(&buf, "end of prev node doesn't match start of next node\nin ");
|
|
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
|
prt_str(&buf, " node ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
|
- prt_str(&buf, "\n prev ");
|
|
+ prt_str(&buf, "\nprev ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
|
|
- prt_str(&buf, "\n next ");
|
|
+ prt_str(&buf, "\nnext ");
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
|
|
log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
|
|
- goto topology_repair;
|
|
+ goto out;
|
|
}
|
|
|
|
bch2_bkey_buf_reassemble(&prev, c, k);
|
|
@@ -115,29 +120,25 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
|
}
|
|
|
|
if (bkey_deleted(&prev.k->k)) {
|
|
- bch2_topology_error(c);
|
|
+ ret = __bch2_topology_error(c, &buf);
|
|
|
|
- printbuf_reset(&buf);
|
|
- prt_str(&buf, "empty interior node\n in ");
|
|
+ prt_str(&buf, "empty interior node\nin ");
|
|
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
|
prt_str(&buf, " node ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
|
|
|
log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
|
|
- goto topology_repair;
|
|
} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
|
|
- bch2_topology_error(c);
|
|
+ ret = __bch2_topology_error(c, &buf);
|
|
|
|
- printbuf_reset(&buf);
|
|
- prt_str(&buf, "last child node doesn't end at end of parent node\n in ");
|
|
+ prt_str(&buf, "last child node doesn't end at end of parent node\nin ");
|
|
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
|
prt_str(&buf, " node ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
|
- prt_str(&buf, "\n last key ");
|
|
+ prt_str(&buf, "\nlast key ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
|
|
|
|
log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
|
|
- goto topology_repair;
|
|
}
|
|
out:
|
|
fsck_err:
|
|
@@ -145,9 +146,6 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
|
bch2_bkey_buf_exit(&prev, c);
|
|
printbuf_exit(&buf);
|
|
return ret;
|
|
-topology_repair:
|
|
- ret = bch2_topology_error(c);
|
|
- goto out;
|
|
}
|
|
|
|
/* Calculate ideal packed bkey format for new btree nodes: */
|
|
@@ -287,6 +285,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
|
|
struct disk_reservation *res,
|
|
struct closure *cl,
|
|
bool interior_node,
|
|
+ unsigned target,
|
|
unsigned flags)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
@@ -320,6 +319,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
|
|
mutex_unlock(&c->btree_reserve_cache_lock);
|
|
retry:
|
|
ret = bch2_alloc_sectors_start_trans(trans,
|
|
+ target ?:
|
|
c->opts.metadata_target ?:
|
|
c->opts.foreground_target,
|
|
0,
|
|
@@ -328,7 +328,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
|
|
res->nr_replicas,
|
|
min(res->nr_replicas,
|
|
c->opts.metadata_replicas_required),
|
|
- watermark, 0, cl, &wp);
|
|
+ watermark,
|
|
+ target ? BCH_WRITE_only_specified_devs : 0,
|
|
+ cl, &wp);
|
|
if (unlikely(ret))
|
|
goto err;
|
|
|
|
@@ -508,6 +510,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *
|
|
static int bch2_btree_reserve_get(struct btree_trans *trans,
|
|
struct btree_update *as,
|
|
unsigned nr_nodes[2],
|
|
+ unsigned target,
|
|
unsigned flags,
|
|
struct closure *cl)
|
|
{
|
|
@@ -530,7 +533,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
|
|
|
|
while (p->nr < nr_nodes[interior]) {
|
|
b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
|
|
- interior, flags);
|
|
+ interior, target, flags);
|
|
if (IS_ERR(b)) {
|
|
ret = PTR_ERR(b);
|
|
goto err;
|
|
@@ -649,6 +652,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
+/* If the node has been reused, we might be reading uninitialized memory - that's fine: */
|
|
+static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq)
|
|
+{
|
|
+ struct btree_node *b_data = READ_ONCE(b->data);
|
|
+
|
|
+ return (b_data ? b_data->keys.seq : 0) == seq;
|
|
+}
|
|
+
|
|
static void btree_update_nodes_written(struct btree_update *as)
|
|
{
|
|
struct bch_fs *c = as->c;
|
|
@@ -677,17 +688,9 @@ static void btree_update_nodes_written(struct btree_update *as)
|
|
* on disk:
|
|
*/
|
|
for (i = 0; i < as->nr_old_nodes; i++) {
|
|
- __le64 seq;
|
|
-
|
|
b = as->old_nodes[i];
|
|
|
|
- bch2_trans_begin(trans);
|
|
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
|
|
- seq = b->data ? b->data->keys.seq : 0;
|
|
- six_unlock_read(&b->c.lock);
|
|
- bch2_trans_unlock_long(trans);
|
|
-
|
|
- if (seq == as->old_nodes_seq[i])
|
|
+ if (btree_node_seq_matches(b, as->old_nodes_seq[i]))
|
|
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
|
|
TASK_UNINTERRUPTIBLE);
|
|
}
|
|
@@ -1119,7 +1122,8 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
|
|
|
|
static struct btree_update *
|
|
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
|
|
- unsigned level_start, bool split, unsigned flags)
|
|
+ unsigned level_start, bool split,
|
|
+ unsigned target, unsigned flags)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct btree_update *as;
|
|
@@ -1224,12 +1228,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
|
|
|
|
ret = bch2_disk_reservation_get(c, &as->disk_res,
|
|
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
|
|
- c->opts.metadata_replicas,
|
|
+ READ_ONCE(c->opts.metadata_replicas),
|
|
disk_res_flags);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
|
|
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL);
|
|
if (bch2_err_matches(ret, ENOSPC) ||
|
|
bch2_err_matches(ret, ENOMEM)) {
|
|
struct closure cl;
|
|
@@ -1248,7 +1252,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
|
|
closure_init_stack(&cl);
|
|
|
|
do {
|
|
- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
|
|
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl);
|
|
|
|
bch2_trans_unlock(trans);
|
|
bch2_wait_on_allocator(c, &cl);
|
|
@@ -1271,7 +1275,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
|
|
bch2_btree_update_free(as, trans);
|
|
if (!bch2_err_matches(ret, ENOSPC) &&
|
|
!bch2_err_matches(ret, EROFS) &&
|
|
- ret != -BCH_ERR_journal_reclaim_would_deadlock)
|
|
+ ret != -BCH_ERR_journal_reclaim_would_deadlock &&
|
|
+ ret != -BCH_ERR_journal_shutdown)
|
|
bch_err_fn_ratelimited(c, ret);
|
|
return ERR_PTR(ret);
|
|
}
|
|
@@ -1391,7 +1396,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
-static void
|
|
+static int
|
|
bch2_btree_insert_keys_interior(struct btree_update *as,
|
|
struct btree_trans *trans,
|
|
struct btree_path *path,
|
|
@@ -1413,7 +1418,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
|
|
insert = bkey_next(insert))
|
|
bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
|
|
|
|
- if (bch2_btree_node_check_topology(trans, b)) {
|
|
+ int ret = bch2_btree_node_check_topology(trans, b);
|
|
+ if (ret) {
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
for (struct bkey_i *k = keys->keys;
|
|
@@ -1423,11 +1429,15 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
|
|
prt_newline(&buf);
|
|
}
|
|
|
|
- panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf);
|
|
+ bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s",
|
|
+ (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf);
|
|
+ dump_stack();
|
|
+ return ret;
|
|
}
|
|
|
|
memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);
|
|
keys->top_p -= insert->_data - keys->keys_p;
|
|
+ return 0;
|
|
}
|
|
|
|
static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
|
|
@@ -1561,11 +1571,11 @@ static void __btree_split_node(struct btree_update *as,
|
|
* nodes that were coalesced, and thus in the middle of a child node post
|
|
* coalescing:
|
|
*/
|
|
-static void btree_split_insert_keys(struct btree_update *as,
|
|
- struct btree_trans *trans,
|
|
- btree_path_idx_t path_idx,
|
|
- struct btree *b,
|
|
- struct keylist *keys)
|
|
+static int btree_split_insert_keys(struct btree_update *as,
|
|
+ struct btree_trans *trans,
|
|
+ btree_path_idx_t path_idx,
|
|
+ struct btree *b,
|
|
+ struct keylist *keys)
|
|
{
|
|
struct btree_path *path = trans->paths + path_idx;
|
|
|
|
@@ -1575,8 +1585,12 @@ static void btree_split_insert_keys(struct btree_update *as,
|
|
|
|
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
|
|
|
|
- bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
|
|
+ int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
|
|
+ if (ret)
|
|
+ return ret;
|
|
}
|
|
+
|
|
+ return 0;
|
|
}
|
|
|
|
static int btree_split(struct btree_update *as, struct btree_trans *trans,
|
|
@@ -1609,8 +1623,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
|
|
__btree_split_node(as, trans, b, n, keys);
|
|
|
|
if (keys) {
|
|
- btree_split_insert_keys(as, trans, path, n1, keys);
|
|
- btree_split_insert_keys(as, trans, path, n2, keys);
|
|
+ ret = btree_split_insert_keys(as, trans, path, n1, keys) ?:
|
|
+ btree_split_insert_keys(as, trans, path, n2, keys);
|
|
+ if (ret)
|
|
+ goto err;
|
|
BUG_ON(!bch2_keylist_empty(keys));
|
|
}
|
|
|
|
@@ -1656,7 +1672,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
|
|
n3->sib_u64s[0] = U16_MAX;
|
|
n3->sib_u64s[1] = U16_MAX;
|
|
|
|
- btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
|
|
+ ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
|
|
+ if (ret)
|
|
+ goto err;
|
|
}
|
|
} else {
|
|
trace_and_count(c, btree_node_compact, trans, b);
|
|
@@ -1664,7 +1682,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
|
|
n1 = bch2_btree_node_alloc_replacement(as, trans, b);
|
|
|
|
if (keys) {
|
|
- btree_split_insert_keys(as, trans, path, n1, keys);
|
|
+ ret = btree_split_insert_keys(as, trans, path, n1, keys);
|
|
+ if (ret)
|
|
+ goto err;
|
|
BUG_ON(!bch2_keylist_empty(keys));
|
|
}
|
|
|
|
@@ -1782,11 +1802,24 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
|
|
int ret;
|
|
|
|
lockdep_assert_held(&c->gc_lock);
|
|
- BUG_ON(!btree_node_intent_locked(path, b->c.level));
|
|
BUG_ON(!b->c.level);
|
|
BUG_ON(!as || as->b);
|
|
bch2_verify_keylist_sorted(keys);
|
|
|
|
+ if (!btree_node_intent_locked(path, b->c.level)) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+ prt_printf(&buf, "%s(): node not locked at level %u\n",
|
|
+ __func__, b->c.level);
|
|
+ bch2_btree_update_to_text(&buf, as);
|
|
+ bch2_btree_path_to_text(&buf, trans, path_idx);
|
|
+ bch2_fs_emergency_read_only2(c, &buf);
|
|
+
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
ret = bch2_btree_node_lock_write(trans, path, &b->c);
|
|
if (ret)
|
|
return ret;
|
|
@@ -1798,15 +1831,15 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
|
|
goto split;
|
|
}
|
|
|
|
- ret = bch2_btree_node_check_topology(trans, b);
|
|
+
|
|
+ ret = bch2_btree_node_check_topology(trans, b) ?:
|
|
+ bch2_btree_insert_keys_interior(as, trans, path, b,
|
|
+ path->l[b->c.level].iter, keys);
|
|
if (ret) {
|
|
bch2_btree_node_unlock_write(trans, path, b);
|
|
return ret;
|
|
}
|
|
|
|
- bch2_btree_insert_keys_interior(as, trans, path, b,
|
|
- path->l[b->c.level].iter, keys);
|
|
-
|
|
trans_for_each_path_with_node(trans, b, linked, i)
|
|
bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
|
|
|
|
@@ -1852,7 +1885,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
|
|
|
|
as = bch2_btree_update_start(trans, trans->paths + path,
|
|
trans->paths[path].level,
|
|
- true, flags);
|
|
+ true, 0, flags);
|
|
if (IS_ERR(as))
|
|
return PTR_ERR(as);
|
|
|
|
@@ -1922,7 +1955,8 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path,
|
|
return bch2_btree_split_leaf(trans, path, flags);
|
|
|
|
struct btree_update *as =
|
|
- bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
|
|
+ bch2_btree_update_start(trans, trans->paths + path, b->c.level,
|
|
+ true, 0, flags);
|
|
if (IS_ERR(as))
|
|
return PTR_ERR(as);
|
|
|
|
@@ -2007,18 +2041,22 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
|
|
}
|
|
|
|
if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
|
|
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
|
|
-
|
|
- bch2_bpos_to_text(&buf1, prev->data->max_key);
|
|
- bch2_bpos_to_text(&buf2, next->data->min_key);
|
|
- bch_err(c,
|
|
- "%s(): btree topology error:\n"
|
|
- " prev ends at %s\n"
|
|
- " next starts at %s",
|
|
- __func__, buf1.buf, buf2.buf);
|
|
- printbuf_exit(&buf1);
|
|
- printbuf_exit(&buf2);
|
|
- ret = bch2_topology_error(c);
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ printbuf_indent_add_nextline(&buf, 2);
|
|
+ prt_printf(&buf, "%s(): ", __func__);
|
|
+ ret = __bch2_topology_error(c, &buf);
|
|
+ prt_newline(&buf);
|
|
+
|
|
+ prt_printf(&buf, "prev ends at ");
|
|
+ bch2_bpos_to_text(&buf, prev->data->max_key);
|
|
+ prt_newline(&buf);
|
|
+
|
|
+ prt_printf(&buf, "next starts at ");
|
|
+ bch2_bpos_to_text(&buf, next->data->min_key);
|
|
+
|
|
+ bch_err(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
goto err;
|
|
}
|
|
|
|
@@ -2047,7 +2085,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
|
|
|
|
parent = btree_node_parent(trans->paths + path, b);
|
|
as = bch2_btree_update_start(trans, trans->paths + path, level, false,
|
|
- BCH_TRANS_COMMIT_no_enospc|flags);
|
|
+ 0, BCH_TRANS_COMMIT_no_enospc|flags);
|
|
ret = PTR_ERR_OR_ZERO(as);
|
|
if (ret)
|
|
goto err;
|
|
@@ -2126,9 +2164,35 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
|
|
goto out;
|
|
}
|
|
|
|
+static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
|
|
+ BTREE_MAX_DEPTH, b->c.level,
|
|
+ BTREE_ITER_intent);
|
|
+ int ret = bch2_btree_iter_traverse(trans, iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ /* has node been freed? */
|
|
+ if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
|
|
+ /* node has been freed: */
|
|
+ BUG_ON(!btree_node_dying(b));
|
|
+ ret = -BCH_ERR_btree_node_dying;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ BUG_ON(!btree_node_hashed(b));
|
|
+ return 0;
|
|
+err:
|
|
+ bch2_trans_iter_exit(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
int bch2_btree_node_rewrite(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
struct btree *b,
|
|
+ unsigned target,
|
|
unsigned flags)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
@@ -2141,7 +2205,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
|
|
|
|
struct btree_path *path = btree_iter_path(trans, iter);
|
|
parent = btree_node_parent(path, b);
|
|
- as = bch2_btree_update_start(trans, path, b->c.level, false, flags);
|
|
+ as = bch2_btree_update_start(trans, path, b->c.level,
|
|
+ false, target, flags);
|
|
ret = PTR_ERR_OR_ZERO(as);
|
|
if (ret)
|
|
goto out;
|
|
@@ -2191,67 +2256,83 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
|
|
goto out;
|
|
}
|
|
|
|
-struct async_btree_rewrite {
|
|
- struct bch_fs *c;
|
|
- struct work_struct work;
|
|
- struct list_head list;
|
|
- enum btree_id btree_id;
|
|
- unsigned level;
|
|
- struct bkey_buf key;
|
|
-};
|
|
-
|
|
-static int async_btree_node_rewrite_trans(struct btree_trans *trans,
|
|
- struct async_btree_rewrite *a)
|
|
+static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
|
|
+ enum btree_id btree, unsigned level,
|
|
+ struct bkey_i *k, unsigned flags)
|
|
{
|
|
struct btree_iter iter;
|
|
bch2_trans_node_iter_init(trans, &iter,
|
|
- a->btree_id, a->key.k->k.p,
|
|
- BTREE_MAX_DEPTH, a->level, 0);
|
|
- struct btree *b = bch2_btree_iter_peek_node(&iter);
|
|
+ btree, k->k.p,
|
|
+ BTREE_MAX_DEPTH, level, 0);
|
|
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
|
|
int ret = PTR_ERR_OR_ZERO(b);
|
|
if (ret)
|
|
goto out;
|
|
|
|
- bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
|
|
+ bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
|
|
ret = found
|
|
- ? bch2_btree_node_rewrite(trans, &iter, b, 0)
|
|
+ ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags)
|
|
: -ENOENT;
|
|
+out:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
|
|
-#if 0
|
|
- /* Tracepoint... */
|
|
- if (!ret || ret == -ENOENT) {
|
|
- struct bch_fs *c = trans->c;
|
|
- struct printbuf buf = PRINTBUF;
|
|
+int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
|
|
+ enum btree_id btree, unsigned level,
|
|
+ struct bpos pos,
|
|
+ unsigned target,
|
|
+ unsigned flags)
|
|
+{
|
|
+ BUG_ON(!level);
|
|
|
|
- if (!ret) {
|
|
- prt_printf(&buf, "rewrite node:\n ");
|
|
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
|
|
- } else {
|
|
- prt_printf(&buf, "node to rewrite not found:\n want: ");
|
|
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
|
|
- prt_printf(&buf, "\n got: ");
|
|
- if (b)
|
|
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
|
- else
|
|
- prt_str(&buf, "(null)");
|
|
- }
|
|
- bch_info(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
-#endif
|
|
-out:
|
|
+ /* Traverse one depth lower to get a pointer to the node itself: */
|
|
+ struct btree_iter iter;
|
|
+ bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0);
|
|
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
|
|
+ int ret = PTR_ERR_OR_ZERO(b);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags);
|
|
+err:
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
|
|
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
|
|
+ struct btree *b, unsigned flags)
|
|
+{
|
|
+ struct btree_iter iter;
|
|
+ int ret = get_iter_to_node(trans, &iter, b);
|
|
+ if (ret)
|
|
+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
|
|
+
|
|
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags);
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct async_btree_rewrite {
|
|
+ struct bch_fs *c;
|
|
+ struct work_struct work;
|
|
+ struct list_head list;
|
|
+ enum btree_id btree_id;
|
|
+ unsigned level;
|
|
+ struct bkey_buf key;
|
|
+};
|
|
+
|
|
static void async_btree_node_rewrite_work(struct work_struct *work)
|
|
{
|
|
struct async_btree_rewrite *a =
|
|
container_of(work, struct async_btree_rewrite, work);
|
|
struct bch_fs *c = a->c;
|
|
|
|
- int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
|
|
- if (ret != -ENOENT)
|
|
+ int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
|
|
+ a->btree_id, a->level, a->key.k, 0));
|
|
+ if (ret != -ENOENT &&
|
|
+ !bch2_err_matches(ret, EROFS) &&
|
|
+ ret != -BCH_ERR_journal_shutdown)
|
|
bch_err_fn_ratelimited(c, ret);
|
|
|
|
spin_lock(&c->btree_node_rewrites_lock);
|
|
@@ -2261,7 +2342,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
|
|
closure_wake_up(&c->btree_node_rewrites_wait);
|
|
|
|
bch2_bkey_buf_exit(&a->key, c);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite);
|
|
kfree(a);
|
|
}
|
|
|
|
@@ -2282,8 +2363,8 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
|
|
bool now = false, pending = false;
|
|
|
|
spin_lock(&c->btree_node_rewrites_lock);
|
|
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay &&
|
|
- bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
|
|
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) &&
|
|
+ enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) {
|
|
list_add(&a->list, &c->btree_node_rewrites);
|
|
now = true;
|
|
} else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
|
|
@@ -2322,7 +2403,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c)
|
|
if (!a)
|
|
break;
|
|
|
|
- bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
|
|
+ enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite);
|
|
queue_work(c->btree_node_rewrite_worker, &a->work);
|
|
}
|
|
}
|
|
@@ -2352,7 +2433,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
|
|
bool skip_triggers)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter iter2 = { NULL };
|
|
+ struct btree_iter iter2 = {};
|
|
struct btree *parent;
|
|
int ret;
|
|
|
|
@@ -2376,7 +2457,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
|
|
|
|
parent = btree_node_parent(btree_iter_path(trans, iter), b);
|
|
if (parent) {
|
|
- bch2_trans_copy_iter(&iter2, iter);
|
|
+ bch2_trans_copy_iter(trans, &iter2, iter);
|
|
|
|
iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
|
|
iter2.flags & BTREE_ITER_intent,
|
|
@@ -2390,7 +2471,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
|
|
|
|
trans->paths_sorted = false;
|
|
|
|
- ret = bch2_btree_iter_traverse(&iter2) ?:
|
|
+ ret = bch2_btree_iter_traverse(trans, &iter2) ?:
|
|
bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
|
|
if (ret)
|
|
goto err;
|
|
@@ -2494,30 +2575,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
|
|
unsigned commit_flags, bool skip_triggers)
|
|
{
|
|
struct btree_iter iter;
|
|
- int ret;
|
|
-
|
|
- bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
|
|
- BTREE_MAX_DEPTH, b->c.level,
|
|
- BTREE_ITER_intent);
|
|
- ret = bch2_btree_iter_traverse(&iter);
|
|
+ int ret = get_iter_to_node(trans, &iter, b);
|
|
if (ret)
|
|
- goto out;
|
|
-
|
|
- /* has node been freed? */
|
|
- if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
|
|
- /* node has been freed: */
|
|
- BUG_ON(!btree_node_dying(b));
|
|
- goto out;
|
|
- }
|
|
-
|
|
- BUG_ON(!btree_node_hashed(b));
|
|
+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
|
|
|
|
bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
|
|
!bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
|
|
|
|
ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
|
|
commit_flags, skip_triggers);
|
|
-out:
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
|
|
index 26d646e1275c..7fe793788a79 100644
|
|
--- a/fs/bcachefs/btree_update_interior.h
|
|
+++ b/fs/bcachefs/btree_update_interior.h
|
|
@@ -144,7 +144,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
|
|
|
|
EBUG_ON(!btree_node_locked(path, level));
|
|
|
|
- if (bch2_btree_node_merging_disabled)
|
|
+ if (static_branch_unlikely(&bch2_btree_node_merging_disabled))
|
|
return 0;
|
|
|
|
b = path->l[level].b;
|
|
@@ -168,8 +168,15 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
|
|
}
|
|
|
|
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
|
|
- struct btree *, unsigned);
|
|
+ struct btree *, unsigned, unsigned);
|
|
+int bch2_btree_node_rewrite_pos(struct btree_trans *,
|
|
+ enum btree_id, unsigned,
|
|
+ struct bpos, unsigned, unsigned);
|
|
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
|
|
+ struct btree *, unsigned);
|
|
+
|
|
void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
|
|
+
|
|
int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
|
|
struct btree *, struct bkey_i *,
|
|
unsigned, bool);
|
|
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
|
|
index 2c09d19dd621..efb0c64d0aac 100644
|
|
--- a/fs/bcachefs/btree_write_buffer.c
|
|
+++ b/fs/bcachefs/btree_write_buffer.c
|
|
@@ -7,6 +7,7 @@
|
|
#include "btree_update_interior.h"
|
|
#include "btree_write_buffer.h"
|
|
#include "disk_accounting.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "error.h"
|
|
#include "extents.h"
|
|
#include "journal.h"
|
|
@@ -144,7 +145,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
|
|
EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
|
|
EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
|
|
|
|
- ret = bch2_btree_iter_traverse(iter);
|
|
+ ret = bch2_btree_iter_traverse(trans, iter);
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -181,6 +182,8 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
|
|
return wb_flush_one_slowpath(trans, iter, wb);
|
|
}
|
|
|
|
+ EBUG_ON(!bpos_eq(wb->k.k.p, path->pos));
|
|
+
|
|
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
|
|
(*fast)++;
|
|
return 0;
|
|
@@ -208,7 +211,7 @@ btree_write_buffered_insert(struct btree_trans *trans,
|
|
|
|
trans->journal_res.seq = wb->journal_seq;
|
|
|
|
- ret = bch2_btree_iter_traverse(&iter) ?:
|
|
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_trans_update(trans, &iter, &wb->k,
|
|
BTREE_UPDATE_internal_snapshot_node);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
@@ -285,7 +288,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
|
struct bch_fs *c = trans->c;
|
|
struct journal *j = &c->journal;
|
|
struct btree_write_buffer *wb = &c->btree_write_buffer;
|
|
- struct btree_iter iter = { NULL };
|
|
+ struct btree_iter iter = {};
|
|
size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
|
|
bool write_locked = false;
|
|
bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
|
|
@@ -368,7 +371,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
|
write_locked = false;
|
|
|
|
ret = lockrestart_do(trans,
|
|
- bch2_btree_iter_traverse(&iter) ?:
|
|
+ bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_foreground_maybe_merge(trans, iter.path, 0,
|
|
BCH_WATERMARK_reclaim|
|
|
BCH_TRANS_COMMIT_journal_reclaim|
|
|
@@ -385,7 +388,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
|
BTREE_ITER_intent|BTREE_ITER_all_snapshots);
|
|
}
|
|
|
|
- bch2_btree_iter_set_pos(&iter, k->k.k.p);
|
|
+ bch2_btree_iter_set_pos(trans, &iter, k->k.k.p);
|
|
btree_iter_path(trans, &iter)->preserve = false;
|
|
|
|
bool accounting_accumulated = false;
|
|
@@ -428,10 +431,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
|
*/
|
|
trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
|
|
|
|
- sort(wb->flushing.keys.data,
|
|
- wb->flushing.keys.nr,
|
|
- sizeof(wb->flushing.keys.data[0]),
|
|
- wb_key_seq_cmp, NULL);
|
|
+ sort_nonatomic(wb->flushing.keys.data,
|
|
+ wb->flushing.keys.nr,
|
|
+ sizeof(wb->flushing.keys.data[0]),
|
|
+ wb_key_seq_cmp, NULL);
|
|
|
|
darray_for_each(wb->flushing.keys, i) {
|
|
if (!i->journal_seq)
|
|
@@ -629,11 +632,11 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer))
|
|
return -BCH_ERR_erofs_no_writes;
|
|
|
|
int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
|
|
return ret;
|
|
}
|
|
|
|
@@ -692,7 +695,7 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
|
|
} while (!ret && bch2_btree_write_buffer_should_flush(c));
|
|
mutex_unlock(&wb->flushing.lock);
|
|
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
|
|
}
|
|
|
|
static void wb_accounting_sort(struct btree_write_buffer *wb)
|
|
@@ -821,9 +824,9 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_
|
|
bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
|
|
|
|
if (bch2_btree_write_buffer_should_flush(c) &&
|
|
- __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
|
|
+ __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) &&
|
|
!queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
|
|
|
|
if (dst->wb == &wb->flushing)
|
|
mutex_unlock(&wb->flushing.lock);
|
|
@@ -866,13 +869,18 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
|
|
darray_exit(&wb->inc.keys);
|
|
}
|
|
|
|
-int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
|
|
+void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c)
|
|
{
|
|
struct btree_write_buffer *wb = &c->btree_write_buffer;
|
|
|
|
mutex_init(&wb->inc.lock);
|
|
mutex_init(&wb->flushing.lock);
|
|
INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
|
|
+}
|
|
+
|
|
+int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
|
|
+{
|
|
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
|
|
|
|
/* Will be resized by journal as needed: */
|
|
unsigned initial_size = 1 << 16;
|
|
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
|
|
index d535cea28bde..05f56fd1eed0 100644
|
|
--- a/fs/bcachefs/btree_write_buffer.h
|
|
+++ b/fs/bcachefs/btree_write_buffer.h
|
|
@@ -101,6 +101,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_t
|
|
|
|
int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
|
|
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
|
|
+void bch2_fs_btree_write_buffer_init_early(struct bch_fs *);
|
|
int bch2_fs_btree_write_buffer_init(struct bch_fs *);
|
|
|
|
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
|
|
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
|
|
index e9e76e20f43b..d39d163c6ea9 100644
|
|
--- a/fs/bcachefs/btree_write_buffer_types.h
|
|
+++ b/fs/bcachefs/btree_write_buffer_types.h
|
|
@@ -2,7 +2,7 @@
|
|
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
|
|
#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
|
|
|
|
-#include "darray.h"
|
|
+#include <linux/darray_types.h>
|
|
#include "journal_types.h"
|
|
|
|
#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
|
|
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
|
|
index 345b117a4a4a..8bb6384190c5 100644
|
|
--- a/fs/bcachefs/buckets.c
|
|
+++ b/fs/bcachefs/buckets.c
|
|
@@ -29,9 +29,16 @@
|
|
#include <linux/preempt.h>
|
|
|
|
void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
|
|
+{
|
|
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
|
|
+ usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets);
|
|
+}
|
|
+
|
|
+void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage)
|
|
{
|
|
memset(usage, 0, sizeof(*usage));
|
|
- acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
|
|
+ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage,
|
|
+ sizeof(struct bch_dev_usage_full) / sizeof(u64));
|
|
}
|
|
|
|
static u64 reserve_factor(u64 r)
|
|
@@ -75,7 +82,7 @@ bch2_fs_usage_read_short(struct bch_fs *c)
|
|
|
|
void bch2_dev_usage_to_text(struct printbuf *out,
|
|
struct bch_dev *ca,
|
|
- struct bch_dev_usage *usage)
|
|
+ struct bch_dev_usage_full *usage)
|
|
{
|
|
if (out->nr_tabstops < 5) {
|
|
printbuf_tabstops_reset(out);
|
|
@@ -365,7 +372,7 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
|
|
struct btree_iter iter;
|
|
bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
|
|
BTREE_ITER_intent|BTREE_ITER_all_snapshots);
|
|
- ret = bch2_btree_iter_traverse(&iter) ?:
|
|
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_trans_update(trans, &iter, new,
|
|
BTREE_UPDATE_internal_snapshot_node|
|
|
BTREE_TRIGGER_norun);
|
|
@@ -381,6 +388,31 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
+static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf,
|
|
+ struct bkey_s_c k, bool insert, enum bch_sb_error_id id)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+
|
|
+ prt_printf(buf, "\nwhile marking ");
|
|
+ bch2_bkey_val_to_text(buf, c, k);
|
|
+ prt_newline(buf);
|
|
+
|
|
+ bool print = __bch2_count_fsck_err(c, id, buf);
|
|
+
|
|
+ int ret = bch2_run_explicit_recovery_pass(c, buf,
|
|
+ BCH_RECOVERY_PASS_check_allocations, 0);
|
|
+
|
|
+ if (insert) {
|
|
+ bch2_trans_updates_to_text(buf, trans);
|
|
+ __bch2_inconsistent_error(c, buf);
|
|
+ ret = -BCH_ERR_bucket_ref_update;
|
|
+ }
|
|
+
|
|
+ if (print || insert)
|
|
+ bch2_print_str(c, KERN_ERR, buf->buf);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
|
|
struct bkey_s_c k,
|
|
const struct bch_extent_ptr *ptr,
|
|
@@ -396,32 +428,29 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
|
|
|
|
BUG_ON(!sectors);
|
|
|
|
- if (gen_after(ptr->gen, b_gen)) {
|
|
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
|
- log_fsck_err(trans, ptr_gen_newer_than_bucket_gen,
|
|
- "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
|
|
- "while marking %s",
|
|
+ if (unlikely(gen_after(ptr->gen, b_gen))) {
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+ prt_printf(&buf,
|
|
+ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen",
|
|
ptr->dev, bucket_nr, b_gen,
|
|
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
|
|
- ptr->gen,
|
|
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
|
|
- if (inserting)
|
|
- goto err;
|
|
+ ptr->gen);
|
|
+
|
|
+ ret = bucket_ref_update_err(trans, &buf, k, inserting,
|
|
+ BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen);
|
|
goto out;
|
|
}
|
|
|
|
- if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
|
|
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
|
- log_fsck_err(trans, ptr_too_stale,
|
|
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
|
|
- "while marking %s",
|
|
+ if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) {
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+ prt_printf(&buf,
|
|
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale",
|
|
ptr->dev, bucket_nr, b_gen,
|
|
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
|
|
- ptr->gen,
|
|
- (printbuf_reset(&buf),
|
|
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
|
|
- if (inserting)
|
|
- goto err;
|
|
+ ptr->gen);
|
|
+
|
|
+ ret = bucket_ref_update_err(trans, &buf, k, inserting,
|
|
+ BCH_FSCK_ERR_ptr_too_stale);
|
|
goto out;
|
|
}
|
|
|
|
@@ -430,62 +459,50 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
|
|
goto out;
|
|
}
|
|
|
|
- if (b_gen != ptr->gen) {
|
|
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
|
- log_fsck_err(trans, stale_dirty_ptr,
|
|
- "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
|
|
- "while marking %s",
|
|
+ if (unlikely(b_gen != ptr->gen)) {
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+ prt_printf(&buf,
|
|
+ "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)",
|
|
ptr->dev, bucket_nr, b_gen,
|
|
bucket_gen_get(ca, bucket_nr),
|
|
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
|
|
- ptr->gen,
|
|
- (printbuf_reset(&buf),
|
|
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
|
|
- if (inserting)
|
|
- goto err;
|
|
+ ptr->gen);
|
|
+
|
|
+ ret = bucket_ref_update_err(trans, &buf, k, inserting,
|
|
+ BCH_FSCK_ERR_stale_dirty_ptr);
|
|
goto out;
|
|
}
|
|
|
|
- if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
|
|
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
|
- log_fsck_err(trans, ptr_bucket_data_type_mismatch,
|
|
- "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
|
|
- "while marking %s",
|
|
- ptr->dev, bucket_nr, b_gen,
|
|
- bch2_data_type_str(bucket_data_type),
|
|
- bch2_data_type_str(ptr_data_type),
|
|
- (printbuf_reset(&buf),
|
|
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
|
|
- if (inserting)
|
|
- goto err;
|
|
+ if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) {
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+ prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s",
|
|
+ ptr->dev, bucket_nr, b_gen,
|
|
+ bch2_data_type_str(bucket_data_type),
|
|
+ bch2_data_type_str(ptr_data_type));
|
|
+
|
|
+ ret = bucket_ref_update_err(trans, &buf, k, inserting,
|
|
+ BCH_FSCK_ERR_ptr_bucket_data_type_mismatch);
|
|
goto out;
|
|
}
|
|
|
|
- if ((u64) *bucket_sectors + sectors > U32_MAX) {
|
|
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
|
- log_fsck_err(trans, bucket_sector_count_overflow,
|
|
- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
|
|
- "while marking %s",
|
|
+ if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) {
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+ prt_printf(&buf,
|
|
+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX",
|
|
ptr->dev, bucket_nr, b_gen,
|
|
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
|
|
- *bucket_sectors, sectors,
|
|
- (printbuf_reset(&buf),
|
|
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
|
|
- if (inserting)
|
|
- goto err;
|
|
+ *bucket_sectors, sectors);
|
|
+
|
|
+ ret = bucket_ref_update_err(trans, &buf, k, inserting,
|
|
+ BCH_FSCK_ERR_bucket_sector_count_overflow);
|
|
sectors = -*bucket_sectors;
|
|
+ goto out;
|
|
}
|
|
|
|
*bucket_sectors += sectors;
|
|
out:
|
|
printbuf_exit(&buf);
|
|
return ret;
|
|
-err:
|
|
-fsck_err:
|
|
- bch2_dump_trans_updates(trans);
|
|
- bch2_inconsistent_error(c);
|
|
- ret = -BCH_ERR_bucket_ref_update;
|
|
- goto out;
|
|
}
|
|
|
|
void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
|
|
@@ -582,6 +599,13 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
|
|
}
|
|
|
|
struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
|
|
+ if (!bucket_valid(ca, bucket.offset)) {
|
|
+ if (insert) {
|
|
+ bch2_dev_bucket_missing(ca, bucket.offset);
|
|
+ ret = -BCH_ERR_trigger_pointer;
|
|
+ }
|
|
+ goto err;
|
|
+ }
|
|
|
|
if (flags & BTREE_TRIGGER_transactional) {
|
|
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
|
|
@@ -590,11 +614,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (!p.ptr.cached) {
|
|
- ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
|
|
- if (ret)
|
|
- goto err;
|
|
- }
|
|
+ ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
|
|
+ if (ret)
|
|
+ goto err;
|
|
}
|
|
|
|
if (flags & BTREE_TRIGGER_gc) {
|
|
@@ -653,9 +675,9 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
|
|
stripe_blockcount_get(&s->v, p.ec.block) +
|
|
sectors);
|
|
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_replicas,
|
|
- };
|
|
+ struct disk_accounting_pos acc;
|
|
+ memset(&acc, 0, sizeof(acc));
|
|
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
|
|
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
|
|
acc.replicas.data_type = data_type;
|
|
ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false);
|
|
@@ -674,26 +696,28 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
|
|
return -BCH_ERR_ENOMEM_mark_stripe_ptr;
|
|
}
|
|
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
+ gc_stripe_lock(m);
|
|
|
|
if (!m || !m->alive) {
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
+ gc_stripe_unlock(m);
|
|
struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+ prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ",
|
|
+ (u64) p.ec.idx);
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
- bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s",
|
|
- (u64) p.ec.idx, buf.buf);
|
|
+ __bch2_inconsistent_error(c, &buf);
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
printbuf_exit(&buf);
|
|
- bch2_inconsistent_error(c);
|
|
return -BCH_ERR_trigger_stripe_pointer;
|
|
}
|
|
|
|
m->block_sectors[p.ec.block] += sectors;
|
|
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_replicas,
|
|
- };
|
|
- memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
+ struct disk_accounting_pos acc;
|
|
+ memset(&acc, 0, sizeof(acc));
|
|
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
|
|
+ unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA");
|
|
+ gc_stripe_unlock(m);
|
|
|
|
acc.replicas.data_type = data_type;
|
|
int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true);
|
|
@@ -719,16 +743,14 @@ static int __trigger_extent(struct btree_trans *trans,
|
|
: BCH_DATA_user;
|
|
int ret = 0;
|
|
|
|
- struct disk_accounting_pos acc_replicas_key = {
|
|
- .type = BCH_DISK_ACCOUNTING_replicas,
|
|
- .replicas.data_type = data_type,
|
|
- .replicas.nr_devs = 0,
|
|
- .replicas.nr_required = 1,
|
|
- };
|
|
+ struct disk_accounting_pos acc_replicas_key;
|
|
+ memset(&acc_replicas_key, 0, sizeof(acc_replicas_key));
|
|
+ acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas;
|
|
+ acc_replicas_key.replicas.data_type = data_type;
|
|
+ acc_replicas_key.replicas.nr_devs = 0;
|
|
+ acc_replicas_key.replicas.nr_required = 1;
|
|
|
|
- struct disk_accounting_pos acct_compression_key = {
|
|
- .type = BCH_DISK_ACCOUNTING_compression,
|
|
- };
|
|
+ unsigned cur_compression_type = 0;
|
|
u64 compression_acct[3] = { 1, 0, 0 };
|
|
|
|
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
@@ -762,13 +784,13 @@ static int __trigger_extent(struct btree_trans *trans,
|
|
acc_replicas_key.replicas.nr_required = 0;
|
|
}
|
|
|
|
- if (acct_compression_key.compression.type &&
|
|
- acct_compression_key.compression.type != p.crc.compression_type) {
|
|
+ if (cur_compression_type &&
|
|
+ cur_compression_type != p.crc.compression_type) {
|
|
if (flags & BTREE_TRIGGER_overwrite)
|
|
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
|
|
|
|
- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
|
|
- ARRAY_SIZE(compression_acct), gc);
|
|
+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
|
|
+ compression, cur_compression_type);
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -777,7 +799,7 @@ static int __trigger_extent(struct btree_trans *trans,
|
|
compression_acct[2] = 0;
|
|
}
|
|
|
|
- acct_compression_key.compression.type = p.crc.compression_type;
|
|
+ cur_compression_type = p.crc.compression_type;
|
|
if (p.crc.compression_type) {
|
|
compression_acct[1] += p.crc.uncompressed_size;
|
|
compression_acct[2] += p.crc.compressed_size;
|
|
@@ -791,45 +813,34 @@ static int __trigger_extent(struct btree_trans *trans,
|
|
}
|
|
|
|
if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
|
|
- struct disk_accounting_pos acc_snapshot_key = {
|
|
- .type = BCH_DISK_ACCOUNTING_snapshot,
|
|
- .snapshot.id = k.k->p.snapshot,
|
|
- };
|
|
- ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
|
|
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
- if (acct_compression_key.compression.type) {
|
|
+ if (cur_compression_type) {
|
|
if (flags & BTREE_TRIGGER_overwrite)
|
|
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
|
|
|
|
- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
|
|
- ARRAY_SIZE(compression_acct), gc);
|
|
+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
|
|
+ compression, cur_compression_type);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
if (level) {
|
|
- struct disk_accounting_pos acc_btree_key = {
|
|
- .type = BCH_DISK_ACCOUNTING_btree,
|
|
- .btree.id = btree_id,
|
|
- };
|
|
- ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
|
|
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id);
|
|
if (ret)
|
|
return ret;
|
|
} else {
|
|
bool insert = !(flags & BTREE_TRIGGER_overwrite);
|
|
- struct disk_accounting_pos acc_inum_key = {
|
|
- .type = BCH_DISK_ACCOUNTING_inum,
|
|
- .inum.inum = k.k->p.inode,
|
|
- };
|
|
+
|
|
s64 v[3] = {
|
|
insert ? 1 : -1,
|
|
insert ? k.k->size : -((s64) k.k->size),
|
|
*replicas_sectors,
|
|
};
|
|
- ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
|
|
+ ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -878,15 +889,15 @@ int bch2_trigger_extent(struct btree_trans *trans,
|
|
}
|
|
|
|
int need_rebalance_delta = 0;
|
|
- s64 need_rebalance_sectors_delta = 0;
|
|
+ s64 need_rebalance_sectors_delta[1] = { 0 };
|
|
|
|
s64 s = bch2_bkey_sectors_need_rebalance(c, old);
|
|
need_rebalance_delta -= s != 0;
|
|
- need_rebalance_sectors_delta -= s;
|
|
+ need_rebalance_sectors_delta[0] -= s;
|
|
|
|
s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
|
|
need_rebalance_delta += s != 0;
|
|
- need_rebalance_sectors_delta += s;
|
|
+ need_rebalance_sectors_delta[0] += s;
|
|
|
|
if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
|
|
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
|
|
@@ -895,12 +906,9 @@ int bch2_trigger_extent(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
- if (need_rebalance_sectors_delta) {
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_rebalance_work,
|
|
- };
|
|
- int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
|
|
- flags & BTREE_TRIGGER_gc);
|
|
+ if (need_rebalance_sectors_delta[0]) {
|
|
+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
|
|
+ need_rebalance_sectors_delta, rebalance_work);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -916,17 +924,13 @@ static int __trigger_reservation(struct btree_trans *trans,
|
|
enum btree_iter_update_trigger_flags flags)
|
|
{
|
|
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
|
|
- s64 sectors = k.k->size;
|
|
+ s64 sectors[1] = { k.k->size };
|
|
|
|
if (flags & BTREE_TRIGGER_overwrite)
|
|
- sectors = -sectors;
|
|
-
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_persistent_reserved,
|
|
- .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas,
|
|
- };
|
|
+ sectors[0] = -sectors[0];
|
|
|
|
- return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc);
|
|
+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors,
|
|
+ persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas);
|
|
}
|
|
|
|
return 0;
|
|
@@ -957,14 +961,23 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
|
|
return PTR_ERR(a);
|
|
|
|
if (a->v.data_type && type && a->v.data_type != type) {
|
|
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
|
- log_fsck_err(trans, bucket_metadata_type_mismatch,
|
|
- "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
|
|
- "while marking %s",
|
|
- iter.pos.inode, iter.pos.offset, a->v.gen,
|
|
- bch2_data_type_str(a->v.data_type),
|
|
- bch2_data_type_str(type),
|
|
- bch2_data_type_str(type));
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+ prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
|
|
+ "while marking %s\n",
|
|
+ iter.pos.inode, iter.pos.offset, a->v.gen,
|
|
+ bch2_data_type_str(a->v.data_type),
|
|
+ bch2_data_type_str(type),
|
|
+ bch2_data_type_str(type));
|
|
+
|
|
+ bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
|
|
+
|
|
+ bch2_run_explicit_recovery_pass(c, &buf,
|
|
+ BCH_RECOVERY_PASS_check_allocations, 0);
|
|
+
|
|
+ if (print)
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
ret = -BCH_ERR_metadata_bucket_inconsistency;
|
|
goto err;
|
|
}
|
|
@@ -976,7 +989,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
|
|
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
|
|
}
|
|
err:
|
|
-fsck_err:
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
@@ -1134,10 +1146,10 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
|
|
int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
|
|
enum btree_iter_update_trigger_flags flags)
|
|
{
|
|
- for_each_online_member(c, ca) {
|
|
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) {
|
|
int ret = bch2_trans_mark_dev_sb(c, ca, flags);
|
|
if (ret) {
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs);
|
|
return ret;
|
|
}
|
|
}
|
|
@@ -1305,15 +1317,18 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
|
old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
|
|
|
|
if (resize) {
|
|
- bucket_gens->nbuckets = min(bucket_gens->nbuckets,
|
|
- old_bucket_gens->nbuckets);
|
|
- bucket_gens->nbuckets_minus_first =
|
|
- bucket_gens->nbuckets - bucket_gens->first_bucket;
|
|
+ u64 copy = min(bucket_gens->nbuckets,
|
|
+ old_bucket_gens->nbuckets);
|
|
memcpy(bucket_gens->b,
|
|
old_bucket_gens->b,
|
|
- bucket_gens->nbuckets);
|
|
+ sizeof(bucket_gens->b[0]) * copy);
|
|
}
|
|
|
|
+ ret = bch2_bucket_bitmap_resize(&ca->bucket_backpointer_mismatch,
|
|
+ ca->mi.nbuckets, nbuckets) ?:
|
|
+ bch2_bucket_bitmap_resize(&ca->bucket_backpointer_empty,
|
|
+ ca->mi.nbuckets, nbuckets);
|
|
+
|
|
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
|
|
bucket_gens = old_bucket_gens;
|
|
|
|
@@ -1336,7 +1351,7 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
|
|
|
|
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
|
|
{
|
|
- ca->usage = alloc_percpu(struct bch_dev_usage);
|
|
+ ca->usage = alloc_percpu(struct bch_dev_usage_full);
|
|
if (!ca->usage)
|
|
return -BCH_ERR_ENOMEM_usage_init;
|
|
|
|
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
|
|
index a9acdd6c0c86..af1532de4a37 100644
|
|
--- a/fs/bcachefs/buckets.h
|
|
+++ b/fs/bcachefs/buckets.h
|
|
@@ -39,38 +39,12 @@ static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t
|
|
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
|
|
_b < (_buckets)->b + (_buckets)->nbuckets; _b++)
|
|
|
|
-/*
|
|
- * Ugly hack alert:
|
|
- *
|
|
- * We need to cram a spinlock in a single byte, because that's what we have left
|
|
- * in struct bucket, and we care about the size of these - during fsck, we need
|
|
- * in memory state for every single bucket on every device.
|
|
- *
|
|
- * We used to do
|
|
- * while (xchg(&b->lock, 1) cpu_relax();
|
|
- * but, it turns out not all architectures support xchg on a single byte.
|
|
- *
|
|
- * So now we use bit_spin_lock(), with fun games since we can't burn a whole
|
|
- * ulong for this - we just need to make sure the lock bit always ends up in the
|
|
- * first byte.
|
|
- */
|
|
-
|
|
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
-#define BUCKET_LOCK_BITNR 0
|
|
-#else
|
|
-#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
|
|
-#endif
|
|
-
|
|
-union ulong_byte_assert {
|
|
- ulong ulong;
|
|
- u8 byte;
|
|
-};
|
|
-
|
|
static inline void bucket_unlock(struct bucket *b)
|
|
{
|
|
BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
|
|
|
|
clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
|
|
+ smp_mb__after_atomic();
|
|
wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
|
|
}
|
|
|
|
@@ -167,9 +141,7 @@ static inline int gen_cmp(u8 a, u8 b)
|
|
|
|
static inline int gen_after(u8 a, u8 b)
|
|
{
|
|
- int r = gen_cmp(a, b);
|
|
-
|
|
- return r > 0 ? r : 0;
|
|
+ return max(0, gen_cmp(a, b));
|
|
}
|
|
|
|
static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
|
|
@@ -201,7 +173,16 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
|
|
return ret;
|
|
}
|
|
|
|
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *);
|
|
+void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *);
|
|
+static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca)
|
|
+{
|
|
+ struct bch_dev_usage_full ret;
|
|
+
|
|
+ bch2_dev_usage_full_read_fast(ca, &ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *);
|
|
|
|
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
|
|
{
|
|
@@ -236,7 +217,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca,
|
|
enum bch_watermark watermark)
|
|
{
|
|
return max_t(s64, 0,
|
|
- usage.d[BCH_DATA_free].buckets -
|
|
+ usage.buckets[BCH_DATA_free]-
|
|
ca->nr_open_buckets -
|
|
bch2_dev_buckets_reserved(ca, watermark));
|
|
}
|
|
@@ -246,10 +227,10 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
|
|
enum bch_watermark watermark)
|
|
{
|
|
return max_t(s64, 0,
|
|
- usage.d[BCH_DATA_free].buckets
|
|
- + usage.d[BCH_DATA_cached].buckets
|
|
- + usage.d[BCH_DATA_need_gc_gens].buckets
|
|
- + usage.d[BCH_DATA_need_discard].buckets
|
|
+ usage.buckets[BCH_DATA_free]
|
|
+ + usage.buckets[BCH_DATA_cached]
|
|
+ + usage.buckets[BCH_DATA_need_gc_gens]
|
|
+ + usage.buckets[BCH_DATA_need_discard]
|
|
- ca->nr_open_buckets
|
|
- bch2_dev_buckets_reserved(ca, watermark));
|
|
}
|
|
@@ -262,11 +243,6 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,
|
|
|
|
/* Filesystem usage: */
|
|
|
|
-static inline unsigned dev_usage_u64s(void)
|
|
-{
|
|
- return sizeof(struct bch_dev_usage) / sizeof(u64);
|
|
-}
|
|
-
|
|
struct bch_fs_usage_short
|
|
bch2_fs_usage_read_short(struct bch_fs *);
|
|
|
|
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
|
|
index 7174047b8e92..0aed2500ade3 100644
|
|
--- a/fs/bcachefs/buckets_types.h
|
|
+++ b/fs/bcachefs/buckets_types.h
|
|
@@ -7,6 +7,33 @@
|
|
|
|
#define BUCKET_JOURNAL_SEQ_BITS 16
|
|
|
|
+/*
|
|
+ * Ugly hack alert:
|
|
+ *
|
|
+ * We need to cram a spinlock in a single byte, because that's what we have left
|
|
+ * in struct bucket, and we care about the size of these - during fsck, we need
|
|
+ * in memory state for every single bucket on every device.
|
|
+ *
|
|
+ * We used to do
|
|
+ * while (xchg(&b->lock, 1) cpu_relax();
|
|
+ * but, it turns out not all architectures support xchg on a single byte.
|
|
+ *
|
|
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
|
|
+ * ulong for this - we just need to make sure the lock bit always ends up in the
|
|
+ * first byte.
|
|
+ */
|
|
+
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
+#define BUCKET_LOCK_BITNR 0
|
|
+#else
|
|
+#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
|
|
+#endif
|
|
+
|
|
+union ulong_byte_assert {
|
|
+ ulong ulong;
|
|
+ u8 byte;
|
|
+};
|
|
+
|
|
struct bucket {
|
|
u8 lock;
|
|
u8 gen_valid:1;
|
|
@@ -27,7 +54,12 @@ struct bucket_gens {
|
|
u8 b[] __counted_by(nbuckets);
|
|
};
|
|
|
|
+/* Only info on bucket countns: */
|
|
struct bch_dev_usage {
|
|
+ u64 buckets[BCH_DATA_NR];
|
|
+};
|
|
+
|
|
+struct bch_dev_usage_full {
|
|
struct bch_dev_usage_type {
|
|
u64 buckets;
|
|
u64 sectors; /* _compressed_ sectors: */
|
|
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
|
|
index 46e9e32105a9..4066946b26bc 100644
|
|
--- a/fs/bcachefs/chardev.c
|
|
+++ b/fs/bcachefs/chardev.c
|
|
@@ -11,6 +11,7 @@
|
|
#include "move.h"
|
|
#include "recovery_passes.h"
|
|
#include "replicas.h"
|
|
+#include "sb-counters.h"
|
|
#include "super-io.h"
|
|
#include "thread_with_file.h"
|
|
|
|
@@ -312,7 +313,12 @@ static int bch2_data_thread(void *arg)
|
|
struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
|
|
|
|
ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
|
|
- ctx->stats.data_type = U8_MAX;
|
|
+ if (ctx->thr.ret == -BCH_ERR_device_offline)
|
|
+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
|
|
+ else {
|
|
+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
|
|
+ ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done;
|
|
+ }
|
|
return 0;
|
|
}
|
|
|
|
@@ -331,14 +337,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
|
|
struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
|
|
struct bch_fs *c = ctx->c;
|
|
struct bch_ioctl_data_event e = {
|
|
- .type = BCH_DATA_EVENT_PROGRESS,
|
|
- .p.data_type = ctx->stats.data_type,
|
|
- .p.btree_id = ctx->stats.pos.btree,
|
|
- .p.pos = ctx->stats.pos.pos,
|
|
- .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
|
|
- .p.sectors_total = bch2_fs_usage_read_short(c).used,
|
|
+ .type = BCH_DATA_EVENT_PROGRESS,
|
|
+ .ret = ctx->stats.ret,
|
|
+ .p.data_type = ctx->stats.data_type,
|
|
+ .p.btree_id = ctx->stats.pos.btree,
|
|
+ .p.pos = ctx->stats.pos.pos,
|
|
+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
|
|
+ .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected),
|
|
+ .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected),
|
|
};
|
|
|
|
+ if (ctx->arg.op == BCH_DATA_OP_scrub) {
|
|
+ struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
|
|
+ if (ca) {
|
|
+ struct bch_dev_usage_full u;
|
|
+ bch2_dev_usage_full_read_fast(ca, &u);
|
|
+ for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
|
|
+ if (ctx->arg.scrub.data_types & BIT(i))
|
|
+ e.p.sectors_total += u.d[i].sectors;
|
|
+ bch2_dev_put(ca);
|
|
+ }
|
|
+ } else {
|
|
+ e.p.sectors_total = bch2_fs_usage_read_short(c).used;
|
|
+ }
|
|
+
|
|
if (len < sizeof(e))
|
|
return -EINVAL;
|
|
|
|
@@ -404,10 +426,8 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
|
|
arg.replica_entries_bytes = replicas.nr;
|
|
|
|
for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
|
|
- struct disk_accounting_pos k = {
|
|
- .type = BCH_DISK_ACCOUNTING_persistent_reserved,
|
|
- .persistent_reserved.nr_replicas = i,
|
|
- };
|
|
+ struct disk_accounting_pos k;
|
|
+ disk_accounting_key_init(k, persistent_reserved, .nr_replicas = i);
|
|
|
|
bch2_accounting_mem_read(c,
|
|
disk_accounting_pos_to_bpos(&k),
|
|
@@ -453,7 +473,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
|
|
struct bch_ioctl_dev_usage __user *user_arg)
|
|
{
|
|
struct bch_ioctl_dev_usage arg;
|
|
- struct bch_dev_usage src;
|
|
+ struct bch_dev_usage_full src;
|
|
struct bch_dev *ca;
|
|
unsigned i;
|
|
|
|
@@ -473,7 +493,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
|
|
if (IS_ERR(ca))
|
|
return PTR_ERR(ca);
|
|
|
|
- src = bch2_dev_usage_read(ca);
|
|
+ src = bch2_dev_usage_full_read(ca);
|
|
|
|
arg.state = ca->mi.state;
|
|
arg.bucket_size = ca->mi.bucket_size;
|
|
@@ -494,7 +514,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
|
|
struct bch_ioctl_dev_usage_v2 __user *user_arg)
|
|
{
|
|
struct bch_ioctl_dev_usage_v2 arg;
|
|
- struct bch_dev_usage src;
|
|
+ struct bch_dev_usage_full src;
|
|
struct bch_dev *ca;
|
|
int ret = 0;
|
|
|
|
@@ -514,7 +534,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
|
|
if (IS_ERR(ca))
|
|
return PTR_ERR(ca);
|
|
|
|
- src = bch2_dev_usage_read(ca);
|
|
+ src = bch2_dev_usage_full_read(ca);
|
|
|
|
arg.state = ca->mi.state;
|
|
arg.bucket_size = ca->mi.bucket_size;
|
|
@@ -593,11 +613,13 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
|
|
if (!dev)
|
|
return -EINVAL;
|
|
|
|
- for_each_online_member(c, ca)
|
|
+ rcu_read_lock();
|
|
+ for_each_online_member_rcu(c, ca)
|
|
if (ca->dev == dev) {
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ rcu_read_unlock();
|
|
return ca->dev_idx;
|
|
}
|
|
+ rcu_read_unlock();
|
|
|
|
return -BCH_ERR_ENOENT_dev_idx_not_found;
|
|
}
|
|
@@ -710,6 +732,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
|
|
BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
|
|
case BCH_IOCTL_QUERY_ACCOUNTING:
|
|
return bch2_ioctl_query_accounting(c, arg);
|
|
+ case BCH_IOCTL_QUERY_COUNTERS:
|
|
+ return bch2_ioctl_query_counters(c, arg);
|
|
default:
|
|
return -ENOTTY;
|
|
}
|
|
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
|
|
index 23a383577d4c..ff5ab8ada777 100644
|
|
--- a/fs/bcachefs/checksum.c
|
|
+++ b/fs/bcachefs/checksum.c
|
|
@@ -7,17 +7,12 @@
|
|
#include "super-io.h"
|
|
|
|
#include <linux/crc32c.h>
|
|
-#include <linux/crypto.h>
|
|
#include <linux/xxhash.h>
|
|
#include <linux/key.h>
|
|
#include <linux/random.h>
|
|
#include <linux/ratelimit.h>
|
|
-#include <linux/scatterlist.h>
|
|
-#include <crypto/algapi.h>
|
|
#include <crypto/chacha.h>
|
|
-#include <crypto/hash.h>
|
|
#include <crypto/poly1305.h>
|
|
-#include <crypto/skcipher.h>
|
|
#include <keys/user-type.h>
|
|
|
|
/*
|
|
@@ -96,116 +91,40 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void *
|
|
}
|
|
}
|
|
|
|
-static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
|
|
- struct nonce nonce,
|
|
- struct scatterlist *sg, size_t len)
|
|
+static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS],
|
|
+ const struct bch_key *key, struct nonce nonce)
|
|
{
|
|
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
|
|
+ u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)];
|
|
|
|
- skcipher_request_set_sync_tfm(req, tfm);
|
|
- skcipher_request_set_callback(req, 0, NULL, NULL);
|
|
- skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
|
|
+ BUILD_BUG_ON(sizeof(key_words) != sizeof(*key));
|
|
+ memcpy(key_words, key, sizeof(key_words));
|
|
+ le32_to_cpu_array(key_words, ARRAY_SIZE(key_words));
|
|
|
|
- int ret = crypto_skcipher_encrypt(req);
|
|
- if (ret)
|
|
- pr_err("got error %i from crypto_skcipher_encrypt()", ret);
|
|
-
|
|
- return ret;
|
|
-}
|
|
-
|
|
-static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
|
|
- struct nonce nonce,
|
|
- void *buf, size_t len)
|
|
-{
|
|
- if (!is_vmalloc_addr(buf)) {
|
|
- struct scatterlist sg = {};
|
|
-
|
|
- sg_mark_end(&sg);
|
|
- sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
|
|
- return do_encrypt_sg(tfm, nonce, &sg, len);
|
|
- } else {
|
|
- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
|
|
- size_t sgl_len = 0;
|
|
- int ret;
|
|
-
|
|
- darray_init(&sgl);
|
|
-
|
|
- while (len) {
|
|
- unsigned offset = offset_in_page(buf);
|
|
- struct scatterlist sg = {
|
|
- .page_link = (unsigned long) vmalloc_to_page(buf),
|
|
- .offset = offset,
|
|
- .length = min(len, PAGE_SIZE - offset),
|
|
- };
|
|
-
|
|
- if (darray_push(&sgl, sg)) {
|
|
- sg_mark_end(&darray_last(sgl));
|
|
- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- nonce = nonce_add(nonce, sgl_len);
|
|
- sgl_len = 0;
|
|
- sgl.nr = 0;
|
|
- BUG_ON(darray_push(&sgl, sg));
|
|
- }
|
|
-
|
|
- buf += sg.length;
|
|
- len -= sg.length;
|
|
- sgl_len += sg.length;
|
|
- }
|
|
+ BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE);
|
|
+ chacha_init(state, key_words, (const u8 *)nonce.d);
|
|
|
|
- sg_mark_end(&darray_last(sgl));
|
|
- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
|
|
-err:
|
|
- darray_exit(&sgl);
|
|
- return ret;
|
|
- }
|
|
+ memzero_explicit(key_words, sizeof(key_words));
|
|
}
|
|
|
|
-int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
|
|
- void *buf, size_t len)
|
|
+void bch2_chacha20(const struct bch_key *key, struct nonce nonce,
|
|
+ void *data, size_t len)
|
|
{
|
|
- struct crypto_sync_skcipher *chacha20 =
|
|
- crypto_alloc_sync_skcipher("chacha20", 0, 0);
|
|
- int ret;
|
|
-
|
|
- ret = PTR_ERR_OR_ZERO(chacha20);
|
|
- if (ret) {
|
|
- pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
|
|
- return ret;
|
|
- }
|
|
-
|
|
- ret = crypto_skcipher_setkey(&chacha20->base,
|
|
- (void *) key, sizeof(*key));
|
|
- if (ret) {
|
|
- pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
|
|
- goto err;
|
|
- }
|
|
+ u32 state[CHACHA_STATE_WORDS];
|
|
|
|
- ret = do_encrypt(chacha20, nonce, buf, len);
|
|
-err:
|
|
- crypto_free_sync_skcipher(chacha20);
|
|
- return ret;
|
|
+ bch2_chacha20_init(state, key, nonce);
|
|
+ chacha20_crypt(state, data, data, len);
|
|
+ memzero_explicit(state, sizeof(state));
|
|
}
|
|
|
|
-static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
|
|
- struct nonce nonce)
|
|
+static void bch2_poly1305_init(struct poly1305_desc_ctx *desc,
|
|
+ struct bch_fs *c, struct nonce nonce)
|
|
{
|
|
- u8 key[POLY1305_KEY_SIZE];
|
|
- int ret;
|
|
+ u8 key[POLY1305_KEY_SIZE] = { 0 };
|
|
|
|
nonce.d[3] ^= BCH_NONCE_POLY;
|
|
|
|
- memset(key, 0, sizeof(key));
|
|
- ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
|
|
- if (ret)
|
|
- return ret;
|
|
-
|
|
- desc->tfm = c->poly1305;
|
|
- crypto_shash_init(desc);
|
|
- crypto_shash_update(desc, key, sizeof(key));
|
|
- return 0;
|
|
+ bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key));
|
|
+ poly1305_init(desc, key);
|
|
}
|
|
|
|
struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
|
|
@@ -230,14 +149,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
|
|
|
|
case BCH_CSUM_chacha20_poly1305_80:
|
|
case BCH_CSUM_chacha20_poly1305_128: {
|
|
- SHASH_DESC_ON_STACK(desc, c->poly1305);
|
|
+ struct poly1305_desc_ctx dctx;
|
|
u8 digest[POLY1305_DIGEST_SIZE];
|
|
struct bch_csum ret = { 0 };
|
|
|
|
- gen_poly_key(c, desc, nonce);
|
|
-
|
|
- crypto_shash_update(desc, data, len);
|
|
- crypto_shash_final(desc, digest);
|
|
+ bch2_poly1305_init(&dctx, c, nonce);
|
|
+ poly1305_update(&dctx, data, len);
|
|
+ poly1305_final(&dctx, digest);
|
|
|
|
memcpy(&ret, digest, bch_crc_bytes[type]);
|
|
return ret;
|
|
@@ -253,11 +171,12 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,
|
|
if (!bch2_csum_type_is_encryption(type))
|
|
return 0;
|
|
|
|
- if (bch2_fs_inconsistent_on(!c->chacha20,
|
|
+ if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
|
|
c, "attempting to encrypt without encryption key"))
|
|
return -BCH_ERR_no_encryption_key;
|
|
|
|
- return do_encrypt(c->chacha20, nonce, data, len);
|
|
+ bch2_chacha20(&c->chacha20_key, nonce, data, len);
|
|
+ return 0;
|
|
}
|
|
|
|
static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
|
|
@@ -296,26 +215,26 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
|
|
|
|
case BCH_CSUM_chacha20_poly1305_80:
|
|
case BCH_CSUM_chacha20_poly1305_128: {
|
|
- SHASH_DESC_ON_STACK(desc, c->poly1305);
|
|
+ struct poly1305_desc_ctx dctx;
|
|
u8 digest[POLY1305_DIGEST_SIZE];
|
|
struct bch_csum ret = { 0 };
|
|
|
|
- gen_poly_key(c, desc, nonce);
|
|
+ bch2_poly1305_init(&dctx, c, nonce);
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
__bio_for_each_segment(bv, bio, *iter, *iter) {
|
|
void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
|
|
|
|
- crypto_shash_update(desc, p, bv.bv_len);
|
|
+ poly1305_update(&dctx, p, bv.bv_len);
|
|
kunmap_local(p);
|
|
}
|
|
#else
|
|
__bio_for_each_bvec(bv, bio, *iter, *iter)
|
|
- crypto_shash_update(desc,
|
|
+ poly1305_update(&dctx,
|
|
page_address(bv.bv_page) + bv.bv_offset,
|
|
bv.bv_len);
|
|
#endif
|
|
- crypto_shash_final(desc, digest);
|
|
+ poly1305_final(&dctx, digest);
|
|
|
|
memcpy(&ret, digest, bch_crc_bytes[type]);
|
|
return ret;
|
|
@@ -338,43 +257,33 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
|
|
{
|
|
struct bio_vec bv;
|
|
struct bvec_iter iter;
|
|
- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
|
|
- size_t sgl_len = 0;
|
|
+ u32 chacha_state[CHACHA_STATE_WORDS];
|
|
int ret = 0;
|
|
|
|
- if (bch2_fs_inconsistent_on(!c->chacha20,
|
|
+ if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
|
|
c, "attempting to encrypt without encryption key"))
|
|
return -BCH_ERR_no_encryption_key;
|
|
|
|
- darray_init(&sgl);
|
|
+ bch2_chacha20_init(chacha_state, &c->chacha20_key, nonce);
|
|
|
|
bio_for_each_segment(bv, bio, iter) {
|
|
- struct scatterlist sg = {
|
|
- .page_link = (unsigned long) bv.bv_page,
|
|
- .offset = bv.bv_offset,
|
|
- .length = bv.bv_len,
|
|
- };
|
|
-
|
|
- if (darray_push(&sgl, sg)) {
|
|
- sg_mark_end(&darray_last(sgl));
|
|
- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- nonce = nonce_add(nonce, sgl_len);
|
|
- sgl_len = 0;
|
|
- sgl.nr = 0;
|
|
-
|
|
- BUG_ON(darray_push(&sgl, sg));
|
|
+ void *p;
|
|
+
|
|
+ /*
|
|
+ * chacha_crypt() assumes that the length is a multiple of
|
|
+ * CHACHA_BLOCK_SIZE on any non-final call.
|
|
+ */
|
|
+ if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) {
|
|
+ bch_err_ratelimited(c, "bio not aligned for encryption");
|
|
+ ret = -EIO;
|
|
+ break;
|
|
}
|
|
|
|
- sgl_len += sg.length;
|
|
+ p = bvec_kmap_local(&bv);
|
|
+ chacha20_crypt(chacha_state, p, p, bv.bv_len);
|
|
+ kunmap_local(p);
|
|
}
|
|
-
|
|
- sg_mark_end(&darray_last(sgl));
|
|
- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
|
|
-err:
|
|
- darray_exit(&sgl);
|
|
+ memzero_explicit(chacha_state, sizeof(chacha_state));
|
|
return ret;
|
|
}
|
|
|
|
@@ -466,7 +375,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
|
|
prt_str(&buf, ")");
|
|
WARN_RATELIMIT(1, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
- return -EIO;
|
|
+ return -BCH_ERR_recompute_checksum;
|
|
}
|
|
|
|
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
|
|
@@ -650,10 +559,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
|
|
}
|
|
|
|
/* decrypt real key: */
|
|
- ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
|
|
- &sb_key, sizeof(sb_key));
|
|
- if (ret)
|
|
- goto err;
|
|
+ bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key));
|
|
|
|
if (bch2_key_is_encrypted(&sb_key)) {
|
|
bch_err(c, "incorrect encryption key");
|
|
@@ -668,31 +574,14 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
|
|
return ret;
|
|
}
|
|
|
|
-static int bch2_alloc_ciphers(struct bch_fs *c)
|
|
-{
|
|
- if (c->chacha20)
|
|
- return 0;
|
|
-
|
|
- struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
|
|
- int ret = PTR_ERR_OR_ZERO(chacha20);
|
|
- if (ret) {
|
|
- bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
|
|
- return ret;
|
|
- }
|
|
-
|
|
- struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0);
|
|
- ret = PTR_ERR_OR_ZERO(poly1305);
|
|
- if (ret) {
|
|
- bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
|
|
- crypto_free_sync_skcipher(chacha20);
|
|
- return ret;
|
|
- }
|
|
-
|
|
- c->chacha20 = chacha20;
|
|
- c->poly1305 = poly1305;
|
|
- return 0;
|
|
-}
|
|
+#if 0
|
|
|
|
+/*
|
|
+ * This seems to be duplicating code in cmd_remove_passphrase() in
|
|
+ * bcachefs-tools, but we might want to switch userspace to use this - and
|
|
+ * perhaps add an ioctl for calling this at runtime, so we can take the
|
|
+ * passphrase off of a mounted filesystem (which has come up).
|
|
+ */
|
|
int bch2_disable_encryption(struct bch_fs *c)
|
|
{
|
|
struct bch_sb_field_crypt *crypt;
|
|
@@ -725,6 +614,10 @@ int bch2_disable_encryption(struct bch_fs *c)
|
|
return ret;
|
|
}
|
|
|
|
+/*
|
|
+ * For enabling encryption on an existing filesystem: not hooked up yet, but it
|
|
+ * should be
|
|
+ */
|
|
int bch2_enable_encryption(struct bch_fs *c, bool keyed)
|
|
{
|
|
struct bch_encrypted_key key;
|
|
@@ -781,48 +674,25 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
|
|
memzero_explicit(&key, sizeof(key));
|
|
return ret;
|
|
}
|
|
+#endif
|
|
|
|
void bch2_fs_encryption_exit(struct bch_fs *c)
|
|
{
|
|
- if (c->poly1305)
|
|
- crypto_free_shash(c->poly1305);
|
|
- if (c->chacha20)
|
|
- crypto_free_sync_skcipher(c->chacha20);
|
|
- if (c->sha256)
|
|
- crypto_free_shash(c->sha256);
|
|
+ memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key));
|
|
}
|
|
|
|
int bch2_fs_encryption_init(struct bch_fs *c)
|
|
{
|
|
struct bch_sb_field_crypt *crypt;
|
|
- struct bch_key key;
|
|
- int ret = 0;
|
|
-
|
|
- c->sha256 = crypto_alloc_shash("sha256", 0, 0);
|
|
- ret = PTR_ERR_OR_ZERO(c->sha256);
|
|
- if (ret) {
|
|
- c->sha256 = NULL;
|
|
- bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
|
|
- goto out;
|
|
- }
|
|
+ int ret;
|
|
|
|
crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
|
|
if (!crypt)
|
|
- goto out;
|
|
-
|
|
- ret = bch2_alloc_ciphers(c);
|
|
- if (ret)
|
|
- goto out;
|
|
-
|
|
- ret = bch2_decrypt_sb_key(c, crypt, &key);
|
|
- if (ret)
|
|
- goto out;
|
|
+ return 0;
|
|
|
|
- ret = crypto_skcipher_setkey(&c->chacha20->base,
|
|
- (void *) &key.key, sizeof(key.key));
|
|
+ ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key);
|
|
if (ret)
|
|
- goto out;
|
|
-out:
|
|
- memzero_explicit(&key, sizeof(key));
|
|
- return ret;
|
|
+ return ret;
|
|
+ c->chacha20_key_set = true;
|
|
+ return 0;
|
|
}
|
|
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
|
|
index 43b9d71f2f2b..7bd9cf6104ca 100644
|
|
--- a/fs/bcachefs/checksum.h
|
|
+++ b/fs/bcachefs/checksum.h
|
|
@@ -69,7 +69,8 @@ static inline void bch2_csum_err_msg(struct printbuf *out,
|
|
bch2_csum_to_text(out, type, expected);
|
|
}
|
|
|
|
-int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
|
|
+void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t);
|
|
+
|
|
int bch2_request_key(struct bch_sb *, struct bch_key *);
|
|
#ifndef __KERNEL__
|
|
int bch2_revoke_key(struct bch_sb *);
|
|
@@ -103,8 +104,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
|
|
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
|
|
struct bch_key *);
|
|
|
|
+#if 0
|
|
int bch2_disable_encryption(struct bch_fs *);
|
|
int bch2_enable_encryption(struct bch_fs *, bool);
|
|
+#endif
|
|
|
|
void bch2_fs_encryption_exit(struct bch_fs *);
|
|
int bch2_fs_encryption_init(struct bch_fs *);
|
|
@@ -154,7 +157,7 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
|
|
if (type >= BCH_CSUM_NR)
|
|
return false;
|
|
|
|
- if (bch2_csum_type_is_encryption(type) && !c->chacha20)
|
|
+ if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set)
|
|
return false;
|
|
|
|
return true;
|
|
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
|
|
index 114bf2f3879f..1bca61d17092 100644
|
|
--- a/fs/bcachefs/compress.c
|
|
+++ b/fs/bcachefs/compress.c
|
|
@@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
size_t src_len = src->bi_iter.bi_size;
|
|
size_t dst_len = crc.uncompressed_size << 9;
|
|
void *workspace;
|
|
- int ret;
|
|
+ int ret = 0, ret2;
|
|
|
|
enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
|
|
mempool_t *workspace_pool = &c->compress_workspace[opt];
|
|
@@ -189,7 +189,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
else
|
|
ret = -BCH_ERR_compression_workspace_not_initialized;
|
|
if (ret)
|
|
- goto out;
|
|
+ goto err;
|
|
}
|
|
|
|
src_data = bio_map_or_bounce(c, src, READ);
|
|
@@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
switch (crc.compression_type) {
|
|
case BCH_COMPRESSION_TYPE_lz4_old:
|
|
case BCH_COMPRESSION_TYPE_lz4:
|
|
- ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
|
|
- src_len, dst_len, dst_len);
|
|
- if (ret != dst_len)
|
|
- goto err;
|
|
+ ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
|
|
+ src_len, dst_len, dst_len);
|
|
+ if (ret2 != dst_len)
|
|
+ ret = -BCH_ERR_decompress_lz4;
|
|
break;
|
|
case BCH_COMPRESSION_TYPE_gzip: {
|
|
z_stream strm = {
|
|
@@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
|
|
zlib_set_workspace(&strm, workspace);
|
|
zlib_inflateInit2(&strm, -MAX_WBITS);
|
|
- ret = zlib_inflate(&strm, Z_FINISH);
|
|
+ ret2 = zlib_inflate(&strm, Z_FINISH);
|
|
|
|
mempool_free(workspace, workspace_pool);
|
|
|
|
- if (ret != Z_STREAM_END)
|
|
- goto err;
|
|
+ if (ret2 != Z_STREAM_END)
|
|
+ ret = -BCH_ERR_decompress_gzip;
|
|
break;
|
|
}
|
|
case BCH_COMPRESSION_TYPE_zstd: {
|
|
ZSTD_DCtx *ctx;
|
|
size_t real_src_len = le32_to_cpup(src_data.b);
|
|
|
|
- if (real_src_len > src_len - 4)
|
|
+ if (real_src_len > src_len - 4) {
|
|
+ ret = -BCH_ERR_decompress_zstd_src_len_bad;
|
|
goto err;
|
|
+ }
|
|
|
|
workspace = mempool_alloc(workspace_pool, GFP_NOFS);
|
|
ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
|
|
|
|
- ret = zstd_decompress_dctx(ctx,
|
|
+ ret2 = zstd_decompress_dctx(ctx,
|
|
dst_data, dst_len,
|
|
src_data.b + 4, real_src_len);
|
|
|
|
mempool_free(workspace, workspace_pool);
|
|
|
|
- if (ret != dst_len)
|
|
- goto err;
|
|
+ if (ret2 != dst_len)
|
|
+ ret = -BCH_ERR_decompress_zstd;
|
|
break;
|
|
}
|
|
default:
|
|
BUG();
|
|
}
|
|
- ret = 0;
|
|
+err:
|
|
fsck_err:
|
|
-out:
|
|
bio_unmap_or_unbounce(c, src_data);
|
|
return ret;
|
|
-err:
|
|
- ret = -EIO;
|
|
- goto out;
|
|
}
|
|
|
|
int bch2_bio_uncompress_inplace(struct bch_write_op *op,
|
|
@@ -268,27 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
|
|
BUG_ON(!bio->bi_vcnt);
|
|
BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
|
|
|
|
- if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
|
|
- crc->compressed_size << 9 > c->opts.encoded_extent_max) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "error rewriting existing data: extent too big");
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- return -EIO;
|
|
+ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) {
|
|
+ bch2_write_op_error(op, op->pos.offset,
|
|
+ "extent too big to decompress (%u > %u)",
|
|
+ crc->uncompressed_size << 9, c->opts.encoded_extent_max);
|
|
+ return -BCH_ERR_decompress_exceeded_max_encoded_extent;
|
|
}
|
|
|
|
data = __bounce_alloc(c, dst_len, WRITE);
|
|
|
|
- if (__bio_uncompress(c, bio, data.b, *crc)) {
|
|
- if (!c->opts.no_data_io) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "error rewriting existing data: decompression error");
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
- ret = -EIO;
|
|
+ ret = __bio_uncompress(c, bio, data.b, *crc);
|
|
+
|
|
+ if (c->opts.no_data_io)
|
|
+ ret = 0;
|
|
+
|
|
+ if (ret) {
|
|
+ bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret));
|
|
goto err;
|
|
}
|
|
|
|
@@ -321,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
|
|
if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
|
|
crc.compressed_size << 9 > c->opts.encoded_extent_max)
|
|
- return -EIO;
|
|
+ return -BCH_ERR_decompress_exceeded_max_encoded_extent;
|
|
|
|
dst_data = dst_len == dst_iter.bi_size
|
|
? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
|
|
@@ -378,13 +371,14 @@ static int attempt_compress(struct bch_fs *c,
|
|
};
|
|
|
|
zlib_set_workspace(&strm, workspace);
|
|
- zlib_deflateInit2(&strm,
|
|
+ if (zlib_deflateInit2(&strm,
|
|
compression.level
|
|
? clamp_t(unsigned, compression.level,
|
|
Z_BEST_SPEED, Z_BEST_COMPRESSION)
|
|
: Z_DEFAULT_COMPRESSION,
|
|
Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
|
|
- Z_DEFAULT_STRATEGY);
|
|
+ Z_DEFAULT_STRATEGY) != Z_OK)
|
|
+ return 0;
|
|
|
|
if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
|
|
return 0;
|
|
@@ -720,7 +714,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
|
|
|
|
ret = match_string(bch2_compression_opts, -1, type_str);
|
|
if (ret < 0 && err)
|
|
- prt_str(err, "invalid compression type");
|
|
+ prt_printf(err, "invalid compression type\n");
|
|
if (ret < 0)
|
|
goto err;
|
|
|
|
@@ -735,7 +729,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
|
|
if (!ret && level > 15)
|
|
ret = -EINVAL;
|
|
if (ret < 0 && err)
|
|
- prt_str(err, "invalid compression level");
|
|
+ prt_printf(err, "invalid compression level\n");
|
|
if (ret < 0)
|
|
goto err;
|
|
|
|
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
|
|
index 642fbc60ecab..de096ca65b4b 100644
|
|
--- a/fs/bcachefs/data_update.c
|
|
+++ b/fs/bcachefs/data_update.c
|
|
@@ -20,6 +20,15 @@
|
|
#include "subvolume.h"
|
|
#include "trace.h"
|
|
|
|
+#include <linux/ioprio.h>
|
|
+
|
|
+static const char * const bch2_data_update_type_strs[] = {
|
|
+#define x(t, n, ...) [n] = #t,
|
|
+ BCH_DATA_UPDATE_TYPES()
|
|
+#undef x
|
|
+ NULL
|
|
+};
|
|
+
|
|
static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
|
|
{
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
@@ -33,7 +42,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
|
|
bkey_for_each_ptr(ptrs, ptr) {
|
|
- if (!bch2_dev_tryget(c, ptr->dev)) {
|
|
+ if (unlikely(!bch2_dev_tryget(c, ptr->dev))) {
|
|
bkey_for_each_ptr(ptrs, ptr2) {
|
|
if (ptr2 == ptr)
|
|
break;
|
|
@@ -91,7 +100,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
|
|
return true;
|
|
}
|
|
|
|
-static noinline void trace_move_extent_finish2(struct data_update *u,
|
|
+static noinline void trace_io_move_finish2(struct data_update *u,
|
|
struct bkey_i *new,
|
|
struct bkey_i *insert)
|
|
{
|
|
@@ -111,11 +120,11 @@ static noinline void trace_move_extent_finish2(struct data_update *u,
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
|
|
prt_newline(&buf);
|
|
|
|
- trace_move_extent_finish(c, buf.buf);
|
|
+ trace_io_move_finish(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
-static void trace_move_extent_fail2(struct data_update *m,
|
|
+static void trace_io_move_fail2(struct data_update *m,
|
|
struct bkey_s_c new,
|
|
struct bkey_s_c wrote,
|
|
struct bkey_i *insert,
|
|
@@ -126,7 +135,7 @@ static void trace_move_extent_fail2(struct data_update *m,
|
|
struct printbuf buf = PRINTBUF;
|
|
unsigned rewrites_found = 0;
|
|
|
|
- if (!trace_move_extent_fail_enabled())
|
|
+ if (!trace_io_move_fail_enabled())
|
|
return;
|
|
|
|
prt_str(&buf, msg);
|
|
@@ -166,7 +175,7 @@ static void trace_move_extent_fail2(struct data_update *m,
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
|
|
}
|
|
|
|
- trace_move_extent_fail(c, buf.buf);
|
|
+ trace_io_move_fail(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
@@ -179,6 +188,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
container_of(op, struct data_update, op);
|
|
struct keylist *keys = &op->insert_keys;
|
|
struct bkey_buf _new, _insert;
|
|
+ struct printbuf journal_msg = PRINTBUF;
|
|
int ret = 0;
|
|
|
|
bch2_bkey_buf_init(&_new);
|
|
@@ -206,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
|
|
bch2_trans_begin(trans);
|
|
|
|
- k = bch2_btree_iter_peek_slot(&iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -214,7 +224,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
new = bkey_i_to_extent(bch2_keylist_front(keys));
|
|
|
|
if (!bch2_extents_match(k, old)) {
|
|
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
|
|
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i),
|
|
NULL, "no match:");
|
|
goto nowork;
|
|
}
|
|
@@ -254,7 +264,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
if (m->data_opts.rewrite_ptrs &&
|
|
!rewrites_found &&
|
|
bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
|
|
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
|
|
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
|
|
goto nowork;
|
|
}
|
|
|
|
@@ -271,7 +281,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
}
|
|
|
|
if (!bkey_val_u64s(&new->k)) {
|
|
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
|
|
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
|
|
goto nowork;
|
|
}
|
|
|
|
@@ -336,8 +346,9 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
.btree = m->btree_id,
|
|
.flags = BCH_VALIDATE_commit,
|
|
});
|
|
- if (invalid) {
|
|
+ if (unlikely(invalid)) {
|
|
struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
|
|
prt_str(&buf, "about to insert invalid key in data update path");
|
|
prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
|
|
@@ -348,14 +359,30 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
prt_str(&buf, "\nnew: ");
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
|
|
|
|
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
|
|
+ bch2_fs_emergency_read_only2(c, &buf);
|
|
+
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
printbuf_exit(&buf);
|
|
|
|
- bch2_fatal_error(c);
|
|
- ret = -EIO;
|
|
+ ret = -BCH_ERR_invalid_bkey;
|
|
goto out;
|
|
}
|
|
|
|
+ printbuf_reset(&journal_msg);
|
|
+ prt_str(&journal_msg, bch2_data_update_type_strs[m->type]);
|
|
+
|
|
+ ret = bch2_trans_log_msg(trans, &journal_msg) ?:
|
|
+ bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?:
|
|
+ bch2_insert_snapshot_whiteouts(trans, m->btree_id,
|
|
+ k.k->p, bkey_start_pos(&insert->k)) ?:
|
|
+ bch2_insert_snapshot_whiteouts(trans, m->btree_id,
|
|
+ k.k->p, insert->k.p) ?:
|
|
+ bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
|
|
+ bch2_trans_update(trans, &iter, insert,
|
|
+ BTREE_UPDATE_internal_snapshot_node);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
if (trace_data_update_enabled()) {
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
@@ -370,25 +397,38 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
- ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
|
|
- k.k->p, bkey_start_pos(&insert->k)) ?:
|
|
- bch2_insert_snapshot_whiteouts(trans, m->btree_id,
|
|
- k.k->p, insert->k.p) ?:
|
|
- bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
|
|
- bch2_trans_update(trans, &iter, insert,
|
|
- BTREE_UPDATE_internal_snapshot_node) ?:
|
|
- bch2_trans_commit(trans, &op->res,
|
|
+ if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
|
|
+ bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
|
|
+
|
|
+ prt_str(&buf, "\nold: ");
|
|
+ bch2_bkey_val_to_text(&buf, c, old);
|
|
+ prt_str(&buf, "\nk: ");
|
|
+ bch2_bkey_val_to_text(&buf, c, k);
|
|
+ prt_str(&buf, "\nnew: ");
|
|
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
|
|
+
|
|
+ trace_io_move_created_rebalance(c, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+
|
|
+ this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]);
|
|
+ }
|
|
+
|
|
+ ret = bch2_trans_commit(trans, &op->res,
|
|
NULL,
|
|
BCH_TRANS_COMMIT_no_check_rw|
|
|
BCH_TRANS_COMMIT_no_enospc|
|
|
m->data_opts.btree_insert_flags);
|
|
- if (!ret) {
|
|
- bch2_btree_iter_set_pos(&iter, next_pos);
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
- this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
|
|
- if (trace_move_extent_finish_enabled())
|
|
- trace_move_extent_finish2(m, &new->k_i, insert);
|
|
- }
|
|
+ bch2_btree_iter_set_pos(trans, &iter, next_pos);
|
|
+
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
|
|
+ if (trace_io_move_finish_enabled())
|
|
+ trace_io_move_finish2(m, &new->k_i, insert);
|
|
err:
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
ret = 0;
|
|
@@ -409,12 +449,13 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
&m->stats->sectors_raced);
|
|
}
|
|
|
|
- count_event(c, move_extent_fail);
|
|
+ count_event(c, io_move_fail);
|
|
|
|
- bch2_btree_iter_advance(&iter);
|
|
+ bch2_btree_iter_advance(trans, &iter);
|
|
goto next;
|
|
}
|
|
out:
|
|
+ printbuf_exit(&journal_msg);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
bch2_bkey_buf_exit(&_insert, c);
|
|
bch2_bkey_buf_exit(&_new, c);
|
|
@@ -427,14 +468,17 @@ int bch2_data_update_index_update(struct bch_write_op *op)
|
|
return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
|
|
}
|
|
|
|
-void bch2_data_update_read_done(struct data_update *m,
|
|
- struct bch_extent_crc_unpacked crc)
|
|
+void bch2_data_update_read_done(struct data_update *m)
|
|
{
|
|
+ m->read_done = true;
|
|
+
|
|
/* write bio must own pages: */
|
|
BUG_ON(!m->op.wbio.bio.bi_vcnt);
|
|
|
|
- m->op.crc = crc;
|
|
- m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
|
|
+ m->op.crc = m->rbio.pick.crc;
|
|
+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
|
|
+
|
|
+ this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size);
|
|
|
|
closure_call(&m->op.cl, bch2_write, NULL, NULL);
|
|
}
|
|
@@ -444,38 +488,41 @@ void bch2_data_update_exit(struct data_update *update)
|
|
struct bch_fs *c = update->op.c;
|
|
struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
|
|
|
|
+ bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
|
|
+ kfree(update->bvecs);
|
|
+ update->bvecs = NULL;
|
|
+
|
|
if (c->opts.nocow_enabled)
|
|
bkey_nocow_unlock(c, k);
|
|
bkey_put_dev_refs(c, k);
|
|
- bch2_bkey_buf_exit(&update->k, c);
|
|
bch2_disk_reservation_put(c, &update->op.res);
|
|
- bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
|
|
+ bch2_bkey_buf_exit(&update->k, c);
|
|
}
|
|
|
|
-static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
|
- struct data_update *update)
|
|
+static int bch2_update_unwritten_extent(struct btree_trans *trans,
|
|
+ struct data_update *update)
|
|
{
|
|
struct bch_fs *c = update->op.c;
|
|
- struct bio *bio = &update->op.wbio.bio;
|
|
struct bkey_i_extent *e;
|
|
struct write_point *wp;
|
|
struct closure cl;
|
|
struct btree_iter iter;
|
|
struct bkey_s_c k;
|
|
- int ret;
|
|
+ int ret = 0;
|
|
|
|
closure_init_stack(&cl);
|
|
bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
|
|
|
|
- while (bio_sectors(bio)) {
|
|
- unsigned sectors = bio_sectors(bio);
|
|
+ while (bpos_lt(update->op.pos, update->k.k->k.p)) {
|
|
+ unsigned sectors = update->k.k->k.p.offset -
|
|
+ update->op.pos.offset;
|
|
|
|
bch2_trans_begin(trans);
|
|
|
|
bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
|
|
BTREE_ITER_slots);
|
|
ret = lockrestart_do(trans, ({
|
|
- k = bch2_btree_iter_peek_slot(&iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &iter);
|
|
bkey_err(k);
|
|
}));
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
@@ -504,7 +551,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
|
bch_err_fn_ratelimited(c, ret);
|
|
|
|
if (ret)
|
|
- return;
|
|
+ break;
|
|
|
|
sectors = min(sectors, wp->sectors_free);
|
|
|
|
@@ -514,7 +561,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
|
bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
|
|
bch2_alloc_sectors_done(c, wp);
|
|
|
|
- bio_advance(bio, sectors << 9);
|
|
update->op.pos.offset += sectors;
|
|
|
|
extent_for_each_ptr(extent_i_to_s(e), ptr)
|
|
@@ -533,13 +579,16 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
|
bch2_trans_unlock(trans);
|
|
closure_sync(&cl);
|
|
}
|
|
+
|
|
+ return ret;
|
|
}
|
|
|
|
void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
|
|
struct bch_io_opts *io_opts,
|
|
struct data_update_opts *data_opts)
|
|
{
|
|
- printbuf_tabstop_push(out, 20);
|
|
+ if (!out->nr_tabstops)
|
|
+ printbuf_tabstop_push(out, 20);
|
|
|
|
prt_str_indented(out, "rewrite ptrs:\t");
|
|
bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
|
|
@@ -563,10 +612,17 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
|
|
|
|
prt_str_indented(out, "extra replicas:\t");
|
|
prt_u64(out, data_opts->extra_replicas);
|
|
+ prt_newline(out);
|
|
+
|
|
+ prt_str_indented(out, "scrub:\t");
|
|
+ prt_u64(out, data_opts->scrub);
|
|
}
|
|
|
|
void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
|
|
{
|
|
+ prt_str(out, bch2_data_update_type_strs[m->type]);
|
|
+ prt_newline(out);
|
|
+
|
|
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
|
|
prt_newline(out);
|
|
|
|
@@ -574,6 +630,25 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
|
|
bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
|
|
}
|
|
|
|
+void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m)
|
|
+{
|
|
+ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
|
|
+ prt_newline(out);
|
|
+ printbuf_indent_add(out, 2);
|
|
+ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
|
|
+
|
|
+ if (!m->read_done) {
|
|
+ prt_printf(out, "read:\n");
|
|
+ printbuf_indent_add(out, 2);
|
|
+ bch2_read_bio_to_text(out, &m->rbio);
|
|
+ } else {
|
|
+ prt_printf(out, "write:\n");
|
|
+ printbuf_indent_add(out, 2);
|
|
+ bch2_write_op_to_text(out, &m->op);
|
|
+ }
|
|
+ printbuf_indent_sub(out, 4);
|
|
+}
|
|
+
|
|
int bch2_extent_drop_ptrs(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
struct bkey_s_c k,
|
|
@@ -617,12 +692,87 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
|
|
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
|
}
|
|
|
|
+int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
|
|
+ struct bch_io_opts *io_opts)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+
|
|
+ /* write path might have to decompress data: */
|
|
+ unsigned buf_bytes = 0;
|
|
+ bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
|
|
+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
|
|
+
|
|
+ unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
|
|
+
|
|
+ m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
|
|
+ if (!m->bvecs)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
|
|
+ bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
|
|
+
|
|
+ if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
|
|
+ kfree(m->bvecs);
|
|
+ m->bvecs = NULL;
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ rbio_init(&m->rbio.bio, c, *io_opts, NULL);
|
|
+ m->rbio.data_update = true;
|
|
+ m->rbio.bio.bi_iter.bi_size = buf_bytes;
|
|
+ m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
|
|
+ m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int can_write_extent(struct bch_fs *c, struct data_update *m)
|
|
+{
|
|
+ if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
|
|
+ unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
|
|
+ return -BCH_ERR_data_update_done_would_block;
|
|
+
|
|
+ unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
|
|
+ ? m->op.target
|
|
+ : 0;
|
|
+ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
|
|
+
|
|
+ darray_for_each(m->op.devs_have, i)
|
|
+ __clear_bit(*i, devs.d);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ unsigned nr_replicas = 0, i;
|
|
+ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
|
|
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, i);
|
|
+ if (!ca)
|
|
+ continue;
|
|
+
|
|
+ struct bch_dev_usage usage;
|
|
+ bch2_dev_usage_read_fast(ca, &usage);
|
|
+
|
|
+ if (!dev_buckets_free(ca, usage, m->op.watermark))
|
|
+ continue;
|
|
+
|
|
+ nr_replicas += ca->mi.durability;
|
|
+ if (nr_replicas >= m->op.nr_replicas)
|
|
+ break;
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (!nr_replicas)
|
|
+ return -BCH_ERR_data_update_done_no_rw_devs;
|
|
+ if (nr_replicas < m->op.nr_replicas)
|
|
+ return -BCH_ERR_insufficient_devices;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
int bch2_data_update_init(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
struct moving_context *ctxt,
|
|
struct data_update *m,
|
|
struct write_point_specifier wp,
|
|
- struct bch_io_opts io_opts,
|
|
+ struct bch_io_opts *io_opts,
|
|
struct data_update_opts data_opts,
|
|
enum btree_id btree_id,
|
|
struct bkey_s_c k)
|
|
@@ -640,36 +790,30 @@ int bch2_data_update_init(struct btree_trans *trans,
|
|
* snapshots table - just skip it, we can move it later.
|
|
*/
|
|
if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
|
|
- return -BCH_ERR_data_update_done;
|
|
-
|
|
- if (!bkey_get_dev_refs(c, k))
|
|
- return -BCH_ERR_data_update_done;
|
|
-
|
|
- if (c->opts.nocow_enabled &&
|
|
- !bkey_nocow_lock(c, ctxt, k)) {
|
|
- bkey_put_dev_refs(c, k);
|
|
- return -BCH_ERR_nocow_lock_blocked;
|
|
- }
|
|
+ return -BCH_ERR_data_update_done_no_snapshot;
|
|
|
|
bch2_bkey_buf_init(&m->k);
|
|
bch2_bkey_buf_reassemble(&m->k, c, k);
|
|
+ m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc
|
|
+ ? BCH_DATA_UPDATE_copygc
|
|
+ : BCH_DATA_UPDATE_rebalance;
|
|
m->btree_id = btree_id;
|
|
m->data_opts = data_opts;
|
|
m->ctxt = ctxt;
|
|
m->stats = ctxt ? ctxt->stats : NULL;
|
|
|
|
- bch2_write_op_init(&m->op, c, io_opts);
|
|
+ bch2_write_op_init(&m->op, c, *io_opts);
|
|
m->op.pos = bkey_start_pos(k.k);
|
|
m->op.version = k.k->bversion;
|
|
m->op.target = data_opts.target;
|
|
m->op.write_point = wp;
|
|
m->op.nr_replicas = 0;
|
|
- m->op.flags |= BCH_WRITE_PAGES_STABLE|
|
|
- BCH_WRITE_PAGES_OWNED|
|
|
- BCH_WRITE_DATA_ENCODED|
|
|
- BCH_WRITE_MOVE|
|
|
+ m->op.flags |= BCH_WRITE_pages_stable|
|
|
+ BCH_WRITE_pages_owned|
|
|
+ BCH_WRITE_data_encoded|
|
|
+ BCH_WRITE_move|
|
|
m->data_opts.write_flags;
|
|
- m->op.compression_opt = io_opts.background_compression;
|
|
+ m->op.compression_opt = io_opts->background_compression;
|
|
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
|
|
|
|
unsigned durability_have = 0, durability_removing = 0;
|
|
@@ -707,7 +851,7 @@ int bch2_data_update_init(struct btree_trans *trans,
|
|
ptr_bit <<= 1;
|
|
}
|
|
|
|
- unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
|
|
+ unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
|
|
|
|
/*
|
|
* If current extent durability is less than io_opts.data_replicas,
|
|
@@ -740,28 +884,70 @@ int bch2_data_update_init(struct btree_trans *trans,
|
|
m->data_opts.rewrite_ptrs = 0;
|
|
/* if iter == NULL, it's just a promote */
|
|
if (iter)
|
|
- ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
|
|
- goto out;
|
|
+ ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
|
|
+ if (!ret)
|
|
+ ret = -BCH_ERR_data_update_done_no_writes_needed;
|
|
+ goto out_bkey_buf_exit;
|
|
}
|
|
|
|
+ /*
|
|
+ * Check if the allocation will succeed, to avoid getting an error later
|
|
+ * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
|
|
+ * read:
|
|
+ *
|
|
+ * This guards against
|
|
+ * - BCH_WRITE_alloc_nowait allocations failing (promotes)
|
|
+ * - Destination target full
|
|
+ * - Device(s) in destination target offline
|
|
+ * - Insufficient durability available in destination target
|
|
+ * (i.e. trying to move a durability=2 replica to a target with a
|
|
+ * single durability=2 device)
|
|
+ */
|
|
+ ret = can_write_extent(c, m);
|
|
+ if (ret)
|
|
+ goto out_bkey_buf_exit;
|
|
+
|
|
if (reserve_sectors) {
|
|
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
|
|
m->data_opts.extra_replicas
|
|
? 0
|
|
: BCH_DISK_RESERVATION_NOFAIL);
|
|
if (ret)
|
|
- goto out;
|
|
+ goto out_bkey_buf_exit;
|
|
+ }
|
|
+
|
|
+ if (!bkey_get_dev_refs(c, k)) {
|
|
+ ret = -BCH_ERR_data_update_done_no_dev_refs;
|
|
+ goto out_put_disk_res;
|
|
+ }
|
|
+
|
|
+ if (c->opts.nocow_enabled &&
|
|
+ !bkey_nocow_lock(c, ctxt, k)) {
|
|
+ ret = -BCH_ERR_nocow_lock_blocked;
|
|
+ goto out_put_dev_refs;
|
|
}
|
|
|
|
if (bkey_extent_is_unwritten(k)) {
|
|
- bch2_update_unwritten_extent(trans, m);
|
|
- goto out;
|
|
+ ret = bch2_update_unwritten_extent(trans, m) ?:
|
|
+ -BCH_ERR_data_update_done_unwritten;
|
|
+ goto out_nocow_unlock;
|
|
}
|
|
|
|
+ ret = bch2_data_update_bios_init(m, c, io_opts);
|
|
+ if (ret)
|
|
+ goto out_nocow_unlock;
|
|
+
|
|
return 0;
|
|
-out:
|
|
- bch2_data_update_exit(m);
|
|
- return ret ?: -BCH_ERR_data_update_done;
|
|
+out_nocow_unlock:
|
|
+ if (c->opts.nocow_enabled)
|
|
+ bkey_nocow_unlock(c, k);
|
|
+out_put_dev_refs:
|
|
+ bkey_put_dev_refs(c, k);
|
|
+out_put_disk_res:
|
|
+ bch2_disk_reservation_put(c, &m->op.res);
|
|
+out_bkey_buf_exit:
|
|
+ bch2_bkey_buf_exit(&m->k, c);
|
|
+ return ret;
|
|
}
|
|
|
|
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
|
|
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
|
|
index e4b50723428e..5e14d13568de 100644
|
|
--- a/fs/bcachefs/data_update.h
|
|
+++ b/fs/bcachefs/data_update.h
|
|
@@ -4,6 +4,7 @@
|
|
#define _BCACHEFS_DATA_UPDATE_H
|
|
|
|
#include "bkey_buf.h"
|
|
+#include "io_read.h"
|
|
#include "io_write_types.h"
|
|
|
|
struct moving_context;
|
|
@@ -15,27 +16,61 @@ struct data_update_opts {
|
|
u8 extra_replicas;
|
|
unsigned btree_insert_flags;
|
|
unsigned write_flags;
|
|
+
|
|
+ int read_dev;
|
|
+ bool scrub;
|
|
};
|
|
|
|
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
|
|
struct bch_io_opts *, struct data_update_opts *);
|
|
|
|
+#define BCH_DATA_UPDATE_TYPES() \
|
|
+ x(copygc, 0) \
|
|
+ x(rebalance, 1) \
|
|
+ x(promote, 2)
|
|
+
|
|
+enum bch_data_update_types {
|
|
+#define x(n, id) BCH_DATA_UPDATE_##n = id,
|
|
+ BCH_DATA_UPDATE_TYPES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
struct data_update {
|
|
+ enum bch_data_update_types type;
|
|
/* extent being updated: */
|
|
+ bool read_done;
|
|
enum btree_id btree_id;
|
|
struct bkey_buf k;
|
|
struct data_update_opts data_opts;
|
|
struct moving_context *ctxt;
|
|
struct bch_move_stats *stats;
|
|
+
|
|
+ struct bch_read_bio rbio;
|
|
struct bch_write_op op;
|
|
+ struct bio_vec *bvecs;
|
|
+};
|
|
+
|
|
+struct promote_op {
|
|
+ struct rcu_head rcu;
|
|
+ u64 start_time;
|
|
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
|
|
+ unsigned list_idx;
|
|
+#endif
|
|
+
|
|
+ struct rhash_head hash;
|
|
+ struct bpos pos;
|
|
+
|
|
+ struct work_struct work;
|
|
+ struct data_update write;
|
|
+ struct bio_vec bi_inline_vecs[]; /* must be last */
|
|
};
|
|
|
|
void bch2_data_update_to_text(struct printbuf *, struct data_update *);
|
|
+void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
|
|
|
|
int bch2_data_update_index_update(struct bch_write_op *);
|
|
|
|
-void bch2_data_update_read_done(struct data_update *,
|
|
- struct bch_extent_crc_unpacked);
|
|
+void bch2_data_update_read_done(struct data_update *);
|
|
|
|
int bch2_extent_drop_ptrs(struct btree_trans *,
|
|
struct btree_iter *,
|
|
@@ -43,12 +78,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *,
|
|
struct bch_io_opts *,
|
|
struct data_update_opts *);
|
|
|
|
+int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
|
|
+ struct bch_io_opts *);
|
|
+
|
|
void bch2_data_update_exit(struct data_update *);
|
|
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
|
|
struct moving_context *,
|
|
struct data_update *,
|
|
struct write_point_specifier,
|
|
- struct bch_io_opts, struct data_update_opts,
|
|
+ struct bch_io_opts *, struct data_update_opts,
|
|
enum btree_id, struct bkey_s_c);
|
|
void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
|
|
|
|
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
|
|
index 55333e82d1fe..4fa70634c90e 100644
|
|
--- a/fs/bcachefs/debug.c
|
|
+++ b/fs/bcachefs/debug.c
|
|
@@ -7,6 +7,8 @@
|
|
*/
|
|
|
|
#include "bcachefs.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "async_objs.h"
|
|
#include "bkey_methods.h"
|
|
#include "btree_cache.h"
|
|
#include "btree_io.h"
|
|
@@ -15,6 +17,7 @@
|
|
#include "btree_update.h"
|
|
#include "btree_update_interior.h"
|
|
#include "buckets.h"
|
|
+#include "data_update.h"
|
|
#include "debug.h"
|
|
#include "error.h"
|
|
#include "extents.h"
|
|
@@ -39,9 +42,10 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
|
|
struct btree_node *n_sorted = c->verify_data->data;
|
|
struct bset *sorted, *inmemory = &b->data->keys;
|
|
struct bio *bio;
|
|
- bool failed = false, saw_error = false;
|
|
+ bool failed = false;
|
|
|
|
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
|
|
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
|
|
+ BCH_DEV_READ_REF_btree_verify_replicas);
|
|
if (!ca)
|
|
return false;
|
|
|
|
@@ -56,12 +60,13 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
|
|
submit_bio_wait(bio);
|
|
|
|
bio_put(bio);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ],
|
|
+ BCH_DEV_READ_REF_btree_verify_replicas);
|
|
|
|
memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
|
|
|
|
v->written = 0;
|
|
- if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
|
|
+ if (bch2_btree_node_read_done(c, ca, v, NULL, NULL))
|
|
return false;
|
|
|
|
n_sorted = c->verify_data->data;
|
|
@@ -190,12 +195,13 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
|
|
unsigned offset = 0;
|
|
int ret;
|
|
|
|
- if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
|
|
+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) {
|
|
prt_printf(out, "error getting device to read from: invalid device\n");
|
|
return;
|
|
}
|
|
|
|
- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
|
|
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
|
|
+ BCH_DEV_READ_REF_btree_node_ondisk_to_text);
|
|
if (!ca) {
|
|
prt_printf(out, "error getting device to read from: not online\n");
|
|
return;
|
|
@@ -296,28 +302,13 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
|
|
if (bio)
|
|
bio_put(bio);
|
|
kvfree(n_ondisk);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ],
|
|
+ BCH_DEV_READ_REF_btree_node_ondisk_to_text);
|
|
}
|
|
|
|
#ifdef CONFIG_DEBUG_FS
|
|
|
|
-/* XXX: bch_fs refcounting */
|
|
-
|
|
-struct dump_iter {
|
|
- struct bch_fs *c;
|
|
- enum btree_id id;
|
|
- struct bpos from;
|
|
- struct bpos prev_node;
|
|
- u64 iter;
|
|
-
|
|
- struct printbuf buf;
|
|
-
|
|
- char __user *ubuf; /* destination user buffer */
|
|
- size_t size; /* size of requested read */
|
|
- ssize_t ret; /* bytes read so far */
|
|
-};
|
|
-
|
|
-static ssize_t flush_buf(struct dump_iter *i)
|
|
+ssize_t bch2_debugfs_flush_buf(struct dump_iter *i)
|
|
{
|
|
if (i->buf.pos) {
|
|
size_t bytes = min_t(size_t, i->buf.pos, i->size);
|
|
@@ -329,6 +320,11 @@ static ssize_t flush_buf(struct dump_iter *i)
|
|
i->buf.pos -= copied;
|
|
memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
|
|
|
|
+ if (i->buf.last_newline >= copied)
|
|
+ i->buf.last_newline -= copied;
|
|
+ if (i->buf.last_field >= copied)
|
|
+ i->buf.last_field -= copied;
|
|
+
|
|
if (copied != bytes)
|
|
return -EFAULT;
|
|
}
|
|
@@ -355,7 +351,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file)
|
|
return 0;
|
|
}
|
|
|
|
-static int bch2_dump_release(struct inode *inode, struct file *file)
|
|
+int bch2_dump_release(struct inode *inode, struct file *file)
|
|
{
|
|
struct dump_iter *i = file->private_data;
|
|
|
|
@@ -373,7 +369,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
|
|
i->size = size;
|
|
i->ret = 0;
|
|
|
|
- return flush_buf(i) ?:
|
|
+ return bch2_debugfs_flush_buf(i) ?:
|
|
bch2_trans_run(i->c,
|
|
for_each_btree_key(trans, iter, i->id, i->from,
|
|
BTREE_ITER_prefetch|
|
|
@@ -382,7 +378,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
|
|
prt_newline(&i->buf);
|
|
bch2_trans_unlock(trans);
|
|
i->from = bpos_successor(iter.pos);
|
|
- flush_buf(i);
|
|
+ bch2_debugfs_flush_buf(i);
|
|
}))) ?:
|
|
i->ret;
|
|
}
|
|
@@ -403,7 +399,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
|
|
i->size = size;
|
|
i->ret = 0;
|
|
|
|
- ssize_t ret = flush_buf(i);
|
|
+ ssize_t ret = bch2_debugfs_flush_buf(i);
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -417,7 +413,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
|
|
? bpos_successor(b->key.k.p)
|
|
: b->key.k.p;
|
|
|
|
- drop_locks_do(trans, flush_buf(i));
|
|
+ drop_locks_do(trans, bch2_debugfs_flush_buf(i));
|
|
}))) ?: i->ret;
|
|
}
|
|
|
|
@@ -437,7 +433,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
|
|
i->size = size;
|
|
i->ret = 0;
|
|
|
|
- return flush_buf(i) ?:
|
|
+ return bch2_debugfs_flush_buf(i) ?:
|
|
bch2_trans_run(i->c,
|
|
for_each_btree_key(trans, iter, i->id, i->from,
|
|
BTREE_ITER_prefetch|
|
|
@@ -455,7 +451,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
|
|
bch2_bfloat_to_text(&i->buf, l->b, _k);
|
|
bch2_trans_unlock(trans);
|
|
i->from = bpos_successor(iter.pos);
|
|
- flush_buf(i);
|
|
+ bch2_debugfs_flush_buf(i);
|
|
}))) ?:
|
|
i->ret;
|
|
}
|
|
@@ -516,7 +512,7 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
|
|
struct rhash_head *pos;
|
|
struct btree *b;
|
|
|
|
- ret = flush_buf(i);
|
|
+ ret = bch2_debugfs_flush_buf(i);
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -539,7 +535,7 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
|
|
ret = -ENOMEM;
|
|
|
|
if (!ret)
|
|
- ret = flush_buf(i);
|
|
+ ret = bch2_debugfs_flush_buf(i);
|
|
|
|
return ret ?: i->ret;
|
|
}
|
|
@@ -613,7 +609,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
|
|
|
|
closure_put(&trans->ref);
|
|
|
|
- ret = flush_buf(i);
|
|
+ ret = bch2_debugfs_flush_buf(i);
|
|
if (ret)
|
|
goto unlocked;
|
|
|
|
@@ -626,7 +622,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
|
|
ret = -ENOMEM;
|
|
|
|
if (!ret)
|
|
- ret = flush_buf(i);
|
|
+ ret = bch2_debugfs_flush_buf(i);
|
|
|
|
return ret ?: i->ret;
|
|
}
|
|
@@ -651,7 +647,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
|
|
i->ret = 0;
|
|
|
|
while (1) {
|
|
- err = flush_buf(i);
|
|
+ err = bch2_debugfs_flush_buf(i);
|
|
if (err)
|
|
return err;
|
|
|
|
@@ -694,7 +690,7 @@ static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
|
|
i->iter++;
|
|
}
|
|
|
|
- err = flush_buf(i);
|
|
+ err = bch2_debugfs_flush_buf(i);
|
|
if (err)
|
|
return err;
|
|
|
|
@@ -752,7 +748,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
|
|
while (1) {
|
|
struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
|
|
|
|
- err = flush_buf(i);
|
|
+ err = bch2_debugfs_flush_buf(i);
|
|
if (err)
|
|
return err;
|
|
|
|
@@ -769,6 +765,12 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
|
|
mutex_lock(&s->lock);
|
|
|
|
prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
|
|
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
|
|
+ printbuf_indent_add(&i->buf, 2);
|
|
+ bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace);
|
|
+ printbuf_indent_sub(&i->buf, 2);
|
|
+#endif
|
|
+
|
|
prt_printf(&i->buf, "Transaction duration:\n");
|
|
|
|
printbuf_indent_add(&i->buf, 2);
|
|
@@ -844,8 +846,11 @@ static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
|
|
seqmutex_unlock(&c->btree_trans_lock);
|
|
}
|
|
|
|
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
|
- size_t size, loff_t *ppos)
|
|
+typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *);
|
|
+
|
|
+static ssize_t bch2_simple_print(struct file *file, char __user *buf,
|
|
+ size_t size, loff_t *ppos,
|
|
+ fs_to_text_fn fn)
|
|
{
|
|
struct dump_iter *i = file->private_data;
|
|
struct bch_fs *c = i->c;
|
|
@@ -856,7 +861,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
|
i->ret = 0;
|
|
|
|
if (!i->iter) {
|
|
- btree_deadlock_to_text(&i->buf, c);
|
|
+ fn(&i->buf, c);
|
|
i->iter++;
|
|
}
|
|
|
|
@@ -864,11 +869,17 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
|
ret = -ENOMEM;
|
|
|
|
if (!ret)
|
|
- ret = flush_buf(i);
|
|
+ ret = bch2_debugfs_flush_buf(i);
|
|
|
|
return ret ?: i->ret;
|
|
}
|
|
|
|
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
|
+ size_t size, loff_t *ppos)
|
|
+{
|
|
+ return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text);
|
|
+}
|
|
+
|
|
static const struct file_operations btree_deadlock_ops = {
|
|
.owner = THIS_MODULE,
|
|
.open = bch2_dump_open,
|
|
@@ -876,6 +887,19 @@ static const struct file_operations btree_deadlock_ops = {
|
|
.read = bch2_btree_deadlock_read,
|
|
};
|
|
|
|
+static ssize_t bch2_write_points_read(struct file *file, char __user *buf,
|
|
+ size_t size, loff_t *ppos)
|
|
+{
|
|
+ return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text);
|
|
+}
|
|
+
|
|
+static const struct file_operations write_points_ops = {
|
|
+ .owner = THIS_MODULE,
|
|
+ .open = bch2_dump_open,
|
|
+ .release = bch2_dump_release,
|
|
+ .read = bch2_write_points_read,
|
|
+};
|
|
+
|
|
void bch2_fs_debug_exit(struct bch_fs *c)
|
|
{
|
|
if (!IS_ERR_OR_NULL(c->fs_debug_dir))
|
|
@@ -904,7 +928,11 @@ void bch2_fs_debug_init(struct bch_fs *c)
|
|
if (IS_ERR_OR_NULL(bch_debug))
|
|
return;
|
|
|
|
- snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
|
|
+ if (c->sb.multi_device)
|
|
+ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
|
|
+ else
|
|
+ strscpy(name, c->name, sizeof(name));
|
|
+
|
|
c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
|
|
if (IS_ERR_OR_NULL(c->fs_debug_dir))
|
|
return;
|
|
@@ -927,6 +955,11 @@ void bch2_fs_debug_init(struct bch_fs *c)
|
|
debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
|
|
c->btree_debug, &btree_deadlock_ops);
|
|
|
|
+ debugfs_create_file("write_points", 0400, c->fs_debug_dir,
|
|
+ c->btree_debug, &write_points_ops);
|
|
+
|
|
+ bch2_fs_async_obj_debugfs_init(c);
|
|
+
|
|
c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
|
|
if (IS_ERR_OR_NULL(c->btree_debug_dir))
|
|
return;
|
|
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
|
|
index 2c37143b5fd1..d88b1194b8ac 100644
|
|
--- a/fs/bcachefs/debug.h
|
|
+++ b/fs/bcachefs/debug.h
|
|
@@ -14,11 +14,29 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
|
|
|
|
static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
|
|
{
|
|
- if (bch2_verify_btree_ondisk)
|
|
+ if (static_branch_unlikely(&bch2_verify_btree_ondisk))
|
|
__bch2_btree_verify(c, b);
|
|
}
|
|
|
|
#ifdef CONFIG_DEBUG_FS
|
|
+struct dump_iter {
|
|
+ struct bch_fs *c;
|
|
+ struct async_obj_list *list;
|
|
+ enum btree_id id;
|
|
+ struct bpos from;
|
|
+ struct bpos prev_node;
|
|
+ u64 iter;
|
|
+
|
|
+ struct printbuf buf;
|
|
+
|
|
+ char __user *ubuf; /* destination user buffer */
|
|
+ size_t size; /* size of requested read */
|
|
+ ssize_t ret; /* bytes read so far */
|
|
+};
|
|
+
|
|
+ssize_t bch2_debugfs_flush_buf(struct dump_iter *);
|
|
+int bch2_dump_release(struct inode *, struct file *);
|
|
+
|
|
void bch2_fs_debug_exit(struct bch_fs *);
|
|
void bch2_fs_debug_init(struct bch_fs *);
|
|
#else
|
|
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
|
|
index 600eee936f13..d198001838f3 100644
|
|
--- a/fs/bcachefs/dirent.c
|
|
+++ b/fs/bcachefs/dirent.c
|
|
@@ -13,6 +13,28 @@
|
|
|
|
#include <linux/dcache.h>
|
|
|
|
+int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
|
|
+ const struct qstr *str, struct qstr *out_cf)
|
|
+{
|
|
+ *out_cf = (struct qstr) QSTR_INIT(NULL, 0);
|
|
+
|
|
+#ifdef CONFIG_UNICODE
|
|
+ unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1);
|
|
+ int ret = PTR_ERR_OR_ZERO(buf);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1);
|
|
+ if (ret <= 0)
|
|
+ return ret;
|
|
+
|
|
+ *out_cf = (struct qstr) QSTR_INIT(buf, ret);
|
|
+ return 0;
|
|
+#else
|
|
+ return -EOPNOTSUPP;
|
|
+#endif
|
|
+}
|
|
+
|
|
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
|
|
{
|
|
if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
|
|
@@ -28,13 +50,38 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
|
|
#endif
|
|
|
|
return bkey_bytes -
|
|
- offsetof(struct bch_dirent, d_name) -
|
|
+ (d.v->d_casefold
|
|
+ ? offsetof(struct bch_dirent, d_cf_name_block.d_names)
|
|
+ : offsetof(struct bch_dirent, d_name)) -
|
|
trailing_nuls;
|
|
}
|
|
|
|
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
|
|
{
|
|
- return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
|
|
+ if (d.v->d_casefold) {
|
|
+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
|
|
+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len);
|
|
+ } else {
|
|
+ return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d)
|
|
+{
|
|
+ if (d.v->d_casefold) {
|
|
+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
|
|
+ unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len);
|
|
+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len);
|
|
+ } else {
|
|
+ return (struct qstr) QSTR_INIT(NULL, 0);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d)
|
|
+{
|
|
+ return d.v->d_casefold
|
|
+ ? bch2_dirent_get_casefold_name(d)
|
|
+ : bch2_dirent_get_name(d);
|
|
}
|
|
|
|
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
|
|
@@ -57,7 +104,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
|
|
static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
|
|
{
|
|
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
|
- struct qstr name = bch2_dirent_get_name(d);
|
|
+ struct qstr name = bch2_dirent_get_lookup_name(d);
|
|
|
|
return bch2_dirent_hash(info, &name);
|
|
}
|
|
@@ -65,7 +112,7 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
|
|
static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
|
|
{
|
|
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
|
|
- const struct qstr l_name = bch2_dirent_get_name(l);
|
|
+ const struct qstr l_name = bch2_dirent_get_lookup_name(l);
|
|
const struct qstr *r_name = _r;
|
|
|
|
return !qstr_eq(l_name, *r_name);
|
|
@@ -75,8 +122,8 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
|
|
{
|
|
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
|
|
struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
|
|
- const struct qstr l_name = bch2_dirent_get_name(l);
|
|
- const struct qstr r_name = bch2_dirent_get_name(r);
|
|
+ const struct qstr l_name = bch2_dirent_get_lookup_name(l);
|
|
+ const struct qstr r_name = bch2_dirent_get_lookup_name(r);
|
|
|
|
return !qstr_eq(l_name, r_name);
|
|
}
|
|
@@ -104,17 +151,19 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
struct bkey_validate_context from)
|
|
{
|
|
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
|
+ unsigned name_block_len = bch2_dirent_name_bytes(d);
|
|
struct qstr d_name = bch2_dirent_get_name(d);
|
|
+ struct qstr d_cf_name = bch2_dirent_get_casefold_name(d);
|
|
int ret = 0;
|
|
|
|
bkey_fsck_err_on(!d_name.len,
|
|
c, dirent_empty_name,
|
|
"empty name");
|
|
|
|
- bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len),
|
|
+ bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len,
|
|
c, dirent_val_too_big,
|
|
- "value too big (%zu > %u)",
|
|
- bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
|
|
+ "dirent names exceed bkey size (%d + %d > %d)",
|
|
+ d_name.len, d_cf_name.len, name_block_len);
|
|
|
|
/*
|
|
* Check new keys don't exceed the max length
|
|
@@ -142,6 +191,18 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
le64_to_cpu(d.v->d_inum) == d.k->p.inode,
|
|
c, dirent_to_itself,
|
|
"dirent points to own directory");
|
|
+
|
|
+ if (d.v->d_casefold) {
|
|
+ bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit &&
|
|
+ d_cf_name.len > BCH_NAME_MAX,
|
|
+ c, dirent_cf_name_too_big,
|
|
+ "dirent w/ cf name too big (%u > %u)",
|
|
+ d_cf_name.len, BCH_NAME_MAX);
|
|
+
|
|
+ bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len),
|
|
+ c, dirent_stray_data_after_cf_name,
|
|
+ "dirent has stray data after cf name's NUL");
|
|
+ }
|
|
fsck_err:
|
|
return ret;
|
|
}
|
|
@@ -151,27 +212,33 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
|
|
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
|
struct qstr d_name = bch2_dirent_get_name(d);
|
|
|
|
- prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
|
|
+ prt_printf(out, "%.*s", d_name.len, d_name.name);
|
|
+
|
|
+ if (d.v->d_casefold) {
|
|
+ struct qstr d_name = bch2_dirent_get_lookup_name(d);
|
|
+ prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name);
|
|
+ }
|
|
+
|
|
+ prt_str(out, " ->");
|
|
|
|
if (d.v->d_type != DT_SUBVOL)
|
|
- prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
|
|
+ prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum));
|
|
else
|
|
- prt_printf(out, "%u -> %u",
|
|
+ prt_printf(out, " %u -> %u",
|
|
le32_to_cpu(d.v->d_parent_subvol),
|
|
le32_to_cpu(d.v->d_child_subvol));
|
|
|
|
prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
|
|
}
|
|
|
|
-static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
|
|
- subvol_inum dir, u8 type,
|
|
- const struct qstr *name, u64 dst)
|
|
+static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans,
|
|
+ subvol_inum dir,
|
|
+ u8 type,
|
|
+ int name_len, int cf_name_len,
|
|
+ u64 dst)
|
|
{
|
|
struct bkey_i_dirent *dirent;
|
|
- unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
|
|
-
|
|
- if (name->len > BCH_NAME_MAX)
|
|
- return ERR_PTR(-ENAMETOOLONG);
|
|
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len);
|
|
|
|
BUG_ON(u64s > U8_MAX);
|
|
|
|
@@ -190,14 +257,75 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
|
|
}
|
|
|
|
dirent->v.d_type = type;
|
|
+ dirent->v.d_unused = 0;
|
|
+ dirent->v.d_casefold = cf_name_len ? 1 : 0;
|
|
+
|
|
+ return dirent;
|
|
+}
|
|
+
|
|
+static void dirent_init_regular_name(struct bkey_i_dirent *dirent,
|
|
+ const struct qstr *name)
|
|
+{
|
|
+ EBUG_ON(dirent->v.d_casefold);
|
|
+
|
|
+ memcpy(&dirent->v.d_name[0], name->name, name->len);
|
|
+ memset(&dirent->v.d_name[name->len], 0,
|
|
+ bkey_val_bytes(&dirent->k) -
|
|
+ offsetof(struct bch_dirent, d_name) -
|
|
+ name->len);
|
|
+}
|
|
+
|
|
+static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
|
|
+ const struct qstr *name,
|
|
+ const struct qstr *cf_name)
|
|
+{
|
|
+ EBUG_ON(!dirent->v.d_casefold);
|
|
+ EBUG_ON(!cf_name->len);
|
|
+
|
|
+ dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
|
|
+ dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len);
|
|
+ memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
|
|
+ memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);
|
|
+ memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0,
|
|
+ bkey_val_bytes(&dirent->k) -
|
|
+ offsetof(struct bch_dirent, d_cf_name_block.d_names) -
|
|
+ name->len + cf_name->len);
|
|
+
|
|
+ EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len);
|
|
+}
|
|
+
|
|
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
|
|
+ const struct bch_hash_info *hash_info,
|
|
+ subvol_inum dir,
|
|
+ u8 type,
|
|
+ const struct qstr *name,
|
|
+ const struct qstr *cf_name,
|
|
+ u64 dst)
|
|
+{
|
|
+ struct bkey_i_dirent *dirent;
|
|
+ struct qstr _cf_name;
|
|
|
|
- memcpy(dirent->v.d_name, name->name, name->len);
|
|
- memset(dirent->v.d_name + name->len, 0,
|
|
- bkey_val_bytes(&dirent->k) -
|
|
- offsetof(struct bch_dirent, d_name) -
|
|
- name->len);
|
|
+ if (name->len > BCH_NAME_MAX)
|
|
+ return ERR_PTR(-ENAMETOOLONG);
|
|
+
|
|
+ if (hash_info->cf_encoding && !cf_name) {
|
|
+ int ret = bch2_casefold(trans, hash_info, name, &_cf_name);
|
|
+ if (ret)
|
|
+ return ERR_PTR(ret);
|
|
+
|
|
+ cf_name = &_cf_name;
|
|
+ }
|
|
|
|
- EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
|
|
+ dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst);
|
|
+ if (IS_ERR(dirent))
|
|
+ return dirent;
|
|
+
|
|
+ if (cf_name)
|
|
+ dirent_init_casefolded_name(dirent, name, cf_name);
|
|
+ else
|
|
+ dirent_init_regular_name(dirent, name);
|
|
+
|
|
+ EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
|
|
|
|
return dirent;
|
|
}
|
|
@@ -213,7 +341,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
|
|
struct bkey_i_dirent *dirent;
|
|
int ret;
|
|
|
|
- dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
|
|
+ dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
|
|
ret = PTR_ERR_OR_ZERO(dirent);
|
|
if (ret)
|
|
return ret;
|
|
@@ -222,8 +350,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
|
|
dirent->k.p.snapshot = snapshot;
|
|
|
|
ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
|
|
- dir_inum, snapshot, &dirent->k_i,
|
|
- flags|BTREE_UPDATE_internal_snapshot_node);
|
|
+ dir_inum, snapshot, &dirent->k_i, flags);
|
|
*dir_offset = dirent->k.p.offset;
|
|
|
|
return ret;
|
|
@@ -238,7 +365,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
|
|
struct bkey_i_dirent *dirent;
|
|
int ret;
|
|
|
|
- dirent = dirent_create_key(trans, dir, type, name, dst_inum);
|
|
+ dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
|
|
ret = PTR_ERR_OR_ZERO(dirent);
|
|
if (ret)
|
|
return ret;
|
|
@@ -275,14 +402,15 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
|
|
}
|
|
|
|
int bch2_dirent_rename(struct btree_trans *trans,
|
|
- subvol_inum src_dir, struct bch_hash_info *src_hash,
|
|
- subvol_inum dst_dir, struct bch_hash_info *dst_hash,
|
|
+ subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size,
|
|
+ subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size,
|
|
const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
|
|
const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
|
|
enum bch_rename_mode mode)
|
|
{
|
|
- struct btree_iter src_iter = { NULL };
|
|
- struct btree_iter dst_iter = { NULL };
|
|
+ struct qstr src_name_lookup, dst_name_lookup;
|
|
+ struct btree_iter src_iter = {};
|
|
+ struct btree_iter dst_iter = {};
|
|
struct bkey_s_c old_src, old_dst = bkey_s_c_null;
|
|
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
|
|
struct bpos dst_pos =
|
|
@@ -295,8 +423,11 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
memset(dst_inum, 0, sizeof(*dst_inum));
|
|
|
|
/* Lookup src: */
|
|
+ ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup);
|
|
+ if (ret)
|
|
+ goto out;
|
|
old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
|
|
- src_hash, src_dir, src_name,
|
|
+ src_hash, src_dir, &src_name_lookup,
|
|
BTREE_ITER_intent);
|
|
ret = bkey_err(old_src);
|
|
if (ret)
|
|
@@ -308,6 +439,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
goto out;
|
|
|
|
/* Lookup dst: */
|
|
+ ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup);
|
|
+ if (ret)
|
|
+ goto out;
|
|
if (mode == BCH_RENAME) {
|
|
/*
|
|
* Note that we're _not_ checking if the target already exists -
|
|
@@ -315,12 +449,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
* correctness:
|
|
*/
|
|
ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
|
|
- dst_hash, dst_dir, dst_name);
|
|
+ dst_hash, dst_dir, &dst_name_lookup);
|
|
if (ret)
|
|
goto out;
|
|
} else {
|
|
old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
|
|
- dst_hash, dst_dir, dst_name,
|
|
+ dst_hash, dst_dir, &dst_name_lookup,
|
|
BTREE_ITER_intent);
|
|
ret = bkey_err(old_dst);
|
|
if (ret)
|
|
@@ -336,7 +470,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
*src_offset = dst_iter.pos.offset;
|
|
|
|
/* Create new dst key: */
|
|
- new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
|
|
+ new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
|
|
+ dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
|
|
ret = PTR_ERR_OR_ZERO(new_dst);
|
|
if (ret)
|
|
goto out;
|
|
@@ -346,7 +481,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
|
|
/* Create new src key: */
|
|
if (mode == BCH_RENAME_EXCHANGE) {
|
|
- new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
|
|
+ new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name,
|
|
+ src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
|
|
ret = PTR_ERR_OR_ZERO(new_src);
|
|
if (ret)
|
|
goto out;
|
|
@@ -406,6 +542,14 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
new_src->v.d_type == DT_SUBVOL)
|
|
new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
|
|
|
|
+ if (old_dst.k)
|
|
+ *dst_dir_i_size -= bkey_bytes(old_dst.k);
|
|
+ *src_dir_i_size -= bkey_bytes(old_src.k);
|
|
+
|
|
+ if (mode == BCH_RENAME_EXCHANGE)
|
|
+ *src_dir_i_size += bkey_bytes(&new_src->k);
|
|
+ *dst_dir_i_size += bkey_bytes(&new_dst->k);
|
|
+
|
|
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
|
|
if (ret)
|
|
goto out;
|
|
@@ -434,16 +578,16 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
}
|
|
|
|
if (delete_src) {
|
|
- bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
|
|
- ret = bch2_btree_iter_traverse(&src_iter) ?:
|
|
+ bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot);
|
|
+ ret = bch2_btree_iter_traverse(trans, &src_iter) ?:
|
|
bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
|
|
if (delete_dst) {
|
|
- bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
|
|
- ret = bch2_btree_iter_traverse(&dst_iter) ?:
|
|
+ bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot);
|
|
+ ret = bch2_btree_iter_traverse(trans, &dst_iter) ?:
|
|
bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
|
|
if (ret)
|
|
goto out;
|
|
@@ -465,9 +609,14 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans,
|
|
const struct qstr *name, subvol_inum *inum,
|
|
unsigned flags)
|
|
{
|
|
+ struct qstr lookup_name;
|
|
+ int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
|
|
- hash_info, dir, name, flags);
|
|
- int ret = bkey_err(k);
|
|
+ hash_info, dir, &lookup_name, flags);
|
|
+ ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
|
|
@@ -485,7 +634,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
|
|
const struct qstr *name, subvol_inum *inum)
|
|
{
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
- struct btree_iter iter = { NULL };
|
|
+ struct btree_iter iter = {};
|
|
|
|
int ret = lockrestart_do(trans,
|
|
bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
|
|
@@ -540,7 +689,7 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
|
|
vfs_d_type(d.v->d_type));
|
|
if (ret)
|
|
ctx->pos = d.k->p.offset + 1;
|
|
- return ret;
|
|
+ return !ret;
|
|
}
|
|
|
|
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
|
|
@@ -565,10 +714,61 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
|
|
if (ret2 > 0)
|
|
continue;
|
|
|
|
- ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
|
|
+ ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target));
|
|
})));
|
|
|
|
bch2_bkey_buf_exit(&sk, c);
|
|
|
|
return ret < 0 ? ret : 0;
|
|
}
|
|
+
|
|
+/* fsck */
|
|
+
|
|
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
|
|
+ BTREE_ITER_all_snapshots, k, ret) {
|
|
+ if (k.k->p.offset != inode_nr)
|
|
+ break;
|
|
+ if (!bkey_is_inode(k.k))
|
|
+ continue;
|
|
+ ret = bch2_inode_unpack(k, inode);
|
|
+ goto found;
|
|
+ }
|
|
+ ret = -BCH_ERR_ENOENT_inode;
|
|
+found:
|
|
+ bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter iter;
|
|
+ struct bch_inode_unpacked dir_inode;
|
|
+ struct bch_hash_info dir_hash_info;
|
|
+ int ret;
|
|
+
|
|
+ ret = lookup_first_inode(trans, pos.inode, &dir_inode);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ dir_hash_info = bch2_hash_info_init(c, &dir_inode);
|
|
+
|
|
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
|
|
+
|
|
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
|
|
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
|
|
+ &dir_hash_info, &iter,
|
|
+ BTREE_UPDATE_internal_snapshot_node);
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+err:
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
|
|
index 362b3b2f2f2e..d3e7ae669575 100644
|
|
--- a/fs/bcachefs/dirent.h
|
|
+++ b/fs/bcachefs/dirent.h
|
|
@@ -23,12 +23,30 @@ struct bch_fs;
|
|
struct bch_hash_info;
|
|
struct bch_inode_info;
|
|
|
|
+int bch2_casefold(struct btree_trans *, const struct bch_hash_info *,
|
|
+ const struct qstr *, struct qstr *);
|
|
+
|
|
+static inline int bch2_maybe_casefold(struct btree_trans *trans,
|
|
+ const struct bch_hash_info *info,
|
|
+ const struct qstr *str, struct qstr *out_cf)
|
|
+{
|
|
+ if (likely(!info->cf_encoding)) {
|
|
+ *out_cf = *str;
|
|
+ return 0;
|
|
+ } else {
|
|
+ return bch2_casefold(trans, info, str, out_cf);
|
|
+ }
|
|
+}
|
|
+
|
|
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
|
|
|
|
-static inline unsigned dirent_val_u64s(unsigned len)
|
|
+static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
|
|
{
|
|
- return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
|
|
- sizeof(u64));
|
|
+ unsigned bytes = cf_len
|
|
+ ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len
|
|
+ : offsetof(struct bch_dirent, d_name) + len;
|
|
+
|
|
+ return DIV_ROUND_UP(bytes, sizeof(u64));
|
|
}
|
|
|
|
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
|
|
@@ -62,8 +80,8 @@ enum bch_rename_mode {
|
|
};
|
|
|
|
int bch2_dirent_rename(struct btree_trans *,
|
|
- subvol_inum, struct bch_hash_info *,
|
|
- subvol_inum, struct bch_hash_info *,
|
|
+ subvol_inum, struct bch_hash_info *, u64 *,
|
|
+ subvol_inum, struct bch_hash_info *, u64 *,
|
|
const struct qstr *, subvol_inum *, u64 *,
|
|
const struct qstr *, subvol_inum *, u64 *,
|
|
enum bch_rename_mode);
|
|
@@ -79,4 +97,6 @@ int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
|
|
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
|
|
int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
|
|
|
|
+int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
|
|
+
|
|
#endif /* _BCACHEFS_DIRENT_H */
|
|
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
|
|
index 5e116b88e814..a46dbddd21aa 100644
|
|
--- a/fs/bcachefs/dirent_format.h
|
|
+++ b/fs/bcachefs/dirent_format.h
|
|
@@ -29,9 +29,25 @@ struct bch_dirent {
|
|
* Copy of mode bits 12-15 from the target inode - so userspace can get
|
|
* the filetype without having to do a stat()
|
|
*/
|
|
- __u8 d_type;
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u8 d_type:5,
|
|
+ d_unused:2,
|
|
+ d_casefold:1;
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ __u8 d_casefold:1,
|
|
+ d_unused:2,
|
|
+ d_type:5;
|
|
+#endif
|
|
|
|
- __u8 d_name[];
|
|
+ union {
|
|
+ struct {
|
|
+ __u8 d_pad;
|
|
+ __le16 d_name_len;
|
|
+ __le16 d_cf_name_len;
|
|
+ __u8 d_names[];
|
|
+ } d_cf_name_block __packed;
|
|
+ __DECLARE_FLEX_ARRAY(__u8, d_name);
|
|
+ } __packed;
|
|
} __packed __aligned(8);
|
|
|
|
#define DT_SUBVOL 16
|
|
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
|
|
index b32e91ba8be8..b3840ff7c407 100644
|
|
--- a/fs/bcachefs/disk_accounting.c
|
|
+++ b/fs/bcachefs/disk_accounting.c
|
|
@@ -68,23 +68,31 @@ static const char * const disk_accounting_type_strs[] = {
|
|
NULL
|
|
};
|
|
|
|
-static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
|
|
- s64 *d, unsigned nr)
|
|
+static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos,
|
|
+ s64 *d, unsigned nr)
|
|
{
|
|
struct bkey_i_accounting *acc = bkey_accounting_init(k);
|
|
|
|
- acc->k.p = disk_accounting_pos_to_bpos(pos);
|
|
+ acc->k.p = pos;
|
|
set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
|
|
|
|
memcpy_u64s_small(acc->v.d, d, nr);
|
|
}
|
|
|
|
+static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
|
|
+ s64 *d, unsigned nr)
|
|
+{
|
|
+ return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr);
|
|
+}
|
|
+
|
|
static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
|
|
|
|
int bch2_disk_accounting_mod(struct btree_trans *trans,
|
|
struct disk_accounting_pos *k,
|
|
s64 *d, unsigned nr, bool gc)
|
|
{
|
|
+ BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
|
|
+
|
|
/* Normalize: */
|
|
switch (k->type) {
|
|
case BCH_DISK_ACCOUNTING_replicas:
|
|
@@ -92,21 +100,49 @@ int bch2_disk_accounting_mod(struct btree_trans *trans,
|
|
break;
|
|
}
|
|
|
|
- BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
|
|
+ struct bpos pos = disk_accounting_pos_to_bpos(k);
|
|
+
|
|
+ if (likely(!gc)) {
|
|
+ struct bkey_i_accounting *a;
|
|
+#if 0
|
|
+ for (a = btree_trans_subbuf_base(trans, &trans->accounting);
|
|
+ a != btree_trans_subbuf_top(trans, &trans->accounting);
|
|
+ a = (void *) bkey_next(&a->k_i))
|
|
+ if (bpos_eq(a->k.p, pos)) {
|
|
+ BUG_ON(nr != bch2_accounting_counters(&a->k));
|
|
+ acc_u64s(a->v.d, d, nr);
|
|
+
|
|
+ if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) {
|
|
+ unsigned offset = (u64 *) a -
|
|
+ (u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
|
|
+
|
|
+ trans->accounting.u64s -= a->k.u64s;
|
|
+ memmove_u64s_down(a,
|
|
+ bkey_next(&a->k_i),
|
|
+ trans->accounting.u64s - offset);
|
|
+ }
|
|
+ return 0;
|
|
+ }
|
|
+#endif
|
|
+ unsigned u64s = sizeof(*a) / sizeof(u64) + nr;
|
|
+ a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s);
|
|
+ int ret = PTR_ERR_OR_ZERO(a);
|
|
+ if (ret)
|
|
+ return ret;
|
|
|
|
- struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
|
|
+ __accounting_key_init(&a->k_i, pos, d, nr);
|
|
+ return 0;
|
|
+ } else {
|
|
+ struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
|
|
|
|
- accounting_key_init(&k_i.k, k, d, nr);
|
|
+ __accounting_key_init(&k_i.k, pos, d, nr);
|
|
|
|
- if (unlikely(gc)) {
|
|
int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
|
|
if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
|
|
ret = drop_locks_do(trans,
|
|
bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
|
|
bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
|
|
return ret;
|
|
- } else {
|
|
- return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k);
|
|
}
|
|
}
|
|
|
|
@@ -114,10 +150,9 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
|
|
unsigned dev, s64 sectors,
|
|
bool gc)
|
|
{
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_replicas,
|
|
- };
|
|
-
|
|
+ struct disk_accounting_pos acc;
|
|
+ memset(&acc, 0, sizeof(acc));
|
|
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
|
|
bch2_replicas_entry_cached(&acc.replicas, dev);
|
|
|
|
return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc);
|
|
@@ -135,6 +170,12 @@ static inline bool is_zero(char *start, char *end)
|
|
|
|
#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member))
|
|
|
|
+static const unsigned bch2_accounting_type_nr_counters[] = {
|
|
+#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr,
|
|
+ BCH_DISK_ACCOUNTING_TYPES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
struct bkey_validate_context from)
|
|
{
|
|
@@ -193,6 +234,11 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
|
|
c, accounting_key_junk_at_end,
|
|
"junk at end of accounting key");
|
|
+
|
|
+ bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type],
|
|
+ c, accounting_key_nr_counters_wrong,
|
|
+ "accounting key with %u counters, should be %u",
|
|
+ bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]);
|
|
fsck_err:
|
|
return ret;
|
|
}
|
|
@@ -277,7 +323,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
|
|
|
|
static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
|
|
{
|
|
- struct bch_replicas_padded r;
|
|
+ union bch_replicas_padded r;
|
|
return accounting_to_replicas(&r.e, p)
|
|
? bch2_mark_replicas(c, &r.e)
|
|
: 0;
|
|
@@ -289,14 +335,13 @@ static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
|
|
*/
|
|
int bch2_accounting_update_sb(struct btree_trans *trans)
|
|
{
|
|
- for (struct jset_entry *i = trans->journal_entries;
|
|
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
|
|
- i = vstruct_next(i))
|
|
- if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) {
|
|
- int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p);
|
|
- if (ret)
|
|
- return ret;
|
|
- }
|
|
+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
|
|
+ i != btree_trans_subbuf_top(trans, &trans->accounting);
|
|
+ i = bkey_next(i)) {
|
|
+ int ret = bch2_accounting_update_sb_one(trans->c, i->k.p);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
|
|
return 0;
|
|
}
|
|
@@ -351,7 +396,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
|
|
int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
|
|
enum bch_accounting_mode mode)
|
|
{
|
|
- struct bch_replicas_padded r;
|
|
+ union bch_replicas_padded r;
|
|
|
|
if (mode != BCH_ACCOUNTING_read &&
|
|
accounting_to_replicas(&r.e, a.k->p) &&
|
|
@@ -366,6 +411,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
|
|
return ret;
|
|
}
|
|
|
|
+int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a,
|
|
+ enum bch_accounting_mode mode)
|
|
+{
|
|
+ union bch_replicas_padded r;
|
|
+
|
|
+ if (mode != BCH_ACCOUNTING_read &&
|
|
+ accounting_to_replicas(&r.e, a.k->p) &&
|
|
+ !bch2_replicas_marked_locked(c, &r.e))
|
|
+ return -BCH_ERR_btree_insert_need_mark_replicas;
|
|
+
|
|
+ return __bch2_accounting_mem_insert(c, a);
|
|
+}
|
|
+
|
|
static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
|
|
{
|
|
for (unsigned i = 0; i < e->nr_counters; i++)
|
|
@@ -415,10 +473,12 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
|
|
|
|
percpu_down_read(&c->mark_lock);
|
|
darray_for_each(acc->k, i) {
|
|
- struct {
|
|
+ union {
|
|
+ u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs,
|
|
+ BCH_BKEY_PTRS_MAX)];
|
|
struct bch_replicas_usage r;
|
|
- u8 pad[BCH_BKEY_PTRS_MAX];
|
|
} u;
|
|
+ u.r.r.nr_devs = BCH_BKEY_PTRS_MAX;
|
|
|
|
if (!accounting_to_replicas(&u.r.r, i->pos))
|
|
continue;
|
|
@@ -547,11 +607,11 @@ int bch2_gc_accounting_done(struct bch_fs *c)
|
|
prt_str(&buf, "accounting mismatch for ");
|
|
bch2_accounting_key_to_text(&buf, &acc_k);
|
|
|
|
- prt_str(&buf, ": got");
|
|
+ prt_str(&buf, ":\n got");
|
|
for (unsigned j = 0; j < nr; j++)
|
|
prt_printf(&buf, " %llu", dst_v[j]);
|
|
|
|
- prt_str(&buf, " should be");
|
|
+ prt_str(&buf, "\nshould be");
|
|
for (unsigned j = 0; j < nr; j++)
|
|
prt_printf(&buf, " %llu", src_v[j]);
|
|
|
|
@@ -573,7 +633,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)
|
|
accounting_key_init(&k_i.k, &acc_k, src_v, nr);
|
|
bch2_accounting_mem_mod_locked(trans,
|
|
bkey_i_to_s_c_accounting(&k_i.k),
|
|
- BCH_ACCOUNTING_normal);
|
|
+ BCH_ACCOUNTING_normal, true);
|
|
|
|
preempt_disable();
|
|
struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
|
|
@@ -602,23 +662,23 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
|
|
|
|
percpu_down_read(&c->mark_lock);
|
|
int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
|
|
- BCH_ACCOUNTING_read);
|
|
+ BCH_ACCOUNTING_read, false);
|
|
percpu_up_read(&c->mark_lock);
|
|
return ret;
|
|
}
|
|
|
|
static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
|
|
- struct disk_accounting_pos acc,
|
|
+ struct disk_accounting_pos *acc,
|
|
u64 *v, unsigned nr)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct printbuf buf = PRINTBUF;
|
|
int ret = 0, invalid_dev = -1;
|
|
|
|
- switch (acc.type) {
|
|
+ switch (acc->type) {
|
|
case BCH_DISK_ACCOUNTING_replicas: {
|
|
- struct bch_replicas_padded r;
|
|
- __accounting_to_replicas(&r.e, &acc);
|
|
+ union bch_replicas_padded r;
|
|
+ __accounting_to_replicas(&r.e, acc);
|
|
|
|
for (unsigned i = 0; i < r.e.nr_devs; i++)
|
|
if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
|
|
@@ -635,9 +695,9 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
|
|
|
|
if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
|
|
trans, accounting_replicas_not_marked,
|
|
- "accounting not marked in superblock replicas\n %s",
|
|
+ "accounting not marked in superblock replicas\n%s",
|
|
(printbuf_reset(&buf),
|
|
- bch2_accounting_key_to_text(&buf, &acc),
|
|
+ bch2_accounting_key_to_text(&buf, acc),
|
|
buf.buf))) {
|
|
/*
|
|
* We're not RW yet and still single threaded, dropping
|
|
@@ -653,8 +713,8 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
|
|
}
|
|
|
|
case BCH_DISK_ACCOUNTING_dev_data_type:
|
|
- if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
|
|
- invalid_dev = acc.dev_data_type.dev;
|
|
+ if (!bch2_dev_exists(c, acc->dev_data_type.dev)) {
|
|
+ invalid_dev = acc->dev_data_type.dev;
|
|
goto invalid_device;
|
|
}
|
|
break;
|
|
@@ -665,16 +725,16 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
|
|
return ret;
|
|
invalid_device:
|
|
if (fsck_err(trans, accounting_to_invalid_device,
|
|
- "accounting entry points to invalid device %i\n %s",
|
|
+ "accounting entry points to invalid device %i\n%s",
|
|
invalid_dev,
|
|
(printbuf_reset(&buf),
|
|
- bch2_accounting_key_to_text(&buf, &acc),
|
|
+ bch2_accounting_key_to_text(&buf, acc),
|
|
buf.buf))) {
|
|
for (unsigned i = 0; i < nr; i++)
|
|
v[i] = -v[i];
|
|
|
|
ret = commit_do(trans, NULL, NULL, 0,
|
|
- bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
|
|
+ bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?:
|
|
-BCH_ERR_remove_disk_accounting_entry;
|
|
} else {
|
|
ret = -BCH_ERR_remove_disk_accounting_entry;
|
|
@@ -725,9 +785,11 @@ int bch2_accounting_read(struct bch_fs *c)
|
|
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
|
|
break;
|
|
|
|
- if (!bch2_accounting_is_mem(acc_k)) {
|
|
- struct disk_accounting_pos next = { .type = acc_k.type + 1 };
|
|
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
|
|
+ if (!bch2_accounting_is_mem(&acc_k)) {
|
|
+ struct disk_accounting_pos next;
|
|
+ memset(&next, 0, sizeof(next));
|
|
+ next.type = acc_k.type + 1;
|
|
+ bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
|
|
continue;
|
|
}
|
|
|
|
@@ -745,7 +807,7 @@ int bch2_accounting_read(struct bch_fs *c)
|
|
struct disk_accounting_pos acc_k;
|
|
bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
|
|
|
|
- if (!bch2_accounting_is_mem(acc_k))
|
|
+ if (!bch2_accounting_is_mem(&acc_k))
|
|
continue;
|
|
|
|
struct bkey_s_c k = bkey_i_to_s_c(i->k);
|
|
@@ -801,7 +863,7 @@ int bch2_accounting_read(struct bch_fs *c)
|
|
*/
|
|
ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
|
|
? -BCH_ERR_remove_disk_accounting_entry
|
|
- : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters);
|
|
+ : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
|
|
|
|
if (ret == -BCH_ERR_remove_disk_accounting_entry) {
|
|
free_percpu(i->v[0]);
|
|
@@ -882,15 +944,13 @@ int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
|
|
int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
|
|
{
|
|
struct bch_fs *c = ca->fs;
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
|
|
- .dev_data_type.dev = ca->dev_idx,
|
|
- .dev_data_type.data_type = BCH_DATA_free,
|
|
- };
|
|
u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
|
|
|
|
int ret = bch2_trans_do(c, ({
|
|
- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?:
|
|
+ bch2_disk_accounting_mod2(trans, gc,
|
|
+ v, dev_data_type,
|
|
+ .dev = ca->dev_idx,
|
|
+ .data_type = BCH_DATA_free) ?:
|
|
(!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0);
|
|
}));
|
|
bch_err_fn(c, ret);
|
|
@@ -916,9 +976,11 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
|
|
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
|
|
break;
|
|
|
|
- if (!bch2_accounting_is_mem(acc_k)) {
|
|
- struct disk_accounting_pos next = { .type = acc_k.type + 1 };
|
|
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
|
|
+ if (!bch2_accounting_is_mem(&acc_k)) {
|
|
+ struct disk_accounting_pos next;
|
|
+ memset(&next, 0, sizeof(next));
|
|
+ next.type = acc_k.type + 1;
|
|
+ bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
|
|
continue;
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
|
|
index f4372cafea2e..f6098e33ab30 100644
|
|
--- a/fs/bcachefs/disk_accounting.h
|
|
+++ b/fs/bcachefs/disk_accounting.h
|
|
@@ -33,10 +33,12 @@ static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
|
|
static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
|
|
struct bkey_s_c_accounting src)
|
|
{
|
|
- EBUG_ON(dst->k.u64s != src.k->u64s);
|
|
-
|
|
- for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
|
|
+ for (unsigned i = 0;
|
|
+ i < min(bch2_accounting_counters(&dst->k),
|
|
+ bch2_accounting_counters(src.k));
|
|
+ i++)
|
|
dst->v.d[i] += src.v->d[i];
|
|
+
|
|
if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
|
|
dst->k.bversion = src.k->bversion;
|
|
}
|
|
@@ -85,6 +87,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos
|
|
|
|
int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
|
|
s64 *, unsigned, bool);
|
|
+
|
|
+#define disk_accounting_key_init(_k, _type, ...) \
|
|
+do { \
|
|
+ memset(&(_k), 0, sizeof(_k)); \
|
|
+ (_k).type = BCH_DISK_ACCOUNTING_##_type; \
|
|
+ (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \
|
|
+} while (0)
|
|
+
|
|
+#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \
|
|
+({ \
|
|
+ struct disk_accounting_pos pos; \
|
|
+ disk_accounting_key_init(pos, __VA_ARGS__); \
|
|
+ bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \
|
|
+})
|
|
+
|
|
+#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \
|
|
+ bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__)
|
|
+
|
|
int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
|
|
|
|
int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
|
|
@@ -116,12 +136,13 @@ enum bch_accounting_mode {
|
|
};
|
|
|
|
int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
|
|
+int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
|
|
void bch2_accounting_mem_gc(struct bch_fs *);
|
|
|
|
-static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
|
|
+static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc)
|
|
{
|
|
- return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR &&
|
|
- acc.type != BCH_DISK_ACCOUNTING_inum;
|
|
+ return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR &&
|
|
+ acc->type != BCH_DISK_ACCOUNTING_inum;
|
|
}
|
|
|
|
/*
|
|
@@ -130,7 +151,8 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
|
|
*/
|
|
static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
|
|
struct bkey_s_c_accounting a,
|
|
- enum bch_accounting_mode mode)
|
|
+ enum bch_accounting_mode mode,
|
|
+ bool write_locked)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct bch_accounting_mem *acc = &c->accounting;
|
|
@@ -141,7 +163,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
|
|
if (gc && !acc->gc_running)
|
|
return 0;
|
|
|
|
- if (!bch2_accounting_is_mem(acc_k))
|
|
+ if (!bch2_accounting_is_mem(&acc_k))
|
|
return 0;
|
|
|
|
if (mode == BCH_ACCOUNTING_normal) {
|
|
@@ -169,7 +191,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
|
|
|
|
while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
|
|
accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
|
|
- int ret = bch2_accounting_mem_insert(c, a, mode);
|
|
+ int ret = 0;
|
|
+ if (unlikely(write_locked))
|
|
+ ret = bch2_accounting_mem_insert_locked(c, a, mode);
|
|
+ else
|
|
+ ret = bch2_accounting_mem_insert(c, a, mode);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -186,7 +212,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
|
|
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
|
|
{
|
|
percpu_down_read(&trans->c->mark_lock);
|
|
- int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
|
|
+ int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false);
|
|
percpu_up_read(&trans->c->mark_lock);
|
|
return ret;
|
|
}
|
|
@@ -233,13 +259,13 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
|
|
struct bkey_i_accounting *a,
|
|
unsigned commit_flags)
|
|
{
|
|
- a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
|
|
- (u64 *) a - (u64 *) trans->journal_entries);
|
|
+ u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
|
|
+ a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base);
|
|
|
|
EBUG_ON(bversion_zero(a->k.bversion));
|
|
|
|
return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
|
|
- ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
|
|
+ ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false)
|
|
: 0;
|
|
}
|
|
|
|
@@ -251,7 +277,7 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans
|
|
struct bkey_s_accounting a = accounting_i_to_s(a_i);
|
|
|
|
bch2_accounting_neg(a);
|
|
- bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
|
|
+ bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false);
|
|
bch2_accounting_neg(a);
|
|
}
|
|
}
|
|
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
|
|
index 7b6e6c97e6aa..8269af1dbe2a 100644
|
|
--- a/fs/bcachefs/disk_accounting_format.h
|
|
+++ b/fs/bcachefs/disk_accounting_format.h
|
|
@@ -95,40 +95,81 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
|
|
}
|
|
}
|
|
|
|
+/*
|
|
+ * field 1: name
|
|
+ * field 2: id
|
|
+ * field 3: number of counters (max 3)
|
|
+ */
|
|
+
|
|
#define BCH_DISK_ACCOUNTING_TYPES() \
|
|
- x(nr_inodes, 0) \
|
|
- x(persistent_reserved, 1) \
|
|
- x(replicas, 2) \
|
|
- x(dev_data_type, 3) \
|
|
- x(compression, 4) \
|
|
- x(snapshot, 5) \
|
|
- x(btree, 6) \
|
|
- x(rebalance_work, 7) \
|
|
- x(inum, 8)
|
|
+ x(nr_inodes, 0, 1) \
|
|
+ x(persistent_reserved, 1, 1) \
|
|
+ x(replicas, 2, 1) \
|
|
+ x(dev_data_type, 3, 3) \
|
|
+ x(compression, 4, 3) \
|
|
+ x(snapshot, 5, 1) \
|
|
+ x(btree, 6, 1) \
|
|
+ x(rebalance_work, 7, 1) \
|
|
+ x(inum, 8, 3)
|
|
|
|
enum disk_accounting_type {
|
|
-#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr,
|
|
+#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr,
|
|
BCH_DISK_ACCOUNTING_TYPES()
|
|
#undef x
|
|
BCH_DISK_ACCOUNTING_TYPE_NR,
|
|
};
|
|
|
|
-struct bch_nr_inodes {
|
|
+/*
|
|
+ * No subtypes - number of inodes in the entire filesystem
|
|
+ *
|
|
+ * XXX: perhaps we could add a per-subvolume counter?
|
|
+ */
|
|
+struct bch_acct_nr_inodes {
|
|
};
|
|
|
|
-struct bch_persistent_reserved {
|
|
+/*
|
|
+ * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the
|
|
+ * reservation:
|
|
+ */
|
|
+struct bch_acct_persistent_reserved {
|
|
__u8 nr_replicas;
|
|
};
|
|
|
|
-struct bch_dev_data_type {
|
|
+/*
|
|
+ * device, data type counter fields:
|
|
+ * [
|
|
+ * nr_buckets
|
|
+ * live sectors (in buckets of that data type)
|
|
+ * sectors of internal fragmentation
|
|
+ * ]
|
|
+ *
|
|
+ * XXX: live sectors should've been done differently, you can have multiple data
|
|
+ * types in the same bucket (user, stripe, cached) and this collapses them to
|
|
+ * the bucket data type, and makes the internal fragmentation counter redundant
|
|
+ */
|
|
+struct bch_acct_dev_data_type {
|
|
__u8 dev;
|
|
__u8 data_type;
|
|
};
|
|
|
|
+/*
|
|
+ * Compression type fields:
|
|
+ * [
|
|
+ * number of extents
|
|
+ * uncompressed size
|
|
+ * compressed size
|
|
+ * ]
|
|
+ *
|
|
+ * Compression ratio, average extent size (fragmentation).
|
|
+ */
|
|
struct bch_acct_compression {
|
|
__u8 type;
|
|
};
|
|
|
|
+/*
|
|
+ * On disk usage by snapshot id; counts same values as replicas counter, but
|
|
+ * aggregated differently
|
|
+ */
|
|
struct bch_acct_snapshot {
|
|
__u32 id;
|
|
} __packed;
|
|
@@ -137,10 +178,27 @@ struct bch_acct_btree {
|
|
__u32 id;
|
|
} __packed;
|
|
|
|
+/*
|
|
+ * inum counter fields:
|
|
+ * [
|
|
+ * number of extents
|
|
+ * sum of extent sizes - bkey size
|
|
+ * this field is similar to inode.bi_sectors, except here extents in
|
|
+ * different snapshots but the same inode number are all collapsed to the
|
|
+ * same counter
|
|
+ * sum of on disk size - same values tracked by replicas counters
|
|
+ * ]
|
|
+ *
|
|
+ * This tracks on disk fragmentation.
|
|
+ */
|
|
struct bch_acct_inum {
|
|
__u64 inum;
|
|
} __packed;
|
|
|
|
+/*
|
|
+ * Simple counter of the amount of data (on disk sectors) rebalance needs to
|
|
+ * move, extents counted here are also in the rebalance_work btree.
|
|
+ */
|
|
struct bch_acct_rebalance_work {
|
|
};
|
|
|
|
@@ -149,10 +207,10 @@ struct disk_accounting_pos {
|
|
struct {
|
|
__u8 type;
|
|
union {
|
|
- struct bch_nr_inodes nr_inodes;
|
|
- struct bch_persistent_reserved persistent_reserved;
|
|
+ struct bch_acct_nr_inodes nr_inodes;
|
|
+ struct bch_acct_persistent_reserved persistent_reserved;
|
|
struct bch_replicas_entry_v1 replicas;
|
|
- struct bch_dev_data_type dev_data_type;
|
|
+ struct bch_acct_dev_data_type dev_data_type;
|
|
struct bch_acct_compression compression;
|
|
struct bch_acct_snapshot snapshot;
|
|
struct bch_acct_btree btree;
|
|
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
|
|
index b1982131b206..242b3270cd5c 100644
|
|
--- a/fs/bcachefs/disk_accounting_types.h
|
|
+++ b/fs/bcachefs/disk_accounting_types.h
|
|
@@ -2,7 +2,7 @@
|
|
#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H
|
|
#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H
|
|
|
|
-#include "darray.h"
|
|
+#include <linux/darray.h>
|
|
|
|
struct accounting_mem_entry {
|
|
struct bpos pos;
|
|
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
|
|
index 5df8de0b8c02..c20ecf5e5381 100644
|
|
--- a/fs/bcachefs/disk_groups.c
|
|
+++ b/fs/bcachefs/disk_groups.c
|
|
@@ -86,35 +86,6 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *
|
|
return ret;
|
|
}
|
|
|
|
-void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
|
|
-{
|
|
- out->atomic++;
|
|
- rcu_read_lock();
|
|
-
|
|
- struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
|
|
- if (!g)
|
|
- goto out;
|
|
-
|
|
- for (unsigned i = 0; i < g->nr; i++) {
|
|
- if (i)
|
|
- prt_printf(out, " ");
|
|
-
|
|
- if (g->entries[i].deleted) {
|
|
- prt_printf(out, "[deleted]");
|
|
- continue;
|
|
- }
|
|
-
|
|
- prt_printf(out, "[parent %d devs", g->entries[i].parent);
|
|
- for_each_member_device_rcu(c, ca, &g->entries[i].devs)
|
|
- prt_printf(out, " %s", ca->name);
|
|
- prt_printf(out, "]");
|
|
- }
|
|
-
|
|
-out:
|
|
- rcu_read_unlock();
|
|
- out->atomic--;
|
|
-}
|
|
-
|
|
static void bch2_sb_disk_groups_to_text(struct printbuf *out,
|
|
struct bch_sb *sb,
|
|
struct bch_sb_field *f)
|
|
@@ -241,20 +212,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
|
|
case TARGET_DEV:
|
|
return dev == t.dev;
|
|
case TARGET_GROUP: {
|
|
- struct bch_disk_groups_cpu *g;
|
|
- const struct bch_devs_mask *m;
|
|
- bool ret;
|
|
-
|
|
- rcu_read_lock();
|
|
- g = rcu_dereference(c->disk_groups);
|
|
- m = g && t.group < g->nr && !g->entries[t.group].deleted
|
|
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
|
|
+ const struct bch_devs_mask *m =
|
|
+ g && t.group < g->nr && !g->entries[t.group].deleted
|
|
? &g->entries[t.group].devs
|
|
: NULL;
|
|
|
|
- ret = m ? test_bit(dev, m->d) : false;
|
|
- rcu_read_unlock();
|
|
-
|
|
- return ret;
|
|
+ return m ? test_bit(dev, m->d) : false;
|
|
}
|
|
default:
|
|
BUG();
|
|
@@ -377,54 +341,81 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
|
|
return v;
|
|
}
|
|
|
|
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
|
|
+static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g,
|
|
+ unsigned v)
|
|
{
|
|
- struct bch_disk_groups_cpu *groups;
|
|
- struct bch_disk_group_cpu *g;
|
|
- unsigned nr = 0;
|
|
u16 path[32];
|
|
-
|
|
- out->atomic++;
|
|
- rcu_read_lock();
|
|
- groups = rcu_dereference(c->disk_groups);
|
|
- if (!groups)
|
|
- goto invalid;
|
|
+ unsigned nr = 0;
|
|
|
|
while (1) {
|
|
if (nr == ARRAY_SIZE(path))
|
|
goto invalid;
|
|
|
|
- if (v >= groups->nr)
|
|
+ if (v >= (g ? g->nr : 0))
|
|
goto invalid;
|
|
|
|
- g = groups->entries + v;
|
|
+ struct bch_disk_group_cpu *e = g->entries + v;
|
|
|
|
- if (g->deleted)
|
|
+ if (e->deleted)
|
|
goto invalid;
|
|
|
|
path[nr++] = v;
|
|
|
|
- if (!g->parent)
|
|
+ if (!e->parent)
|
|
break;
|
|
|
|
- v = g->parent - 1;
|
|
+ v = e->parent - 1;
|
|
}
|
|
|
|
while (nr) {
|
|
- v = path[--nr];
|
|
- g = groups->entries + v;
|
|
+ struct bch_disk_group_cpu *e = g->entries + path[--nr];
|
|
|
|
- prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
|
|
+ prt_printf(out, "%.*s", (int) sizeof(e->label), e->label);
|
|
if (nr)
|
|
prt_printf(out, ".");
|
|
}
|
|
-out:
|
|
- rcu_read_unlock();
|
|
- out->atomic--;
|
|
return;
|
|
invalid:
|
|
prt_printf(out, "invalid label %u", v);
|
|
- goto out;
|
|
+}
|
|
+
|
|
+void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ bch2_printbuf_make_room(out, 4096);
|
|
+
|
|
+ out->atomic++;
|
|
+ rcu_read_lock();
|
|
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
|
|
+
|
|
+ for (unsigned i = 0; i < (g ? g->nr : 0); i++) {
|
|
+ prt_printf(out, "%2u: ", i);
|
|
+
|
|
+ if (g->entries[i].deleted) {
|
|
+ prt_printf(out, "[deleted]");
|
|
+ goto next;
|
|
+ }
|
|
+
|
|
+ __bch2_disk_path_to_text(out, g, i);
|
|
+
|
|
+ prt_printf(out, " devs");
|
|
+
|
|
+ for_each_member_device_rcu(c, ca, &g->entries[i].devs)
|
|
+ prt_printf(out, " %s", ca->name);
|
|
+next:
|
|
+ prt_newline(out);
|
|
+ }
|
|
+
|
|
+ rcu_read_unlock();
|
|
+ out->atomic--;
|
|
+}
|
|
+
|
|
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
|
|
+{
|
|
+ out->atomic++;
|
|
+ rcu_read_lock();
|
|
+ __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v),
|
|
+ rcu_read_unlock();
|
|
+ --out->atomic;
|
|
}
|
|
|
|
void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
|
|
@@ -470,23 +461,22 @@ void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned
|
|
|
|
int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
|
|
{
|
|
- struct bch_member *mi;
|
|
- int ret, v = -1;
|
|
+ lockdep_assert_held(&c->sb_lock);
|
|
|
|
- if (!strlen(name) || !strcmp(name, "none"))
|
|
- return 0;
|
|
|
|
- v = bch2_disk_path_find_or_create(&c->disk_sb, name);
|
|
- if (v < 0)
|
|
- return v;
|
|
+ if (!strlen(name) || !strcmp(name, "none")) {
|
|
+ struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
|
|
+ SET_BCH_MEMBER_GROUP(mi, 0);
|
|
+ } else {
|
|
+ int v = bch2_disk_path_find_or_create(&c->disk_sb, name);
|
|
+ if (v < 0)
|
|
+ return v;
|
|
|
|
- ret = bch2_sb_disk_groups_to_cpu(c);
|
|
- if (ret)
|
|
- return ret;
|
|
+ struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
|
|
+ SET_BCH_MEMBER_GROUP(mi, v + 1);
|
|
+ }
|
|
|
|
- mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
|
|
- SET_BCH_MEMBER_GROUP(mi, v + 1);
|
|
- return 0;
|
|
+ return bch2_sb_disk_groups_to_cpu(c);
|
|
}
|
|
|
|
int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
|
|
@@ -555,14 +545,12 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
|
|
? rcu_dereference(c->devs[t.dev])
|
|
: NULL;
|
|
|
|
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
|
|
+ if (ca && ca->disk_sb.bdev)
|
|
prt_printf(out, "/dev/%s", ca->name);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
- } else if (ca) {
|
|
+ else if (ca)
|
|
prt_printf(out, "offline device %u", t.dev);
|
|
- } else {
|
|
+ else
|
|
prt_printf(out, "invalid device %u", t.dev);
|
|
- }
|
|
|
|
rcu_read_unlock();
|
|
out->atomic--;
|
|
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
|
|
index d2a5e76e6479..c581426e3894 100644
|
|
--- a/fs/bcachefs/ec.c
|
|
+++ b/fs/bcachefs/ec.c
|
|
@@ -16,10 +16,12 @@
|
|
#include "disk_accounting.h"
|
|
#include "disk_groups.h"
|
|
#include "ec.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "error.h"
|
|
#include "io_read.h"
|
|
#include "io_write.h"
|
|
#include "keylist.h"
|
|
+#include "lru.h"
|
|
#include "recovery.h"
|
|
#include "replicas.h"
|
|
#include "super-io.h"
|
|
@@ -104,6 +106,8 @@ struct ec_bio {
|
|
struct bch_dev *ca;
|
|
struct ec_stripe_buf *buf;
|
|
size_t idx;
|
|
+ int rw;
|
|
+ u64 submit_time;
|
|
struct bio bio;
|
|
};
|
|
|
|
@@ -298,15 +302,27 @@ static int mark_stripe_bucket(struct btree_trans *trans,
|
|
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
|
|
|
|
if (flags & BTREE_TRIGGER_transactional) {
|
|
+ struct extent_ptr_decoded p = {
|
|
+ .ptr = *ptr,
|
|
+ .crc = bch2_extent_crc_unpack(s.k, NULL),
|
|
+ };
|
|
+ struct bkey_i_backpointer bp;
|
|
+ bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p,
|
|
+ (const union bch_extent_entry *) ptr, &bp);
|
|
+
|
|
struct bkey_i_alloc_v4 *a =
|
|
bch2_trans_start_alloc_update(trans, bucket, 0);
|
|
- ret = PTR_ERR_OR_ZERO(a) ?:
|
|
- __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
|
|
+ ret = PTR_ERR_OR_ZERO(a) ?:
|
|
+ __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?:
|
|
+ bch2_bucket_backpointer_mod(trans, s.s_c, &bp,
|
|
+ !(flags & BTREE_TRIGGER_overwrite));
|
|
+ if (ret)
|
|
+ goto err;
|
|
}
|
|
|
|
if (flags & BTREE_TRIGGER_gc) {
|
|
struct bucket *g = gc_bucket(ca, bucket.offset);
|
|
- if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
|
|
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s",
|
|
ptr->dev,
|
|
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
|
|
ret = -BCH_ERR_mark_stripe;
|
|
@@ -366,19 +382,6 @@ static int mark_stripe_buckets(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
-static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
|
|
-{
|
|
- m->sectors = le16_to_cpu(s->sectors);
|
|
- m->algorithm = s->algorithm;
|
|
- m->nr_blocks = s->nr_blocks;
|
|
- m->nr_redundant = s->nr_redundant;
|
|
- m->disk_label = s->disk_label;
|
|
- m->blocks_nonempty = 0;
|
|
-
|
|
- for (unsigned i = 0; i < s->nr_blocks; i++)
|
|
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
|
|
-}
|
|
-
|
|
int bch2_trigger_stripe(struct btree_trans *trans,
|
|
enum btree_id btree, unsigned level,
|
|
struct bkey_s_c old, struct bkey_s _new,
|
|
@@ -399,6 +402,15 @@ int bch2_trigger_stripe(struct btree_trans *trans,
|
|
(new_s->nr_blocks != old_s->nr_blocks ||
|
|
new_s->nr_redundant != old_s->nr_redundant));
|
|
|
|
+ if (flags & BTREE_TRIGGER_transactional) {
|
|
+ int ret = bch2_lru_change(trans,
|
|
+ BCH_LRU_STRIPE_FRAGMENTATION,
|
|
+ idx,
|
|
+ stripe_lru_pos(old_s),
|
|
+ stripe_lru_pos(new_s));
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
|
|
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
|
|
/*
|
|
@@ -443,24 +455,25 @@ int bch2_trigger_stripe(struct btree_trans *trans,
|
|
if (new_s) {
|
|
s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
|
|
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_replicas,
|
|
- };
|
|
+ struct disk_accounting_pos acc;
|
|
+ memset(&acc, 0, sizeof(acc));
|
|
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
|
|
bch2_bkey_to_replicas(&acc.replicas, new);
|
|
int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc);
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (gc)
|
|
- memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
|
|
+ unsafe_memcpy(&gc->r.e, &acc.replicas,
|
|
+ replicas_entry_bytes(&acc.replicas), "VLA");
|
|
}
|
|
|
|
if (old_s) {
|
|
s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
|
|
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_replicas,
|
|
- };
|
|
+ struct disk_accounting_pos acc;
|
|
+ memset(&acc, 0, sizeof(acc));
|
|
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
|
|
bch2_bkey_to_replicas(&acc.replicas, old);
|
|
int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc);
|
|
if (ret)
|
|
@@ -472,38 +485,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
- if (flags & BTREE_TRIGGER_atomic) {
|
|
- struct stripe *m = genradix_ptr(&c->stripes, idx);
|
|
-
|
|
- if (!m) {
|
|
- struct printbuf buf1 = PRINTBUF;
|
|
- struct printbuf buf2 = PRINTBUF;
|
|
-
|
|
- bch2_bkey_val_to_text(&buf1, c, old);
|
|
- bch2_bkey_val_to_text(&buf2, c, new);
|
|
- bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
|
|
- "old %s\n"
|
|
- "new %s", idx, buf1.buf, buf2.buf);
|
|
- printbuf_exit(&buf2);
|
|
- printbuf_exit(&buf1);
|
|
- bch2_inconsistent_error(c);
|
|
- return -1;
|
|
- }
|
|
-
|
|
- if (!new_s) {
|
|
- bch2_stripes_heap_del(c, m, idx);
|
|
-
|
|
- memset(m, 0, sizeof(*m));
|
|
- } else {
|
|
- stripe_to_mem(m, new_s);
|
|
-
|
|
- if (!old_s)
|
|
- bch2_stripes_heap_insert(c, m, idx);
|
|
- else
|
|
- bch2_stripes_heap_update(c, m, idx);
|
|
- }
|
|
- }
|
|
-
|
|
return 0;
|
|
}
|
|
|
|
@@ -527,20 +508,14 @@ static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
|
|
|
|
static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
|
|
{
|
|
- switch (k.k->type) {
|
|
- case KEY_TYPE_extent: {
|
|
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
|
- const union bch_extent_entry *entry;
|
|
-
|
|
- extent_for_each_entry(e, entry)
|
|
- if (extent_entry_type(entry) ==
|
|
- BCH_EXTENT_ENTRY_stripe_ptr &&
|
|
- entry->stripe_ptr.idx == idx)
|
|
- return true;
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
|
|
- break;
|
|
- }
|
|
- }
|
|
+ bkey_extent_entry_for_each(ptrs, entry)
|
|
+ if (extent_entry_type(entry) ==
|
|
+ BCH_EXTENT_ENTRY_stripe_ptr &&
|
|
+ entry->stripe_ptr.idx == idx)
|
|
+ return true;
|
|
|
|
return false;
|
|
}
|
|
@@ -725,15 +700,20 @@ static void ec_block_endio(struct bio *bio)
|
|
struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
|
|
struct bch_dev *ca = ec_bio->ca;
|
|
struct closure *cl = bio->bi_private;
|
|
+ int rw = ec_bio->rw;
|
|
+ unsigned ref = rw == READ
|
|
+ ? BCH_DEV_READ_REF_ec_block
|
|
+ : BCH_DEV_WRITE_REF_ec_block;
|
|
|
|
- if (bch2_dev_io_err_on(bio->bi_status, ca,
|
|
- bio_data_dir(bio)
|
|
- ? BCH_MEMBER_ERROR_write
|
|
- : BCH_MEMBER_ERROR_read,
|
|
- "erasure coding %s error: %s",
|
|
+ bch2_account_io_completion(ca, bio_data_dir(bio),
|
|
+ ec_bio->submit_time, !bio->bi_status);
|
|
+
|
|
+ if (bio->bi_status) {
|
|
+ bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
|
|
str_write_read(bio_data_dir(bio)),
|
|
- bch2_blk_status_to_str(bio->bi_status)))
|
|
+ bch2_blk_status_to_str(bio->bi_status));
|
|
clear_bit(ec_bio->idx, ec_bio->buf->valid);
|
|
+ }
|
|
|
|
int stale = dev_ptr_stale(ca, ptr);
|
|
if (stale) {
|
|
@@ -745,7 +725,7 @@ static void ec_block_endio(struct bio *bio)
|
|
}
|
|
|
|
bio_put(&ec_bio->bio);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[rw], ref);
|
|
closure_put(cl);
|
|
}
|
|
|
|
@@ -759,8 +739,11 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
|
|
? BCH_DATA_user
|
|
: BCH_DATA_parity;
|
|
int rw = op_is_write(opf);
|
|
+ unsigned ref = rw == READ
|
|
+ ? BCH_DEV_READ_REF_ec_block
|
|
+ : BCH_DEV_WRITE_REF_ec_block;
|
|
|
|
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
|
|
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref);
|
|
if (!ca) {
|
|
clear_bit(idx, buf->valid);
|
|
return;
|
|
@@ -796,6 +779,8 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
|
|
ec_bio->ca = ca;
|
|
ec_bio->buf = buf;
|
|
ec_bio->idx = idx;
|
|
+ ec_bio->rw = rw;
|
|
+ ec_bio->submit_time = local_clock();
|
|
|
|
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
|
|
ec_bio->bio.bi_end_io = ec_block_endio;
|
|
@@ -804,14 +789,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
|
|
bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
|
|
|
|
closure_get(cl);
|
|
- percpu_ref_get(&ca->io_ref);
|
|
+ enumerated_ref_get(&ca->io_ref[rw], ref);
|
|
|
|
submit_bio(&ec_bio->bio);
|
|
|
|
offset += b;
|
|
}
|
|
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[rw], ref);
|
|
}
|
|
|
|
static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
|
|
@@ -917,26 +902,6 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
|
|
|
|
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
|
|
{
|
|
- ec_stripes_heap n, *h = &c->ec_stripes_heap;
|
|
-
|
|
- if (idx >= h->size) {
|
|
- if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
|
|
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
|
-
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- if (n.size > h->size) {
|
|
- memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
|
|
- n.nr = h->nr;
|
|
- swap(*h, n);
|
|
- }
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
-
|
|
- free_heap(&n);
|
|
- }
|
|
-
|
|
- if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
|
|
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
|
-
|
|
if (c->gc_pos.phase != GC_PHASE_not_running &&
|
|
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
|
|
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
|
@@ -1009,188 +974,58 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
|
|
s->idx = 0;
|
|
}
|
|
|
|
-/* Heap of all existing stripes, ordered by blocks_nonempty */
|
|
-
|
|
-static u64 stripe_idx_to_delete(struct bch_fs *c)
|
|
-{
|
|
- ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
-
|
|
- lockdep_assert_held(&c->ec_stripes_heap_lock);
|
|
-
|
|
- if (h->nr &&
|
|
- h->data[0].blocks_nonempty == 0 &&
|
|
- !bch2_stripe_is_open(c, h->data[0].idx))
|
|
- return h->data[0].idx;
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
|
|
- size_t i)
|
|
-{
|
|
- struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
|
|
-
|
|
- genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
|
|
-}
|
|
-
|
|
-static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
|
|
-{
|
|
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
|
|
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
|
|
-
|
|
- return ((_l->blocks_nonempty > _r->blocks_nonempty) <
|
|
- (_l->blocks_nonempty < _r->blocks_nonempty));
|
|
-}
|
|
-
|
|
-static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
|
|
-{
|
|
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
|
|
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
|
|
- ec_stripes_heap *_h = (ec_stripes_heap *)h;
|
|
- size_t i = _l - _h->data;
|
|
- size_t j = _r - _h->data;
|
|
-
|
|
- swap(*_l, *_r);
|
|
-
|
|
- ec_stripes_heap_set_backpointer(_h, i);
|
|
- ec_stripes_heap_set_backpointer(_h, j);
|
|
-}
|
|
-
|
|
-static const struct min_heap_callbacks callbacks = {
|
|
- .less = ec_stripes_heap_cmp,
|
|
- .swp = ec_stripes_heap_swap,
|
|
-};
|
|
-
|
|
-static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
|
|
-{
|
|
- ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
- struct stripe *m = genradix_ptr(&c->stripes, idx);
|
|
-
|
|
- BUG_ON(m->heap_idx >= h->nr);
|
|
- BUG_ON(h->data[m->heap_idx].idx != idx);
|
|
-}
|
|
-
|
|
-void bch2_stripes_heap_del(struct bch_fs *c,
|
|
- struct stripe *m, size_t idx)
|
|
-{
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- heap_verify_backpointer(c, idx);
|
|
-
|
|
- min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
-}
|
|
-
|
|
-void bch2_stripes_heap_insert(struct bch_fs *c,
|
|
- struct stripe *m, size_t idx)
|
|
-{
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- BUG_ON(min_heap_full(&c->ec_stripes_heap));
|
|
-
|
|
- genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
|
|
- min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
|
|
- .idx = idx,
|
|
- .blocks_nonempty = m->blocks_nonempty,
|
|
- }),
|
|
- &callbacks,
|
|
- &c->ec_stripes_heap);
|
|
-
|
|
- heap_verify_backpointer(c, idx);
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
-}
|
|
-
|
|
-void bch2_stripes_heap_update(struct bch_fs *c,
|
|
- struct stripe *m, size_t idx)
|
|
-{
|
|
- ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
- bool do_deletes;
|
|
- size_t i;
|
|
-
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- heap_verify_backpointer(c, idx);
|
|
-
|
|
- h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
|
|
-
|
|
- i = m->heap_idx;
|
|
- min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap);
|
|
- min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
|
|
-
|
|
- heap_verify_backpointer(c, idx);
|
|
-
|
|
- do_deletes = stripe_idx_to_delete(c) != 0;
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
-
|
|
- if (do_deletes)
|
|
- bch2_do_stripe_deletes(c);
|
|
-}
|
|
-
|
|
/* stripe deletion */
|
|
|
|
static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
|
|
{
|
|
- struct bch_fs *c = trans->c;
|
|
struct btree_iter iter;
|
|
- struct bkey_s_c k;
|
|
- struct bkey_s_c_stripe s;
|
|
- int ret;
|
|
-
|
|
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
|
|
- BTREE_ITER_intent);
|
|
- ret = bkey_err(k);
|
|
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
|
|
+ BTREE_ID_stripes, POS(0, idx),
|
|
+ BTREE_ITER_intent);
|
|
+ int ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (k.k->type != KEY_TYPE_stripe) {
|
|
- bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
|
|
- ret = -EINVAL;
|
|
- goto err;
|
|
- }
|
|
-
|
|
- s = bkey_s_c_to_stripe(k);
|
|
- for (unsigned i = 0; i < s.v->nr_blocks; i++)
|
|
- if (stripe_blockcount_get(s.v, i)) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
-
|
|
- bch2_bkey_val_to_text(&buf, c, k);
|
|
- bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- ret = -EINVAL;
|
|
- goto err;
|
|
- }
|
|
-
|
|
- ret = bch2_btree_delete_at(trans, &iter, 0);
|
|
+ /*
|
|
+ * We expect write buffer races here
|
|
+ * Important: check stripe_is_open with stripe key locked:
|
|
+ */
|
|
+ if (k.k->type == KEY_TYPE_stripe &&
|
|
+ !bch2_stripe_is_open(trans->c, idx) &&
|
|
+ stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
|
|
+ ret = bch2_btree_delete_at(trans, &iter, 0);
|
|
err:
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
|
|
+/*
|
|
+ * XXX
|
|
+ * can we kill this and delete stripes from the trigger?
|
|
+ */
|
|
static void ec_stripe_delete_work(struct work_struct *work)
|
|
{
|
|
struct bch_fs *c =
|
|
container_of(work, struct bch_fs, ec_stripe_delete_work);
|
|
|
|
- while (1) {
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- u64 idx = stripe_idx_to_delete(c);
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
-
|
|
- if (!idx)
|
|
- break;
|
|
-
|
|
- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
- ec_stripe_delete(trans, idx));
|
|
- bch_err_fn(c, ret);
|
|
- if (ret)
|
|
- break;
|
|
- }
|
|
-
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
|
|
+ bch2_trans_run(c,
|
|
+ bch2_btree_write_buffer_tryflush(trans) ?:
|
|
+ for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
|
|
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
|
|
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
|
|
+ 0, lru_k,
|
|
+ NULL, NULL,
|
|
+ BCH_TRANS_COMMIT_no_enospc, ({
|
|
+ ec_stripe_delete(trans, lru_k.k->p.offset);
|
|
+ })));
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
|
|
}
|
|
|
|
void bch2_do_stripe_deletes(struct bch_fs *c)
|
|
{
|
|
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
|
|
+ if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) &&
|
|
!queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
|
|
}
|
|
|
|
/* stripe creation: */
|
|
@@ -1294,7 +1129,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
|
|
|
|
bch2_fs_inconsistent(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
- return -EIO;
|
|
+ return -BCH_ERR_erasure_coding_found_btree_node;
|
|
}
|
|
|
|
k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
|
|
@@ -1360,7 +1195,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
|
|
|
|
struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
|
|
if (!ca)
|
|
- return -EIO;
|
|
+ return -BCH_ERR_ENOENT_dev_not_found;
|
|
|
|
struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
|
|
|
|
@@ -1380,8 +1215,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
|
|
if (bp_k.k->type != KEY_TYPE_backpointer)
|
|
continue;
|
|
|
|
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
|
|
+ if (bp.v->btree_id == BTREE_ID_stripes)
|
|
+ continue;
|
|
+
|
|
ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
|
|
- bkey_s_c_to_backpointer(bp_k), &last_flushed);
|
|
+ bp, &last_flushed);
|
|
}));
|
|
|
|
bch2_bkey_buf_exit(&last_flushed, c);
|
|
@@ -1393,21 +1232,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
|
|
{
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
|
|
- unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
|
|
- int ret = 0;
|
|
+ unsigned nr_data = v->nr_blocks - v->nr_redundant;
|
|
|
|
- ret = bch2_btree_write_buffer_flush_sync(trans);
|
|
+ int ret = bch2_btree_write_buffer_flush_sync(trans);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- for (i = 0; i < nr_data; i++) {
|
|
+ for (unsigned i = 0; i < nr_data; i++) {
|
|
ret = ec_stripe_update_bucket(trans, s, i);
|
|
if (ret)
|
|
break;
|
|
}
|
|
err:
|
|
bch2_trans_put(trans);
|
|
-
|
|
return ret;
|
|
}
|
|
|
|
@@ -1416,7 +1253,8 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
|
|
unsigned block,
|
|
struct open_bucket *ob)
|
|
{
|
|
- struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
|
|
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE,
|
|
+ BCH_DEV_WRITE_REF_ec_bucket_zero);
|
|
if (!ca) {
|
|
s->err = -BCH_ERR_erofs_no_writes;
|
|
return;
|
|
@@ -1432,7 +1270,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
|
|
ob->sectors_free,
|
|
GFP_KERNEL, 0);
|
|
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero);
|
|
|
|
if (ret)
|
|
s->err = ret;
|
|
@@ -1473,6 +1311,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
|
if (s->err) {
|
|
if (!bch2_err_matches(s->err, EROFS))
|
|
bch_err(c, "error creating stripe: error writing data buckets");
|
|
+ ret = s->err;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1481,6 +1320,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
|
|
|
if (ec_do_recov(c, &s->existing_stripe)) {
|
|
bch_err(c, "error creating stripe: error reading existing stripe");
|
|
+ ret = -BCH_ERR_ec_block_read;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1506,6 +1346,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
|
|
|
if (ec_nr_failed(&s->new_stripe)) {
|
|
bch_err(c, "error creating stripe: error writing redundancy buckets");
|
|
+ ret = -BCH_ERR_ec_block_write;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1527,6 +1368,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
|
if (ret)
|
|
goto err;
|
|
err:
|
|
+ trace_stripe_create(c, s->idx, ret);
|
|
+
|
|
bch2_disk_reservation_put(c, &s->res);
|
|
|
|
for (i = 0; i < v->nr_blocks; i++)
|
|
@@ -1577,15 +1420,15 @@ static void ec_stripe_create_work(struct work_struct *work)
|
|
while ((s = get_pending_stripe(c)))
|
|
ec_stripe_create(s);
|
|
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
|
|
}
|
|
|
|
void bch2_ec_do_stripe_creates(struct bch_fs *c)
|
|
{
|
|
- bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
|
|
+ enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create);
|
|
|
|
if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
|
|
}
|
|
|
|
static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
|
|
@@ -1612,11 +1455,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int
|
|
ec_stripe_new_set_pending(c, h);
|
|
}
|
|
|
|
-void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
|
|
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err)
|
|
{
|
|
struct ec_stripe_new *s = ob->ec;
|
|
|
|
- s->err = -EIO;
|
|
+ s->err = err;
|
|
}
|
|
|
|
void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
|
|
@@ -1875,23 +1718,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
|
|
}
|
|
|
|
static int new_stripe_alloc_buckets(struct btree_trans *trans,
|
|
+ struct alloc_request *req,
|
|
struct ec_stripe_head *h, struct ec_stripe_new *s,
|
|
- enum bch_watermark watermark, struct closure *cl)
|
|
+ struct closure *cl)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct bch_devs_mask devs = h->devs;
|
|
struct open_bucket *ob;
|
|
- struct open_buckets buckets;
|
|
struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
|
|
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
|
|
- bool have_cache = true;
|
|
int ret = 0;
|
|
|
|
+ req->scratch_data_type = req->data_type;
|
|
+ req->scratch_ptrs = req->ptrs;
|
|
+ req->scratch_nr_replicas = req->nr_replicas;
|
|
+ req->scratch_nr_effective = req->nr_effective;
|
|
+ req->scratch_have_cache = req->have_cache;
|
|
+ req->scratch_devs_may_alloc = req->devs_may_alloc;
|
|
+
|
|
+ req->devs_may_alloc = h->devs;
|
|
+ req->have_cache = true;
|
|
+
|
|
BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
|
|
BUG_ON(v->nr_redundant != s->nr_parity);
|
|
|
|
/* * We bypass the sector allocator which normally does this: */
|
|
- bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
|
|
+ bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d,
|
|
+ c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
|
|
|
|
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
|
|
/*
|
|
@@ -1901,7 +1753,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
|
|
* block when updating the stripe
|
|
*/
|
|
if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
|
|
- __clear_bit(v->ptrs[i].dev, devs.d);
|
|
+ __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d);
|
|
|
|
if (i < s->nr_data)
|
|
nr_have_data++;
|
|
@@ -1912,95 +1764,94 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
|
|
BUG_ON(nr_have_data > s->nr_data);
|
|
BUG_ON(nr_have_parity > s->nr_parity);
|
|
|
|
- buckets.nr = 0;
|
|
+ req->ptrs.nr = 0;
|
|
if (nr_have_parity < s->nr_parity) {
|
|
- ret = bch2_bucket_alloc_set_trans(trans, &buckets,
|
|
- &h->parity_stripe,
|
|
- &devs,
|
|
- s->nr_parity,
|
|
- &nr_have_parity,
|
|
- &have_cache, 0,
|
|
- BCH_DATA_parity,
|
|
- watermark,
|
|
- cl);
|
|
-
|
|
- open_bucket_for_each(c, &buckets, ob, i) {
|
|
+ req->nr_replicas = s->nr_parity;
|
|
+ req->nr_effective = nr_have_parity;
|
|
+ req->data_type = BCH_DATA_parity;
|
|
+
|
|
+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl);
|
|
+
|
|
+ open_bucket_for_each(c, &req->ptrs, ob, i) {
|
|
j = find_next_zero_bit(s->blocks_gotten,
|
|
s->nr_data + s->nr_parity,
|
|
s->nr_data);
|
|
BUG_ON(j >= s->nr_data + s->nr_parity);
|
|
|
|
- s->blocks[j] = buckets.v[i];
|
|
+ s->blocks[j] = req->ptrs.v[i];
|
|
v->ptrs[j] = bch2_ob_ptr(c, ob);
|
|
__set_bit(j, s->blocks_gotten);
|
|
}
|
|
|
|
if (ret)
|
|
- return ret;
|
|
+ goto err;
|
|
}
|
|
|
|
- buckets.nr = 0;
|
|
+ req->ptrs.nr = 0;
|
|
if (nr_have_data < s->nr_data) {
|
|
- ret = bch2_bucket_alloc_set_trans(trans, &buckets,
|
|
- &h->block_stripe,
|
|
- &devs,
|
|
- s->nr_data,
|
|
- &nr_have_data,
|
|
- &have_cache, 0,
|
|
- BCH_DATA_user,
|
|
- watermark,
|
|
- cl);
|
|
-
|
|
- open_bucket_for_each(c, &buckets, ob, i) {
|
|
+ req->nr_replicas = s->nr_data;
|
|
+ req->nr_effective = nr_have_data;
|
|
+ req->data_type = BCH_DATA_user;
|
|
+
|
|
+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl);
|
|
+
|
|
+ open_bucket_for_each(c, &req->ptrs, ob, i) {
|
|
j = find_next_zero_bit(s->blocks_gotten,
|
|
s->nr_data, 0);
|
|
BUG_ON(j >= s->nr_data);
|
|
|
|
- s->blocks[j] = buckets.v[i];
|
|
+ s->blocks[j] = req->ptrs.v[i];
|
|
v->ptrs[j] = bch2_ob_ptr(c, ob);
|
|
__set_bit(j, s->blocks_gotten);
|
|
}
|
|
|
|
if (ret)
|
|
- return ret;
|
|
+ goto err;
|
|
}
|
|
-
|
|
- return 0;
|
|
+err:
|
|
+ req->data_type = req->scratch_data_type;
|
|
+ req->ptrs = req->scratch_ptrs;
|
|
+ req->nr_replicas = req->scratch_nr_replicas;
|
|
+ req->nr_effective = req->scratch_nr_effective;
|
|
+ req->have_cache = req->scratch_have_cache;
|
|
+ req->devs_may_alloc = req->scratch_devs_may_alloc;
|
|
+ return ret;
|
|
}
|
|
|
|
-static s64 get_existing_stripe(struct bch_fs *c,
|
|
- struct ec_stripe_head *head)
|
|
+static int __get_existing_stripe(struct btree_trans *trans,
|
|
+ struct ec_stripe_head *head,
|
|
+ struct ec_stripe_buf *stripe,
|
|
+ u64 idx)
|
|
{
|
|
- ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
- struct stripe *m;
|
|
- size_t heap_idx;
|
|
- u64 stripe_idx;
|
|
- s64 ret = -1;
|
|
-
|
|
- if (may_create_new_stripe(c))
|
|
- return -1;
|
|
+ struct bch_fs *c = trans->c;
|
|
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
|
|
- /* No blocks worth reusing, stripe will just be deleted: */
|
|
- if (!h->data[heap_idx].blocks_nonempty)
|
|
- continue;
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
|
|
+ BTREE_ID_stripes, POS(0, idx), 0);
|
|
+ int ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
- stripe_idx = h->data[heap_idx].idx;
|
|
+ /* We expect write buffer races here */
|
|
+ if (k.k->type != KEY_TYPE_stripe)
|
|
+ goto out;
|
|
|
|
- m = genradix_ptr(&c->stripes, stripe_idx);
|
|
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
|
|
+ if (stripe_lru_pos(s.v) <= 1)
|
|
+ goto out;
|
|
|
|
- if (m->disk_label == head->disk_label &&
|
|
- m->algorithm == head->algo &&
|
|
- m->nr_redundant == head->redundancy &&
|
|
- m->sectors == head->blocksize &&
|
|
- m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
|
|
- bch2_try_open_stripe(c, head->s, stripe_idx)) {
|
|
- ret = stripe_idx;
|
|
- break;
|
|
- }
|
|
+ if (s.v->disk_label == head->disk_label &&
|
|
+ s.v->algorithm == head->algo &&
|
|
+ s.v->nr_redundant == head->redundancy &&
|
|
+ le16_to_cpu(s.v->sectors) == head->blocksize &&
|
|
+ bch2_try_open_stripe(c, head->s, idx)) {
|
|
+ bkey_reassemble(&stripe->key, k);
|
|
+ ret = 1;
|
|
}
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
+out:
|
|
+ bch2_set_btree_iter_dontneed(trans, &iter);
|
|
+err:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
|
|
@@ -2052,24 +1903,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
|
|
struct ec_stripe_new *s)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- s64 idx;
|
|
- int ret;
|
|
|
|
/*
|
|
* If we can't allocate a new stripe, and there's no stripes with empty
|
|
* blocks for us to reuse, that means we have to wait on copygc:
|
|
*/
|
|
- idx = get_existing_stripe(c, h);
|
|
- if (idx < 0)
|
|
- return -BCH_ERR_stripe_alloc_blocked;
|
|
+ if (may_create_new_stripe(c))
|
|
+ return -1;
|
|
|
|
- ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
|
|
- bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
|
|
- "reading stripe key: %s", bch2_err_str(ret));
|
|
- if (ret) {
|
|
- bch2_stripe_close(c, s);
|
|
- return ret;
|
|
+ struct btree_iter lru_iter;
|
|
+ struct bkey_s_c lru_k;
|
|
+ int ret = 0;
|
|
+
|
|
+ for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
|
|
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
|
|
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
|
|
+ 0, lru_k, ret) {
|
|
+ ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
|
|
+ if (ret)
|
|
+ break;
|
|
}
|
|
+ bch2_trans_iter_exit(trans, &lru_iter);
|
|
+ if (!ret)
|
|
+ ret = -BCH_ERR_stripe_alloc_blocked;
|
|
+ if (ret == 1)
|
|
+ ret = 0;
|
|
+ if (ret)
|
|
+ return ret;
|
|
|
|
return init_new_stripe_from_existing(c, s);
|
|
}
|
|
@@ -2102,7 +1962,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
|
|
if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
|
|
if (start_pos.offset) {
|
|
start_pos = min_pos;
|
|
- bch2_btree_iter_set_pos(&iter, start_pos);
|
|
+ bch2_btree_iter_set_pos(trans, &iter, start_pos);
|
|
continue;
|
|
}
|
|
|
|
@@ -2136,17 +1996,15 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
|
|
}
|
|
|
|
struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
|
|
- unsigned target,
|
|
+ struct alloc_request *req,
|
|
unsigned algo,
|
|
- unsigned redundancy,
|
|
- enum bch_watermark watermark,
|
|
struct closure *cl)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct ec_stripe_head *h;
|
|
- bool waiting = false;
|
|
+ unsigned redundancy = req->nr_replicas - 1;
|
|
unsigned disk_label = 0;
|
|
- struct target t = target_decode(target);
|
|
+ struct target t = target_decode(req->target);
|
|
+ bool waiting = false;
|
|
int ret;
|
|
|
|
if (t.type == TARGET_GROUP) {
|
|
@@ -2157,7 +2015,9 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
|
|
disk_label = t.group + 1; /* 0 == no label */
|
|
}
|
|
|
|
- h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
|
|
+ struct ec_stripe_head *h =
|
|
+ __bch2_ec_stripe_head_get(trans, disk_label, algo,
|
|
+ redundancy, req->watermark);
|
|
if (IS_ERR_OR_NULL(h))
|
|
return h;
|
|
|
|
@@ -2181,8 +2041,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
|
|
goto alloc_existing;
|
|
|
|
/* First, try to allocate a full stripe: */
|
|
- ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?:
|
|
+ enum bch_watermark saved_watermark = BCH_WATERMARK_stripe;
|
|
+ swap(req->watermark, saved_watermark);
|
|
+ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
|
|
__bch2_ec_stripe_head_reserve(trans, h, s);
|
|
+ swap(req->watermark, saved_watermark);
|
|
+
|
|
if (!ret)
|
|
goto allocate_buf;
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
|
|
@@ -2200,8 +2064,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
|
|
if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
|
|
goto err;
|
|
|
|
- if (watermark == BCH_WATERMARK_copygc) {
|
|
- ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?:
|
|
+ if (req->watermark == BCH_WATERMARK_copygc) {
|
|
+ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
|
|
__bch2_ec_stripe_head_reserve(trans, h, s);
|
|
if (ret)
|
|
goto err;
|
|
@@ -2220,7 +2084,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
|
|
* Retry allocating buckets, with the watermark for this
|
|
* particular write:
|
|
*/
|
|
- ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl);
|
|
+ ret = new_stripe_alloc_buckets(trans, req, h, s, cl);
|
|
if (ret)
|
|
goto err;
|
|
|
|
@@ -2242,67 +2106,106 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
|
|
|
|
/* device removal */
|
|
|
|
-static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
|
|
+int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_s_c k,
|
|
+ unsigned dev_idx,
|
|
+ unsigned flags)
|
|
{
|
|
- struct bch_alloc_v4 a_convert;
|
|
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
|
|
-
|
|
- if (!a->stripe)
|
|
+ if (k.k->type != KEY_TYPE_stripe)
|
|
return 0;
|
|
|
|
- if (a->stripe_sectors) {
|
|
- bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
|
|
- return -BCH_ERR_invalidate_stripe_to_dev;
|
|
- }
|
|
-
|
|
- struct btree_iter iter;
|
|
struct bkey_i_stripe *s =
|
|
- bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
|
|
- BTREE_ITER_slots, stripe);
|
|
+ bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe);
|
|
int ret = PTR_ERR_OR_ZERO(s);
|
|
if (ret)
|
|
return ret;
|
|
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_replicas,
|
|
- };
|
|
+ struct disk_accounting_pos acc;
|
|
|
|
s64 sectors = 0;
|
|
for (unsigned i = 0; i < s->v.nr_blocks; i++)
|
|
sectors -= stripe_blockcount_get(&s->v, i);
|
|
|
|
+ memset(&acc, 0, sizeof(acc));
|
|
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
|
|
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
|
|
acc.replicas.data_type = BCH_DATA_user;
|
|
ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false);
|
|
if (ret)
|
|
- goto err;
|
|
+ return ret;
|
|
|
|
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
|
|
- bkey_for_each_ptr(ptrs, ptr)
|
|
- if (ptr->dev == k_a.k->p.inode)
|
|
+
|
|
+ /* XXX: how much redundancy do we still have? check degraded flags */
|
|
+
|
|
+ unsigned nr_good = 0;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ bkey_for_each_ptr(ptrs, ptr) {
|
|
+ if (ptr->dev == dev_idx)
|
|
ptr->dev = BCH_SB_MEMBER_INVALID;
|
|
|
|
+ struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev);
|
|
+ nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
|
|
+ return -BCH_ERR_remove_would_lose_data;
|
|
+
|
|
+ unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant;
|
|
+
|
|
+ if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST))
|
|
+ return -BCH_ERR_remove_would_lose_data;
|
|
+
|
|
sectors = -sectors;
|
|
|
|
+ memset(&acc, 0, sizeof(acc));
|
|
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
|
|
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
|
|
acc.replicas.data_type = BCH_DATA_user;
|
|
- ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false);
|
|
+ return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false);
|
|
+}
|
|
+
|
|
+static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bch_alloc_v4 a_convert;
|
|
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
|
|
+
|
|
+ if (!a->stripe)
|
|
+ return 0;
|
|
+
|
|
+ if (a->stripe_sectors) {
|
|
+ bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
|
|
+ return -BCH_ERR_invalidate_stripe_to_dev;
|
|
+ }
|
|
+
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c_stripe s =
|
|
+ bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
|
|
+ BTREE_ITER_slots, stripe);
|
|
+ int ret = bkey_err(s);
|
|
if (ret)
|
|
- goto err;
|
|
-err:
|
|
+ return ret;
|
|
+
|
|
+ ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
|
|
-int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
|
|
+int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags)
|
|
{
|
|
- return bch2_trans_run(c,
|
|
+ int ret = bch2_trans_run(c,
|
|
for_each_btree_key_max_commit(trans, iter,
|
|
BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
|
|
BTREE_ITER_intent, k,
|
|
NULL, NULL, 0, ({
|
|
- bch2_invalidate_stripe_to_dev(trans, k);
|
|
+ bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags);
|
|
})));
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
}
|
|
|
|
/* startup/shutdown */
|
|
@@ -2351,10 +2254,10 @@ void bch2_fs_ec_stop(struct bch_fs *c)
|
|
|
|
static bool bch2_fs_ec_flush_done(struct bch_fs *c)
|
|
{
|
|
- bool ret;
|
|
+ sched_annotate_sleep();
|
|
|
|
mutex_lock(&c->ec_stripe_new_lock);
|
|
- ret = list_empty(&c->ec_stripe_new_list);
|
|
+ bool ret = list_empty(&c->ec_stripe_new_list);
|
|
mutex_unlock(&c->ec_stripe_new_lock);
|
|
|
|
return ret;
|
|
@@ -2367,46 +2270,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
|
|
|
|
int bch2_stripes_read(struct bch_fs *c)
|
|
{
|
|
- int ret = bch2_trans_run(c,
|
|
- for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
|
|
- BTREE_ITER_prefetch, k, ({
|
|
- if (k.k->type != KEY_TYPE_stripe)
|
|
- continue;
|
|
-
|
|
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
|
|
- if (ret)
|
|
- break;
|
|
-
|
|
- struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
|
|
-
|
|
- stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
|
|
-
|
|
- bch2_stripes_heap_insert(c, m, k.k->p.offset);
|
|
- 0;
|
|
- })));
|
|
- bch_err_fn(c, ret);
|
|
- return ret;
|
|
-}
|
|
-
|
|
-void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
|
|
-{
|
|
- ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
- struct stripe *m;
|
|
- size_t i;
|
|
-
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
|
|
- m = genradix_ptr(&c->stripes, h->data[i].idx);
|
|
-
|
|
- prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
|
|
- h->data[i].blocks_nonempty,
|
|
- m->nr_blocks - m->nr_redundant,
|
|
- m->nr_redundant);
|
|
- if (bch2_stripe_is_open(c, h->data[i].idx))
|
|
- prt_str(out, " open");
|
|
- prt_newline(out);
|
|
- }
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
+ return 0;
|
|
}
|
|
|
|
static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
|
|
@@ -2477,15 +2341,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
|
|
|
|
BUG_ON(!list_empty(&c->ec_stripe_new_list));
|
|
|
|
- free_heap(&c->ec_stripes_heap);
|
|
- genradix_free(&c->stripes);
|
|
bioset_exit(&c->ec_bioset);
|
|
}
|
|
|
|
void bch2_fs_ec_init_early(struct bch_fs *c)
|
|
{
|
|
spin_lock_init(&c->ec_stripes_new_lock);
|
|
- mutex_init(&c->ec_stripes_heap_lock);
|
|
|
|
INIT_LIST_HEAD(&c->ec_stripe_head_list);
|
|
mutex_init(&c->ec_stripe_head_lock);
|
|
@@ -2503,3 +2364,40 @@ int bch2_fs_ec_init(struct bch_fs *c)
|
|
return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
|
|
BIOSET_NEED_BVECS);
|
|
}
|
|
+
|
|
+static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans,
|
|
+ struct bkey_s_c k,
|
|
+ struct bkey_buf *last_flushed)
|
|
+{
|
|
+ if (k.k->type != KEY_TYPE_stripe)
|
|
+ return 0;
|
|
+
|
|
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
|
|
+
|
|
+ u64 lru_idx = stripe_lru_pos(s.v);
|
|
+ if (lru_idx) {
|
|
+ int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION,
|
|
+ k.k->p.offset, lru_idx, k, last_flushed);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_check_stripe_to_lru_refs(struct bch_fs *c)
|
|
+{
|
|
+ struct bkey_buf last_flushed;
|
|
+
|
|
+ bch2_bkey_buf_init(&last_flushed);
|
|
+ bkey_init(&last_flushed.k->k);
|
|
+
|
|
+ int ret = bch2_trans_run(c,
|
|
+ for_each_btree_key_commit(trans, iter, BTREE_ID_stripes,
|
|
+ POS_MIN, BTREE_ITER_prefetch, k,
|
|
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
+ bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)));
|
|
+
|
|
+ bch2_bkey_buf_exit(&last_flushed, c);
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
|
|
index 583ca6a226da..548048adf0d5 100644
|
|
--- a/fs/bcachefs/ec.h
|
|
+++ b/fs/bcachefs/ec.h
|
|
@@ -92,6 +92,29 @@ static inline void stripe_csum_set(struct bch_stripe *s,
|
|
memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
|
|
}
|
|
|
|
+#define STRIPE_LRU_POS_EMPTY 1
|
|
+
|
|
+static inline u64 stripe_lru_pos(const struct bch_stripe *s)
|
|
+{
|
|
+ if (!s)
|
|
+ return 0;
|
|
+
|
|
+ unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
|
|
+
|
|
+ for (unsigned i = 0; i < nr_data; i++)
|
|
+ blocks_empty += !stripe_blockcount_get(s, i);
|
|
+
|
|
+ /* Will be picked up by the stripe_delete worker */
|
|
+ if (blocks_empty == nr_data)
|
|
+ return STRIPE_LRU_POS_EMPTY;
|
|
+
|
|
+ if (!blocks_empty)
|
|
+ return 0;
|
|
+
|
|
+ /* invert: more blocks empty = reuse first */
|
|
+ return LRU_TIME_MAX - blocks_empty;
|
|
+}
|
|
+
|
|
static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
|
|
const struct bch_extent_ptr *data_ptr,
|
|
unsigned sectors)
|
|
@@ -132,6 +155,21 @@ static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
|
|
m->sectors);
|
|
}
|
|
|
|
+static inline void gc_stripe_unlock(struct gc_stripe *s)
|
|
+{
|
|
+ BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
|
|
+
|
|
+ clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock);
|
|
+ smp_mb__after_atomic();
|
|
+ wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR);
|
|
+}
|
|
+
|
|
+static inline void gc_stripe_lock(struct gc_stripe *s)
|
|
+{
|
|
+ wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR,
|
|
+ TASK_UNINTERRUPTIBLE);
|
|
+}
|
|
+
|
|
struct bch_read_bio;
|
|
|
|
struct ec_stripe_buf {
|
|
@@ -212,18 +250,15 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey
|
|
|
|
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
|
|
|
|
-void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
|
|
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
|
|
|
|
int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
|
|
|
|
void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
|
|
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
|
|
- unsigned, unsigned, unsigned,
|
|
- enum bch_watermark, struct closure *);
|
|
|
|
-void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
|
|
-void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
|
|
-void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
|
|
+struct alloc_request;
|
|
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
|
|
+ struct alloc_request *, unsigned, struct closure *);
|
|
|
|
void bch2_do_stripe_deletes(struct bch_fs *);
|
|
void bch2_ec_do_stripe_creates(struct bch_fs *);
|
|
@@ -253,7 +288,9 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
|
|
}
|
|
}
|
|
|
|
-int bch2_dev_remove_stripes(struct bch_fs *, unsigned);
|
|
+int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *,
|
|
+ struct bkey_s_c, unsigned, unsigned);
|
|
+int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned);
|
|
|
|
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
|
|
void bch2_fs_ec_stop(struct bch_fs *);
|
|
@@ -261,11 +298,12 @@ void bch2_fs_ec_flush(struct bch_fs *);
|
|
|
|
int bch2_stripes_read(struct bch_fs *);
|
|
|
|
-void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
|
|
void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
|
|
|
|
void bch2_fs_ec_exit(struct bch_fs *);
|
|
void bch2_fs_ec_init_early(struct bch_fs *);
|
|
int bch2_fs_ec_init(struct bch_fs *);
|
|
|
|
+int bch2_check_stripe_to_lru_refs(struct bch_fs *);
|
|
+
|
|
#endif /* _BCACHEFS_EC_H */
|
|
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
|
|
index 8d1e70e830ac..809446c78951 100644
|
|
--- a/fs/bcachefs/ec_types.h
|
|
+++ b/fs/bcachefs/ec_types.h
|
|
@@ -4,9 +4,10 @@
|
|
|
|
#include "bcachefs_format.h"
|
|
|
|
-struct bch_replicas_padded {
|
|
+union bch_replicas_padded {
|
|
+ u8 bytes[struct_size_t(struct bch_replicas_entry_v1,
|
|
+ devs, BCH_BKEY_PTRS_MAX)];
|
|
struct bch_replicas_entry_v1 e;
|
|
- u8 pad[BCH_BKEY_PTRS_MAX];
|
|
};
|
|
|
|
struct stripe {
|
|
@@ -20,23 +21,15 @@ struct stripe {
|
|
};
|
|
|
|
struct gc_stripe {
|
|
+ u8 lock;
|
|
+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */
|
|
u16 sectors;
|
|
-
|
|
u8 nr_blocks;
|
|
u8 nr_redundant;
|
|
-
|
|
- unsigned alive:1; /* does a corresponding key exist in stripes btree? */
|
|
u16 block_sectors[BCH_BKEY_PTRS_MAX];
|
|
struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];
|
|
|
|
- struct bch_replicas_padded r;
|
|
+ union bch_replicas_padded r;
|
|
};
|
|
|
|
-struct ec_stripe_heap_entry {
|
|
- size_t idx;
|
|
- unsigned blocks_nonempty;
|
|
-};
|
|
-
|
|
-typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap;
|
|
-
|
|
#endif /* _BCACHEFS_EC_TYPES_H */
|
|
diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c
|
|
new file mode 100644
|
|
index 000000000000..56ab430f209f
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/enumerated_ref.c
|
|
@@ -0,0 +1,144 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "enumerated_ref.h"
|
|
+#include "util.h"
|
|
+
|
|
+#include <linux/completion.h>
|
|
+
|
|
+#ifdef ENUMERATED_REF_DEBUG
|
|
+void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
|
|
+{
|
|
+ BUG_ON(idx >= ref->nr);
|
|
+ atomic_long_inc(&ref->refs[idx]);
|
|
+}
|
|
+
|
|
+bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
|
|
+{
|
|
+ BUG_ON(idx >= ref->nr);
|
|
+ return atomic_long_inc_not_zero(&ref->refs[idx]);
|
|
+}
|
|
+
|
|
+bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
|
|
+{
|
|
+ BUG_ON(idx >= ref->nr);
|
|
+ return !ref->dying &&
|
|
+ atomic_long_inc_not_zero(&ref->refs[idx]);
|
|
+}
|
|
+
|
|
+void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
|
|
+{
|
|
+ BUG_ON(idx >= ref->nr);
|
|
+ long v = atomic_long_dec_return(&ref->refs[idx]);
|
|
+
|
|
+ BUG_ON(v < 0);
|
|
+ if (v)
|
|
+ return;
|
|
+
|
|
+ for (unsigned i = 0; i < ref->nr; i++)
|
|
+ if (atomic_long_read(&ref->refs[i]))
|
|
+ return;
|
|
+
|
|
+ if (ref->stop_fn)
|
|
+ ref->stop_fn(ref);
|
|
+ complete(&ref->stop_complete);
|
|
+}
|
|
+#endif
|
|
+
|
|
+#ifndef ENUMERATED_REF_DEBUG
|
|
+static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref)
|
|
+{
|
|
+ struct enumerated_ref *ref =
|
|
+ container_of(percpu_ref, struct enumerated_ref, ref);
|
|
+
|
|
+ if (ref->stop_fn)
|
|
+ ref->stop_fn(ref);
|
|
+ complete(&ref->stop_complete);
|
|
+}
|
|
+#endif
|
|
+
|
|
+void enumerated_ref_stop_async(struct enumerated_ref *ref)
|
|
+{
|
|
+ reinit_completion(&ref->stop_complete);
|
|
+
|
|
+#ifndef ENUMERATED_REF_DEBUG
|
|
+ percpu_ref_kill(&ref->ref);
|
|
+#else
|
|
+ ref->dying = true;
|
|
+ for (unsigned i = 0; i < ref->nr; i++)
|
|
+ enumerated_ref_put(ref, i);
|
|
+#endif
|
|
+}
|
|
+
|
|
+void enumerated_ref_stop(struct enumerated_ref *ref,
|
|
+ const char * const names[])
|
|
+{
|
|
+ enumerated_ref_stop_async(ref);
|
|
+ while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n");
|
|
+ prt_str(&buf, "Outstanding refs:\n");
|
|
+ enumerated_ref_to_text(&buf, ref, names);
|
|
+ printk(KERN_ERR "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
+}
|
|
+
|
|
+void enumerated_ref_start(struct enumerated_ref *ref)
|
|
+{
|
|
+#ifndef ENUMERATED_REF_DEBUG
|
|
+ percpu_ref_reinit(&ref->ref);
|
|
+#else
|
|
+ ref->dying = false;
|
|
+ for (unsigned i = 0; i < ref->nr; i++) {
|
|
+ BUG_ON(atomic_long_read(&ref->refs[i]));
|
|
+ atomic_long_inc(&ref->refs[i]);
|
|
+ }
|
|
+#endif
|
|
+}
|
|
+
|
|
+void enumerated_ref_exit(struct enumerated_ref *ref)
|
|
+{
|
|
+#ifndef ENUMERATED_REF_DEBUG
|
|
+ percpu_ref_exit(&ref->ref);
|
|
+#else
|
|
+ kfree(ref->refs);
|
|
+ ref->refs = NULL;
|
|
+ ref->nr = 0;
|
|
+#endif
|
|
+}
|
|
+
|
|
+int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr,
|
|
+ void (*stop_fn)(struct enumerated_ref *))
|
|
+{
|
|
+ init_completion(&ref->stop_complete);
|
|
+ ref->stop_fn = stop_fn;
|
|
+
|
|
+#ifndef ENUMERATED_REF_DEBUG
|
|
+ return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb,
|
|
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL);
|
|
+#else
|
|
+ ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL);
|
|
+ if (!ref->refs)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ref->nr = nr;
|
|
+ return 0;
|
|
+#endif
|
|
+}
|
|
+
|
|
+void enumerated_ref_to_text(struct printbuf *out,
|
|
+ struct enumerated_ref *ref,
|
|
+ const char * const names[])
|
|
+{
|
|
+#ifdef ENUMERATED_REF_DEBUG
|
|
+ bch2_printbuf_tabstop_push(out, 32);
|
|
+
|
|
+ for (unsigned i = 0; i < ref->nr; i++)
|
|
+ prt_printf(out, "%s\t%li\n", names[i],
|
|
+ atomic_long_read(&ref->refs[i]));
|
|
+#else
|
|
+ prt_str(out, "(not in debug mode)\n");
|
|
+#endif
|
|
+}
|
|
diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h
|
|
new file mode 100644
|
|
index 000000000000..ec01cf59ef80
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/enumerated_ref.h
|
|
@@ -0,0 +1,66 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_ENUMERATED_REF_H
|
|
+#define _BCACHEFS_ENUMERATED_REF_H
|
|
+
|
|
+#include "enumerated_ref_types.h"
|
|
+
|
|
+/*
|
|
+ * A refcount where the users are enumerated: in debug mode, we create sepate
|
|
+ * refcounts for each user, to make leaks and refcount errors easy to track
|
|
+ * down:
|
|
+ */
|
|
+
|
|
+#ifdef ENUMERATED_REF_DEBUG
|
|
+void enumerated_ref_get(struct enumerated_ref *, unsigned);
|
|
+bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned);
|
|
+bool enumerated_ref_tryget(struct enumerated_ref *, unsigned);
|
|
+void enumerated_ref_put(struct enumerated_ref *, unsigned);
|
|
+#else
|
|
+
|
|
+static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
|
|
+{
|
|
+ percpu_ref_get(&ref->ref);
|
|
+}
|
|
+
|
|
+static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
|
|
+{
|
|
+ return percpu_ref_tryget(&ref->ref);
|
|
+}
|
|
+
|
|
+static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
|
|
+{
|
|
+ return percpu_ref_tryget_live(&ref->ref);
|
|
+}
|
|
+
|
|
+static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
|
|
+{
|
|
+ percpu_ref_put(&ref->ref);
|
|
+}
|
|
+#endif
|
|
+
|
|
+static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref)
|
|
+{
|
|
+#ifndef ENUMERATED_REF_DEBUG
|
|
+ return percpu_ref_is_zero(&ref->ref);
|
|
+#else
|
|
+ for (unsigned i = 0; i < ref->nr; i++)
|
|
+ if (atomic_long_read(&ref->refs[i]))
|
|
+ return false;
|
|
+ return true;
|
|
+#endif
|
|
+}
|
|
+
|
|
+void enumerated_ref_stop_async(struct enumerated_ref *);
|
|
+void enumerated_ref_stop(struct enumerated_ref *, const char * const[]);
|
|
+void enumerated_ref_start(struct enumerated_ref *);
|
|
+
|
|
+void enumerated_ref_exit(struct enumerated_ref *);
|
|
+int enumerated_ref_init(struct enumerated_ref *, unsigned,
|
|
+ void (*stop_fn)(struct enumerated_ref *));
|
|
+
|
|
+struct printbuf;
|
|
+void enumerated_ref_to_text(struct printbuf *,
|
|
+ struct enumerated_ref *,
|
|
+ const char * const[]);
|
|
+
|
|
+#endif /* _BCACHEFS_ENUMERATED_REF_H */
|
|
diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h
|
|
new file mode 100644
|
|
index 000000000000..0e6076f466d3
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/enumerated_ref_types.h
|
|
@@ -0,0 +1,19 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H
|
|
+#define _BCACHEFS_ENUMERATED_REF_TYPES_H
|
|
+
|
|
+#include <linux/percpu-refcount.h>
|
|
+
|
|
+struct enumerated_ref {
|
|
+#ifdef ENUMERATED_REF_DEBUG
|
|
+ unsigned nr;
|
|
+ bool dying;
|
|
+ atomic_long_t *refs;
|
|
+#else
|
|
+ struct percpu_ref ref;
|
|
+#endif
|
|
+ void (*stop_fn)(struct enumerated_ref *);
|
|
+ struct completion stop_complete;
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */
|
|
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
|
|
index 4590cd0c7c90..62843e772b2c 100644
|
|
--- a/fs/bcachefs/errcode.h
|
|
+++ b/fs/bcachefs/errcode.h
|
|
@@ -5,6 +5,8 @@
|
|
#define BCH_ERRCODES() \
|
|
x(ERANGE, ERANGE_option_too_small) \
|
|
x(ERANGE, ERANGE_option_too_big) \
|
|
+ x(EINVAL, injected) \
|
|
+ x(BCH_ERR_injected, injected_fs_start) \
|
|
x(EINVAL, mount_option) \
|
|
x(BCH_ERR_mount_option, option_name) \
|
|
x(BCH_ERR_mount_option, option_value) \
|
|
@@ -51,6 +53,7 @@
|
|
x(ENOMEM, ENOMEM_dio_write_bioset_init) \
|
|
x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \
|
|
x(ENOMEM, ENOMEM_promote_table_init) \
|
|
+ x(ENOMEM, ENOMEM_async_obj_init) \
|
|
x(ENOMEM, ENOMEM_compression_bounce_read_init) \
|
|
x(ENOMEM, ENOMEM_compression_bounce_write_init) \
|
|
x(ENOMEM, ENOMEM_compression_workspace_init) \
|
|
@@ -116,9 +119,11 @@
|
|
x(ENOENT, ENOENT_snapshot_tree) \
|
|
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
|
|
x(ENOENT, ENOENT_dev_not_found) \
|
|
+ x(ENOENT, ENOENT_dev_bucket_not_found) \
|
|
x(ENOENT, ENOENT_dev_idx_not_found) \
|
|
x(ENOENT, ENOENT_inode_no_backpointer) \
|
|
x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
|
|
+ x(ENOENT, btree_node_dying) \
|
|
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
|
|
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
|
|
x(EEXIST, EEXIST_str_hash_set) \
|
|
@@ -170,6 +175,7 @@
|
|
x(0, backpointer_to_overwritten_btree_node) \
|
|
x(0, journal_reclaim_would_deadlock) \
|
|
x(EINVAL, fsck) \
|
|
+ x(BCH_ERR_fsck, fsck_ask) \
|
|
x(BCH_ERR_fsck, fsck_fix) \
|
|
x(BCH_ERR_fsck, fsck_delete_bkey) \
|
|
x(BCH_ERR_fsck, fsck_ignore) \
|
|
@@ -177,9 +183,14 @@
|
|
x(BCH_ERR_fsck, fsck_repair_unimplemented) \
|
|
x(BCH_ERR_fsck, fsck_repair_impossible) \
|
|
x(EINVAL, restart_recovery) \
|
|
- x(EINVAL, not_in_recovery) \
|
|
x(EINVAL, cannot_rewind_recovery) \
|
|
x(0, data_update_done) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_would_block) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_unwritten) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \
|
|
x(EINVAL, device_state_not_allowed) \
|
|
x(EINVAL, member_info_missing) \
|
|
x(EINVAL, mismatched_block_size) \
|
|
@@ -191,6 +202,7 @@
|
|
x(EINVAL, device_has_been_removed) \
|
|
x(EINVAL, device_splitbrain) \
|
|
x(EINVAL, device_already_online) \
|
|
+ x(EINVAL, filesystem_uuid_already_open) \
|
|
x(EINVAL, insufficient_devices_to_start) \
|
|
x(EINVAL, invalid) \
|
|
x(EINVAL, internal_fsck_err) \
|
|
@@ -200,6 +212,9 @@
|
|
x(EINVAL, no_resize_with_buckets_nouse) \
|
|
x(EINVAL, inode_unpack_error) \
|
|
x(EINVAL, varint_decode_error) \
|
|
+ x(EINVAL, erasure_coding_found_btree_node) \
|
|
+ x(EINVAL, option_negative) \
|
|
+ x(EOPNOTSUPP, may_not_use_incompat_feature) \
|
|
x(EROFS, erofs_trans_commit) \
|
|
x(EROFS, erofs_no_writes) \
|
|
x(EROFS, erofs_journal_err) \
|
|
@@ -207,13 +222,23 @@
|
|
x(EROFS, erofs_unfixed_errors) \
|
|
x(EROFS, erofs_norecovery) \
|
|
x(EROFS, erofs_nochanges) \
|
|
+ x(EROFS, erofs_no_alloc_info) \
|
|
+ x(EROFS, erofs_filesystem_full) \
|
|
x(EROFS, insufficient_devices) \
|
|
x(0, operation_blocked) \
|
|
x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \
|
|
- x(BCH_ERR_operation_blocked, journal_res_get_blocked) \
|
|
- x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \
|
|
- x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \
|
|
- x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \
|
|
+ x(BCH_ERR_operation_blocked, journal_res_blocked) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_blocked) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_max_open) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_full) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_pin_full) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_stuck) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_retry_open) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \
|
|
+ x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \
|
|
+ x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \
|
|
x(BCH_ERR_invalid, invalid_sb) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_magic) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_version) \
|
|
@@ -223,6 +248,7 @@
|
|
x(BCH_ERR_invalid_sb, invalid_sb_csum) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
|
|
+ x(BCH_ERR_invalid_sb, invalid_sb_offset) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
|
|
@@ -248,8 +274,9 @@
|
|
x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \
|
|
x(BCH_ERR_invalid, invalid_bkey) \
|
|
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
|
|
- x(EIO, journal_shutdown) \
|
|
+ x(EROFS, journal_shutdown) \
|
|
x(EIO, journal_flush_err) \
|
|
+ x(EIO, journal_write_err) \
|
|
x(EIO, btree_node_read_err) \
|
|
x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \
|
|
x(EIO, sb_not_downgraded) \
|
|
@@ -258,17 +285,53 @@
|
|
x(EIO, btree_node_read_validate_error) \
|
|
x(EIO, btree_need_topology_repair) \
|
|
x(EIO, bucket_ref_update) \
|
|
+ x(EIO, trigger_alloc) \
|
|
x(EIO, trigger_pointer) \
|
|
x(EIO, trigger_stripe_pointer) \
|
|
x(EIO, metadata_bucket_inconsistency) \
|
|
x(EIO, mark_stripe) \
|
|
x(EIO, stripe_reconstruct) \
|
|
x(EIO, key_type_error) \
|
|
- x(EIO, no_device_to_read_from) \
|
|
+ x(EIO, extent_poisoned) \
|
|
x(EIO, missing_indirect_extent) \
|
|
x(EIO, invalidate_stripe_to_dev) \
|
|
x(EIO, no_encryption_key) \
|
|
x(EIO, insufficient_journal_devices) \
|
|
+ x(EIO, device_offline) \
|
|
+ x(EIO, EIO_fault_injected) \
|
|
+ x(EIO, ec_block_read) \
|
|
+ x(EIO, ec_block_write) \
|
|
+ x(EIO, recompute_checksum) \
|
|
+ x(EIO, decompress) \
|
|
+ x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \
|
|
+ x(BCH_ERR_decompress, decompress_lz4) \
|
|
+ x(BCH_ERR_decompress, decompress_gzip) \
|
|
+ x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \
|
|
+ x(BCH_ERR_decompress, decompress_zstd) \
|
|
+ x(EIO, data_write) \
|
|
+ x(BCH_ERR_data_write, data_write_io) \
|
|
+ x(BCH_ERR_data_write, data_write_csum) \
|
|
+ x(BCH_ERR_data_write, data_write_invalid_ptr) \
|
|
+ x(BCH_ERR_data_write, data_write_misaligned) \
|
|
+ x(BCH_ERR_decompress, data_read) \
|
|
+ x(BCH_ERR_data_read, no_device_to_read_from) \
|
|
+ x(BCH_ERR_data_read, no_devices_valid) \
|
|
+ x(BCH_ERR_data_read, data_read_io_err) \
|
|
+ x(BCH_ERR_data_read, data_read_csum_err) \
|
|
+ x(BCH_ERR_data_read, data_read_retry) \
|
|
+ x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
|
|
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \
|
|
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \
|
|
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \
|
|
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \
|
|
+ x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\
|
|
+ x(BCH_ERR_data_read, data_read_decompress_err) \
|
|
+ x(BCH_ERR_data_read, data_read_decrypt_err) \
|
|
+ x(BCH_ERR_data_read, data_read_ptr_stale_race) \
|
|
+ x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \
|
|
+ x(BCH_ERR_data_read, data_read_no_encryption_key) \
|
|
+ x(BCH_ERR_data_read, data_read_buffer_too_small) \
|
|
+ x(BCH_ERR_data_read, data_read_key_overwritten) \
|
|
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
|
|
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
|
|
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
|
|
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
|
|
index 038da6a61f6b..c2cad28635bf 100644
|
|
--- a/fs/bcachefs/error.c
|
|
+++ b/fs/bcachefs/error.c
|
|
@@ -3,15 +3,24 @@
|
|
#include "btree_cache.h"
|
|
#include "btree_iter.h"
|
|
#include "error.h"
|
|
-#include "fs-common.h"
|
|
#include "journal.h"
|
|
+#include "namei.h"
|
|
#include "recovery_passes.h"
|
|
#include "super.h"
|
|
#include "thread_with_file.h"
|
|
|
|
#define FSCK_ERR_RATELIMIT_NR 10
|
|
|
|
-bool bch2_inconsistent_error(struct bch_fs *c)
|
|
+void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out)
|
|
+{
|
|
+ printbuf_indent_add_nextline(out, 2);
|
|
+
|
|
+#ifdef BCACHEFS_LOG_PREFIX
|
|
+ prt_printf(out, "bcachefs (%s): ", fs_or_dev_name);
|
|
+#endif
|
|
+}
|
|
+
|
|
+bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out)
|
|
{
|
|
set_bit(BCH_FS_error, &c->flags);
|
|
|
|
@@ -20,11 +29,10 @@ bool bch2_inconsistent_error(struct bch_fs *c)
|
|
return false;
|
|
case BCH_ON_ERROR_fix_safe:
|
|
case BCH_ON_ERROR_ro:
|
|
- if (bch2_fs_emergency_read_only(c))
|
|
- bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
|
|
- journal_cur_seq(&c->journal));
|
|
+ bch2_fs_emergency_read_only2(c, out);
|
|
return true;
|
|
case BCH_ON_ERROR_panic:
|
|
+ bch2_print_str(c, KERN_ERR, out->buf);
|
|
panic(bch2_fmt(c, "panic after error"));
|
|
return true;
|
|
default:
|
|
@@ -32,18 +40,91 @@ bool bch2_inconsistent_error(struct bch_fs *c)
|
|
}
|
|
}
|
|
|
|
-int bch2_topology_error(struct bch_fs *c)
|
|
+bool bch2_inconsistent_error(struct bch_fs *c)
|
|
+{
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ buf.atomic++;
|
|
+
|
|
+ printbuf_indent_add_nextline(&buf, 2);
|
|
+
|
|
+ bool ret = __bch2_inconsistent_error(c, &buf);
|
|
+ if (ret)
|
|
+ bch_err(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+__printf(3, 0)
|
|
+static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans,
|
|
+ const char *fmt, va_list args)
|
|
+{
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ buf.atomic++;
|
|
+
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+
|
|
+ prt_vprintf(&buf, fmt, args);
|
|
+ prt_newline(&buf);
|
|
+
|
|
+ if (trans)
|
|
+ bch2_trans_updates_to_text(&buf, trans);
|
|
+ bool ret = __bch2_inconsistent_error(c, &buf);
|
|
+ bch2_print_str_nonblocking(c, KERN_ERR, buf.buf);
|
|
+
|
|
+ printbuf_exit(&buf);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...)
|
|
+{
|
|
+ va_list args;
|
|
+ va_start(args, fmt);
|
|
+ bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args);
|
|
+ va_end(args);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...)
|
|
+{
|
|
+ va_list args;
|
|
+ va_start(args, fmt);
|
|
+ bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args);
|
|
+ va_end(args);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
|
|
{
|
|
+ prt_printf(out, "btree topology error: ");
|
|
+
|
|
set_bit(BCH_FS_topology_error, &c->flags);
|
|
- if (!test_bit(BCH_FS_recovery_running, &c->flags)) {
|
|
- bch2_inconsistent_error(c);
|
|
+ if (!test_bit(BCH_FS_in_recovery, &c->flags)) {
|
|
+ __bch2_inconsistent_error(c, out);
|
|
return -BCH_ERR_btree_need_topology_repair;
|
|
} else {
|
|
- return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
|
|
+ return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?:
|
|
-BCH_ERR_btree_node_read_validate_error;
|
|
}
|
|
}
|
|
|
|
+int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...)
|
|
+{
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+
|
|
+ va_list args;
|
|
+ va_start(args, fmt);
|
|
+ prt_vprintf(&buf, fmt, args);
|
|
+ va_end(args);
|
|
+
|
|
+ int ret = __bch2_topology_error(c, &buf);
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+
|
|
+ printbuf_exit(&buf);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
void bch2_fatal_error(struct bch_fs *c)
|
|
{
|
|
if (bch2_fs_emergency_read_only(c))
|
|
@@ -54,25 +135,44 @@ void bch2_io_error_work(struct work_struct *work)
|
|
{
|
|
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
|
|
struct bch_fs *c = ca->fs;
|
|
- bool dev;
|
|
+
|
|
+ /* XXX: if it's reads or checksums that are failing, set it to failed */
|
|
|
|
down_write(&c->state_lock);
|
|
- dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
|
|
- BCH_FORCE_IF_DEGRADED);
|
|
- if (dev
|
|
- ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
|
|
- BCH_FORCE_IF_DEGRADED)
|
|
- : bch2_fs_emergency_read_only(c))
|
|
- bch_err(ca,
|
|
- "too many IO errors, setting %s RO",
|
|
+ unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
|
|
+
|
|
+ if (write_errors_start &&
|
|
+ time_after(jiffies,
|
|
+ write_errors_start + c->opts.write_error_timeout * HZ)) {
|
|
+ if (ca->mi.state >= BCH_MEMBER_STATE_ro)
|
|
+ goto out;
|
|
+
|
|
+ bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
|
|
+ BCH_FORCE_IF_DEGRADED);
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ __bch2_log_msg_start(ca->name, &buf);
|
|
+
|
|
+ prt_printf(&buf, "writes erroring for %u seconds, setting %s ro",
|
|
+ c->opts.write_error_timeout,
|
|
dev ? "device" : "filesystem");
|
|
+ if (!dev)
|
|
+ bch2_fs_emergency_read_only2(c, &buf);
|
|
+
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
+out:
|
|
up_write(&c->state_lock);
|
|
}
|
|
|
|
void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
|
|
{
|
|
atomic64_inc(&ca->errors[type]);
|
|
- //queue_work(system_long_wq, &ca->io_error_work);
|
|
+
|
|
+ if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
|
|
+ ca->write_errors_start = jiffies;
|
|
+
|
|
+ queue_work(system_long_wq, &ca->io_error_work);
|
|
}
|
|
|
|
enum ask_yn {
|
|
@@ -168,15 +268,13 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
|
|
|
|
#endif
|
|
|
|
-static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
|
|
+static struct fsck_err_state *fsck_err_get(struct bch_fs *c,
|
|
+ enum bch_sb_error_id id)
|
|
{
|
|
struct fsck_err_state *s;
|
|
|
|
- if (!test_bit(BCH_FS_fsck_running, &c->flags))
|
|
- return NULL;
|
|
-
|
|
list_for_each_entry(s, &c->fsck_error_msgs, list)
|
|
- if (s->fmt == fmt) {
|
|
+ if (s->id == id) {
|
|
/*
|
|
* move it to the head of the list: repeated fsck errors
|
|
* are common
|
|
@@ -194,7 +292,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
|
|
}
|
|
|
|
INIT_LIST_HEAD(&s->list);
|
|
- s->fmt = fmt;
|
|
+ s->id = id;
|
|
list_add(&s->list, &c->fsck_error_msgs);
|
|
return s;
|
|
}
|
|
@@ -231,7 +329,7 @@ static int do_fsck_ask_yn(struct bch_fs *c,
|
|
if (bch2_fs_stdio_redirect(c))
|
|
bch2_print(c, "%s", question->buf);
|
|
else
|
|
- bch2_print_string_as_lines(KERN_ERR, question->buf);
|
|
+ bch2_print_str(c, KERN_ERR, question->buf);
|
|
|
|
int ask = bch2_fsck_ask_yn(c, trans);
|
|
|
|
@@ -244,15 +342,107 @@ static int do_fsck_ask_yn(struct bch_fs *c,
|
|
return ask;
|
|
}
|
|
|
|
+static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c,
|
|
+ enum bch_sb_error_id id, const char *msg,
|
|
+ bool *repeat, bool *print, bool *suppress)
|
|
+{
|
|
+ bch2_sb_error_count(c, id);
|
|
+
|
|
+ struct fsck_err_state *s = fsck_err_get(c, id);
|
|
+ if (s) {
|
|
+ /*
|
|
+ * We may be called multiple times for the same error on
|
|
+ * transaction restart - this memoizes instead of asking the user
|
|
+ * multiple times for the same error:
|
|
+ */
|
|
+ if (s->last_msg && !strcmp(msg, s->last_msg)) {
|
|
+ *repeat = true;
|
|
+ *print = false;
|
|
+ return s;
|
|
+ }
|
|
+
|
|
+ kfree(s->last_msg);
|
|
+ s->last_msg = kstrdup(msg, GFP_KERNEL);
|
|
+
|
|
+ if (c->opts.ratelimit_errors &&
|
|
+ s->nr >= FSCK_ERR_RATELIMIT_NR) {
|
|
+ if (s->nr == FSCK_ERR_RATELIMIT_NR)
|
|
+ *suppress = true;
|
|
+ else
|
|
+ *print = false;
|
|
+ }
|
|
+
|
|
+ s->nr++;
|
|
+ }
|
|
+ return s;
|
|
+}
|
|
+
|
|
+bool __bch2_count_fsck_err(struct bch_fs *c,
|
|
+ enum bch_sb_error_id id, struct printbuf *msg)
|
|
+{
|
|
+ bch2_sb_error_count(c, id);
|
|
+
|
|
+ mutex_lock(&c->fsck_error_msgs_lock);
|
|
+ bool print = true, repeat = false, suppress = false;
|
|
+
|
|
+ count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress);
|
|
+ mutex_unlock(&c->fsck_error_msgs_lock);
|
|
+
|
|
+ if (suppress)
|
|
+ prt_printf(msg, "Ratelimiting new instances of previous error\n");
|
|
+
|
|
+ return print && !repeat;
|
|
+}
|
|
+
|
|
+int bch2_fsck_err_opt(struct bch_fs *c,
|
|
+ enum bch_fsck_flags flags,
|
|
+ enum bch_sb_error_id err)
|
|
+{
|
|
+ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
|
|
+ flags |= fsck_flags_extra[err];
|
|
+
|
|
+ if (test_bit(BCH_FS_in_fsck, &c->flags)) {
|
|
+ if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE)))
|
|
+ return -BCH_ERR_fsck_repair_unimplemented;
|
|
+
|
|
+ switch (c->opts.fix_errors) {
|
|
+ case FSCK_FIX_exit:
|
|
+ return -BCH_ERR_fsck_errors_not_fixed;
|
|
+ case FSCK_FIX_yes:
|
|
+ if (flags & FSCK_CAN_FIX)
|
|
+ return -BCH_ERR_fsck_fix;
|
|
+ fallthrough;
|
|
+ case FSCK_FIX_no:
|
|
+ if (flags & FSCK_CAN_IGNORE)
|
|
+ return -BCH_ERR_fsck_ignore;
|
|
+ return -BCH_ERR_fsck_errors_not_fixed;
|
|
+ case FSCK_FIX_ask:
|
|
+ if (flags & FSCK_AUTOFIX)
|
|
+ return -BCH_ERR_fsck_fix;
|
|
+ return -BCH_ERR_fsck_ask;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+ } else {
|
|
+ if ((flags & FSCK_AUTOFIX) &&
|
|
+ (c->opts.errors == BCH_ON_ERROR_continue ||
|
|
+ c->opts.errors == BCH_ON_ERROR_fix_safe))
|
|
+ return -BCH_ERR_fsck_fix;
|
|
+
|
|
+ if (c->opts.errors == BCH_ON_ERROR_continue &&
|
|
+ (flags & FSCK_CAN_IGNORE))
|
|
+ return -BCH_ERR_fsck_ignore;
|
|
+ return -BCH_ERR_fsck_errors_not_fixed;
|
|
+ }
|
|
+}
|
|
+
|
|
int __bch2_fsck_err(struct bch_fs *c,
|
|
struct btree_trans *trans,
|
|
enum bch_fsck_flags flags,
|
|
enum bch_sb_error_id err,
|
|
const char *fmt, ...)
|
|
{
|
|
- struct fsck_err_state *s = NULL;
|
|
va_list args;
|
|
- bool print = true, suppressing = false, inconsistent = false, exiting = false;
|
|
struct printbuf buf = PRINTBUF, *out = &buf;
|
|
int ret = -BCH_ERR_fsck_ignore;
|
|
const char *action_orig = "fix?", *action = action_orig;
|
|
@@ -287,7 +477,12 @@ int __bch2_fsck_err(struct bch_fs *c,
|
|
? -BCH_ERR_fsck_fix
|
|
: -BCH_ERR_fsck_ignore;
|
|
|
|
- bch2_sb_error_count(c, err);
|
|
+ printbuf_indent_add_nextline(out, 2);
|
|
+
|
|
+#ifdef BCACHEFS_LOG_PREFIX
|
|
+ if (strncmp(fmt, "bcachefs", 8))
|
|
+ prt_printf(out, bch2_log_msg(c, ""));
|
|
+#endif
|
|
|
|
va_start(args, fmt);
|
|
prt_vprintf(out, fmt, args);
|
|
@@ -307,42 +502,15 @@ int __bch2_fsck_err(struct bch_fs *c,
|
|
}
|
|
|
|
mutex_lock(&c->fsck_error_msgs_lock);
|
|
- s = fsck_err_get(c, fmt);
|
|
- if (s) {
|
|
- /*
|
|
- * We may be called multiple times for the same error on
|
|
- * transaction restart - this memoizes instead of asking the user
|
|
- * multiple times for the same error:
|
|
- */
|
|
- if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
|
|
- ret = s->ret;
|
|
- goto err_unlock;
|
|
- }
|
|
-
|
|
- kfree(s->last_msg);
|
|
- s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
|
|
- if (!s->last_msg) {
|
|
- ret = -ENOMEM;
|
|
- goto err_unlock;
|
|
- }
|
|
-
|
|
- if (c->opts.ratelimit_errors &&
|
|
- !(flags & FSCK_NO_RATELIMIT) &&
|
|
- s->nr >= FSCK_ERR_RATELIMIT_NR) {
|
|
- if (s->nr == FSCK_ERR_RATELIMIT_NR)
|
|
- suppressing = true;
|
|
- else
|
|
- print = false;
|
|
- }
|
|
-
|
|
- s->nr++;
|
|
+ bool repeat = false, print = true, suppress = false;
|
|
+ bool inconsistent = false, exiting = false;
|
|
+ struct fsck_err_state *s =
|
|
+ count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress);
|
|
+ if (repeat) {
|
|
+ ret = s->ret;
|
|
+ goto err_unlock;
|
|
}
|
|
|
|
-#ifdef BCACHEFS_LOG_PREFIX
|
|
- if (!strncmp(fmt, "bcachefs:", 9))
|
|
- prt_printf(out, bch2_log_msg(c, ""));
|
|
-#endif
|
|
-
|
|
if ((flags & FSCK_AUTOFIX) &&
|
|
(c->opts.errors == BCH_ON_ERROR_continue ||
|
|
c->opts.errors == BCH_ON_ERROR_fix_safe)) {
|
|
@@ -356,11 +524,14 @@ int __bch2_fsck_err(struct bch_fs *c,
|
|
}
|
|
|
|
goto print;
|
|
- } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
|
|
+ } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) {
|
|
if (c->opts.errors != BCH_ON_ERROR_continue ||
|
|
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
|
|
- prt_str(out, ", shutting down");
|
|
+ prt_str_indented(out, ", shutting down\n"
|
|
+ "error not marked as autofix and not in fsck\n"
|
|
+ "run fsck, and forward to devs so error can be marked for self-healing");
|
|
inconsistent = true;
|
|
+ print = true;
|
|
ret = -BCH_ERR_fsck_errors_not_fixed;
|
|
} else if (flags & FSCK_CAN_FIX) {
|
|
prt_str(out, ", ");
|
|
@@ -412,31 +583,37 @@ int __bch2_fsck_err(struct bch_fs *c,
|
|
!(flags & FSCK_CAN_IGNORE)))
|
|
ret = -BCH_ERR_fsck_errors_not_fixed;
|
|
|
|
- if (test_bit(BCH_FS_fsck_running, &c->flags) &&
|
|
+ if (test_bit(BCH_FS_in_fsck, &c->flags) &&
|
|
(ret != -BCH_ERR_fsck_fix &&
|
|
ret != -BCH_ERR_fsck_ignore)) {
|
|
exiting = true;
|
|
print = true;
|
|
}
|
|
print:
|
|
+ prt_newline(out);
|
|
+
|
|
+ if (inconsistent)
|
|
+ __bch2_inconsistent_error(c, out);
|
|
+ else if (exiting)
|
|
+ prt_printf(out, "Unable to continue, halting\n");
|
|
+ else if (suppress)
|
|
+ prt_printf(out, "Ratelimiting new instances of previous error\n");
|
|
+
|
|
if (print) {
|
|
+ /* possibly strip an empty line, from printbuf_indent_add */
|
|
+ while (out->pos && out->buf[out->pos - 1] == ' ')
|
|
+ --out->pos;
|
|
+ printbuf_nul_terminate(out);
|
|
+
|
|
if (bch2_fs_stdio_redirect(c))
|
|
- bch2_print(c, "%s\n", out->buf);
|
|
+ bch2_print(c, "%s", out->buf);
|
|
else
|
|
- bch2_print_string_as_lines(KERN_ERR, out->buf);
|
|
+ bch2_print_str(c, KERN_ERR, out->buf);
|
|
}
|
|
|
|
- if (exiting)
|
|
- bch_err(c, "Unable to continue, halting");
|
|
- else if (suppressing)
|
|
- bch_err(c, "Ratelimiting new instances of previous error");
|
|
-
|
|
if (s)
|
|
s->ret = ret;
|
|
|
|
- if (inconsistent)
|
|
- bch2_inconsistent_error(c);
|
|
-
|
|
/*
|
|
* We don't yet track whether the filesystem currently has errors, for
|
|
* log_fsck_err()s: that would require us to track for every error type
|
|
@@ -498,29 +675,27 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
|
|
prt_printf(&buf, " level=%u: ", from.level);
|
|
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
- prt_str(&buf, "\n ");
|
|
+ prt_newline(&buf);
|
|
|
|
va_list args;
|
|
va_start(args, fmt);
|
|
prt_vprintf(&buf, fmt, args);
|
|
va_end(args);
|
|
|
|
- prt_str(&buf, ": delete?");
|
|
-
|
|
- int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf);
|
|
+ int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf);
|
|
printbuf_exit(&buf);
|
|
return ret;
|
|
}
|
|
|
|
-void bch2_flush_fsck_errs(struct bch_fs *c)
|
|
+static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print)
|
|
{
|
|
struct fsck_err_state *s, *n;
|
|
|
|
mutex_lock(&c->fsck_error_msgs_lock);
|
|
|
|
list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
|
|
- if (s->ratelimited && s->last_msg)
|
|
- bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
|
|
+ if (print && s->ratelimited && s->last_msg)
|
|
+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
|
|
|
|
list_del(&s->list);
|
|
kfree(s->last_msg);
|
|
@@ -530,35 +705,53 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
|
|
mutex_unlock(&c->fsck_error_msgs_lock);
|
|
}
|
|
|
|
-int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum)
|
|
+void bch2_flush_fsck_errs(struct bch_fs *c)
|
|
+{
|
|
+ __bch2_flush_fsck_errs(c, true);
|
|
+}
|
|
+
|
|
+void bch2_free_fsck_errs(struct bch_fs *c)
|
|
+{
|
|
+ __bch2_flush_fsck_errs(c, false);
|
|
+}
|
|
+
|
|
+int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
|
|
+ subvol_inum inum, u64 offset)
|
|
{
|
|
u32 restart_count = trans->restart_count;
|
|
int ret = 0;
|
|
|
|
- /* XXX: we don't yet attempt to print paths when we don't know the subvol */
|
|
- if (inum.subvol)
|
|
- ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out));
|
|
+ if (inum.subvol) {
|
|
+ ret = bch2_inum_to_path(trans, inum, out);
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
+ return ret;
|
|
+ }
|
|
if (!inum.subvol || ret)
|
|
prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
|
|
+ prt_printf(out, " offset %llu: ", offset);
|
|
|
|
return trans_was_restarted(trans, restart_count);
|
|
}
|
|
|
|
-int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
|
|
- subvol_inum inum, u64 offset)
|
|
+void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
|
|
+ subvol_inum inum, u64 offset)
|
|
{
|
|
- int ret = bch2_inum_err_msg_trans(trans, out, inum);
|
|
- prt_printf(out, " offset %llu: ", offset);
|
|
- return ret;
|
|
+ bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
|
|
}
|
|
|
|
-void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum)
|
|
+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
|
|
+ struct bpos pos)
|
|
{
|
|
- bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum));
|
|
+ int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ prt_printf(out, " offset %llu: ", pos.offset << 8);
|
|
+ return 0;
|
|
}
|
|
|
|
-void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
|
|
- subvol_inum inum, u64 offset)
|
|
+void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out,
|
|
+ struct bpos pos)
|
|
{
|
|
- bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
|
|
+ bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
|
|
}
|
|
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
|
|
index 7acf2a27ca28..5123d4c86770 100644
|
|
--- a/fs/bcachefs/error.h
|
|
+++ b/fs/bcachefs/error.h
|
|
@@ -18,6 +18,13 @@ struct work_struct;
|
|
|
|
/* Error messages: */
|
|
|
|
+void __bch2_log_msg_start(const char *, struct printbuf *);
|
|
+
|
|
+static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out)
|
|
+{
|
|
+ __bch2_log_msg_start(c->name, out);
|
|
+}
|
|
+
|
|
/*
|
|
* Inconsistency errors: The on disk data is inconsistent. If these occur during
|
|
* initial recovery, they don't indicate a bug in the running code - we walk all
|
|
@@ -29,21 +36,10 @@ struct work_struct;
|
|
* BCH_ON_ERROR_CONTINUE mode
|
|
*/
|
|
|
|
+bool __bch2_inconsistent_error(struct bch_fs *, struct printbuf *);
|
|
bool bch2_inconsistent_error(struct bch_fs *);
|
|
-
|
|
-int bch2_topology_error(struct bch_fs *);
|
|
-
|
|
-#define bch2_fs_topology_error(c, ...) \
|
|
-({ \
|
|
- bch_err(c, "btree topology error: " __VA_ARGS__); \
|
|
- bch2_topology_error(c); \
|
|
-})
|
|
-
|
|
-#define bch2_fs_inconsistent(c, ...) \
|
|
-({ \
|
|
- bch_err(c, __VA_ARGS__); \
|
|
- bch2_inconsistent_error(c); \
|
|
-})
|
|
+__printf(2, 3)
|
|
+bool bch2_fs_inconsistent(struct bch_fs *, const char *, ...);
|
|
|
|
#define bch2_fs_inconsistent_on(cond, ...) \
|
|
({ \
|
|
@@ -53,26 +49,21 @@ int bch2_topology_error(struct bch_fs *);
|
|
_ret; \
|
|
})
|
|
|
|
-/*
|
|
- * When a transaction update discovers or is causing a fs inconsistency, it's
|
|
- * helpful to also dump the pending updates:
|
|
- */
|
|
-#define bch2_trans_inconsistent(trans, ...) \
|
|
-({ \
|
|
- bch_err(trans->c, __VA_ARGS__); \
|
|
- bch2_dump_trans_updates(trans); \
|
|
- bch2_inconsistent_error(trans->c); \
|
|
-})
|
|
+__printf(2, 3)
|
|
+bool bch2_trans_inconsistent(struct btree_trans *, const char *, ...);
|
|
|
|
-#define bch2_trans_inconsistent_on(cond, trans, ...) \
|
|
+#define bch2_trans_inconsistent_on(cond, ...) \
|
|
({ \
|
|
bool _ret = unlikely(!!(cond)); \
|
|
- \
|
|
if (_ret) \
|
|
- bch2_trans_inconsistent(trans, __VA_ARGS__); \
|
|
+ bch2_trans_inconsistent(__VA_ARGS__); \
|
|
_ret; \
|
|
})
|
|
|
|
+int __bch2_topology_error(struct bch_fs *, struct printbuf *);
|
|
+__printf(2, 3)
|
|
+int bch2_fs_topology_error(struct bch_fs *, const char *, ...);
|
|
+
|
|
/*
|
|
* Fsck errors: inconsistency errors we detect at mount time, and should ideally
|
|
* be able to repair:
|
|
@@ -80,7 +71,7 @@ int bch2_topology_error(struct bch_fs *);
|
|
|
|
struct fsck_err_state {
|
|
struct list_head list;
|
|
- const char *fmt;
|
|
+ enum bch_sb_error_id id;
|
|
u64 nr;
|
|
bool ratelimited;
|
|
int ret;
|
|
@@ -90,6 +81,14 @@ struct fsck_err_state {
|
|
|
|
#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
|
|
|
|
+bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *);
|
|
+#define bch2_count_fsck_err(_c, _err, ...) \
|
|
+ __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__)
|
|
+
|
|
+int bch2_fsck_err_opt(struct bch_fs *,
|
|
+ enum bch_fsck_flags,
|
|
+ enum bch_sb_error_id);
|
|
+
|
|
__printf(5, 6) __cold
|
|
int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
|
|
enum bch_fsck_flags,
|
|
@@ -101,6 +100,7 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
|
|
_flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
|
|
|
|
void bch2_flush_fsck_errs(struct bch_fs *);
|
|
+void bch2_free_fsck_errs(struct bch_fs *);
|
|
|
|
#define fsck_err_wrap(_do) \
|
|
({ \
|
|
@@ -216,32 +216,43 @@ void bch2_io_error_work(struct work_struct *);
|
|
/* Does the error handling without logging a message */
|
|
void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
|
|
|
|
-#define bch2_dev_io_err_on(cond, ca, _type, ...) \
|
|
-({ \
|
|
- bool _ret = (cond); \
|
|
- \
|
|
- if (_ret) { \
|
|
- bch_err_dev_ratelimited(ca, __VA_ARGS__); \
|
|
- bch2_io_error(ca, _type); \
|
|
- } \
|
|
- _ret; \
|
|
-})
|
|
-
|
|
-#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \
|
|
-({ \
|
|
- bool _ret = (cond); \
|
|
- \
|
|
- if (_ret) { \
|
|
- bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
|
|
- bch2_io_error(ca, _type); \
|
|
- } \
|
|
- _ret; \
|
|
-})
|
|
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
|
+void bch2_latency_acct(struct bch_dev *, u64, int);
|
|
+#else
|
|
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
|
|
+#endif
|
|
+
|
|
+static inline void bch2_account_io_success_fail(struct bch_dev *ca,
|
|
+ enum bch_member_error_type type,
|
|
+ bool success)
|
|
+{
|
|
+ if (likely(success)) {
|
|
+ if (type == BCH_MEMBER_ERROR_write &&
|
|
+ ca->write_errors_start)
|
|
+ ca->write_errors_start = 0;
|
|
+ } else {
|
|
+ bch2_io_error(ca, type);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void bch2_account_io_completion(struct bch_dev *ca,
|
|
+ enum bch_member_error_type type,
|
|
+ u64 submit_time, bool success)
|
|
+{
|
|
+ if (unlikely(!ca))
|
|
+ return;
|
|
+
|
|
+ if (type != BCH_MEMBER_ERROR_checksum)
|
|
+ bch2_latency_acct(ca, submit_time, type);
|
|
+
|
|
+ bch2_account_io_success_fail(ca, type, success);
|
|
+}
|
|
|
|
-int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum);
|
|
int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
|
|
|
|
-void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum);
|
|
void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
|
|
|
|
+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos);
|
|
+void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos);
|
|
+
|
|
#endif /* _BCACHEFS_ERROR_H */
|
|
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
|
|
index 6aac579a692a..b899ee75f5b9 100644
|
|
--- a/fs/bcachefs/extent_update.c
|
|
+++ b/fs/bcachefs/extent_update.c
|
|
@@ -37,16 +37,17 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
|
|
return lru + ret * 2;
|
|
}
|
|
|
|
+#define EXTENT_ITERS_MAX 64
|
|
+
|
|
static int count_iters_for_insert(struct btree_trans *trans,
|
|
struct bkey_s_c k,
|
|
unsigned offset,
|
|
struct bpos *end,
|
|
- unsigned *nr_iters,
|
|
- unsigned max_iters)
|
|
+ unsigned *nr_iters)
|
|
{
|
|
int ret = 0, ret2 = 0;
|
|
|
|
- if (*nr_iters >= max_iters) {
|
|
+ if (*nr_iters >= EXTENT_ITERS_MAX) {
|
|
*end = bpos_min(*end, k.k->p);
|
|
ret = 1;
|
|
}
|
|
@@ -56,7 +57,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
|
|
case KEY_TYPE_reflink_v:
|
|
*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
|
|
|
|
- if (*nr_iters >= max_iters) {
|
|
+ if (*nr_iters >= EXTENT_ITERS_MAX) {
|
|
*end = bpos_min(*end, k.k->p);
|
|
ret = 1;
|
|
}
|
|
@@ -81,7 +82,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
|
|
|
|
*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
|
|
|
|
- if (*nr_iters >= max_iters) {
|
|
+ if (*nr_iters >= EXTENT_ITERS_MAX) {
|
|
struct bpos pos = bkey_start_pos(k.k);
|
|
pos.offset += min_t(u64, k.k->size,
|
|
r_k.k->p.offset - idx);
|
|
@@ -100,59 +101,31 @@ static int count_iters_for_insert(struct btree_trans *trans,
|
|
return ret2 ?: ret;
|
|
}
|
|
|
|
-#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3)
|
|
-
|
|
int bch2_extent_atomic_end(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
- struct bkey_i *insert,
|
|
struct bpos *end)
|
|
{
|
|
- struct btree_iter copy;
|
|
- struct bkey_s_c k;
|
|
unsigned nr_iters = 0;
|
|
- int ret;
|
|
-
|
|
- ret = bch2_btree_iter_traverse(iter);
|
|
- if (ret)
|
|
- return ret;
|
|
-
|
|
- *end = insert->k.p;
|
|
|
|
- /* extent_update_to_keys(): */
|
|
- nr_iters += 1;
|
|
-
|
|
- ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
|
|
- &nr_iters, EXTENT_ITERS_MAX / 2);
|
|
- if (ret < 0)
|
|
- return ret;
|
|
+ struct btree_iter copy;
|
|
+ bch2_trans_copy_iter(trans, ©, iter);
|
|
|
|
- bch2_trans_copy_iter(©, iter);
|
|
+ int ret = bch2_btree_iter_traverse(trans, ©);
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
- for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) {
|
|
+ struct bkey_s_c k;
|
|
+ for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) {
|
|
unsigned offset = 0;
|
|
|
|
- if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
|
|
- offset = bkey_start_offset(&insert->k) -
|
|
- bkey_start_offset(k.k);
|
|
+ if (bkey_gt(iter->pos, bkey_start_pos(k.k)))
|
|
+ offset = iter->pos.offset - bkey_start_offset(k.k);
|
|
|
|
- /* extent_handle_overwrites(): */
|
|
- switch (bch2_extent_overlap(&insert->k, k.k)) {
|
|
- case BCH_EXTENT_OVERLAP_ALL:
|
|
- case BCH_EXTENT_OVERLAP_FRONT:
|
|
- nr_iters += 1;
|
|
- break;
|
|
- case BCH_EXTENT_OVERLAP_BACK:
|
|
- case BCH_EXTENT_OVERLAP_MIDDLE:
|
|
- nr_iters += 2;
|
|
- break;
|
|
- }
|
|
-
|
|
- ret = count_iters_for_insert(trans, k, offset, end,
|
|
- &nr_iters, EXTENT_ITERS_MAX);
|
|
+ ret = count_iters_for_insert(trans, k, offset, end, &nr_iters);
|
|
if (ret)
|
|
break;
|
|
}
|
|
-
|
|
+err:
|
|
bch2_trans_iter_exit(trans, ©);
|
|
return ret < 0 ? ret : 0;
|
|
}
|
|
@@ -161,10 +134,8 @@ int bch2_extent_trim_atomic(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
struct bkey_i *k)
|
|
{
|
|
- struct bpos end;
|
|
- int ret;
|
|
-
|
|
- ret = bch2_extent_atomic_end(trans, iter, k, &end);
|
|
+ struct bpos end = k->k.p;
|
|
+ int ret = bch2_extent_atomic_end(trans, iter, &end);
|
|
if (ret)
|
|
return ret;
|
|
|
|
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
|
|
index 6f5cf449361a..34467db53f45 100644
|
|
--- a/fs/bcachefs/extent_update.h
|
|
+++ b/fs/bcachefs/extent_update.h
|
|
@@ -5,7 +5,7 @@
|
|
#include "bcachefs.h"
|
|
|
|
int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
|
|
- struct bkey_i *, struct bpos *);
|
|
+ struct bpos *);
|
|
int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
|
|
struct bkey_i *);
|
|
|
|
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
|
|
index 2d8042f853dc..1ac9897f189d 100644
|
|
--- a/fs/bcachefs/extents.c
|
|
+++ b/fs/bcachefs/extents.c
|
|
@@ -28,6 +28,13 @@
|
|
#include "trace.h"
|
|
#include "util.h"
|
|
|
|
+static const char * const bch2_extent_flags_strs[] = {
|
|
+#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
|
|
+ BCH_EXTENT_FLAGS()
|
|
+#undef x
|
|
+ NULL,
|
|
+};
|
|
+
|
|
static unsigned bch2_crc_field_size_max[] = {
|
|
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
|
|
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
|
|
@@ -38,6 +45,49 @@ static void bch2_extent_crc_pack(union bch_extent_crc *,
|
|
struct bch_extent_crc_unpacked,
|
|
enum bch_extent_entry_type);
|
|
|
|
+void bch2_io_failures_to_text(struct printbuf *out,
|
|
+ struct bch_fs *c,
|
|
+ struct bch_io_failures *failed)
|
|
+{
|
|
+ static const char * const error_types[] = {
|
|
+ "io", "checksum", "ec reconstruct", NULL
|
|
+ };
|
|
+
|
|
+ for (struct bch_dev_io_failures *f = failed->devs;
|
|
+ f < failed->devs + failed->nr;
|
|
+ f++) {
|
|
+ unsigned errflags =
|
|
+ ((!!f->failed_io) << 0) |
|
|
+ ((!!f->failed_csum_nr) << 1) |
|
|
+ ((!!f->failed_ec) << 2);
|
|
+
|
|
+ if (!errflags)
|
|
+ continue;
|
|
+
|
|
+ bch2_printbuf_make_room(out, 1024);
|
|
+ rcu_read_lock();
|
|
+ out->atomic++;
|
|
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
|
|
+ if (ca)
|
|
+ prt_str(out, ca->name);
|
|
+ else
|
|
+ prt_printf(out, "(invalid device %u)", f->dev);
|
|
+ --out->atomic;
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ prt_char(out, ' ');
|
|
+
|
|
+ if (is_power_of_2(errflags)) {
|
|
+ prt_bitflags(out, error_types, errflags);
|
|
+ prt_str(out, " error");
|
|
+ } else {
|
|
+ prt_str(out, "errors: ");
|
|
+ prt_bitflags(out, error_types, errflags);
|
|
+ }
|
|
+ prt_newline(out);
|
|
+ }
|
|
+}
|
|
+
|
|
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
|
|
unsigned dev)
|
|
{
|
|
@@ -51,7 +101,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
|
|
}
|
|
|
|
void bch2_mark_io_failure(struct bch_io_failures *failed,
|
|
- struct extent_ptr_decoded *p)
|
|
+ struct extent_ptr_decoded *p,
|
|
+ bool csum_error)
|
|
{
|
|
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
|
|
|
|
@@ -59,53 +110,73 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
|
|
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
|
|
|
|
f = &failed->devs[failed->nr++];
|
|
- f->dev = p->ptr.dev;
|
|
- f->idx = p->idx;
|
|
- f->nr_failed = 1;
|
|
- f->nr_retries = 0;
|
|
- } else if (p->idx != f->idx) {
|
|
- f->idx = p->idx;
|
|
- f->nr_failed = 1;
|
|
- f->nr_retries = 0;
|
|
- } else {
|
|
- f->nr_failed++;
|
|
+ memset(f, 0, sizeof(*f));
|
|
+ f->dev = p->ptr.dev;
|
|
+ }
|
|
+
|
|
+ if (p->do_ec_reconstruct)
|
|
+ f->failed_ec = true;
|
|
+ else if (!csum_error)
|
|
+ f->failed_io = true;
|
|
+ else
|
|
+ f->failed_csum_nr++;
|
|
+}
|
|
+
|
|
+void bch2_mark_btree_validate_failure(struct bch_io_failures *failed,
|
|
+ unsigned dev)
|
|
+{
|
|
+ struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev);
|
|
+
|
|
+ if (!f) {
|
|
+ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
|
|
+
|
|
+ f = &failed->devs[failed->nr++];
|
|
+ memset(f, 0, sizeof(*f));
|
|
+ f->dev = dev;
|
|
}
|
|
+
|
|
+ f->failed_btree_validate = true;
|
|
}
|
|
|
|
-static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
|
|
+static inline u64 dev_latency(struct bch_dev *ca)
|
|
{
|
|
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
|
|
return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
|
|
}
|
|
|
|
+static inline int dev_failed(struct bch_dev *ca)
|
|
+{
|
|
+ return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
|
|
+}
|
|
+
|
|
/*
|
|
* returns true if p1 is better than p2:
|
|
*/
|
|
static inline bool ptr_better(struct bch_fs *c,
|
|
const struct extent_ptr_decoded p1,
|
|
- const struct extent_ptr_decoded p2)
|
|
+ u64 p1_latency,
|
|
+ struct bch_dev *ca1,
|
|
+ const struct extent_ptr_decoded p2,
|
|
+ u64 p2_latency)
|
|
{
|
|
- if (likely(!p1.idx && !p2.idx)) {
|
|
- u64 l1 = dev_latency(c, p1.ptr.dev);
|
|
- u64 l2 = dev_latency(c, p2.ptr.dev);
|
|
+ struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
|
|
|
|
- /*
|
|
- * Square the latencies, to bias more in favor of the faster
|
|
- * device - we never want to stop issuing reads to the slower
|
|
- * device altogether, so that we can update our latency numbers:
|
|
- */
|
|
- l1 *= l1;
|
|
- l2 *= l2;
|
|
+ int failed_delta = dev_failed(ca1) - dev_failed(ca2);
|
|
+ if (unlikely(failed_delta))
|
|
+ return failed_delta < 0;
|
|
|
|
- /* Pick at random, biased in favor of the faster device: */
|
|
+ if (static_branch_unlikely(&bch2_force_reconstruct_read))
|
|
+ return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
|
|
|
|
- return bch2_get_random_u64_below(l1 + l2) > l1;
|
|
- }
|
|
+ if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
|
|
+ return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
|
|
|
|
- if (bch2_force_reconstruct_read)
|
|
- return p1.idx > p2.idx;
|
|
+ int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
|
|
+ if (unlikely(crc_retry_delta))
|
|
+ return crc_retry_delta < 0;
|
|
|
|
- return p1.idx < p2.idx;
|
|
+ /* Pick at random, biased in favor of the faster device: */
|
|
+
|
|
+ return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
|
|
}
|
|
|
|
/*
|
|
@@ -115,64 +186,117 @@ static inline bool ptr_better(struct bch_fs *c,
|
|
*/
|
|
int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
|
struct bch_io_failures *failed,
|
|
- struct extent_ptr_decoded *pick)
|
|
+ struct extent_ptr_decoded *pick,
|
|
+ int dev)
|
|
{
|
|
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
- const union bch_extent_entry *entry;
|
|
- struct extent_ptr_decoded p;
|
|
- struct bch_dev_io_failures *f;
|
|
- int ret = 0;
|
|
+ bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
|
|
+ bool have_dirty_ptrs = false, have_pick = false;
|
|
|
|
if (k.k->type == KEY_TYPE_error)
|
|
return -BCH_ERR_key_type_error;
|
|
|
|
rcu_read_lock();
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ u64 pick_latency;
|
|
+
|
|
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ have_dirty_ptrs |= !p.ptr.cached;
|
|
+
|
|
/*
|
|
* Unwritten extent: no need to actually read, treat it as a
|
|
* hole and return 0s:
|
|
*/
|
|
if (p.ptr.unwritten) {
|
|
- ret = 0;
|
|
- break;
|
|
+ rcu_read_unlock();
|
|
+ return 0;
|
|
}
|
|
|
|
- /*
|
|
- * If there are any dirty pointers it's an error if we can't
|
|
- * read:
|
|
- */
|
|
- if (!ret && !p.ptr.cached)
|
|
- ret = -BCH_ERR_no_device_to_read_from;
|
|
+ /* Are we being asked to read from a specific device? */
|
|
+ if (dev >= 0 && p.ptr.dev != dev)
|
|
+ continue;
|
|
+
|
|
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
|
|
|
|
- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
|
|
+ if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) {
|
|
+ rcu_read_unlock();
|
|
+ int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ rcu_read_lock();
|
|
+ }
|
|
|
|
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
|
|
continue;
|
|
|
|
- f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
|
|
- if (f)
|
|
- p.idx = f->nr_failed < f->nr_retries
|
|
- ? f->idx
|
|
- : f->idx + 1;
|
|
+ struct bch_dev_io_failures *f =
|
|
+ unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
|
|
+ if (unlikely(f)) {
|
|
+ p.crc_retry_nr = f->failed_csum_nr;
|
|
+ p.has_ec &= ~f->failed_ec;
|
|
|
|
- if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
|
|
- p.idx++;
|
|
+ if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
|
|
+ have_io_errors |= f->failed_io;
|
|
+ have_io_errors |= f->failed_btree_validate;
|
|
+ have_io_errors |= f->failed_ec;
|
|
+ }
|
|
+ have_csum_errors |= !!f->failed_csum_nr;
|
|
+
|
|
+ if (p.has_ec && (f->failed_io || f->failed_csum_nr))
|
|
+ p.do_ec_reconstruct = true;
|
|
+ else if (f->failed_io ||
|
|
+ f->failed_btree_validate ||
|
|
+ f->failed_csum_nr > c->opts.checksum_err_retry_nr)
|
|
+ continue;
|
|
+ }
|
|
|
|
- if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
|
|
- p.idx++;
|
|
+ have_missing_devs |= ca && !bch2_dev_is_online(ca);
|
|
|
|
- if (p.idx > (unsigned) p.has_ec)
|
|
- continue;
|
|
+ if (!ca || !bch2_dev_is_online(ca)) {
|
|
+ if (!p.has_ec)
|
|
+ continue;
|
|
+ p.do_ec_reconstruct = true;
|
|
+ }
|
|
|
|
- if (ret > 0 && !ptr_better(c, p, *pick))
|
|
- continue;
|
|
+ if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec)
|
|
+ p.do_ec_reconstruct = true;
|
|
|
|
- *pick = p;
|
|
- ret = 1;
|
|
+ u64 p_latency = dev_latency(ca);
|
|
+ /*
|
|
+ * Square the latencies, to bias more in favor of the faster
|
|
+ * device - we never want to stop issuing reads to the slower
|
|
+ * device altogether, so that we can update our latency numbers:
|
|
+ */
|
|
+ p_latency *= p_latency;
|
|
+
|
|
+ if (!have_pick ||
|
|
+ ptr_better(c,
|
|
+ p, p_latency, ca,
|
|
+ *pick, pick_latency)) {
|
|
+ *pick = p;
|
|
+ pick_latency = p_latency;
|
|
+ have_pick = true;
|
|
+ }
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
- return ret;
|
|
+ if (have_pick)
|
|
+ return 1;
|
|
+ if (!have_dirty_ptrs)
|
|
+ return 0;
|
|
+ if (have_missing_devs)
|
|
+ return -BCH_ERR_no_device_to_read_from;
|
|
+ if (have_csum_errors)
|
|
+ return -BCH_ERR_data_read_csum_err;
|
|
+ if (have_io_errors)
|
|
+ return -BCH_ERR_data_read_io_err;
|
|
+
|
|
+ /*
|
|
+ * If we get here, we have pointers (bkey_ptrs_validate() ensures that),
|
|
+ * but they don't point to valid devices:
|
|
+ */
|
|
+ return -BCH_ERR_no_devices_valid;
|
|
}
|
|
|
|
/* KEY_TYPE_btree_ptr: */
|
|
@@ -536,29 +660,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
|
|
struct bch_extent_crc_unpacked src,
|
|
enum bch_extent_entry_type type)
|
|
{
|
|
-#define set_common_fields(_dst, _src) \
|
|
- _dst.type = 1 << type; \
|
|
- _dst.csum_type = _src.csum_type, \
|
|
- _dst.compression_type = _src.compression_type, \
|
|
- _dst._compressed_size = _src.compressed_size - 1, \
|
|
- _dst._uncompressed_size = _src.uncompressed_size - 1, \
|
|
- _dst.offset = _src.offset
|
|
+#define common_fields(_src) \
|
|
+ .type = BIT(type), \
|
|
+ .csum_type = _src.csum_type, \
|
|
+ .compression_type = _src.compression_type, \
|
|
+ ._compressed_size = _src.compressed_size - 1, \
|
|
+ ._uncompressed_size = _src.uncompressed_size - 1, \
|
|
+ .offset = _src.offset
|
|
|
|
switch (type) {
|
|
case BCH_EXTENT_ENTRY_crc32:
|
|
- set_common_fields(dst->crc32, src);
|
|
- dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo);
|
|
+ dst->crc32 = (struct bch_extent_crc32) {
|
|
+ common_fields(src),
|
|
+ .csum = (u32 __force) *((__le32 *) &src.csum.lo),
|
|
+ };
|
|
break;
|
|
case BCH_EXTENT_ENTRY_crc64:
|
|
- set_common_fields(dst->crc64, src);
|
|
- dst->crc64.nonce = src.nonce;
|
|
- dst->crc64.csum_lo = (u64 __force) src.csum.lo;
|
|
- dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi);
|
|
+ dst->crc64 = (struct bch_extent_crc64) {
|
|
+ common_fields(src),
|
|
+ .nonce = src.nonce,
|
|
+ .csum_lo = (u64 __force) src.csum.lo,
|
|
+ .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi),
|
|
+ };
|
|
break;
|
|
case BCH_EXTENT_ENTRY_crc128:
|
|
- set_common_fields(dst->crc128, src);
|
|
- dst->crc128.nonce = src.nonce;
|
|
- dst->crc128.csum = src.csum;
|
|
+ dst->crc128 = (struct bch_extent_crc128) {
|
|
+ common_fields(src),
|
|
+ .nonce = src.nonce,
|
|
+ .csum = src.csum,
|
|
+ };
|
|
break;
|
|
default:
|
|
BUG();
|
|
@@ -991,13 +1121,14 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke
|
|
static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
|
|
struct bch_extent_ptr *ptr)
|
|
{
|
|
- if (!opts->promote_target ||
|
|
- !bch2_dev_in_target(c, ptr->dev, opts->promote_target))
|
|
+ unsigned target = opts->promote_target ?: opts->foreground_target;
|
|
+
|
|
+ if (target && !bch2_dev_in_target(c, ptr->dev, target))
|
|
return false;
|
|
|
|
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
|
|
|
|
- return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
|
|
+ return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
|
|
}
|
|
|
|
void bch2_extent_ptr_set_cached(struct bch_fs *c,
|
|
@@ -1005,33 +1136,50 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c,
|
|
struct bkey_s k,
|
|
struct bch_extent_ptr *ptr)
|
|
{
|
|
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
|
|
+ struct bkey_ptrs ptrs;
|
|
union bch_extent_entry *entry;
|
|
struct extent_ptr_decoded p;
|
|
+ bool have_cached_ptr;
|
|
+ unsigned drop_dev = ptr->dev;
|
|
|
|
rcu_read_lock();
|
|
- if (!want_cached_ptr(c, opts, ptr)) {
|
|
- bch2_bkey_drop_ptr_noerror(k, ptr);
|
|
- goto out;
|
|
- }
|
|
+restart_drop_ptrs:
|
|
+ ptrs = bch2_bkey_ptrs(k);
|
|
+ have_cached_ptr = false;
|
|
|
|
- /*
|
|
- * Stripes can't contain cached data, for - reasons.
|
|
- *
|
|
- * Possibly something we can fix in the future?
|
|
- */
|
|
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
- if (&entry->ptr == ptr) {
|
|
- if (p.has_ec)
|
|
- bch2_bkey_drop_ptr_noerror(k, ptr);
|
|
- else
|
|
- ptr->cached = true;
|
|
- goto out;
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ /*
|
|
+ * Check if it's erasure coded - stripes can't contain cached
|
|
+ * data. Possibly something we can fix in the future?
|
|
+ */
|
|
+ if (&entry->ptr == ptr && p.has_ec)
|
|
+ goto drop;
|
|
+
|
|
+ if (p.ptr.cached) {
|
|
+ if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) {
|
|
+ bch2_bkey_drop_ptr_noerror(k, &entry->ptr);
|
|
+ ptr = NULL;
|
|
+ goto restart_drop_ptrs;
|
|
+ }
|
|
+
|
|
+ have_cached_ptr = true;
|
|
}
|
|
+ }
|
|
+
|
|
+ if (!ptr)
|
|
+ bkey_for_each_ptr(ptrs, ptr2)
|
|
+ if (ptr2->dev == drop_dev)
|
|
+ ptr = ptr2;
|
|
|
|
- BUG();
|
|
-out:
|
|
+ if (have_cached_ptr || !want_cached_ptr(c, opts, ptr))
|
|
+ goto drop;
|
|
+
|
|
+ ptr->cached = true;
|
|
+ rcu_read_unlock();
|
|
+ return;
|
|
+drop:
|
|
rcu_read_unlock();
|
|
+ bch2_bkey_drop_ptr_noerror(k, ptr);
|
|
}
|
|
|
|
/*
|
|
@@ -1220,6 +1368,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
|
|
bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
|
|
break;
|
|
|
|
+ case BCH_EXTENT_ENTRY_flags:
|
|
+ prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
|
|
+ break;
|
|
+
|
|
default:
|
|
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
|
|
return;
|
|
@@ -1381,6 +1533,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
#endif
|
|
break;
|
|
}
|
|
+ case BCH_EXTENT_ENTRY_flags:
|
|
+ bkey_fsck_err_on(entry != ptrs.start,
|
|
+ c, extent_flags_not_at_start,
|
|
+ "extent flags entry not at start");
|
|
+ break;
|
|
}
|
|
}
|
|
|
|
@@ -1447,6 +1604,28 @@ void bch2_ptr_swab(struct bkey_s k)
|
|
}
|
|
}
|
|
|
|
+int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
|
|
+{
|
|
+ int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
|
|
+
|
|
+ if (ptrs.start != ptrs.end &&
|
|
+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
|
|
+ ptrs.start->flags.flags = flags;
|
|
+ } else {
|
|
+ struct bch_extent_flags f = {
|
|
+ .type = BIT(BCH_EXTENT_ENTRY_flags),
|
|
+ .flags = flags,
|
|
+ };
|
|
+ __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
/* Generic extent code: */
|
|
|
|
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
|
|
@@ -1492,8 +1671,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
|
|
entry->crc128.offset += sub;
|
|
break;
|
|
case BCH_EXTENT_ENTRY_stripe_ptr:
|
|
- break;
|
|
case BCH_EXTENT_ENTRY_rebalance:
|
|
+ case BCH_EXTENT_ENTRY_flags:
|
|
break;
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
|
|
index 204d765dd74c..b8590e51b76e 100644
|
|
--- a/fs/bcachefs/extents.h
|
|
+++ b/fs/bcachefs/extents.h
|
|
@@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
|
|
({ \
|
|
__label__ out; \
|
|
\
|
|
- (_ptr).idx = 0; \
|
|
- (_ptr).has_ec = false; \
|
|
+ (_ptr).has_ec = false; \
|
|
+ (_ptr).do_ec_reconstruct = false; \
|
|
+ (_ptr).crc_retry_nr = 0; \
|
|
\
|
|
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
|
|
switch (__extent_entry_type(_entry)) { \
|
|
@@ -379,13 +380,6 @@ out: \
|
|
|
|
/* Iterate over pointers in KEY_TYPE_extent: */
|
|
|
|
-#define extent_for_each_entry_from(_e, _entry, _start) \
|
|
- __bkey_extent_entry_for_each_from(_start, \
|
|
- extent_entry_last(_e), _entry)
|
|
-
|
|
-#define extent_for_each_entry(_e, _entry) \
|
|
- extent_for_each_entry_from(_e, _entry, (_e).v->start)
|
|
-
|
|
#define extent_ptr_next(_e, _ptr) \
|
|
__bkey_ptr_next(_ptr, extent_entry_last(_e))
|
|
|
|
@@ -398,13 +392,16 @@ out: \
|
|
|
|
/* utility code common to all keys with pointers: */
|
|
|
|
+void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *,
|
|
+ struct bch_io_failures *);
|
|
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
|
|
unsigned);
|
|
void bch2_mark_io_failure(struct bch_io_failures *,
|
|
- struct extent_ptr_decoded *);
|
|
+ struct extent_ptr_decoded *, bool);
|
|
+void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned);
|
|
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
|
|
struct bch_io_failures *,
|
|
- struct extent_ptr_decoded *);
|
|
+ struct extent_ptr_decoded *, int);
|
|
|
|
/* KEY_TYPE_btree_ptr: */
|
|
|
|
@@ -753,4 +750,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
|
|
k->size = new_size;
|
|
}
|
|
|
|
+static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
|
|
+{
|
|
+ if (ptrs.start != ptrs.end &&
|
|
+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
|
|
+ return ptrs.start->flags.flags;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
|
|
+{
|
|
+ return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
|
|
+}
|
|
+
|
|
+int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
|
|
+
|
|
#endif /* _BCACHEFS_EXTENTS_H */
|
|
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
|
|
index c198dfc376d6..74c0252cbd98 100644
|
|
--- a/fs/bcachefs/extents_format.h
|
|
+++ b/fs/bcachefs/extents_format.h
|
|
@@ -79,8 +79,9 @@
|
|
x(crc64, 2) \
|
|
x(crc128, 3) \
|
|
x(stripe_ptr, 4) \
|
|
- x(rebalance, 5)
|
|
-#define BCH_EXTENT_ENTRY_MAX 6
|
|
+ x(rebalance, 5) \
|
|
+ x(flags, 6)
|
|
+#define BCH_EXTENT_ENTRY_MAX 7
|
|
|
|
enum bch_extent_entry_type {
|
|
#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
|
|
@@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr {
|
|
#endif
|
|
};
|
|
|
|
+#define BCH_EXTENT_FLAGS() \
|
|
+ x(poisoned, 0)
|
|
+
|
|
+enum bch_extent_flags_e {
|
|
+#define x(n, v) BCH_EXTENT_FLAG_##n = v,
|
|
+ BCH_EXTENT_FLAGS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+struct bch_extent_flags {
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u64 type:7,
|
|
+ flags:57;
|
|
+#elif defined (__BIG_ENDIAN_BITFIELD)
|
|
+ __u64 flags:57,
|
|
+ type:7;
|
|
+#endif
|
|
+};
|
|
+
|
|
/* bch_extent_rebalance: */
|
|
#include "rebalance_format.h"
|
|
|
|
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
|
|
index 43d6c341ecca..b23ce4a373c0 100644
|
|
--- a/fs/bcachefs/extents_types.h
|
|
+++ b/fs/bcachefs/extents_types.h
|
|
@@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked {
|
|
};
|
|
|
|
struct extent_ptr_decoded {
|
|
- unsigned idx;
|
|
bool has_ec;
|
|
+ bool do_ec_reconstruct;
|
|
+ u8 crc_retry_nr;
|
|
struct bch_extent_crc_unpacked crc;
|
|
struct bch_extent_ptr ptr;
|
|
struct bch_extent_stripe_ptr ec;
|
|
@@ -31,10 +32,11 @@ struct bch_io_failures {
|
|
u8 nr;
|
|
struct bch_dev_io_failures {
|
|
u8 dev;
|
|
- u8 idx;
|
|
- u8 nr_failed;
|
|
- u8 nr_retries;
|
|
- } devs[BCH_REPLICAS_MAX];
|
|
+ unsigned failed_csum_nr:6,
|
|
+ failed_io:1,
|
|
+ failed_btree_validate:1,
|
|
+ failed_ec:1;
|
|
+ } devs[BCH_REPLICAS_MAX + 1];
|
|
};
|
|
|
|
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
|
|
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
|
|
index 2eaffe37b5e7..0e742555cb0a 100644
|
|
--- a/fs/bcachefs/eytzinger.c
|
|
+++ b/fs/bcachefs/eytzinger.c
|
|
@@ -148,89 +148,99 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr
|
|
return cmp(a, b, priv);
|
|
}
|
|
|
|
-static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
|
|
+static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size,
|
|
cmp_r_func_t cmp_func, const void *priv,
|
|
size_t l, size_t r)
|
|
{
|
|
- return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
|
|
- base + inorder_to_eytzinger0(r, n) * size,
|
|
+ return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size,
|
|
+ base1 + inorder_to_eytzinger1(r, n) * size,
|
|
cmp_func, priv);
|
|
}
|
|
|
|
-static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
|
|
+static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size,
|
|
swap_r_func_t swap_func, const void *priv,
|
|
size_t l, size_t r)
|
|
{
|
|
- do_swap(base + inorder_to_eytzinger0(l, n) * size,
|
|
- base + inorder_to_eytzinger0(r, n) * size,
|
|
+ do_swap(base1 + inorder_to_eytzinger1(l, n) * size,
|
|
+ base1 + inorder_to_eytzinger1(r, n) * size,
|
|
size, swap_func, priv);
|
|
}
|
|
|
|
-void eytzinger0_sort_r(void *base, size_t n, size_t size,
|
|
- cmp_r_func_t cmp_func,
|
|
- swap_r_func_t swap_func,
|
|
- const void *priv)
|
|
+static void eytzinger1_sort_r(void *base1, size_t n, size_t size,
|
|
+ cmp_r_func_t cmp_func,
|
|
+ swap_r_func_t swap_func,
|
|
+ const void *priv)
|
|
{
|
|
- int i, j, k;
|
|
+ unsigned i, j, k;
|
|
|
|
/* called from 'sort' without swap function, let's pick the default */
|
|
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
|
|
swap_func = NULL;
|
|
|
|
if (!swap_func) {
|
|
- if (is_aligned(base, size, 8))
|
|
+ if (is_aligned(base1, size, 8))
|
|
swap_func = SWAP_WORDS_64;
|
|
- else if (is_aligned(base, size, 4))
|
|
+ else if (is_aligned(base1, size, 4))
|
|
swap_func = SWAP_WORDS_32;
|
|
else
|
|
swap_func = SWAP_BYTES;
|
|
}
|
|
|
|
/* heapify */
|
|
- for (i = n / 2 - 1; i >= 0; --i) {
|
|
+ for (i = n / 2; i >= 1; --i) {
|
|
/* Find the sift-down path all the way to the leaves. */
|
|
- for (j = i; k = j * 2 + 1, k + 1 < n;)
|
|
- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
|
|
+ for (j = i; k = j * 2, k < n;)
|
|
+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
|
|
|
|
/* Special case for the last leaf with no sibling. */
|
|
- if (j * 2 + 2 == n)
|
|
- j = j * 2 + 1;
|
|
+ if (j * 2 == n)
|
|
+ j *= 2;
|
|
|
|
/* Backtrack to the correct location. */
|
|
- while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0)
|
|
- j = (j - 1) / 2;
|
|
+ while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0)
|
|
+ j /= 2;
|
|
|
|
/* Shift the element into its correct place. */
|
|
for (k = j; j != i;) {
|
|
- j = (j - 1) / 2;
|
|
- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
|
|
+ j /= 2;
|
|
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
|
|
}
|
|
}
|
|
|
|
/* sort */
|
|
- for (i = n - 1; i > 0; --i) {
|
|
- eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
|
|
+ for (i = n; i > 1; --i) {
|
|
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i);
|
|
|
|
/* Find the sift-down path all the way to the leaves. */
|
|
- for (j = 0; k = j * 2 + 1, k + 1 < i;)
|
|
- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
|
|
+ for (j = 1; k = j * 2, k + 1 < i;)
|
|
+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
|
|
|
|
/* Special case for the last leaf with no sibling. */
|
|
- if (j * 2 + 2 == i)
|
|
- j = j * 2 + 1;
|
|
+ if (j * 2 + 1 == i)
|
|
+ j *= 2;
|
|
|
|
/* Backtrack to the correct location. */
|
|
- while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0)
|
|
- j = (j - 1) / 2;
|
|
+ while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0)
|
|
+ j /= 2;
|
|
|
|
/* Shift the element into its correct place. */
|
|
- for (k = j; j;) {
|
|
- j = (j - 1) / 2;
|
|
- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
|
|
+ for (k = j; j > 1;) {
|
|
+ j /= 2;
|
|
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
|
|
}
|
|
}
|
|
}
|
|
|
|
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
|
|
+ cmp_r_func_t cmp_func,
|
|
+ swap_r_func_t swap_func,
|
|
+ const void *priv)
|
|
+{
|
|
+ void *base1 = base - size;
|
|
+
|
|
+ return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv);
|
|
+}
|
|
+
|
|
void eytzinger0_sort(void *base, size_t n, size_t size,
|
|
cmp_func_t cmp_func,
|
|
swap_func_t swap_func)
|
|
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
|
|
index 0541192d7bc0..643c1f716061 100644
|
|
--- a/fs/bcachefs/eytzinger.h
|
|
+++ b/fs/bcachefs/eytzinger.h
|
|
@@ -6,6 +6,7 @@
|
|
#include <linux/log2.h>
|
|
|
|
#ifdef EYTZINGER_DEBUG
|
|
+#include <linux/bug.h>
|
|
#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
|
|
#else
|
|
#define EYTZINGER_BUG_ON(cond)
|
|
@@ -56,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size)
|
|
return rounddown_pow_of_two(size + 1) - 1;
|
|
}
|
|
|
|
-/*
|
|
- * eytzinger1_next() and eytzinger1_prev() have the nice properties that
|
|
- *
|
|
- * eytzinger1_next(0) == eytzinger1_first())
|
|
- * eytzinger1_prev(0) == eytzinger1_last())
|
|
- *
|
|
- * eytzinger1_prev(eytzinger1_first()) == 0
|
|
- * eytzinger1_next(eytzinger1_last()) == 0
|
|
- */
|
|
-
|
|
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
|
|
{
|
|
- EYTZINGER_BUG_ON(i > size);
|
|
+ EYTZINGER_BUG_ON(i == 0 || i > size);
|
|
|
|
if (eytzinger1_right_child(i) <= size) {
|
|
i = eytzinger1_right_child(i);
|
|
|
|
- i <<= __fls(size + 1) - __fls(i);
|
|
+ i <<= __fls(size) - __fls(i);
|
|
i >>= i > size;
|
|
} else {
|
|
i >>= ffz(i) + 1;
|
|
@@ -84,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
|
|
|
|
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
|
|
{
|
|
- EYTZINGER_BUG_ON(i > size);
|
|
+ EYTZINGER_BUG_ON(i == 0 || i > size);
|
|
|
|
if (eytzinger1_left_child(i) <= size) {
|
|
i = eytzinger1_left_child(i) + 1;
|
|
|
|
- i <<= __fls(size + 1) - __fls(i);
|
|
+ i <<= __fls(size) - __fls(i);
|
|
i -= 1;
|
|
i >>= i > size;
|
|
} else {
|
|
@@ -243,73 +234,63 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
|
|
(_i) != -1; \
|
|
(_i) = eytzinger0_next((_i), (_size)))
|
|
|
|
+#define eytzinger0_for_each_prev(_i, _size) \
|
|
+ for (unsigned (_i) = eytzinger0_last((_size)); \
|
|
+ (_i) != -1; \
|
|
+ (_i) = eytzinger0_prev((_i), (_size)))
|
|
+
|
|
/* return greatest node <= @search, or -1 if not found */
|
|
static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
|
|
cmp_func_t cmp, const void *search)
|
|
{
|
|
- unsigned i, n = 0;
|
|
-
|
|
- if (!nr)
|
|
- return -1;
|
|
-
|
|
- do {
|
|
- i = n;
|
|
- n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
|
|
- } while (n < nr);
|
|
-
|
|
- if (n & 1) {
|
|
- /*
|
|
- * @i was greater than @search, return previous node:
|
|
- *
|
|
- * if @i was leftmost/smallest element,
|
|
- * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
|
|
- */
|
|
- return eytzinger0_prev(i, nr);
|
|
- } else {
|
|
- return i;
|
|
- }
|
|
+ void *base1 = base - size;
|
|
+ unsigned n = 1;
|
|
+
|
|
+ while (n <= nr)
|
|
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
|
|
+ n >>= __ffs(n) + 1;
|
|
+ return n - 1;
|
|
}
|
|
|
|
+/* return smallest node > @search, or -1 if not found */
|
|
static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
|
|
cmp_func_t cmp, const void *search)
|
|
{
|
|
- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
|
|
+ void *base1 = base - size;
|
|
+ unsigned n = 1;
|
|
|
|
- /*
|
|
- * if eytitzinger0_find_le() returned -1 - no element was <= search - we
|
|
- * want to return the first element; next/prev identities mean this work
|
|
- * as expected
|
|
- *
|
|
- * similarly if find_le() returns last element, we should return -1;
|
|
- * identities mean this all works out:
|
|
- */
|
|
- return eytzinger0_next(idx, nr);
|
|
+ while (n <= nr)
|
|
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
|
|
+ n >>= __ffs(n + 1) + 1;
|
|
+ return n - 1;
|
|
}
|
|
|
|
+/* return smallest node >= @search, or -1 if not found */
|
|
static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
|
|
cmp_func_t cmp, const void *search)
|
|
{
|
|
- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
|
|
-
|
|
- if (idx < nr && !cmp(base + idx * size, search))
|
|
- return idx;
|
|
+ void *base1 = base - size;
|
|
+ unsigned n = 1;
|
|
|
|
- return eytzinger0_next(idx, nr);
|
|
+ while (n <= nr)
|
|
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0);
|
|
+ n >>= __ffs(n + 1) + 1;
|
|
+ return n - 1;
|
|
}
|
|
|
|
#define eytzinger0_find(base, nr, size, _cmp, search) \
|
|
({ \
|
|
- void *_base = (base); \
|
|
+ size_t _size = (size); \
|
|
+ void *_base1 = (void *)(base) - _size; \
|
|
const void *_search = (search); \
|
|
size_t _nr = (nr); \
|
|
- size_t _size = (size); \
|
|
- size_t _i = 0; \
|
|
+ size_t _i = 1; \
|
|
int _res; \
|
|
\
|
|
- while (_i < _nr && \
|
|
- (_res = _cmp(_search, _base + _i * _size))) \
|
|
- _i = eytzinger0_child(_i, _res > 0); \
|
|
- _i; \
|
|
+ while (_i <= _nr && \
|
|
+ (_res = _cmp(_search, _base1 + _i * _size))) \
|
|
+ _i = eytzinger1_child(_i, _res > 0); \
|
|
+ _i - 1; \
|
|
})
|
|
|
|
void eytzinger0_sort_r(void *, size_t, size_t,
|
|
diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c
|
|
new file mode 100644
|
|
index 000000000000..2faec143eb31
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fast_list.c
|
|
@@ -0,0 +1,156 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+/*
|
|
+ * Fast, unordered lists
|
|
+ *
|
|
+ * Supports add, remove, and iterate
|
|
+ *
|
|
+ * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot
|
|
+ * allocation and freeing.
|
|
+ *
|
|
+ * This means that adding, removing, and iterating over items is lockless,
|
|
+ * except when refilling/emptying the percpu slot buffers.
|
|
+ */
|
|
+
|
|
+#include "fast_list.h"
|
|
+
|
|
+struct fast_list_pcpu {
|
|
+ u32 nr;
|
|
+ u32 entries[31];
|
|
+};
|
|
+
|
|
+static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp)
|
|
+{
|
|
+ int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp);
|
|
+ if (unlikely(idx < 0))
|
|
+ return 0;
|
|
+
|
|
+ if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) {
|
|
+ ida_free(&l->slots_allocated, idx);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ return idx;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * fast_list_get_idx - get a slot in a fast_list
|
|
+ * @l: list to get slot in
|
|
+ *
|
|
+ * This allocates a slot in the radix tree without storing to it, so that we can
|
|
+ * take the potential memory allocation failure early and do the list add later
|
|
+ * when we can't take an allocation failure.
|
|
+ *
|
|
+ * Returns: positive integer on success, -ENOMEM on failure
|
|
+ */
|
|
+int fast_list_get_idx(struct fast_list *l)
|
|
+{
|
|
+ unsigned long flags;
|
|
+ int idx;
|
|
+retry:
|
|
+ local_irq_save(flags);
|
|
+ struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
|
|
+
|
|
+ if (unlikely(!lp->nr)) {
|
|
+ u32 entries[16], nr = 0;
|
|
+
|
|
+ local_irq_restore(flags);
|
|
+ while (nr < ARRAY_SIZE(entries) &&
|
|
+ (idx = fast_list_alloc_idx(l, GFP_KERNEL)))
|
|
+ entries[nr++] = idx;
|
|
+ local_irq_save(flags);
|
|
+
|
|
+ lp = this_cpu_ptr(l->buffer);
|
|
+
|
|
+ while (nr && lp->nr < ARRAY_SIZE(lp->entries))
|
|
+ lp->entries[lp->nr++] = entries[--nr];
|
|
+
|
|
+ if (unlikely(nr)) {
|
|
+ local_irq_restore(flags);
|
|
+ while (nr)
|
|
+ ida_free(&l->slots_allocated, entries[--nr]);
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ if (unlikely(!lp->nr)) {
|
|
+ local_irq_restore(flags);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ idx = lp->entries[--lp->nr];
|
|
+ local_irq_restore(flags);
|
|
+
|
|
+ return idx;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * fast_list_add - add an item to a fast_list
|
|
+ * @l: list
|
|
+ * @item: item to add
|
|
+ *
|
|
+ * Allocates a slot in the radix tree and stores to it and then returns the
|
|
+ * slot index, which must be passed to fast_list_remove().
|
|
+ *
|
|
+ * Returns: positive integer on success, -ENOMEM on failure
|
|
+ */
|
|
+int fast_list_add(struct fast_list *l, void *item)
|
|
+{
|
|
+ int idx = fast_list_get_idx(l);
|
|
+ if (idx < 0)
|
|
+ return idx;
|
|
+
|
|
+ *genradix_ptr_inlined(&l->items, idx) = item;
|
|
+ return idx;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * fast_list_remove - remove an item from a fast_list
|
|
+ * @l: list
|
|
+ * @idx: item's slot index
|
|
+ *
|
|
+ * Zeroes out the slot in the radix tree and frees the slot for future
|
|
+ * fast_list_add() operations.
|
|
+ */
|
|
+void fast_list_remove(struct fast_list *l, unsigned idx)
|
|
+{
|
|
+ u32 entries[16], nr = 0;
|
|
+ unsigned long flags;
|
|
+
|
|
+ if (!idx)
|
|
+ return;
|
|
+
|
|
+ *genradix_ptr_inlined(&l->items, idx) = NULL;
|
|
+
|
|
+ local_irq_save(flags);
|
|
+ struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
|
|
+
|
|
+ if (unlikely(lp->nr == ARRAY_SIZE(lp->entries)))
|
|
+ while (nr < ARRAY_SIZE(entries))
|
|
+ entries[nr++] = lp->entries[--lp->nr];
|
|
+
|
|
+ lp->entries[lp->nr++] = idx;
|
|
+ local_irq_restore(flags);
|
|
+
|
|
+ if (unlikely(nr))
|
|
+ while (nr)
|
|
+ ida_free(&l->slots_allocated, entries[--nr]);
|
|
+}
|
|
+
|
|
+void fast_list_exit(struct fast_list *l)
|
|
+{
|
|
+ /* XXX: warn if list isn't empty */
|
|
+ free_percpu(l->buffer);
|
|
+ ida_destroy(&l->slots_allocated);
|
|
+ genradix_free(&l->items);
|
|
+}
|
|
+
|
|
+int fast_list_init(struct fast_list *l)
|
|
+{
|
|
+ genradix_init(&l->items);
|
|
+ ida_init(&l->slots_allocated);
|
|
+ l->buffer = alloc_percpu(*l->buffer);
|
|
+ if (!l->buffer)
|
|
+ return -ENOMEM;
|
|
+ return 0;
|
|
+}
|
|
diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h
|
|
new file mode 100644
|
|
index 000000000000..73c9bf591fd6
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fast_list.h
|
|
@@ -0,0 +1,41 @@
|
|
+#ifndef _LINUX_FAST_LIST_H
|
|
+#define _LINUX_FAST_LIST_H
|
|
+
|
|
+#include <linux/generic-radix-tree.h>
|
|
+#include <linux/idr.h>
|
|
+#include <linux/percpu.h>
|
|
+
|
|
+struct fast_list_pcpu;
|
|
+
|
|
+struct fast_list {
|
|
+ GENRADIX(void *) items;
|
|
+ struct ida slots_allocated;;
|
|
+ struct fast_list_pcpu __percpu
|
|
+ *buffer;
|
|
+};
|
|
+
|
|
+static inline void *fast_list_iter_peek(struct genradix_iter *iter,
|
|
+ struct fast_list *list)
|
|
+{
|
|
+ void **p;
|
|
+ while ((p = genradix_iter_peek(iter, &list->items)) && !*p)
|
|
+ genradix_iter_advance(iter, &list->items);
|
|
+
|
|
+ return p ? *p : NULL;
|
|
+}
|
|
+
|
|
+#define fast_list_for_each_from(_list, _iter, _i, _start) \
|
|
+ for (_iter = genradix_iter_init(&(_list)->items, _start); \
|
|
+ (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \
|
|
+ genradix_iter_advance(&(_iter), &(_list)->items))
|
|
+
|
|
+#define fast_list_for_each(_list, _iter, _i) \
|
|
+ fast_list_for_each_from(_list, _iter, _i, 0)
|
|
+
|
|
+int fast_list_get_idx(struct fast_list *l);
|
|
+int fast_list_add(struct fast_list *l, void *item);
|
|
+void fast_list_remove(struct fast_list *l, unsigned idx);
|
|
+void fast_list_exit(struct fast_list *l);
|
|
+int fast_list_init(struct fast_list *l);
|
|
+
|
|
+#endif /* _LINUX_FAST_LIST_H */
|
|
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
|
|
index ab1d5db2fa56..e3a75dcca60c 100644
|
|
--- a/fs/bcachefs/fs-io-buffered.c
|
|
+++ b/fs/bcachefs/fs-io-buffered.c
|
|
@@ -110,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans,
|
|
if (!get_more)
|
|
break;
|
|
|
|
+ unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio);
|
|
+
|
|
+ if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping))
|
|
+ break;
|
|
+
|
|
+ unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
|
|
+
|
|
+ /* ensure proper alignment */
|
|
+ order = min(order, __ffs(folio_offset|BIT(31)));
|
|
+
|
|
folio = xa_load(&iter->mapping->i_pages, folio_offset);
|
|
if (folio && !xa_is_value(folio))
|
|
break;
|
|
|
|
- folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
|
|
+ folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order);
|
|
if (!folio)
|
|
break;
|
|
|
|
@@ -149,12 +159,10 @@ static void bchfs_read(struct btree_trans *trans,
|
|
struct bch_fs *c = trans->c;
|
|
struct btree_iter iter;
|
|
struct bkey_buf sk;
|
|
- int flags = BCH_READ_RETRY_IF_STALE|
|
|
- BCH_READ_MAY_PROMOTE;
|
|
+ int flags = BCH_READ_retry_if_stale|
|
|
+ BCH_READ_may_promote;
|
|
int ret = 0;
|
|
|
|
- rbio->c = c;
|
|
- rbio->start_time = local_clock();
|
|
rbio->subvol = inum.subvol;
|
|
|
|
bch2_bkey_buf_init(&sk);
|
|
@@ -175,12 +183,12 @@ static void bchfs_read(struct btree_trans *trans,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- bch2_btree_iter_set_snapshot(&iter, snapshot);
|
|
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
|
|
|
|
- bch2_btree_iter_set_pos(&iter,
|
|
+ bch2_btree_iter_set_pos(trans, &iter,
|
|
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
|
|
|
|
- k = bch2_btree_iter_peek_slot(&iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -211,14 +219,29 @@ static void bchfs_read(struct btree_trans *trans,
|
|
swap(rbio->bio.bi_iter.bi_size, bytes);
|
|
|
|
if (rbio->bio.bi_iter.bi_size == bytes)
|
|
- flags |= BCH_READ_LAST_FRAGMENT;
|
|
+ flags |= BCH_READ_last_fragment;
|
|
|
|
bch2_bio_page_state_set(&rbio->bio, k);
|
|
|
|
bch2_read_extent(trans, rbio, iter.pos,
|
|
data_btree, k, offset_into_extent, flags);
|
|
+ /*
|
|
+ * Careful there's a landmine here if bch2_read_extent() ever
|
|
+ * starts returning transaction restarts here.
|
|
+ *
|
|
+ * We've changed rbio->bi_iter.bi_size to be "bytes we can read
|
|
+ * from this extent" with the swap call, and we restore it
|
|
+ * below. That restore needs to come before checking for
|
|
+ * errors.
|
|
+ *
|
|
+ * But unlike __bch2_read(), we use the rbio bvec iter, not one
|
|
+ * on the stack, so we can't do the restore right after the
|
|
+ * bch2_read_extent() call: we don't own that iterator anymore
|
|
+ * if BCH_READ_last_fragment is set, since we may have submitted
|
|
+ * that rbio instead of cloning it.
|
|
+ */
|
|
|
|
- if (flags & BCH_READ_LAST_FRAGMENT)
|
|
+ if (flags & BCH_READ_last_fragment)
|
|
break;
|
|
|
|
swap(rbio->bio.bi_iter.bi_size, bytes);
|
|
@@ -232,7 +255,8 @@ static void bchfs_read(struct btree_trans *trans,
|
|
|
|
if (ret) {
|
|
struct printbuf buf = PRINTBUF;
|
|
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9);
|
|
+ lockrestart_do(trans,
|
|
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
|
|
prt_printf(&buf, "read error %i from btree lookup", ret);
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
@@ -280,12 +304,13 @@ void bch2_readahead(struct readahead_control *ractl)
|
|
struct bch_read_bio *rbio =
|
|
rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
|
|
GFP_KERNEL, &c->bio_read),
|
|
- opts);
|
|
+ c,
|
|
+ opts,
|
|
+ bch2_readpages_end_io);
|
|
|
|
readpage_iter_advance(&readpages_iter);
|
|
|
|
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
|
|
- rbio->bio.bi_end_io = bch2_readpages_end_io;
|
|
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
|
|
|
|
bchfs_read(trans, rbio, inode_inum(inode),
|
|
@@ -323,10 +348,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
|
|
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
|
|
|
|
rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
|
|
- opts);
|
|
+ c,
|
|
+ opts,
|
|
+ bch2_read_single_folio_end_io);
|
|
rbio->bio.bi_private = &done;
|
|
- rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
|
|
-
|
|
rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
|
|
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
|
|
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
|
|
@@ -420,7 +445,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
|
|
}
|
|
}
|
|
|
|
- if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
|
|
+ if (io->op.flags & BCH_WRITE_wrote_data_inline) {
|
|
bio_for_each_folio_all(fi, bio) {
|
|
struct bch_folio *s;
|
|
|
|
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
|
|
index 2089c36b5866..1f5154d9676b 100644
|
|
--- a/fs/bcachefs/fs-io-direct.c
|
|
+++ b/fs/bcachefs/fs-io-direct.c
|
|
@@ -3,6 +3,7 @@
|
|
|
|
#include "bcachefs.h"
|
|
#include "alloc_foreground.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "fs.h"
|
|
#include "fs-io.h"
|
|
#include "fs-io-direct.h"
|
|
@@ -73,6 +74,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
|
struct blk_plug plug;
|
|
loff_t offset = req->ki_pos;
|
|
bool sync = is_sync_kiocb(req);
|
|
+ bool split = false;
|
|
size_t shorten;
|
|
ssize_t ret;
|
|
|
|
@@ -99,8 +101,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
|
GFP_KERNEL,
|
|
&c->dio_read_bioset);
|
|
|
|
- bio->bi_end_io = bch2_direct_IO_read_endio;
|
|
-
|
|
dio = container_of(bio, struct dio_read, rbio.bio);
|
|
closure_init(&dio->cl, NULL);
|
|
|
|
@@ -133,12 +133,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
|
|
|
goto start;
|
|
while (iter->count) {
|
|
+ split = true;
|
|
+
|
|
bio = bio_alloc_bioset(NULL,
|
|
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
|
|
REQ_OP_READ,
|
|
GFP_KERNEL,
|
|
&c->bio_read);
|
|
- bio->bi_end_io = bch2_direct_IO_read_split_endio;
|
|
start:
|
|
bio->bi_opf = REQ_OP_READ|REQ_SYNC;
|
|
bio->bi_iter.bi_sector = offset >> 9;
|
|
@@ -160,7 +161,15 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
|
if (iter->count)
|
|
closure_get(&dio->cl);
|
|
|
|
- bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
|
|
+ struct bch_read_bio *rbio =
|
|
+ rbio_init(bio,
|
|
+ c,
|
|
+ opts,
|
|
+ split
|
|
+ ? bch2_direct_IO_read_split_endio
|
|
+ : bch2_direct_IO_read_endio);
|
|
+
|
|
+ bch2_read(c, rbio, inode_inum(inode));
|
|
}
|
|
|
|
blk_finish_plug(&plug);
|
|
@@ -393,7 +402,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
|
|
ret = dio->op.error ?: ((long) dio->written << 9);
|
|
bio_put(&dio->op.wbio.bio);
|
|
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
|
|
|
|
/* inode->i_dio_count is our ref on inode and thus bch_fs */
|
|
inode_dio_end(&inode->v);
|
|
@@ -511,8 +520,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
|
|
dio->op.devs_need_flush = &inode->ei_devs_need_flush;
|
|
|
|
if (sync)
|
|
- dio->op.flags |= BCH_WRITE_SYNC;
|
|
- dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
|
|
+ dio->op.flags |= BCH_WRITE_sync;
|
|
+ dio->op.flags |= BCH_WRITE_check_enospc;
|
|
|
|
ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
|
|
bio_sectors(bio), true);
|
|
@@ -598,7 +607,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
|
|
prefetch(&inode->ei_inode);
|
|
prefetch((void *) &inode->ei_inode + 64);
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write))
|
|
return -EROFS;
|
|
|
|
inode_lock(&inode->v);
|
|
@@ -667,7 +676,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
|
|
bio_put(bio);
|
|
inode_dio_end(&inode->v);
|
|
err_put_write_ref:
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
|
|
goto out;
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
|
|
index e072900e6a5b..fbae9c1de746 100644
|
|
--- a/fs/bcachefs/fs-io-pagecache.c
|
|
+++ b/fs/bcachefs/fs-io-pagecache.c
|
|
@@ -605,10 +605,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
|
|
struct address_space *mapping = file->f_mapping;
|
|
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
struct bch2_folio_reservation res;
|
|
- unsigned len;
|
|
- loff_t isize;
|
|
vm_fault_t ret;
|
|
|
|
+ loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c));
|
|
+ unsigned offset = file_offset - folio_pos(folio);
|
|
+ unsigned len = max(PAGE_SIZE, block_bytes(c));
|
|
+
|
|
+ BUG_ON(offset + len > folio_size(folio));
|
|
+
|
|
bch2_folio_reservation_init(c, inode, &res);
|
|
|
|
sb_start_pagefault(inode->v.i_sb);
|
|
@@ -623,24 +627,24 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
|
|
bch2_pagecache_add_get(inode);
|
|
|
|
folio_lock(folio);
|
|
- isize = i_size_read(&inode->v);
|
|
+ u64 isize = i_size_read(&inode->v);
|
|
|
|
- if (folio->mapping != mapping || folio_pos(folio) >= isize) {
|
|
+ if (folio->mapping != mapping || file_offset >= isize) {
|
|
folio_unlock(folio);
|
|
ret = VM_FAULT_NOPAGE;
|
|
goto out;
|
|
}
|
|
|
|
- len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
|
|
+ len = min_t(unsigned, len, isize - file_offset);
|
|
|
|
if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
|
|
- bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
|
|
+ bch2_folio_reservation_get(c, inode, folio, &res, offset, len)) {
|
|
folio_unlock(folio);
|
|
ret = VM_FAULT_SIGBUS;
|
|
goto out;
|
|
}
|
|
|
|
- bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
|
|
+ bch2_set_folio_dirty(c, inode, folio, &res, offset, len);
|
|
bch2_folio_reservation_put(c, inode, &res);
|
|
|
|
folio_wait_stable(folio);
|
|
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
|
|
index 717e7b94c66f..b1e9ee28fc0f 100644
|
|
--- a/fs/bcachefs/fs-io.c
|
|
+++ b/fs/bcachefs/fs-io.c
|
|
@@ -7,6 +7,7 @@
|
|
#include "btree_update.h"
|
|
#include "buckets.h"
|
|
#include "clock.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "error.h"
|
|
#include "extents.h"
|
|
#include "extent_update.h"
|
|
@@ -48,7 +49,8 @@ static void nocow_flush_endio(struct bio *_bio)
|
|
struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
|
|
|
|
closure_put(bio->cl);
|
|
- percpu_ref_put(&bio->ca->io_ref);
|
|
+ enumerated_ref_put(&bio->ca->io_ref[WRITE],
|
|
+ BCH_DEV_WRITE_REF_nocow_flush);
|
|
bio_put(&bio->bio);
|
|
}
|
|
|
|
@@ -71,7 +73,8 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
|
|
for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
|
|
rcu_read_lock();
|
|
ca = rcu_dereference(c->devs[dev]);
|
|
- if (ca && !percpu_ref_tryget(&ca->io_ref))
|
|
+ if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE],
|
|
+ BCH_DEV_WRITE_REF_nocow_flush))
|
|
ca = NULL;
|
|
rcu_read_unlock();
|
|
|
|
@@ -144,10 +147,24 @@ int __must_check bch2_write_inode_size(struct bch_fs *c,
|
|
void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
|
|
struct quota_res *quota_res, s64 sectors)
|
|
{
|
|
- bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
|
|
- "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
|
|
- inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
|
|
- inode->ei_inode.bi_sectors);
|
|
+ if (unlikely((s64) inode->v.i_blocks + sectors < 0)) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+ prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
|
|
+ inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
|
|
+ inode->ei_inode.bi_sectors);
|
|
+
|
|
+ bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf);
|
|
+ if (print)
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+
|
|
+ if (sectors < 0)
|
|
+ sectors = -inode->v.i_blocks;
|
|
+ else
|
|
+ sectors = 0;
|
|
+ }
|
|
+
|
|
inode->v.i_blocks += sectors;
|
|
|
|
#ifdef CONFIG_BCACHEFS_QUOTA
|
|
@@ -205,7 +222,7 @@ static int bch2_flush_inode(struct bch_fs *c,
|
|
if (c->opts.journal_flush_disabled)
|
|
return 0;
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync))
|
|
return -EROFS;
|
|
|
|
u64 seq;
|
|
@@ -213,7 +230,7 @@ static int bch2_flush_inode(struct bch_fs *c,
|
|
bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
|
|
bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
|
|
bch2_inode_flush_nocow_writes(c, inode);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync);
|
|
return ret;
|
|
}
|
|
|
|
@@ -502,11 +519,20 @@ int bchfs_truncate(struct mnt_idmap *idmap,
|
|
goto err;
|
|
}
|
|
|
|
- bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
|
|
- !bch2_journal_error(&c->journal), c,
|
|
- "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
|
|
- inode->v.i_ino, (u64) inode->v.i_blocks,
|
|
- inode->ei_inode.bi_sectors);
|
|
+ if (unlikely(!inode->v.i_size && inode->v.i_blocks &&
|
|
+ !bch2_journal_error(&c->journal))) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+ prt_printf(&buf,
|
|
+ "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
|
|
+ inode->v.i_ino, (u64) inode->v.i_blocks,
|
|
+ inode->ei_inode.bi_sectors);
|
|
+
|
|
+ bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf);
|
|
+ if (print)
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
|
|
ret = bch2_setattr_nonsize(idmap, inode, iattr);
|
|
err:
|
|
@@ -636,9 +662,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
|
|
if (ret)
|
|
goto bkey_err;
|
|
|
|
- bch2_btree_iter_set_snapshot(&iter, snapshot);
|
|
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
|
|
|
|
- k = bch2_btree_iter_peek_slot(&iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &iter);
|
|
if ((ret = bkey_err(k)))
|
|
goto bkey_err;
|
|
|
|
@@ -649,13 +675,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
|
|
/* already reserved */
|
|
if (bkey_extent_is_reservation(k) &&
|
|
bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
|
|
- bch2_btree_iter_advance(&iter);
|
|
+ bch2_btree_iter_advance(trans, &iter);
|
|
continue;
|
|
}
|
|
|
|
if (bkey_extent_is_data(k.k) &&
|
|
!(mode & FALLOC_FL_ZERO_RANGE)) {
|
|
- bch2_btree_iter_advance(&iter);
|
|
+ bch2_btree_iter_advance(trans, &iter);
|
|
continue;
|
|
}
|
|
|
|
@@ -676,7 +702,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
|
|
if (ret)
|
|
goto bkey_err;
|
|
}
|
|
- bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
|
|
+ bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start));
|
|
|
|
if (ret)
|
|
goto bkey_err;
|
|
@@ -795,7 +821,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
|
|
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
long ret;
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate))
|
|
return -EROFS;
|
|
|
|
inode_lock(&inode->v);
|
|
@@ -819,7 +845,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
|
|
err:
|
|
bch2_pagecache_block_put(inode);
|
|
inode_unlock(&inode->v);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate);
|
|
|
|
return bch2_err_class(ret);
|
|
}
|
|
@@ -999,17 +1025,28 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
|
|
POS(inode->v.i_ino, offset >> 9),
|
|
POS(inode->v.i_ino, U64_MAX),
|
|
inum.subvol, BTREE_ITER_slots, k, ({
|
|
- if (k.k->p.inode != inode->v.i_ino) {
|
|
- next_hole = bch2_seek_pagecache_hole(&inode->v,
|
|
- offset, MAX_LFS_FILESIZE, 0, false);
|
|
- break;
|
|
- } else if (!bkey_extent_is_data(k.k)) {
|
|
- next_hole = bch2_seek_pagecache_hole(&inode->v,
|
|
- max(offset, bkey_start_offset(k.k) << 9),
|
|
- k.k->p.offset << 9, 0, false);
|
|
-
|
|
- if (next_hole < k.k->p.offset << 9)
|
|
+ if (k.k->p.inode != inode->v.i_ino ||
|
|
+ !bkey_extent_is_data(k.k)) {
|
|
+ loff_t start_offset = k.k->p.inode == inode->v.i_ino
|
|
+ ? max(offset, bkey_start_offset(k.k) << 9)
|
|
+ : offset;
|
|
+ loff_t end_offset = k.k->p.inode == inode->v.i_ino
|
|
+ ? MAX_LFS_FILESIZE
|
|
+ : k.k->p.offset << 9;
|
|
+
|
|
+ /*
|
|
+ * Found a hole in the btree, now make sure it's
|
|
+ * a hole in the pagecache. We might have to
|
|
+ * keep searching if this hole is entirely dirty
|
|
+ * in the page cache:
|
|
+ */
|
|
+ bch2_trans_unlock(trans);
|
|
+ loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v,
|
|
+ start_offset, end_offset, 0, false);
|
|
+ if (pagecache_hole < end_offset) {
|
|
+ next_hole = pagecache_hole;
|
|
break;
|
|
+ }
|
|
} else {
|
|
offset = max(offset, bkey_start_offset(k.k) << 9);
|
|
}
|
|
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
|
|
index 15725b4ce393..0e99d940a320 100644
|
|
--- a/fs/bcachefs/fs-ioctl.c
|
|
+++ b/fs/bcachefs/fs-ioctl.c
|
|
@@ -5,8 +5,8 @@
|
|
#include "chardev.h"
|
|
#include "dirent.h"
|
|
#include "fs.h"
|
|
-#include "fs-common.h"
|
|
#include "fs-ioctl.h"
|
|
+#include "namei.h"
|
|
#include "quota.h"
|
|
|
|
#include <linux/compat.h>
|
|
@@ -21,180 +21,6 @@
|
|
#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
|
|
#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
|
|
|
|
-struct flags_set {
|
|
- unsigned mask;
|
|
- unsigned flags;
|
|
-
|
|
- unsigned projid;
|
|
-
|
|
- bool set_projinherit;
|
|
- bool projinherit;
|
|
-};
|
|
-
|
|
-static int bch2_inode_flags_set(struct btree_trans *trans,
|
|
- struct bch_inode_info *inode,
|
|
- struct bch_inode_unpacked *bi,
|
|
- void *p)
|
|
-{
|
|
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
- /*
|
|
- * We're relying on btree locking here for exclusion with other ioctl
|
|
- * calls - use the flags in the btree (@bi), not inode->i_flags:
|
|
- */
|
|
- struct flags_set *s = p;
|
|
- unsigned newflags = s->flags;
|
|
- unsigned oldflags = bi->bi_flags & s->mask;
|
|
-
|
|
- if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
|
|
- !capable(CAP_LINUX_IMMUTABLE))
|
|
- return -EPERM;
|
|
-
|
|
- if (!S_ISREG(bi->bi_mode) &&
|
|
- !S_ISDIR(bi->bi_mode) &&
|
|
- (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
|
|
- return -EINVAL;
|
|
-
|
|
- if (s->set_projinherit) {
|
|
- bi->bi_fields_set &= ~(1 << Inode_opt_project);
|
|
- bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
|
|
- }
|
|
-
|
|
- bi->bi_flags &= ~s->mask;
|
|
- bi->bi_flags |= newflags;
|
|
-
|
|
- bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
|
|
-{
|
|
- unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
|
|
-
|
|
- return put_user(flags, arg);
|
|
-}
|
|
-
|
|
-static int bch2_ioc_setflags(struct bch_fs *c,
|
|
- struct file *file,
|
|
- struct bch_inode_info *inode,
|
|
- void __user *arg)
|
|
-{
|
|
- struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
|
|
- unsigned uflags;
|
|
- int ret;
|
|
-
|
|
- if (get_user(uflags, (int __user *) arg))
|
|
- return -EFAULT;
|
|
-
|
|
- s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
|
|
- if (uflags)
|
|
- return -EOPNOTSUPP;
|
|
-
|
|
- ret = mnt_want_write_file(file);
|
|
- if (ret)
|
|
- return ret;
|
|
-
|
|
- inode_lock(&inode->v);
|
|
- if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
|
|
- ret = -EACCES;
|
|
- goto setflags_out;
|
|
- }
|
|
-
|
|
- mutex_lock(&inode->ei_update_lock);
|
|
- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
|
|
- bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
|
|
- ATTR_CTIME);
|
|
- mutex_unlock(&inode->ei_update_lock);
|
|
-
|
|
-setflags_out:
|
|
- inode_unlock(&inode->v);
|
|
- mnt_drop_write_file(file);
|
|
- return ret;
|
|
-}
|
|
-
|
|
-static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
|
|
- struct fsxattr __user *arg)
|
|
-{
|
|
- struct fsxattr fa = { 0 };
|
|
-
|
|
- fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
|
|
-
|
|
- if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
|
|
- fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
|
|
-
|
|
- fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
|
|
-
|
|
- if (copy_to_user(arg, &fa, sizeof(fa)))
|
|
- return -EFAULT;
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static int fssetxattr_inode_update_fn(struct btree_trans *trans,
|
|
- struct bch_inode_info *inode,
|
|
- struct bch_inode_unpacked *bi,
|
|
- void *p)
|
|
-{
|
|
- struct flags_set *s = p;
|
|
-
|
|
- if (s->projid != bi->bi_project) {
|
|
- bi->bi_fields_set |= 1U << Inode_opt_project;
|
|
- bi->bi_project = s->projid;
|
|
- }
|
|
-
|
|
- return bch2_inode_flags_set(trans, inode, bi, p);
|
|
-}
|
|
-
|
|
-static int bch2_ioc_fssetxattr(struct bch_fs *c,
|
|
- struct file *file,
|
|
- struct bch_inode_info *inode,
|
|
- struct fsxattr __user *arg)
|
|
-{
|
|
- struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
|
|
- struct fsxattr fa;
|
|
- int ret;
|
|
-
|
|
- if (copy_from_user(&fa, arg, sizeof(fa)))
|
|
- return -EFAULT;
|
|
-
|
|
- s.set_projinherit = true;
|
|
- s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
|
|
- fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
|
|
-
|
|
- s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
|
|
- if (fa.fsx_xflags)
|
|
- return -EOPNOTSUPP;
|
|
-
|
|
- if (fa.fsx_projid >= U32_MAX)
|
|
- return -EINVAL;
|
|
-
|
|
- /*
|
|
- * inode fields accessible via the xattr interface are stored with a +1
|
|
- * bias, so that 0 means unset:
|
|
- */
|
|
- s.projid = fa.fsx_projid + 1;
|
|
-
|
|
- ret = mnt_want_write_file(file);
|
|
- if (ret)
|
|
- return ret;
|
|
-
|
|
- inode_lock(&inode->v);
|
|
- if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
|
|
- ret = -EACCES;
|
|
- goto err;
|
|
- }
|
|
-
|
|
- mutex_lock(&inode->ei_update_lock);
|
|
- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
|
|
- bch2_set_projid(c, inode, fa.fsx_projid) ?:
|
|
- bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
|
|
- ATTR_CTIME);
|
|
- mutex_unlock(&inode->ei_update_lock);
|
|
-err:
|
|
- inode_unlock(&inode->v);
|
|
- mnt_drop_write_file(file);
|
|
- return ret;
|
|
-}
|
|
-
|
|
static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
|
|
struct bch_inode_info *inode,
|
|
struct bch_inode_unpacked *bi,
|
|
@@ -218,7 +44,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
|
|
int ret = 0;
|
|
subvol_inum inum;
|
|
|
|
- kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
|
|
+ kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
|
|
if (!kname)
|
|
return -ENOMEM;
|
|
|
|
@@ -346,7 +172,10 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
|
|
if (get_user(flags, arg))
|
|
return -EFAULT;
|
|
|
|
- bch_notice(c, "shutdown by ioctl type %u", flags);
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+
|
|
+ prt_printf(&buf, "shutdown by ioctl type %u", flags);
|
|
|
|
switch (flags) {
|
|
case FSOP_GOING_FLAGS_DEFAULT:
|
|
@@ -354,20 +183,23 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
|
|
if (ret)
|
|
break;
|
|
bch2_journal_flush(&c->journal);
|
|
- bch2_fs_emergency_read_only(c);
|
|
+ bch2_fs_emergency_read_only2(c, &buf);
|
|
bdev_thaw(c->vfs_sb->s_bdev);
|
|
break;
|
|
case FSOP_GOING_FLAGS_LOGFLUSH:
|
|
bch2_journal_flush(&c->journal);
|
|
fallthrough;
|
|
case FSOP_GOING_FLAGS_NOLOGFLUSH:
|
|
- bch2_fs_emergency_read_only(c);
|
|
+ bch2_fs_emergency_read_only2(c, &buf);
|
|
break;
|
|
default:
|
|
ret = -EINVAL;
|
|
- break;
|
|
+ goto noprint;
|
|
}
|
|
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+noprint:
|
|
+ printbuf_exit(&buf);
|
|
return ret;
|
|
}
|
|
|
|
@@ -515,10 +347,12 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
|
|
ret = -ENOENT;
|
|
goto err;
|
|
}
|
|
- ret = __bch2_unlink(dir, victim, true);
|
|
+
|
|
+ ret = inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?:
|
|
+ __bch2_unlink(dir, victim, true);
|
|
if (!ret) {
|
|
fsnotify_rmdir(dir, victim);
|
|
- d_delete(victim);
|
|
+ d_invalidate(victim);
|
|
}
|
|
err:
|
|
inode_unlock(dir);
|
|
@@ -534,23 +368,6 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
|
long ret;
|
|
|
|
switch (cmd) {
|
|
- case FS_IOC_GETFLAGS:
|
|
- ret = bch2_ioc_getflags(inode, (int __user *) arg);
|
|
- break;
|
|
-
|
|
- case FS_IOC_SETFLAGS:
|
|
- ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
|
|
- break;
|
|
-
|
|
- case FS_IOC_FSGETXATTR:
|
|
- ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
|
|
- break;
|
|
-
|
|
- case FS_IOC_FSSETXATTR:
|
|
- ret = bch2_ioc_fssetxattr(c, file, inode,
|
|
- (void __user *) arg);
|
|
- break;
|
|
-
|
|
case BCHFS_IOC_REINHERIT_ATTRS:
|
|
ret = bch2_ioc_reinherit_attrs(c, file, inode,
|
|
(void __user *) arg);
|
|
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
|
|
index d30f9bb056fd..a657e4994b71 100644
|
|
--- a/fs/bcachefs/fs-ioctl.h
|
|
+++ b/fs/bcachefs/fs-ioctl.h
|
|
@@ -2,79 +2,6 @@
|
|
#ifndef _BCACHEFS_FS_IOCTL_H
|
|
#define _BCACHEFS_FS_IOCTL_H
|
|
|
|
-/* Inode flags: */
|
|
-
|
|
-/* bcachefs inode flags -> vfs inode flags: */
|
|
-static const __maybe_unused unsigned bch_flags_to_vfs[] = {
|
|
- [__BCH_INODE_sync] = S_SYNC,
|
|
- [__BCH_INODE_immutable] = S_IMMUTABLE,
|
|
- [__BCH_INODE_append] = S_APPEND,
|
|
- [__BCH_INODE_noatime] = S_NOATIME,
|
|
-};
|
|
-
|
|
-/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
|
|
-static const __maybe_unused unsigned bch_flags_to_uflags[] = {
|
|
- [__BCH_INODE_sync] = FS_SYNC_FL,
|
|
- [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
|
|
- [__BCH_INODE_append] = FS_APPEND_FL,
|
|
- [__BCH_INODE_nodump] = FS_NODUMP_FL,
|
|
- [__BCH_INODE_noatime] = FS_NOATIME_FL,
|
|
-};
|
|
-
|
|
-/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
|
|
-static const __maybe_unused unsigned bch_flags_to_xflags[] = {
|
|
- [__BCH_INODE_sync] = FS_XFLAG_SYNC,
|
|
- [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
|
|
- [__BCH_INODE_append] = FS_XFLAG_APPEND,
|
|
- [__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
|
|
- [__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
|
|
- //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
|
|
-};
|
|
-
|
|
-#define set_flags(_map, _in, _out) \
|
|
-do { \
|
|
- unsigned _i; \
|
|
- \
|
|
- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
|
|
- if ((_in) & (1 << _i)) \
|
|
- (_out) |= _map[_i]; \
|
|
- else \
|
|
- (_out) &= ~_map[_i]; \
|
|
-} while (0)
|
|
-
|
|
-#define map_flags(_map, _in) \
|
|
-({ \
|
|
- unsigned _out = 0; \
|
|
- \
|
|
- set_flags(_map, _in, _out); \
|
|
- _out; \
|
|
-})
|
|
-
|
|
-#define map_flags_rev(_map, _in) \
|
|
-({ \
|
|
- unsigned _i, _out = 0; \
|
|
- \
|
|
- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
|
|
- if ((_in) & _map[_i]) { \
|
|
- (_out) |= 1 << _i; \
|
|
- (_in) &= ~_map[_i]; \
|
|
- } \
|
|
- (_out); \
|
|
-})
|
|
-
|
|
-#define map_defined(_map) \
|
|
-({ \
|
|
- unsigned _in = ~0; \
|
|
- \
|
|
- map_flags_rev(_map, _in); \
|
|
-})
|
|
-
|
|
-/* Set VFS inode flags from bcachefs inode: */
|
|
-static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
|
|
-{
|
|
- set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
|
|
-}
|
|
-
|
|
long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
|
|
long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
|
|
|
|
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
|
|
index 90ade8f648d9..3813658e72ad 100644
|
|
--- a/fs/bcachefs/fs.c
|
|
+++ b/fs/bcachefs/fs.c
|
|
@@ -11,7 +11,6 @@
|
|
#include "errcode.h"
|
|
#include "extents.h"
|
|
#include "fs.h"
|
|
-#include "fs-common.h"
|
|
#include "fs-io.h"
|
|
#include "fs-ioctl.h"
|
|
#include "fs-io-buffered.h"
|
|
@@ -22,6 +21,7 @@
|
|
#include "io_read.h"
|
|
#include "journal.h"
|
|
#include "keylist.h"
|
|
+#include "namei.h"
|
|
#include "quota.h"
|
|
#include "rebalance.h"
|
|
#include "snapshot.h"
|
|
@@ -33,6 +33,7 @@
|
|
#include <linux/backing-dev.h>
|
|
#include <linux/exportfs.h>
|
|
#include <linux/fiemap.h>
|
|
+#include <linux/fileattr.h>
|
|
#include <linux/fs_context.h>
|
|
#include <linux/module.h>
|
|
#include <linux/pagemap.h>
|
|
@@ -51,6 +52,29 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
|
|
struct bch_inode_unpacked *,
|
|
struct bch_subvolume *);
|
|
|
|
+/* Set VFS inode flags from bcachefs inode: */
|
|
+static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode)
|
|
+{
|
|
+ static const __maybe_unused unsigned bch_flags_to_vfs[] = {
|
|
+ [__BCH_INODE_sync] = S_SYNC,
|
|
+ [__BCH_INODE_immutable] = S_IMMUTABLE,
|
|
+ [__BCH_INODE_append] = S_APPEND,
|
|
+ [__BCH_INODE_noatime] = S_NOATIME,
|
|
+ };
|
|
+
|
|
+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
|
|
+
|
|
+ if (bch2_inode_casefold(c, &inode->ei_inode))
|
|
+ inode->v.i_flags |= S_CASEFOLD;
|
|
+ else
|
|
+ inode->v.i_flags &= ~S_CASEFOLD;
|
|
+
|
|
+ if (inode->ei_inode.bi_flags & BCH_INODE_has_case_insensitive)
|
|
+ inode->v.i_flags &= ~S_NO_CASEFOLD;
|
|
+ else
|
|
+ inode->v.i_flags |= S_NO_CASEFOLD;
|
|
+}
|
|
+
|
|
void bch2_inode_update_after_write(struct btree_trans *trans,
|
|
struct bch_inode_info *inode,
|
|
struct bch_inode_unpacked *bi,
|
|
@@ -79,7 +103,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
|
|
|
|
inode->ei_inode = *bi;
|
|
|
|
- bch2_inode_flags_to_vfs(inode);
|
|
+ bch2_inode_flags_to_vfs(c, inode);
|
|
}
|
|
|
|
int __must_check bch2_write_inode(struct bch_fs *c,
|
|
@@ -88,7 +112,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
|
|
void *p, unsigned fields)
|
|
{
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
- struct btree_iter iter = { NULL };
|
|
+ struct btree_iter iter = {};
|
|
struct bch_inode_unpacked inode_u;
|
|
int ret;
|
|
retry:
|
|
@@ -172,11 +196,6 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
|
|
return ret;
|
|
}
|
|
|
|
-static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
|
|
-{
|
|
- return a.subvol == b.subvol && a.inum == b.inum;
|
|
-}
|
|
-
|
|
static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
|
|
{
|
|
const subvol_inum *inum = data;
|
|
@@ -333,9 +352,8 @@ static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btre
|
|
if (!trans) {
|
|
__wait_on_freeing_inode(c, inode, inum);
|
|
} else {
|
|
- bch2_trans_unlock(trans);
|
|
- __wait_on_freeing_inode(c, inode, inum);
|
|
- int ret = bch2_trans_relock(trans);
|
|
+ int ret = drop_locks_do(trans,
|
|
+ (__wait_on_freeing_inode(c, inode, inum), 0));
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
}
|
|
@@ -631,17 +649,24 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
|
|
const struct qstr *name)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter dirent_iter = {};
|
|
subvol_inum inum = {};
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
+ struct qstr lookup_name;
|
|
+ int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name);
|
|
+ if (ret)
|
|
+ return ERR_PTR(ret);
|
|
+
|
|
+ struct btree_iter dirent_iter = {};
|
|
struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
|
|
- dir_hash_info, dir, name, 0);
|
|
- int ret = bkey_err(k);
|
|
+ dir_hash_info, dir, &lookup_name, 0);
|
|
+ ret = bkey_err(k);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
|
|
- ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
|
|
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
|
+
|
|
+ ret = bch2_dirent_read_target(trans, dir, d, &inum);
|
|
if (ret > 0)
|
|
ret = -ENOENT;
|
|
if (ret)
|
|
@@ -651,30 +676,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
|
|
if (inode)
|
|
goto out;
|
|
|
|
+ /*
|
|
+ * Note: if check/repair needs it, we commit before
|
|
+ * bch2_inode_hash_init_insert(), as after that point we can't take a
|
|
+ * restart - not in the top level loop with a commit_do(), like we
|
|
+ * usually do:
|
|
+ */
|
|
+
|
|
struct bch_subvolume subvol;
|
|
struct bch_inode_unpacked inode_u;
|
|
ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
|
|
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
|
|
+ bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?:
|
|
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
|
|
PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
|
|
|
|
+ /*
|
|
+ * don't remove it: check_inodes might find another inode that points
|
|
+ * back to this dirent
|
|
+ */
|
|
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
|
|
- c, "dirent to missing inode:\n %s",
|
|
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
|
|
+ c, "dirent to missing inode:\n%s",
|
|
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf));
|
|
if (ret)
|
|
goto err;
|
|
-
|
|
- /* regular files may have hardlinks: */
|
|
- if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) &&
|
|
- !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
|
|
- c,
|
|
- "dirent points to inode that does not point back:\n %s",
|
|
- (bch2_bkey_val_to_text(&buf, c, k),
|
|
- prt_printf(&buf, "\n "),
|
|
- bch2_inode_unpacked_to_text(&buf, &inode_u),
|
|
- buf.buf))) {
|
|
- ret = -ENOENT;
|
|
- goto err;
|
|
- }
|
|
out:
|
|
bch2_trans_iter_exit(trans, &dirent_iter);
|
|
printbuf_exit(&buf);
|
|
@@ -698,6 +723,23 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
|
|
if (IS_ERR(inode))
|
|
inode = NULL;
|
|
|
|
+#ifdef CONFIG_UNICODE
|
|
+ if (!inode && IS_CASEFOLDED(vdir)) {
|
|
+ /*
|
|
+ * Do not cache a negative dentry in casefolded directories
|
|
+ * as it would need to be invalidated in the following situation:
|
|
+ * - Lookup file "blAH" in a casefolded directory
|
|
+ * - Creation of file "BLAH" in a casefolded directory
|
|
+ * - Lookup file "blAH" in a casefolded directory
|
|
+ * which would fail if we had a negative dentry.
|
|
+ *
|
|
+ * We should come back to this when VFS has a method to handle
|
|
+ * this edgecase.
|
|
+ */
|
|
+ return NULL;
|
|
+ }
|
|
+#endif
|
|
+
|
|
return d_splice_alias(&inode->v, dentry);
|
|
}
|
|
|
|
@@ -806,6 +848,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
|
|
*/
|
|
set_nlink(&inode->v, 0);
|
|
}
|
|
+
|
|
+ if (IS_CASEFOLDED(vdir))
|
|
+ d_invalidate(dentry);
|
|
err:
|
|
bch2_trans_put(trans);
|
|
bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
|
|
@@ -876,6 +921,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
|
|
struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
|
|
struct bch_inode_unpacked dst_dir_u, src_dir_u;
|
|
struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
|
|
+ struct d_casefold_enable casefold_enable_src = {};
|
|
+ struct d_casefold_enable casefold_enable_dst = {};
|
|
struct btree_trans *trans;
|
|
enum bch_rename_mode mode = flags & RENAME_EXCHANGE
|
|
? BCH_RENAME_EXCHANGE
|
|
@@ -900,6 +947,21 @@ static int bch2_rename2(struct mnt_idmap *idmap,
|
|
src_inode,
|
|
dst_inode);
|
|
|
|
+ if (src_dir != dst_dir) {
|
|
+ if (bch2_inode_casefold(c, &src_inode->ei_inode)) {
|
|
+ ret = d_casefold_enable(dst_dentry, &casefold_enable_dst, true);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (mode == BCH_RENAME_EXCHANGE &&
|
|
+ bch2_inode_casefold(c, &dst_inode->ei_inode)) {
|
|
+ ret = d_casefold_enable(src_dentry, &casefold_enable_src, true);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
trans = bch2_trans_get(c);
|
|
|
|
ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
|
|
@@ -1004,6 +1066,9 @@ static int bch2_rename2(struct mnt_idmap *idmap,
|
|
src_inode,
|
|
dst_inode);
|
|
|
|
+ d_casefold_enable_commit(&casefold_enable_dst, ret);
|
|
+ d_casefold_enable_commit(&casefold_enable_src, ret);
|
|
+
|
|
return bch2_err_class(ret);
|
|
}
|
|
|
|
@@ -1056,7 +1121,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
|
|
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
struct bch_qid qid;
|
|
struct btree_trans *trans;
|
|
- struct btree_iter inode_iter = { NULL };
|
|
+ struct btree_iter inode_iter = {};
|
|
struct bch_inode_unpacked inode_u;
|
|
struct posix_acl *acl = NULL;
|
|
kuid_t kuid;
|
|
@@ -1216,10 +1281,20 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,
|
|
return finish_open_simple(file, 0);
|
|
}
|
|
|
|
+struct bch_fiemap_extent {
|
|
+ struct bkey_buf kbuf;
|
|
+ unsigned flags;
|
|
+};
|
|
+
|
|
static int bch2_fill_extent(struct bch_fs *c,
|
|
struct fiemap_extent_info *info,
|
|
- struct bkey_s_c k, unsigned flags)
|
|
+ struct bch_fiemap_extent *fe)
|
|
{
|
|
+ struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k);
|
|
+ unsigned flags = fe->flags;
|
|
+
|
|
+ BUG_ON(!k.k->size);
|
|
+
|
|
if (bkey_extent_is_direct_data(k.k)) {
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
const union bch_extent_entry *entry;
|
|
@@ -1272,110 +1347,225 @@ static int bch2_fill_extent(struct bch_fs *c,
|
|
}
|
|
}
|
|
|
|
-static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
|
|
- u64 start, u64 len)
|
|
+/*
|
|
+ * Scan a range of an inode for data in pagecache.
|
|
+ *
|
|
+ * Intended to be retryable, so don't modify the output params until success is
|
|
+ * imminent.
|
|
+ */
|
|
+static int
|
|
+bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
|
|
+ bool nonblock)
|
|
{
|
|
- struct bch_fs *c = vinode->i_sb->s_fs_info;
|
|
- struct bch_inode_info *ei = to_bch_ei(vinode);
|
|
- struct btree_trans *trans;
|
|
- struct btree_iter iter;
|
|
- struct bkey_s_c k;
|
|
- struct bkey_buf cur, prev;
|
|
- bool have_extent = false;
|
|
- int ret = 0;
|
|
+ loff_t dstart, dend;
|
|
|
|
- ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
|
|
- if (ret)
|
|
+ dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock);
|
|
+ if (dstart < 0)
|
|
+ return dstart;
|
|
+
|
|
+ if (dstart == *end) {
|
|
+ *start = dstart;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock);
|
|
+ if (dend < 0)
|
|
+ return dend;
|
|
+
|
|
+ /* race */
|
|
+ BUG_ON(dstart == dend);
|
|
+
|
|
+ *start = dstart;
|
|
+ *end = dend;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Scan a range of pagecache that corresponds to a file mapping hole in the
|
|
+ * extent btree. If data is found, fake up an extent key so it looks like a
|
|
+ * delalloc extent to the rest of the fiemap processing code.
|
|
+ */
|
|
+static int
|
|
+bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode,
|
|
+ u64 start, u64 end, struct bch_fiemap_extent *cur)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_i_extent *delextent;
|
|
+ struct bch_extent_ptr ptr = {};
|
|
+ loff_t dstart = start << 9, dend = end << 9;
|
|
+ int ret;
|
|
+
|
|
+ /*
|
|
+ * We hold btree locks here so we cannot block on folio locks without
|
|
+ * dropping trans locks first. Run a nonblocking scan for the common
|
|
+ * case of no folios over holes and fall back on failure.
|
|
+ *
|
|
+ * Note that dropping locks like this is technically racy against
|
|
+ * writeback inserting to the extent tree, but a non-sync fiemap scan is
|
|
+ * fundamentally racy with writeback anyways. Therefore, just report the
|
|
+ * range as delalloc regardless of whether we have to cycle trans locks.
|
|
+ */
|
|
+ ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true);
|
|
+ if (ret == -EAGAIN)
|
|
+ ret = drop_locks_do(trans,
|
|
+ bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false));
|
|
+ if (ret < 0)
|
|
return ret;
|
|
|
|
- struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
|
|
- if (start + len < start)
|
|
- return -EINVAL;
|
|
+ /*
|
|
+ * Create a fake extent key in the buffer. We have to add a dummy extent
|
|
+ * pointer for the fill code to add an extent entry. It's explicitly
|
|
+ * zeroed to reflect delayed allocation (i.e. phys offset 0).
|
|
+ */
|
|
+ bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64));
|
|
+ delextent = bkey_extent_init(cur->kbuf.k);
|
|
+ delextent->k.p = POS(inode->ei_inum.inum, dend >> 9);
|
|
+ delextent->k.size = (dend - dstart) >> 9;
|
|
+ bch2_bkey_append_ptr(&delextent->k_i, ptr);
|
|
|
|
- start >>= 9;
|
|
+ cur->flags = FIEMAP_EXTENT_DELALLOC;
|
|
|
|
- bch2_bkey_buf_init(&cur);
|
|
- bch2_bkey_buf_init(&prev);
|
|
- trans = bch2_trans_get(c);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_next_fiemap_extent(struct btree_trans *trans,
|
|
+ struct bch_inode_info *inode,
|
|
+ u64 start, u64 end,
|
|
+ struct bch_fiemap_extent *cur)
|
|
+{
|
|
+ u32 snapshot;
|
|
+ int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
|
|
+ if (ret)
|
|
+ return ret;
|
|
|
|
+ struct btree_iter iter;
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
|
|
- POS(ei->v.i_ino, start), 0);
|
|
+ SPOS(inode->ei_inum.inum, start, snapshot), 0);
|
|
|
|
- while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
|
|
- enum btree_id data_btree = BTREE_ID_extents;
|
|
+ struct bkey_s_c k =
|
|
+ bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end));
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
- bch2_trans_begin(trans);
|
|
+ u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end;
|
|
|
|
- u32 snapshot;
|
|
- ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
|
|
- if (ret)
|
|
- continue;
|
|
+ ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur);
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
- bch2_btree_iter_set_snapshot(&iter, snapshot);
|
|
+ struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k);
|
|
|
|
- k = bch2_btree_iter_peek_max(&iter, end);
|
|
- ret = bkey_err(k);
|
|
+ /*
|
|
+ * Does the pagecache or the btree take precedence?
|
|
+ *
|
|
+ * It _should_ be the pagecache, so that we correctly report delalloc
|
|
+ * extents when dirty in the pagecache (we're COW, after all).
|
|
+ *
|
|
+ * But we'd have to add per-sector writeback tracking to
|
|
+ * bch_folio_state, otherwise we report delalloc extents for clean
|
|
+ * cached data in the pagecache.
|
|
+ *
|
|
+ * We should do this, but even then fiemap won't report stable mappings:
|
|
+ * on bcachefs data moves around in the background (copygc, rebalance)
|
|
+ * and we don't provide a way for userspace to lock that out.
|
|
+ */
|
|
+ if (k.k &&
|
|
+ bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)),
|
|
+ pagecache_start)) {
|
|
+ bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k);
|
|
+ bch2_cut_front(iter.pos, cur->kbuf.k);
|
|
+ bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k);
|
|
+ cur->flags = 0;
|
|
+ } else if (k.k) {
|
|
+ bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k);
|
|
+ }
|
|
+
|
|
+ if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) {
|
|
+ unsigned sectors = cur->kbuf.k->k.size;
|
|
+ s64 offset_into_extent = 0;
|
|
+ enum btree_id data_btree = BTREE_ID_extents;
|
|
+ ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent,
|
|
+ &cur->kbuf);
|
|
if (ret)
|
|
- continue;
|
|
+ goto err;
|
|
|
|
- if (!k.k)
|
|
- break;
|
|
+ struct bkey_i *k = cur->kbuf.k;
|
|
+ sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent);
|
|
|
|
- if (!bkey_extent_is_data(k.k) &&
|
|
- k.k->type != KEY_TYPE_reservation) {
|
|
- bch2_btree_iter_advance(&iter);
|
|
- continue;
|
|
- }
|
|
+ bch2_cut_front(POS(k->k.p.inode,
|
|
+ bkey_start_offset(&k->k) + offset_into_extent),
|
|
+ k);
|
|
+ bch2_key_resize(&k->k, sectors);
|
|
+ k->k.p = iter.pos;
|
|
+ k->k.p.offset += k->k.size;
|
|
+ }
|
|
+err:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
|
|
+ u64 start, u64 len)
|
|
+{
|
|
+ struct bch_fs *c = vinode->i_sb->s_fs_info;
|
|
+ struct bch_inode_info *ei = to_bch_ei(vinode);
|
|
+ struct btree_trans *trans;
|
|
+ struct bch_fiemap_extent cur, prev;
|
|
+ int ret = 0;
|
|
|
|
- s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
|
|
- unsigned sectors = k.k->size - offset_into_extent;
|
|
+ ret = fiemap_prep(&ei->v, info, start, &len, 0);
|
|
+ if (ret)
|
|
+ return ret;
|
|
|
|
- bch2_bkey_buf_reassemble(&cur, c, k);
|
|
+ if (start + len < start)
|
|
+ return -EINVAL;
|
|
|
|
- ret = bch2_read_indirect_extent(trans, &data_btree,
|
|
- &offset_into_extent, &cur);
|
|
+ start >>= 9;
|
|
+ u64 end = (start + len) >> 9;
|
|
+
|
|
+ bch2_bkey_buf_init(&cur.kbuf);
|
|
+ bch2_bkey_buf_init(&prev.kbuf);
|
|
+ bkey_init(&prev.kbuf.k->k);
|
|
+
|
|
+ trans = bch2_trans_get(c);
|
|
+
|
|
+ while (start < end) {
|
|
+ ret = lockrestart_do(trans,
|
|
+ bch2_next_fiemap_extent(trans, ei, start, end, &cur));
|
|
if (ret)
|
|
- continue;
|
|
+ goto err;
|
|
|
|
- k = bkey_i_to_s_c(cur.k);
|
|
- bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
|
|
+ BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start);
|
|
+ BUG_ON(cur.kbuf.k->k.p.offset > end);
|
|
|
|
- sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
|
|
+ if (bkey_start_offset(&cur.kbuf.k->k) == end)
|
|
+ break;
|
|
|
|
- bch2_cut_front(POS(k.k->p.inode,
|
|
- bkey_start_offset(k.k) +
|
|
- offset_into_extent),
|
|
- cur.k);
|
|
- bch2_key_resize(&cur.k->k, sectors);
|
|
- cur.k->k.p = iter.pos;
|
|
- cur.k->k.p.offset += cur.k->k.size;
|
|
+ start = cur.kbuf.k->k.p.offset;
|
|
|
|
- if (have_extent) {
|
|
+ if (!bkey_deleted(&prev.kbuf.k->k)) {
|
|
bch2_trans_unlock(trans);
|
|
- ret = bch2_fill_extent(c, info,
|
|
- bkey_i_to_s_c(prev.k), 0);
|
|
+ ret = bch2_fill_extent(c, info, &prev);
|
|
if (ret)
|
|
- break;
|
|
+ goto err;
|
|
}
|
|
|
|
- bkey_copy(prev.k, cur.k);
|
|
- have_extent = true;
|
|
-
|
|
- bch2_btree_iter_set_pos(&iter,
|
|
- POS(iter.pos.inode, iter.pos.offset + sectors));
|
|
+ bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k);
|
|
+ prev.flags = cur.flags;
|
|
}
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
|
|
- if (!ret && have_extent) {
|
|
+ if (!bkey_deleted(&prev.kbuf.k->k)) {
|
|
bch2_trans_unlock(trans);
|
|
- ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
|
|
- FIEMAP_EXTENT_LAST);
|
|
+ prev.flags |= FIEMAP_EXTENT_LAST;
|
|
+ ret = bch2_fill_extent(c, info, &prev);
|
|
}
|
|
-
|
|
+err:
|
|
bch2_trans_put(trans);
|
|
- bch2_bkey_buf_exit(&cur, c);
|
|
- bch2_bkey_buf_exit(&prev, c);
|
|
- return ret < 0 ? ret : 0;
|
|
+ bch2_bkey_buf_exit(&cur.kbuf, c);
|
|
+ bch2_bkey_buf_exit(&prev.kbuf, c);
|
|
+
|
|
+ return bch2_err_class(ret < 0 ? ret : 0);
|
|
}
|
|
|
|
static const struct vm_operations_struct bch_vm_ops = {
|
|
@@ -1430,6 +1620,153 @@ static int bch2_open(struct inode *vinode, struct file *file)
|
|
return generic_file_open(vinode, file);
|
|
}
|
|
|
|
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
|
|
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
|
|
+ [__BCH_INODE_sync] = FS_SYNC_FL,
|
|
+ [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
|
|
+ [__BCH_INODE_append] = FS_APPEND_FL,
|
|
+ [__BCH_INODE_nodump] = FS_NODUMP_FL,
|
|
+ [__BCH_INODE_noatime] = FS_NOATIME_FL,
|
|
+};
|
|
+
|
|
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
|
|
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
|
|
+ [__BCH_INODE_sync] = FS_XFLAG_SYNC,
|
|
+ [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
|
|
+ [__BCH_INODE_append] = FS_XFLAG_APPEND,
|
|
+ [__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
|
|
+ [__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
|
|
+};
|
|
+
|
|
+static int bch2_fileattr_get(struct dentry *dentry,
|
|
+ struct fileattr *fa)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+
|
|
+ fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags));
|
|
+
|
|
+ if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
|
|
+ fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
|
|
+
|
|
+ if (bch2_inode_casefold(c, &inode->ei_inode))
|
|
+ fa->flags |= FS_CASEFOLD_FL;
|
|
+
|
|
+ fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ];
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+struct flags_set {
|
|
+ unsigned mask;
|
|
+ unsigned flags;
|
|
+ unsigned projid;
|
|
+ bool set_project;
|
|
+ bool set_casefold;
|
|
+ bool casefold;
|
|
+};
|
|
+
|
|
+static int fssetxattr_inode_update_fn(struct btree_trans *trans,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi,
|
|
+ void *p)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct flags_set *s = p;
|
|
+
|
|
+ /*
|
|
+ * We're relying on btree locking here for exclusion with other ioctl
|
|
+ * calls - use the flags in the btree (@bi), not inode->i_flags:
|
|
+ */
|
|
+ if (!S_ISREG(bi->bi_mode) &&
|
|
+ !S_ISDIR(bi->bi_mode) &&
|
|
+ (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags)
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (s->casefold != bch2_inode_casefold(c, bi)) {
|
|
+ int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (s->set_project) {
|
|
+ bi->bi_project = s->projid;
|
|
+ bi->bi_fields_set |= BIT(Inode_opt_project);
|
|
+ }
|
|
+
|
|
+ bi->bi_flags &= ~s->mask;
|
|
+ bi->bi_flags |= s->flags;
|
|
+
|
|
+ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_fileattr_set(struct mnt_idmap *idmap,
|
|
+ struct dentry *dentry,
|
|
+ struct fileattr *fa)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct flags_set s = {};
|
|
+ struct d_casefold_enable casefold_enable = {};
|
|
+ int ret;
|
|
+
|
|
+ guard(mutex)(&inode->ei_update_lock);
|
|
+
|
|
+ if (fa->fsx_valid) {
|
|
+ fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
|
|
+
|
|
+ s.mask = map_defined(bch_flags_to_xflags);
|
|
+ s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags);
|
|
+ if (fa->fsx_xflags)
|
|
+ return -EOPNOTSUPP;
|
|
+
|
|
+ if (fa->fsx_projid >= U32_MAX)
|
|
+ return -EINVAL;
|
|
+
|
|
+ /*
|
|
+ * inode fields accessible via the xattr interface are stored with a +1
|
|
+ * bias, so that 0 means unset:
|
|
+ */
|
|
+ if ((inode->ei_inode.bi_project ||
|
|
+ fa->fsx_projid) &&
|
|
+ inode->ei_inode.bi_project != fa->fsx_projid + 1) {
|
|
+ s.projid = fa->fsx_projid + 1;
|
|
+ s.set_project = true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (fa->flags_valid) {
|
|
+ s.mask = map_defined(bch_flags_to_uflags);
|
|
+
|
|
+ s.set_casefold = true;
|
|
+ s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0;
|
|
+ fa->flags &= ~FS_CASEFOLD_FL;
|
|
+
|
|
+ if (s.casefold && s.casefold != bch2_inode_casefold(c, &inode->ei_inode)) {
|
|
+ ret = d_casefold_enable(dentry, &casefold_enable, false);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags);
|
|
+ if (fa->flags) {
|
|
+ ret = -EOPNOTSUPP;
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
|
|
+ (s.set_project
|
|
+ ? bch2_set_projid(c, inode, fa->fsx_projid)
|
|
+ : 0) ?:
|
|
+ bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
|
|
+ ATTR_CTIME);
|
|
+err:
|
|
+ d_casefold_enable_commit(&casefold_enable, ret);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static const struct file_operations bch_file_operations = {
|
|
.open = bch2_open,
|
|
.llseek = bch2_llseek,
|
|
@@ -1457,6 +1794,8 @@ static const struct inode_operations bch_file_inode_operations = {
|
|
.get_inode_acl = bch2_get_acl,
|
|
.set_acl = bch2_set_acl,
|
|
#endif
|
|
+ .fileattr_get = bch2_fileattr_get,
|
|
+ .fileattr_set = bch2_fileattr_set,
|
|
};
|
|
|
|
static const struct inode_operations bch_dir_inode_operations = {
|
|
@@ -1477,6 +1816,8 @@ static const struct inode_operations bch_dir_inode_operations = {
|
|
.get_inode_acl = bch2_get_acl,
|
|
.set_acl = bch2_set_acl,
|
|
#endif
|
|
+ .fileattr_get = bch2_fileattr_get,
|
|
+ .fileattr_set = bch2_fileattr_set,
|
|
};
|
|
|
|
static const struct file_operations bch_dir_file_operations = {
|
|
@@ -1499,6 +1840,8 @@ static const struct inode_operations bch_symlink_inode_operations = {
|
|
.get_inode_acl = bch2_get_acl,
|
|
.set_acl = bch2_set_acl,
|
|
#endif
|
|
+ .fileattr_get = bch2_fileattr_get,
|
|
+ .fileattr_set = bch2_fileattr_set,
|
|
};
|
|
|
|
static const struct inode_operations bch_special_inode_operations = {
|
|
@@ -1509,6 +1852,8 @@ static const struct inode_operations bch_special_inode_operations = {
|
|
.get_inode_acl = bch2_get_acl,
|
|
.set_acl = bch2_set_acl,
|
|
#endif
|
|
+ .fileattr_get = bch2_fileattr_get,
|
|
+ .fileattr_set = bch2_fileattr_set,
|
|
};
|
|
|
|
static const struct address_space_operations bch_address_space_operations = {
|
|
@@ -1678,17 +2023,17 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
|
|
if (ret)
|
|
goto err;
|
|
|
|
- bch2_btree_iter_set_snapshot(&iter1, snapshot);
|
|
- bch2_btree_iter_set_snapshot(&iter2, snapshot);
|
|
+ bch2_btree_iter_set_snapshot(trans, &iter1, snapshot);
|
|
+ bch2_btree_iter_set_snapshot(trans, &iter2, snapshot);
|
|
|
|
ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
|
|
if (ret)
|
|
goto err;
|
|
|
|
if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
|
|
- bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
|
|
+ bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
|
|
|
|
- k = bch2_btree_iter_peek_slot(&iter1);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &iter1);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1712,7 +2057,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
|
|
* File with multiple hardlinks and our backref is to the wrong
|
|
* directory - linear search:
|
|
*/
|
|
- for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
|
|
+ for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) {
|
|
if (k.k->p.inode > dir->ei_inode.bi_inum)
|
|
break;
|
|
|
|
@@ -1802,7 +2147,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans,
|
|
break;
|
|
}
|
|
|
|
- mapping_set_large_folios(inode->v.i_mapping);
|
|
+ mapping_set_folio_min_order(inode->v.i_mapping,
|
|
+ get_order(trans->c->opts.block_size));
|
|
}
|
|
|
|
static void bch2_free_inode(struct inode *vinode)
|
|
@@ -2008,55 +2354,19 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
|
|
return c ?: ERR_PTR(-ENOENT);
|
|
}
|
|
|
|
-static int bch2_remount(struct super_block *sb, int *flags,
|
|
- struct bch_opts opts)
|
|
-{
|
|
- struct bch_fs *c = sb->s_fs_info;
|
|
- int ret = 0;
|
|
-
|
|
- opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
|
|
-
|
|
- if (opts.read_only != c->opts.read_only) {
|
|
- down_write(&c->state_lock);
|
|
-
|
|
- if (opts.read_only) {
|
|
- bch2_fs_read_only(c);
|
|
-
|
|
- sb->s_flags |= SB_RDONLY;
|
|
- } else {
|
|
- ret = bch2_fs_read_write(c);
|
|
- if (ret) {
|
|
- bch_err(c, "error going rw: %i", ret);
|
|
- up_write(&c->state_lock);
|
|
- ret = -EINVAL;
|
|
- goto err;
|
|
- }
|
|
-
|
|
- sb->s_flags &= ~SB_RDONLY;
|
|
- }
|
|
-
|
|
- c->opts.read_only = opts.read_only;
|
|
-
|
|
- up_write(&c->state_lock);
|
|
- }
|
|
-
|
|
- if (opt_defined(opts, errors))
|
|
- c->opts.errors = opts.errors;
|
|
-err:
|
|
- return bch2_err_class(ret);
|
|
-}
|
|
-
|
|
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
|
|
{
|
|
struct bch_fs *c = root->d_sb->s_fs_info;
|
|
bool first = true;
|
|
|
|
- for_each_online_member(c, ca) {
|
|
+ rcu_read_lock();
|
|
+ for_each_online_member_rcu(c, ca) {
|
|
if (!first)
|
|
seq_putc(seq, ':');
|
|
first = false;
|
|
seq_puts(seq, ca->disk_sb.sb_name);
|
|
}
|
|
+ rcu_read_unlock();
|
|
|
|
return 0;
|
|
}
|
|
@@ -2163,7 +2473,7 @@ static int bch2_fs_get_tree(struct fs_context *fc)
|
|
struct inode *vinode;
|
|
struct bch2_opts_parse *opts_parse = fc->fs_private;
|
|
struct bch_opts opts = opts_parse->opts;
|
|
- darray_str devs;
|
|
+ darray_const_str devs;
|
|
darray_fs devs_to_fs = {};
|
|
int ret;
|
|
|
|
@@ -2187,14 +2497,17 @@ static int bch2_fs_get_tree(struct fs_context *fc)
|
|
if (!IS_ERR(sb))
|
|
goto got_sb;
|
|
|
|
- c = bch2_fs_open(devs.data, devs.nr, opts);
|
|
+ c = bch2_fs_open(&devs, &opts);
|
|
ret = PTR_ERR_OR_ZERO(c);
|
|
if (ret)
|
|
goto err;
|
|
|
|
+ if (opt_defined(opts, discard))
|
|
+ set_bit(BCH_FS_discard_mount_opt_set, &c->flags);
|
|
+
|
|
/* Some options can't be parsed until after the fs is started: */
|
|
opts = bch2_opts_empty();
|
|
- ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
|
|
+ ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false);
|
|
if (ret)
|
|
goto err_stop_fs;
|
|
|
|
@@ -2234,7 +2547,12 @@ static int bch2_fs_get_tree(struct fs_context *fc)
|
|
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
|
|
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
|
|
super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
|
|
- super_set_sysfs_name_uuid(sb);
|
|
+
|
|
+ if (c->sb.multi_device)
|
|
+ super_set_sysfs_name_uuid(sb);
|
|
+ else
|
|
+ strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name));
|
|
+
|
|
sb->s_shrink->seeks = 0;
|
|
c->vfs_sb = sb;
|
|
strscpy(sb->s_id, c->name, sizeof(sb->s_id));
|
|
@@ -2245,15 +2563,16 @@ static int bch2_fs_get_tree(struct fs_context *fc)
|
|
|
|
sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
|
|
|
|
- for_each_online_member(c, ca) {
|
|
+ rcu_read_lock();
|
|
+ for_each_online_member_rcu(c, ca) {
|
|
struct block_device *bdev = ca->disk_sb.bdev;
|
|
|
|
/* XXX: create an anonymous device for multi device filesystems */
|
|
sb->s_bdev = bdev;
|
|
sb->s_dev = bdev->bd_dev;
|
|
- percpu_ref_put(&ca->io_ref);
|
|
break;
|
|
}
|
|
+ rcu_read_unlock();
|
|
|
|
c->dev = sb->s_dev;
|
|
|
|
@@ -2264,6 +2583,11 @@ static int bch2_fs_get_tree(struct fs_context *fc)
|
|
|
|
sb->s_shrink->seeks = 0;
|
|
|
|
+#ifdef CONFIG_UNICODE
|
|
+ sb->s_encoding = c->cf_encoding;
|
|
+#endif
|
|
+ generic_set_sb_d_ops(sb);
|
|
+
|
|
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
|
|
ret = PTR_ERR_OR_ZERO(vinode);
|
|
bch_err_msg(c, ret, "mounting: error getting root inode");
|
|
@@ -2300,7 +2624,8 @@ static int bch2_fs_get_tree(struct fs_context *fc)
|
|
goto err;
|
|
|
|
err_put_super:
|
|
- __bch2_fs_stop(c);
|
|
+ if (!sb->s_root)
|
|
+ __bch2_fs_stop(c);
|
|
deactivate_locked_super(sb);
|
|
goto err;
|
|
}
|
|
@@ -2343,6 +2668,8 @@ static int bch2_fs_parse_param(struct fs_context *fc,
|
|
int ret = bch2_parse_one_mount_opt(c, &opts->opts,
|
|
&opts->parse_later, param->key,
|
|
param->string);
|
|
+ if (ret)
|
|
+ pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret));
|
|
|
|
return bch2_err_class(ret);
|
|
}
|
|
@@ -2351,8 +2678,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc)
|
|
{
|
|
struct super_block *sb = fc->root->d_sb;
|
|
struct bch2_opts_parse *opts = fc->fs_private;
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+ int ret = 0;
|
|
|
|
- return bch2_remount(sb, &fc->sb_flags, opts->opts);
|
|
+ opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
|
|
+
|
|
+ if (opts->opts.read_only != c->opts.read_only) {
|
|
+ down_write(&c->state_lock);
|
|
+
|
|
+ if (opts->opts.read_only) {
|
|
+ bch2_fs_read_only(c);
|
|
+
|
|
+ sb->s_flags |= SB_RDONLY;
|
|
+ } else {
|
|
+ ret = bch2_fs_read_write(c);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error going rw: %i", ret);
|
|
+ up_write(&c->state_lock);
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ sb->s_flags &= ~SB_RDONLY;
|
|
+ }
|
|
+
|
|
+ c->opts.read_only = opts->opts.read_only;
|
|
+
|
|
+ up_write(&c->state_lock);
|
|
+ }
|
|
+
|
|
+ if (opt_defined(opts->opts, errors))
|
|
+ c->opts.errors = opts->opts.errors;
|
|
+err:
|
|
+ return bch2_err_class(ret);
|
|
}
|
|
|
|
static const struct fs_context_operations bch2_context_ops = {
|
|
@@ -2396,7 +2754,7 @@ static struct file_system_type bcache_fs_type = {
|
|
.name = "bcachefs",
|
|
.init_fs_context = bch2_init_fs_context,
|
|
.kill_sb = bch2_kill_sb,
|
|
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
|
|
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS,
|
|
};
|
|
|
|
MODULE_ALIAS_FS("bcachefs");
|
|
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
|
|
index 0e85131d0af8..fd3178189565 100644
|
|
--- a/fs/bcachefs/fsck.c
|
|
+++ b/fs/bcachefs/fsck.c
|
|
@@ -6,14 +6,13 @@
|
|
#include "btree_cache.h"
|
|
#include "btree_update.h"
|
|
#include "buckets.h"
|
|
-#include "darray.h"
|
|
#include "dirent.h"
|
|
#include "error.h"
|
|
#include "fs.h"
|
|
-#include "fs-common.h"
|
|
#include "fsck.h"
|
|
#include "inode.h"
|
|
#include "keylist.h"
|
|
+#include "namei.h"
|
|
#include "recovery_passes.h"
|
|
#include "snapshot.h"
|
|
#include "super.h"
|
|
@@ -21,15 +20,9 @@
|
|
#include "xattr.h"
|
|
|
|
#include <linux/bsearch.h>
|
|
+#include <linux/darray.h>
|
|
#include <linux/dcache.h> /* struct qstr */
|
|
|
|
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
|
|
- struct bkey_s_c_dirent d)
|
|
-{
|
|
- return inode->bi_dir == d.k->p.inode &&
|
|
- inode->bi_dir_offset == d.k->p.offset;
|
|
-}
|
|
-
|
|
static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
|
|
struct bch_inode_unpacked *inode)
|
|
{
|
|
@@ -116,50 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
|
|
return ret;
|
|
}
|
|
|
|
-static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
|
|
- struct bch_inode_unpacked *inode)
|
|
-{
|
|
- struct btree_iter iter;
|
|
- struct bkey_s_c k;
|
|
- int ret;
|
|
-
|
|
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
|
|
- BTREE_ITER_all_snapshots, k, ret) {
|
|
- if (k.k->p.offset != inode_nr)
|
|
- break;
|
|
- if (!bkey_is_inode(k.k))
|
|
- continue;
|
|
- ret = bch2_inode_unpack(k, inode);
|
|
- goto found;
|
|
- }
|
|
- ret = -BCH_ERR_ENOENT_inode;
|
|
-found:
|
|
- bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
- return ret;
|
|
-}
|
|
-
|
|
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
|
|
- struct bch_inode_unpacked *inode)
|
|
-{
|
|
- struct btree_iter iter;
|
|
- struct bkey_s_c k;
|
|
- int ret;
|
|
-
|
|
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
|
|
- SPOS(0, inode_nr, snapshot), 0);
|
|
- ret = bkey_err(k);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- ret = bkey_is_inode(k.k)
|
|
- ? bch2_inode_unpack(k, inode)
|
|
- : -BCH_ERR_ENOENT_inode;
|
|
-err:
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
- return ret;
|
|
-}
|
|
-
|
|
static int lookup_dirent_in_snapshot(struct btree_trans *trans,
|
|
struct bch_hash_info hash_info,
|
|
subvol_inum dir, struct qstr *name,
|
|
@@ -179,32 +128,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
-static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
|
|
-{
|
|
- struct bch_fs *c = trans->c;
|
|
- struct btree_iter iter;
|
|
- struct bch_inode_unpacked dir_inode;
|
|
- struct bch_hash_info dir_hash_info;
|
|
- int ret;
|
|
-
|
|
- ret = lookup_first_inode(trans, pos.inode, &dir_inode);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- dir_hash_info = bch2_hash_info_init(c, &dir_inode);
|
|
-
|
|
- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
|
|
-
|
|
- ret = bch2_btree_iter_traverse(&iter) ?:
|
|
- bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
|
|
- &dir_hash_info, &iter,
|
|
- BTREE_UPDATE_internal_snapshot_node);
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
-err:
|
|
- bch_err_fn(c, ret);
|
|
- return ret;
|
|
-}
|
|
-
|
|
/*
|
|
* Find any subvolume associated with a tree of snapshots
|
|
* We can't rely on master_subvol - it might have been deleted.
|
|
@@ -242,7 +165,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct qstr lostfound_str = QSTR("lost+found");
|
|
- struct btree_iter lostfound_iter = { NULL };
|
|
+ struct btree_iter lostfound_iter = {};
|
|
u64 inum = 0;
|
|
unsigned d_type = 0;
|
|
int ret;
|
|
@@ -287,7 +210,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
|
|
|
|
struct bch_inode_unpacked root_inode;
|
|
struct bch_hash_info root_hash_info;
|
|
- ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
|
|
+ ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0);
|
|
bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
|
|
root_inum.inum, subvolid);
|
|
if (ret)
|
|
@@ -313,7 +236,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
|
|
* The bch2_check_dirents pass has already run, dangling dirents
|
|
* shouldn't exist here:
|
|
*/
|
|
- ret = lookup_inode(trans, inum, snapshot, lostfound);
|
|
+ ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0);
|
|
bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
|
|
inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
|
|
return ret;
|
|
@@ -341,7 +264,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
|
|
u64 cpu = raw_smp_processor_id();
|
|
|
|
bch2_inode_init_early(c, lostfound);
|
|
- bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
|
|
+ bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
|
|
lostfound->bi_dir = root_inode.bi_inum;
|
|
lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
|
|
|
|
@@ -351,8 +274,8 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
|
|
- ret = bch2_btree_iter_traverse(&lostfound_iter);
|
|
+ bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot);
|
|
+ ret = bch2_btree_iter_traverse(trans, &lostfound_iter);
|
|
if (ret)
|
|
goto err;
|
|
|
|
@@ -362,6 +285,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
|
|
&lostfound_str,
|
|
lostfound->bi_inum,
|
|
&lostfound->bi_dir_offset,
|
|
+ BTREE_UPDATE_internal_snapshot_node|
|
|
STR_HASH_must_create) ?:
|
|
bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
|
|
BTREE_UPDATE_internal_snapshot_node);
|
|
@@ -377,6 +301,31 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
|
|
inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
|
|
return false;
|
|
|
|
+ /*
|
|
+ * Subvolume roots are special: older versions of subvolume roots may be
|
|
+ * disconnected, it's only the newest version that matters.
|
|
+ *
|
|
+ * We only keep a single dirent pointing to a subvolume root, i.e.
|
|
+ * older versions of snapshots will not have a different dirent pointing
|
|
+ * to the same subvolume root.
|
|
+ *
|
|
+ * This is because dirents that point to subvolumes are only visible in
|
|
+ * the parent subvolume - versioning is not needed - and keeping them
|
|
+ * around would break fsck, because when we're crossing subvolumes we
|
|
+ * don't have a consistent snapshot ID to do check the inode <-> dirent
|
|
+ * relationships.
|
|
+ *
|
|
+ * Thus, a subvolume root that's been renamed after a snapshot will have
|
|
+ * a disconnected older version - that's expected.
|
|
+ *
|
|
+ * Note that taking a snapshot always updates the root inode (to update
|
|
+ * the dirent backpointer), so a subvolume root inode with
|
|
+ * BCH_INODE_has_child_snapshot is never visible.
|
|
+ */
|
|
+ if (inode->bi_subvol &&
|
|
+ (inode->bi_flags & BCH_INODE_has_child_snapshot))
|
|
+ return false;
|
|
+
|
|
return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
|
|
}
|
|
|
|
@@ -462,6 +411,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
|
|
&name,
|
|
inode->bi_subvol ?: inode->bi_inum,
|
|
&inode->bi_dir_offset,
|
|
+ BTREE_UPDATE_internal_snapshot_node|
|
|
STR_HASH_must_create);
|
|
if (ret) {
|
|
bch_err_msg(c, ret, "error creating dirent");
|
|
@@ -548,7 +498,7 @@ static int remove_backpointer(struct btree_trans *trans,
|
|
SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
|
|
int ret = bkey_err(d) ?:
|
|
dirent_points_to_inode(c, d, inode) ?:
|
|
- __remove_dirent(trans, d.k->p);
|
|
+ bch2_fsck_remove_dirent(trans, d.k->p);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
@@ -595,12 +545,12 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
|
|
u64 cpu = raw_smp_processor_id();
|
|
|
|
bch2_inode_init_early(c, &new_inode);
|
|
- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
|
|
+ bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
|
|
|
|
new_inode.bi_subvol = subvolid;
|
|
|
|
int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
|
|
- bch2_btree_iter_traverse(&inode_iter) ?:
|
|
+ bch2_btree_iter_traverse(trans, &inode_iter) ?:
|
|
bch2_inode_write(trans, &inode_iter, &new_inode);
|
|
bch2_trans_iter_exit(trans, &inode_iter);
|
|
if (ret)
|
|
@@ -665,7 +615,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
|
|
struct btree_iter iter = {};
|
|
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
|
|
- struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
|
|
+ struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0));
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
int ret = bkey_err(k);
|
|
if (ret)
|
|
@@ -685,7 +635,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
|
|
|
|
struct bch_inode_unpacked new_inode;
|
|
bch2_inode_init_early(c, &new_inode);
|
|
- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
|
|
+ bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
|
|
new_inode.bi_size = i_size;
|
|
new_inode.bi_inum = inum;
|
|
new_inode.bi_snapshot = snapshot;
|
|
@@ -816,12 +766,12 @@ static int ref_visible2(struct bch_fs *c,
|
|
|
|
#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
|
|
for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \
|
|
- (_i)->snapshot <= (_snapshot); _i++) \
|
|
- if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
|
|
+ (_i)->inode.bi_snapshot <= (_snapshot); _i++) \
|
|
+ if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot))
|
|
|
|
struct inode_walker_entry {
|
|
struct bch_inode_unpacked inode;
|
|
- u32 snapshot;
|
|
+ bool whiteout;
|
|
u64 count;
|
|
u64 i_size;
|
|
};
|
|
@@ -850,13 +800,20 @@ static struct inode_walker inode_walker_init(void)
|
|
static int add_inode(struct bch_fs *c, struct inode_walker *w,
|
|
struct bkey_s_c inode)
|
|
{
|
|
- struct bch_inode_unpacked u;
|
|
-
|
|
- return bch2_inode_unpack(inode, &u) ?:
|
|
- darray_push(&w->inodes, ((struct inode_walker_entry) {
|
|
- .inode = u,
|
|
- .snapshot = inode.k->p.snapshot,
|
|
+ int ret = darray_push(&w->inodes, ((struct inode_walker_entry) {
|
|
+ .whiteout = !bkey_is_inode(inode.k),
|
|
}));
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ struct inode_walker_entry *n = &darray_last(w->inodes);
|
|
+ if (!n->whiteout) {
|
|
+ return bch2_inode_unpack(inode, &n->inode);
|
|
+ } else {
|
|
+ n->inode.bi_inum = inode.k->p.inode;
|
|
+ n->inode.bi_snapshot = inode.k->p.snapshot;
|
|
+ return 0;
|
|
+ }
|
|
}
|
|
|
|
static int get_inodes_all_snapshots(struct btree_trans *trans,
|
|
@@ -876,13 +833,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
|
|
w->recalculate_sums = false;
|
|
w->inodes.nr = 0;
|
|
|
|
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
|
|
- BTREE_ITER_all_snapshots, k, ret) {
|
|
- if (k.k->p.offset != inum)
|
|
+ for_each_btree_key_max_norestart(trans, iter,
|
|
+ BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX),
|
|
+ BTREE_ITER_all_snapshots, k, ret) {
|
|
+ ret = add_inode(c, w, k);
|
|
+ if (ret)
|
|
break;
|
|
-
|
|
- if (bkey_is_inode(k.k))
|
|
- add_inode(c, w, k);
|
|
}
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
@@ -894,48 +850,112 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
+static int get_visible_inodes(struct btree_trans *trans,
|
|
+ struct inode_walker *w,
|
|
+ struct snapshots_seen *s,
|
|
+ u64 inum)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ w->inodes.nr = 0;
|
|
+ w->deletes.nr = 0;
|
|
+
|
|
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
|
|
+ BTREE_ITER_all_snapshots, k, ret) {
|
|
+ if (k.k->p.offset != inum)
|
|
+ break;
|
|
+
|
|
+ if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
|
|
+ continue;
|
|
+
|
|
+ if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
|
|
+ continue;
|
|
+
|
|
+ ret = bkey_is_inode(k.k)
|
|
+ ? add_inode(c, w, k)
|
|
+ : snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static struct inode_walker_entry *
|
|
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
|
|
+lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
|
|
{
|
|
- bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
|
|
+ struct bch_fs *c = trans->c;
|
|
|
|
struct inode_walker_entry *i;
|
|
__darray_for_each(w->inodes, i)
|
|
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
|
|
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot))
|
|
goto found;
|
|
|
|
return NULL;
|
|
found:
|
|
- BUG_ON(k.k->p.snapshot > i->snapshot);
|
|
+ BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot);
|
|
|
|
- if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
|
|
- struct inode_walker_entry new = *i;
|
|
-
|
|
- new.snapshot = k.k->p.snapshot;
|
|
- new.count = 0;
|
|
- new.i_size = 0;
|
|
-
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_bkey_val_to_text(&buf, c, k);
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ int ret = 0;
|
|
|
|
- bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
|
|
+ if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
|
|
+ trans, snapshot_key_missing_inode_snapshot,
|
|
+ "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
|
|
"unexpected because we should always update the inode when we update a key in that inode\n"
|
|
"%s",
|
|
- w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
|
|
- printbuf_exit(&buf);
|
|
+ w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
|
|
+ (bch2_bkey_val_to_text(&buf, c, k),
|
|
+ buf.buf))) {
|
|
+ struct bch_inode_unpacked new = i->inode;
|
|
+ struct bkey_i whiteout;
|
|
+
|
|
+ new.bi_snapshot = k.k->p.snapshot;
|
|
+
|
|
+ if (!i->whiteout) {
|
|
+ ret = __bch2_fsck_write_inode(trans, &new);
|
|
+ } else {
|
|
+ bkey_init(&whiteout.k);
|
|
+ whiteout.k.type = KEY_TYPE_whiteout;
|
|
+ whiteout.k.p = SPOS(0, i->inode.bi_inum, i->inode.bi_snapshot);
|
|
+ ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
|
|
+ &whiteout,
|
|
+ BTREE_UPDATE_internal_snapshot_node);
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ goto fsck_err;
|
|
|
|
- while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
|
|
+ ret = bch2_trans_commit(trans, NULL, NULL, 0);
|
|
+ if (ret)
|
|
+ goto fsck_err;
|
|
+
|
|
+ struct inode_walker_entry new_entry = *i;
|
|
+
|
|
+ new_entry.inode.bi_snapshot = k.k->p.snapshot;
|
|
+ new_entry.count = 0;
|
|
+ new_entry.i_size = 0;
|
|
+
|
|
+ while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot)
|
|
--i;
|
|
|
|
size_t pos = i - w->inodes.data;
|
|
- int ret = darray_insert_item(&w->inodes, pos, new);
|
|
+ ret = darray_insert_item(&w->inodes, pos, new_entry);
|
|
if (ret)
|
|
- return ERR_PTR(ret);
|
|
+ goto fsck_err;
|
|
|
|
- i = w->inodes.data + pos;
|
|
+ ret = -BCH_ERR_transaction_restart_nested;
|
|
+ goto fsck_err;
|
|
}
|
|
|
|
+ printbuf_exit(&buf);
|
|
return i;
|
|
+fsck_err:
|
|
+ printbuf_exit(&buf);
|
|
+ return ERR_PTR(ret);
|
|
}
|
|
|
|
static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
|
|
@@ -950,42 +970,7 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
|
|
|
|
w->last_pos = k.k->p;
|
|
|
|
- return lookup_inode_for_snapshot(trans->c, w, k);
|
|
-}
|
|
-
|
|
-static int get_visible_inodes(struct btree_trans *trans,
|
|
- struct inode_walker *w,
|
|
- struct snapshots_seen *s,
|
|
- u64 inum)
|
|
-{
|
|
- struct bch_fs *c = trans->c;
|
|
- struct btree_iter iter;
|
|
- struct bkey_s_c k;
|
|
- int ret;
|
|
-
|
|
- w->inodes.nr = 0;
|
|
- w->deletes.nr = 0;
|
|
-
|
|
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
|
|
- BTREE_ITER_all_snapshots, k, ret) {
|
|
- if (k.k->p.offset != inum)
|
|
- break;
|
|
-
|
|
- if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
|
|
- continue;
|
|
-
|
|
- if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
|
|
- continue;
|
|
-
|
|
- ret = bkey_is_inode(k.k)
|
|
- ? add_inode(c, w, k)
|
|
- : snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
|
|
- if (ret)
|
|
- break;
|
|
- }
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
-
|
|
- return ret;
|
|
+ return lookup_inode_for_snapshot(trans, w, k);
|
|
}
|
|
|
|
/*
|
|
@@ -1063,6 +1048,23 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
|
|
if (ret && !bch2_err_matches(ret, ENOENT))
|
|
return ret;
|
|
|
|
+ if ((ret || dirent_points_to_inode_nowarn(d, inode)) &&
|
|
+ inode->bi_subvol &&
|
|
+ (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
|
|
+ /* Older version of a renamed subvolume root: we won't have a
|
|
+ * correct dirent for it. That's expected, see
|
|
+ * inode_should_reattach().
|
|
+ *
|
|
+ * We don't clear the backpointer field when doing the rename
|
|
+ * because there might be arbitrarily many versions in older
|
|
+ * snapshots.
|
|
+ */
|
|
+ inode->bi_dir = 0;
|
|
+ inode->bi_dir_offset = 0;
|
|
+ *write_inode = true;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
if (fsck_err_on(ret,
|
|
trans, inode_points_to_missing_dirent,
|
|
"inode points to missing dirent\n%s",
|
|
@@ -1083,7 +1085,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
|
|
inode->bi_dir_offset = 0;
|
|
*write_inode = true;
|
|
}
|
|
-
|
|
+out:
|
|
ret = 0;
|
|
fsck_err:
|
|
bch2_trans_iter_exit(trans, &dirent_iter);
|
|
@@ -1092,32 +1094,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
-static int get_snapshot_root_inode(struct btree_trans *trans,
|
|
- struct bch_inode_unpacked *root,
|
|
- u64 inum)
|
|
-{
|
|
- struct btree_iter iter;
|
|
- struct bkey_s_c k;
|
|
- int ret = 0;
|
|
-
|
|
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
|
|
- SPOS(0, inum, U32_MAX),
|
|
- BTREE_ITER_all_snapshots, k, ret) {
|
|
- if (k.k->p.offset != inum)
|
|
- break;
|
|
- if (bkey_is_inode(k.k))
|
|
- goto found_root;
|
|
- }
|
|
- if (ret)
|
|
- goto err;
|
|
- BUG();
|
|
-found_root:
|
|
- ret = bch2_inode_unpack(k, root);
|
|
-err:
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
- return ret;
|
|
-}
|
|
-
|
|
static int check_inode(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
struct bkey_s_c k,
|
|
@@ -1148,20 +1124,23 @@ static int check_inode(struct btree_trans *trans,
|
|
goto err;
|
|
|
|
if (snapshot_root->bi_inum != u.bi_inum) {
|
|
- ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
|
|
+ ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root);
|
|
if (ret)
|
|
goto err;
|
|
}
|
|
|
|
- if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
|
|
- INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
|
|
- trans, inode_snapshot_mismatch,
|
|
- "inode hash info in different snapshots don't match")) {
|
|
- u.bi_hash_seed = snapshot_root->bi_hash_seed;
|
|
- SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
|
|
- do_update = true;
|
|
+ if (u.bi_hash_seed != snapshot_root->bi_hash_seed ||
|
|
+ INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) {
|
|
+ ret = bch2_repair_inode_hash_info(trans, snapshot_root);
|
|
+ BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented);
|
|
+ if (ret)
|
|
+ goto err;
|
|
}
|
|
|
|
+ ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
if (u.bi_dir || u.bi_dir_offset) {
|
|
ret = check_inode_dirent_inode(trans, &u, &do_update);
|
|
if (ret)
|
|
@@ -1464,7 +1443,9 @@ static int check_key_has_inode(struct btree_trans *trans,
|
|
if (k.k->type == KEY_TYPE_whiteout)
|
|
goto out;
|
|
|
|
- if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
|
|
+ bool have_inode = i && !i->whiteout;
|
|
+
|
|
+ if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
|
|
ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
|
|
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
|
if (ret)
|
|
@@ -1475,16 +1456,16 @@ static int check_key_has_inode(struct btree_trans *trans,
|
|
goto err;
|
|
}
|
|
|
|
- if (fsck_err_on(!i,
|
|
+ if (fsck_err_on(!have_inode,
|
|
trans, key_in_missing_inode,
|
|
- "key in missing inode:\n %s",
|
|
+ "key in missing inode:\n%s",
|
|
(printbuf_reset(&buf),
|
|
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
|
goto delete;
|
|
|
|
- if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
|
|
+ if (fsck_err_on(have_inode && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
|
|
trans, key_in_wrong_inode_type,
|
|
- "key for wrong inode mode %o:\n %s",
|
|
+ "key for wrong inode mode %o:\n%s",
|
|
i->inode.bi_mode,
|
|
(printbuf_reset(&buf),
|
|
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
|
@@ -1510,21 +1491,21 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
|
|
if (i->inode.bi_sectors == i->count)
|
|
continue;
|
|
|
|
- count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
|
|
+ count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot);
|
|
|
|
if (w->recalculate_sums)
|
|
i->count = count2;
|
|
|
|
if (i->count != count2) {
|
|
bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
|
|
- w->last_pos.inode, i->snapshot, i->count, count2);
|
|
+ w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
|
|
i->count = count2;
|
|
}
|
|
|
|
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
|
|
trans, inode_i_sectors_wrong,
|
|
"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
|
|
- w->last_pos.inode, i->snapshot,
|
|
+ w->last_pos.inode, i->inode.bi_snapshot,
|
|
i->inode.bi_sectors, i->count)) {
|
|
i->inode.bi_sectors = i->count;
|
|
ret = bch2_fsck_write_inode(trans, &i->inode);
|
|
@@ -1613,7 +1594,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct printbuf buf = PRINTBUF;
|
|
- struct btree_iter iter1, iter2 = { NULL };
|
|
+ struct btree_iter iter1, iter2 = {};
|
|
struct bkey_s_c k1, k2;
|
|
int ret;
|
|
|
|
@@ -1622,18 +1603,18 @@ static int overlapping_extents_found(struct btree_trans *trans,
|
|
bch2_trans_iter_init(trans, &iter1, btree, pos1,
|
|
BTREE_ITER_all_snapshots|
|
|
BTREE_ITER_not_extents);
|
|
- k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
|
|
+ k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX));
|
|
ret = bkey_err(k1);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- prt_str(&buf, "\n ");
|
|
+ prt_newline(&buf);
|
|
bch2_bkey_val_to_text(&buf, c, k1);
|
|
|
|
if (!bpos_eq(pos1, k1.k->p)) {
|
|
- prt_str(&buf, "\n wanted\n ");
|
|
+ prt_str(&buf, "\nwanted\n ");
|
|
bch2_bpos_to_text(&buf, pos1);
|
|
- prt_str(&buf, "\n ");
|
|
+ prt_str(&buf, "\n");
|
|
bch2_bkey_to_text(&buf, &pos2);
|
|
|
|
bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
|
|
@@ -1642,12 +1623,12 @@ static int overlapping_extents_found(struct btree_trans *trans,
|
|
goto err;
|
|
}
|
|
|
|
- bch2_trans_copy_iter(&iter2, &iter1);
|
|
+ bch2_trans_copy_iter(trans, &iter2, &iter1);
|
|
|
|
while (1) {
|
|
- bch2_btree_iter_advance(&iter2);
|
|
+ bch2_btree_iter_advance(trans, &iter2);
|
|
|
|
- k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
|
|
+ k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX));
|
|
ret = bkey_err(k2);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1656,7 +1637,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
|
|
break;
|
|
}
|
|
|
|
- prt_str(&buf, "\n ");
|
|
+ prt_newline(&buf);
|
|
bch2_bkey_val_to_text(&buf, c, k2);
|
|
|
|
if (bpos_gt(k2.k->p, pos2.p) ||
|
|
@@ -1667,7 +1648,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
|
|
goto err;
|
|
}
|
|
|
|
- prt_printf(&buf, "\n overwriting %s extent",
|
|
+ prt_printf(&buf, "\noverwriting %s extent",
|
|
pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
|
|
|
|
if (fsck_err(trans, extent_overlapping,
|
|
@@ -1688,6 +1669,8 @@ static int overlapping_extents_found(struct btree_trans *trans,
|
|
bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
|
|
bch2_disk_reservation_put(c, &res);
|
|
|
|
+ bch_info(c, "repair ret %s", bch2_err_str(ret));
|
|
+
|
|
if (ret)
|
|
goto err;
|
|
|
|
@@ -1833,21 +1816,21 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
|
|
for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
|
|
inode->inodes.data && i >= inode->inodes.data;
|
|
--i) {
|
|
- if (i->snapshot > k.k->p.snapshot ||
|
|
- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
|
|
+ if (i->inode.bi_snapshot > k.k->p.snapshot ||
|
|
+ !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
|
|
continue;
|
|
|
|
if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
|
|
!bkey_extent_is_reservation(k),
|
|
trans, extent_past_end_of_inode,
|
|
- "extent type past end of inode %llu:%u, i_size %llu\n %s",
|
|
- i->inode.bi_inum, i->snapshot, i->inode.bi_size,
|
|
+ "extent type past end of inode %llu:%u, i_size %llu\n%s",
|
|
+ i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
|
struct btree_iter iter2;
|
|
|
|
- bch2_trans_copy_iter(&iter2, iter);
|
|
- bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
|
|
- ret = bch2_btree_iter_traverse(&iter2) ?:
|
|
+ bch2_trans_copy_iter(trans, &iter2, iter);
|
|
+ bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot);
|
|
+ ret = bch2_btree_iter_traverse(trans, &iter2) ?:
|
|
bch2_btree_delete_at(trans, &iter2,
|
|
BTREE_UPDATE_internal_snapshot_node);
|
|
bch2_trans_iter_exit(trans, &iter2);
|
|
@@ -1868,8 +1851,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
|
|
for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
|
|
inode->inodes.data && i >= inode->inodes.data;
|
|
--i) {
|
|
- if (i->snapshot > k.k->p.snapshot ||
|
|
- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
|
|
+ if (i->whiteout ||
|
|
+ i->inode.bi_snapshot > k.k->p.snapshot ||
|
|
+ !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
|
|
continue;
|
|
|
|
i->count += k.k->size;
|
|
@@ -1951,13 +1935,13 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
|
|
if (i->inode.bi_nlink == i->count)
|
|
continue;
|
|
|
|
- count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
|
|
+ count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot);
|
|
if (count2 < 0)
|
|
return count2;
|
|
|
|
if (i->count != count2) {
|
|
bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
|
|
- w->last_pos.inode, i->snapshot, i->count, count2);
|
|
+ w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
|
|
i->count = count2;
|
|
if (i->inode.bi_nlink == i->count)
|
|
continue;
|
|
@@ -1966,7 +1950,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
|
|
if (fsck_err_on(i->inode.bi_nlink != i->count,
|
|
trans, inode_dir_wrong_nlink,
|
|
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
|
|
- w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
|
|
+ w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) {
|
|
i->inode.bi_nlink = i->count;
|
|
ret = bch2_fsck_write_inode(trans, &i->inode);
|
|
if (ret)
|
|
@@ -1985,169 +1969,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa
|
|
trans_was_restarted(trans, restart_count);
|
|
}
|
|
|
|
-noinline_for_stack
|
|
-static int check_dirent_inode_dirent(struct btree_trans *trans,
|
|
- struct btree_iter *iter,
|
|
- struct bkey_s_c_dirent d,
|
|
- struct bch_inode_unpacked *target)
|
|
-{
|
|
- struct bch_fs *c = trans->c;
|
|
- struct printbuf buf = PRINTBUF;
|
|
- struct btree_iter bp_iter = { NULL };
|
|
- int ret = 0;
|
|
-
|
|
- if (inode_points_to_dirent(target, d))
|
|
- return 0;
|
|
-
|
|
- if (!target->bi_dir &&
|
|
- !target->bi_dir_offset) {
|
|
- fsck_err_on(S_ISDIR(target->bi_mode),
|
|
- trans, inode_dir_missing_backpointer,
|
|
- "directory with missing backpointer\n%s",
|
|
- (printbuf_reset(&buf),
|
|
- bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
- prt_printf(&buf, "\n"),
|
|
- bch2_inode_unpacked_to_text(&buf, target),
|
|
- buf.buf));
|
|
-
|
|
- fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
|
|
- trans, inode_unlinked_but_has_dirent,
|
|
- "inode unlinked but has dirent\n%s",
|
|
- (printbuf_reset(&buf),
|
|
- bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
- prt_printf(&buf, "\n"),
|
|
- bch2_inode_unpacked_to_text(&buf, target),
|
|
- buf.buf));
|
|
-
|
|
- target->bi_flags &= ~BCH_INODE_unlinked;
|
|
- target->bi_dir = d.k->p.inode;
|
|
- target->bi_dir_offset = d.k->p.offset;
|
|
- return __bch2_fsck_write_inode(trans, target);
|
|
- }
|
|
-
|
|
- if (bch2_inode_should_have_single_bp(target) &&
|
|
- !fsck_err(trans, inode_wrong_backpointer,
|
|
- "dirent points to inode that does not point back:\n %s",
|
|
- (bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
- prt_printf(&buf, "\n "),
|
|
- bch2_inode_unpacked_to_text(&buf, target),
|
|
- buf.buf)))
|
|
- goto err;
|
|
-
|
|
- struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
|
|
- SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
|
|
- ret = bkey_err(bp_dirent);
|
|
- if (ret && !bch2_err_matches(ret, ENOENT))
|
|
- goto err;
|
|
-
|
|
- bool backpointer_exists = !ret;
|
|
- ret = 0;
|
|
-
|
|
- if (fsck_err_on(!backpointer_exists,
|
|
- trans, inode_wrong_backpointer,
|
|
- "inode %llu:%u has wrong backpointer:\n"
|
|
- "got %llu:%llu\n"
|
|
- "should be %llu:%llu",
|
|
- target->bi_inum, target->bi_snapshot,
|
|
- target->bi_dir,
|
|
- target->bi_dir_offset,
|
|
- d.k->p.inode,
|
|
- d.k->p.offset)) {
|
|
- target->bi_dir = d.k->p.inode;
|
|
- target->bi_dir_offset = d.k->p.offset;
|
|
- ret = __bch2_fsck_write_inode(trans, target);
|
|
- goto out;
|
|
- }
|
|
-
|
|
- bch2_bkey_val_to_text(&buf, c, d.s_c);
|
|
- prt_newline(&buf);
|
|
- if (backpointer_exists)
|
|
- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
|
|
-
|
|
- if (fsck_err_on(backpointer_exists &&
|
|
- (S_ISDIR(target->bi_mode) ||
|
|
- target->bi_subvol),
|
|
- trans, inode_dir_multiple_links,
|
|
- "%s %llu:%u with multiple links\n%s",
|
|
- S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
|
|
- target->bi_inum, target->bi_snapshot, buf.buf)) {
|
|
- ret = __remove_dirent(trans, d.k->p);
|
|
- goto out;
|
|
- }
|
|
-
|
|
- /*
|
|
- * hardlinked file with nlink 0:
|
|
- * We're just adjusting nlink here so check_nlinks() will pick
|
|
- * it up, it ignores inodes with nlink 0
|
|
- */
|
|
- if (fsck_err_on(backpointer_exists && !target->bi_nlink,
|
|
- trans, inode_multiple_links_but_nlink_0,
|
|
- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
|
|
- target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
|
|
- target->bi_nlink++;
|
|
- target->bi_flags &= ~BCH_INODE_unlinked;
|
|
- ret = __bch2_fsck_write_inode(trans, target);
|
|
- if (ret)
|
|
- goto err;
|
|
- }
|
|
-out:
|
|
-err:
|
|
-fsck_err:
|
|
- bch2_trans_iter_exit(trans, &bp_iter);
|
|
- printbuf_exit(&buf);
|
|
- bch_err_fn(c, ret);
|
|
- return ret;
|
|
-}
|
|
-
|
|
-noinline_for_stack
|
|
-static int check_dirent_target(struct btree_trans *trans,
|
|
- struct btree_iter *iter,
|
|
- struct bkey_s_c_dirent d,
|
|
- struct bch_inode_unpacked *target)
|
|
-{
|
|
- struct bch_fs *c = trans->c;
|
|
- struct bkey_i_dirent *n;
|
|
- struct printbuf buf = PRINTBUF;
|
|
- int ret = 0;
|
|
-
|
|
- ret = check_dirent_inode_dirent(trans, iter, d, target);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- if (fsck_err_on(d.v->d_type != inode_d_type(target),
|
|
- trans, dirent_d_type_wrong,
|
|
- "incorrect d_type: got %s, should be %s:\n%s",
|
|
- bch2_d_type_str(d.v->d_type),
|
|
- bch2_d_type_str(inode_d_type(target)),
|
|
- (printbuf_reset(&buf),
|
|
- bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
|
|
- n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
|
|
- ret = PTR_ERR_OR_ZERO(n);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- bkey_reassemble(&n->k_i, d.s_c);
|
|
- n->v.d_type = inode_d_type(target);
|
|
- if (n->v.d_type == DT_SUBVOL) {
|
|
- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
|
|
- n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
|
|
- } else {
|
|
- n->v.d_inum = cpu_to_le64(target->bi_inum);
|
|
- }
|
|
-
|
|
- ret = bch2_trans_update(trans, iter, &n->k_i, 0);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- d = dirent_i_to_s_c(n);
|
|
- }
|
|
-err:
|
|
-fsck_err:
|
|
- printbuf_exit(&buf);
|
|
- bch_err_fn(c, ret);
|
|
- return ret;
|
|
-}
|
|
-
|
|
/* find a subvolume that's a descendent of @snapshot: */
|
|
static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
|
|
{
|
|
@@ -2241,35 +2062,46 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
|
|
0, subvolume);
|
|
ret = bkey_err(s.s_c);
|
|
if (ret && !bch2_err_matches(ret, ENOENT))
|
|
- return ret;
|
|
+ goto err;
|
|
|
|
if (ret) {
|
|
if (fsck_err(trans, dirent_to_missing_subvol,
|
|
"dirent points to missing subvolume\n%s",
|
|
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
|
|
- return __remove_dirent(trans, d.k->p);
|
|
+ return bch2_fsck_remove_dirent(trans, d.k->p);
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
- if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
|
|
- trans, subvol_fs_path_parent_wrong,
|
|
- "subvol with wrong fs_path_parent, should be be %u\n%s",
|
|
- parent_subvol,
|
|
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
|
|
- struct bkey_i_subvolume *n =
|
|
- bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
|
|
- ret = PTR_ERR_OR_ZERO(n);
|
|
+ if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) {
|
|
+ printbuf_reset(&buf);
|
|
+
|
|
+ prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n",
|
|
+ parent_subvol);
|
|
+
|
|
+ ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset,
|
|
+ le64_to_cpu(s.v->inode) }, &buf);
|
|
if (ret)
|
|
goto err;
|
|
+ prt_newline(&buf);
|
|
+ bch2_bkey_val_to_text(&buf, c, s.s_c);
|
|
|
|
- n->v.fs_path_parent = cpu_to_le32(parent_subvol);
|
|
+ if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) {
|
|
+ struct bkey_i_subvolume *n =
|
|
+ bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
|
|
+ ret = PTR_ERR_OR_ZERO(n);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ n->v.fs_path_parent = cpu_to_le32(parent_subvol);
|
|
+ }
|
|
}
|
|
|
|
u64 target_inum = le64_to_cpu(s.v->inode);
|
|
u32 target_snapshot = le32_to_cpu(s.v->snapshot);
|
|
|
|
- ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
|
|
+ ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot,
|
|
+ &subvol_root, 0);
|
|
if (ret && !bch2_err_matches(ret, ENOENT))
|
|
goto err;
|
|
|
|
@@ -2291,7 +2123,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
|
|
goto err;
|
|
}
|
|
|
|
- ret = check_dirent_target(trans, iter, d, &subvol_root);
|
|
+ ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
|
|
if (ret)
|
|
goto err;
|
|
out:
|
|
@@ -2342,7 +2174,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (!i)
|
|
+ if (!i || i->whiteout)
|
|
goto out;
|
|
|
|
if (dir->first_this_inode)
|
|
@@ -2363,6 +2195,41 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
|
|
|
|
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
|
|
|
+ /* check casefold */
|
|
+ if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding,
|
|
+ trans, dirent_casefold_mismatch,
|
|
+ "dirent casefold does not match dir casefold\n%s",
|
|
+ (printbuf_reset(&buf),
|
|
+ bch2_bkey_val_to_text(&buf, c, k),
|
|
+ buf.buf))) {
|
|
+ struct qstr name = bch2_dirent_get_name(d);
|
|
+ u32 subvol = d.v->d_type == DT_SUBVOL
|
|
+ ? d.v->d_parent_subvol
|
|
+ : 0;
|
|
+ u64 target = d.v->d_type == DT_SUBVOL
|
|
+ ? d.v->d_child_subvol
|
|
+ : d.v->d_inum;
|
|
+ u64 dir_offset;
|
|
+
|
|
+ ret = bch2_hash_delete_at(trans,
|
|
+ bch2_dirent_hash_desc, hash_info, iter,
|
|
+ BTREE_UPDATE_internal_snapshot_node) ?:
|
|
+ bch2_dirent_create_snapshot(trans, subvol,
|
|
+ d.k->p.inode, d.k->p.snapshot,
|
|
+ hash_info,
|
|
+ d.v->d_type,
|
|
+ &name,
|
|
+ target,
|
|
+ &dir_offset,
|
|
+ BTREE_ITER_with_updates|
|
|
+ BTREE_UPDATE_internal_snapshot_node|
|
|
+ STR_HASH_must_create) ?:
|
|
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
|
+
|
|
+ /* might need another check_dirents pass */
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
if (d.v->d_type == DT_SUBVOL) {
|
|
ret = check_dirent_to_subvol(trans, iter, d);
|
|
if (ret)
|
|
@@ -2378,13 +2245,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
|
|
(printbuf_reset(&buf),
|
|
bch2_bkey_val_to_text(&buf, c, k),
|
|
buf.buf))) {
|
|
- ret = __remove_dirent(trans, d.k->p);
|
|
+ ret = bch2_fsck_remove_dirent(trans, d.k->p);
|
|
if (ret)
|
|
goto err;
|
|
}
|
|
|
|
darray_for_each(target->inodes, i) {
|
|
- ret = check_dirent_target(trans, iter, d, &i->inode);
|
|
+ ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
|
|
if (ret)
|
|
goto err;
|
|
}
|
|
@@ -2402,7 +2269,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
|
|
BTREE_ID_dirents,
|
|
SPOS(k.k->p.inode, k.k->p.offset, *i),
|
|
BTREE_ITER_intent);
|
|
- ret = bch2_btree_iter_traverse(&delete_iter) ?:
|
|
+ ret = bch2_btree_iter_traverse(trans, &delete_iter) ?:
|
|
bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
|
|
hash_info,
|
|
&delete_iter,
|
|
@@ -2482,7 +2349,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
|
|
if (ret)
|
|
return ret;
|
|
|
|
- if (!i)
|
|
+ if (!i || i->whiteout)
|
|
return 0;
|
|
|
|
if (inode->first_this_inode)
|
|
@@ -2551,7 +2418,8 @@ static int check_root_trans(struct btree_trans *trans)
|
|
goto err;
|
|
}
|
|
|
|
- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
|
|
+ ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot,
|
|
+ &root_inode, 0);
|
|
if (ret && !bch2_err_matches(ret, ENOENT))
|
|
return ret;
|
|
|
|
@@ -2583,8 +2451,6 @@ int bch2_check_root(struct bch_fs *c)
|
|
return ret;
|
|
}
|
|
|
|
-typedef DARRAY(u32) darray_u32;
|
|
-
|
|
static bool darray_u32_has(darray_u32 *d, u32 v)
|
|
{
|
|
darray_for_each(*d, i)
|
|
@@ -2621,7 +2487,14 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
|
|
u32 parent = le32_to_cpu(s.v->fs_path_parent);
|
|
|
|
if (darray_u32_has(&subvol_path, parent)) {
|
|
- if (fsck_err(c, subvol_loop, "subvolume loop"))
|
|
+ printbuf_reset(&buf);
|
|
+ prt_printf(&buf, "subvolume loop:\n");
|
|
+
|
|
+ darray_for_each_reverse(subvol_path, i)
|
|
+ prt_printf(&buf, "%u ", *i);
|
|
+ prt_printf(&buf, "%u", parent);
|
|
+
|
|
+ if (fsck_err(trans, subvol_loop, "%s", buf.buf))
|
|
ret = reattach_subvol(trans, s);
|
|
break;
|
|
}
|
|
@@ -2629,7 +2502,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
|
|
bch2_trans_iter_exit(trans, &parent_iter);
|
|
bch2_trans_iter_init(trans, &parent_iter,
|
|
BTREE_ID_subvolumes, POS(0, parent), 0);
|
|
- k = bch2_btree_iter_peek_slot(&parent_iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &parent_iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -2637,7 +2510,8 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
|
|
if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
|
|
trans, subvol_unreachable,
|
|
"unreachable subvolume %s",
|
|
- (bch2_bkey_val_to_text(&buf, c, s.s_c),
|
|
+ (printbuf_reset(&buf),
|
|
+ bch2_bkey_val_to_text(&buf, c, s.s_c),
|
|
buf.buf))) {
|
|
ret = reattach_subvol(trans, s);
|
|
break;
|
|
@@ -2793,14 +2667,13 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
|
|
redo_bi_depth = true;
|
|
|
|
if (path_is_dup(&path, inode.bi_inum, snapshot)) {
|
|
- /* XXX print path */
|
|
- bch_err(c, "directory structure loop");
|
|
-
|
|
- darray_for_each(path, i)
|
|
- pr_err("%llu:%u", i->inum, i->snapshot);
|
|
- pr_err("%llu:%u", inode.bi_inum, snapshot);
|
|
+ printbuf_reset(&buf);
|
|
+ prt_printf(&buf, "directory structure loop:\n");
|
|
+ darray_for_each_reverse(path, i)
|
|
+ prt_printf(&buf, "%llu:%u ", i->inum, i->snapshot);
|
|
+ prt_printf(&buf, "%llu:%u", inode.bi_inum, snapshot);
|
|
|
|
- if (fsck_err(trans, dir_loop, "directory structure loop")) {
|
|
+ if (fsck_err(trans, dir_loop, "%s", buf.buf)) {
|
|
ret = remove_backpointer(trans, &inode);
|
|
bch_err_msg(c, ret, "removing dirent");
|
|
if (ret)
|
|
@@ -3199,7 +3072,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
|
|
{
|
|
struct bch_ioctl_fsck_offline arg;
|
|
struct fsck_thread *thr = NULL;
|
|
- darray_str(devs) = {};
|
|
+ darray_const_str devs = {};
|
|
long ret = 0;
|
|
|
|
if (copy_from_user(&arg, user_arg, sizeof(arg)))
|
|
@@ -3240,7 +3113,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
|
|
if (arg.opts) {
|
|
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
|
|
ret = PTR_ERR_OR_ZERO(optstr) ?:
|
|
- bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
|
|
+ bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false);
|
|
if (!IS_ERR(optstr))
|
|
kfree(optstr);
|
|
|
|
@@ -3257,7 +3130,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
|
|
|
|
bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
|
|
|
|
- thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
|
|
+ thr->c = bch2_fs_open(&devs, &thr->opts);
|
|
|
|
if (!IS_ERR(thr->c) &&
|
|
thr->c->opts.errors == BCH_ON_ERROR_panic)
|
|
@@ -3294,19 +3167,18 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
|
|
c->opts.fix_errors = FSCK_FIX_ask;
|
|
|
|
c->opts.fsck = true;
|
|
- set_bit(BCH_FS_fsck_running, &c->flags);
|
|
+ set_bit(BCH_FS_in_fsck, &c->flags);
|
|
|
|
- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
|
|
- int ret = bch2_run_online_recovery_passes(c);
|
|
+ int ret = bch2_run_online_recovery_passes(c, ~0ULL);
|
|
|
|
- clear_bit(BCH_FS_fsck_running, &c->flags);
|
|
+ clear_bit(BCH_FS_in_fsck, &c->flags);
|
|
bch_err_fn(c, ret);
|
|
|
|
c->stdio = NULL;
|
|
c->stdio_filter = NULL;
|
|
c->opts.fix_errors = old_fix_errors;
|
|
|
|
- up(&c->online_fsck_mutex);
|
|
+ up(&c->recovery.run_lock);
|
|
bch2_ro_ref_put(c);
|
|
return ret;
|
|
}
|
|
@@ -3330,7 +3202,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
|
|
if (!bch2_ro_ref_tryget(c))
|
|
return -EROFS;
|
|
|
|
- if (down_trylock(&c->online_fsck_mutex)) {
|
|
+ if (down_trylock(&c->recovery.run_lock)) {
|
|
bch2_ro_ref_put(c);
|
|
return -EAGAIN;
|
|
}
|
|
@@ -3348,7 +3220,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
|
|
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
|
|
|
|
ret = PTR_ERR_OR_ZERO(optstr) ?:
|
|
- bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
|
|
+ bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false);
|
|
if (!IS_ERR(optstr))
|
|
kfree(optstr);
|
|
|
|
@@ -3362,7 +3234,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
|
|
bch_err_fn(c, ret);
|
|
if (thr)
|
|
bch2_fsck_thread_exit(&thr->thr);
|
|
- up(&c->online_fsck_mutex);
|
|
+ up(&c->recovery.run_lock);
|
|
bch2_ro_ref_put(c);
|
|
}
|
|
return ret;
|
|
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
|
|
index 339b80770f1d..5cf70108ae2f 100644
|
|
--- a/fs/bcachefs/inode.c
|
|
+++ b/fs/bcachefs/inode.c
|
|
@@ -14,6 +14,7 @@
|
|
#include "extent_update.h"
|
|
#include "fs.h"
|
|
#include "inode.h"
|
|
+#include "namei.h"
|
|
#include "opts.h"
|
|
#include "str_hash.h"
|
|
#include "snapshot.h"
|
|
@@ -240,6 +241,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k,
|
|
u64 v[2];
|
|
|
|
unpacked->bi_inum = inode.k->p.offset;
|
|
+ unpacked->bi_snapshot = inode.k->p.snapshot;
|
|
unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
|
|
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
|
|
unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
|
|
@@ -284,13 +286,12 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
|
|
{
|
|
memset(unpacked, 0, sizeof(*unpacked));
|
|
|
|
- unpacked->bi_snapshot = k.k->p.snapshot;
|
|
-
|
|
switch (k.k->type) {
|
|
case KEY_TYPE_inode: {
|
|
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
|
|
|
|
unpacked->bi_inum = inode.k->p.offset;
|
|
+ unpacked->bi_snapshot = inode.k->p.snapshot;
|
|
unpacked->bi_journal_seq= 0;
|
|
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
|
|
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
|
|
@@ -309,6 +310,7 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
|
|
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
|
|
|
|
unpacked->bi_inum = inode.k->p.offset;
|
|
+ unpacked->bi_snapshot = inode.k->p.snapshot;
|
|
unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
|
|
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
|
|
unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
|
|
@@ -326,8 +328,6 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
|
|
int bch2_inode_unpack(struct bkey_s_c k,
|
|
struct bch_inode_unpacked *unpacked)
|
|
{
|
|
- unpacked->bi_snapshot = k.k->p.snapshot;
|
|
-
|
|
return likely(k.k->type == KEY_TYPE_inode_v3)
|
|
? bch2_inode_unpack_v3(k, unpacked)
|
|
: bch2_inode_unpack_slowpath(k, unpacked);
|
|
@@ -367,6 +367,82 @@ int __bch2_inode_peek(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
+int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans,
|
|
+ u64 inode_nr, u32 snapshot,
|
|
+ struct bch_inode_unpacked *inode,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
|
|
+ SPOS(0, inode_nr, snapshot), flags);
|
|
+ int ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bkey_is_inode(k.k)
|
|
+ ? bch2_inode_unpack(k, inode)
|
|
+ : -BCH_ERR_ENOENT_inode;
|
|
+err:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
|
|
+ subvol_inum inum,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct btree_iter iter;
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
|
|
+ if (!ret)
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
|
|
+ subvol_inum inum,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct btree_iter iter;
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
|
|
+ if (!ret)
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
|
|
+}
|
|
+
|
|
+int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
|
|
+ struct bch_inode_unpacked *root)
|
|
+{
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+
|
|
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
|
|
+ SPOS(0, inum, U32_MAX),
|
|
+ BTREE_ITER_all_snapshots, k, ret) {
|
|
+ if (k.k->p.offset != inum)
|
|
+ break;
|
|
+ if (bkey_is_inode(k.k)) {
|
|
+ ret = bch2_inode_unpack(k, root);
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+ /* We're only called when we know we have an inode for @inum */
|
|
+ BUG_ON(!ret);
|
|
+out:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
int bch2_inode_write_flags(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
struct bch_inode_unpacked *inode,
|
|
@@ -731,10 +807,9 @@ int bch2_trigger_inode(struct btree_trans *trans,
|
|
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
|
|
}
|
|
|
|
- s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
|
|
- if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
|
|
- struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
|
|
- int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
|
|
+ s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) };
|
|
+ if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) {
|
|
+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -833,7 +908,8 @@ void bch2_inode_init_early(struct bch_fs *c,
|
|
get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
|
|
}
|
|
|
|
-void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
|
|
+void bch2_inode_init_late(struct bch_fs *c,
|
|
+ struct bch_inode_unpacked *inode_u, u64 now,
|
|
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
|
|
struct bch_inode_unpacked *parent)
|
|
{
|
|
@@ -857,6 +933,12 @@ void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
|
|
BCH_INODE_OPTS()
|
|
#undef x
|
|
}
|
|
+
|
|
+ if (!S_ISDIR(mode))
|
|
+ inode_u->bi_casefold = 0;
|
|
+
|
|
+ if (bch2_inode_casefold(c, inode_u))
|
|
+ inode_u->bi_flags |= BCH_INODE_has_case_insensitive;
|
|
}
|
|
|
|
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
|
|
@@ -864,23 +946,10 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
|
|
struct bch_inode_unpacked *parent)
|
|
{
|
|
bch2_inode_init_early(c, inode_u);
|
|
- bch2_inode_init_late(inode_u, bch2_current_time(c),
|
|
+ bch2_inode_init_late(c, inode_u, bch2_current_time(c),
|
|
uid, gid, mode, rdev, parent);
|
|
}
|
|
|
|
-static inline u32 bkey_generation(struct bkey_s_c k)
|
|
-{
|
|
- switch (k.k->type) {
|
|
- case KEY_TYPE_inode:
|
|
- case KEY_TYPE_inode_v2:
|
|
- BUG();
|
|
- case KEY_TYPE_inode_generation:
|
|
- return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
|
|
- default:
|
|
- return 0;
|
|
- }
|
|
-}
|
|
-
|
|
static struct bkey_i_inode_alloc_cursor *
|
|
bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
|
|
{
|
|
@@ -954,7 +1023,7 @@ int bch2_inode_create(struct btree_trans *trans,
|
|
BTREE_ITER_intent);
|
|
struct bkey_s_c k;
|
|
again:
|
|
- while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ while ((k = bch2_btree_iter_peek(trans, iter)).k &&
|
|
!(ret = bkey_err(k)) &&
|
|
bkey_lt(k.k->p, POS(0, max))) {
|
|
if (pos < iter->pos.offset)
|
|
@@ -965,7 +1034,7 @@ int bch2_inode_create(struct btree_trans *trans,
|
|
* we've found just one:
|
|
*/
|
|
pos = iter->pos.offset + 1;
|
|
- bch2_btree_iter_set_pos(iter, POS(0, pos));
|
|
+ bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
|
|
}
|
|
|
|
if (!ret && pos < max)
|
|
@@ -981,12 +1050,12 @@ int bch2_inode_create(struct btree_trans *trans,
|
|
|
|
/* Retry from start */
|
|
pos = start = min;
|
|
- bch2_btree_iter_set_pos(iter, POS(0, pos));
|
|
+ bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
|
|
le32_add_cpu(&cursor->v.gen, 1);
|
|
goto again;
|
|
found_slot:
|
|
- bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
|
|
- k = bch2_btree_iter_peek_slot(iter);
|
|
+ bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot));
|
|
+ k = bch2_btree_iter_peek_slot(trans, iter);
|
|
ret = bkey_err(k);
|
|
if (ret) {
|
|
bch2_trans_iter_exit(trans, iter);
|
|
@@ -1023,9 +1092,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- bch2_btree_iter_set_snapshot(&iter, snapshot);
|
|
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
|
|
|
|
- k = bch2_btree_iter_peek_max(&iter, end);
|
|
+ k = bch2_btree_iter_peek_max(trans, &iter, end);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1056,7 +1125,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
|
|
int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
|
|
{
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
- struct btree_iter iter = { NULL };
|
|
+ struct btree_iter iter = {};
|
|
struct bkey_s_c k;
|
|
u32 snapshot;
|
|
int ret;
|
|
@@ -1092,7 +1161,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
|
|
bch2_fs_inconsistent(c,
|
|
"inode %llu:%u not found when deleting",
|
|
inum.inum, snapshot);
|
|
- ret = -EIO;
|
|
+ ret = -BCH_ERR_ENOENT_inode;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1113,38 +1182,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
|
|
return ret;
|
|
}
|
|
|
|
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
|
|
- subvol_inum inum,
|
|
- struct bch_inode_unpacked *inode)
|
|
-{
|
|
- struct btree_iter iter;
|
|
- int ret;
|
|
-
|
|
- ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
|
|
- if (!ret)
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
- return ret;
|
|
-}
|
|
-
|
|
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
|
|
- subvol_inum inum,
|
|
- struct bch_inode_unpacked *inode)
|
|
-{
|
|
- struct btree_iter iter;
|
|
- int ret;
|
|
-
|
|
- ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
|
|
- if (!ret)
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
- return ret;
|
|
-}
|
|
-
|
|
-int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
|
|
- struct bch_inode_unpacked *inode)
|
|
-{
|
|
- return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
|
|
-}
|
|
-
|
|
int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
|
|
{
|
|
if (bi->bi_flags & BCH_INODE_unlinked)
|
|
@@ -1218,10 +1255,45 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i
|
|
return 0;
|
|
}
|
|
|
|
+int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum,
|
|
+ struct bch_inode_unpacked *bi, unsigned v)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+
|
|
+#ifdef CONFIG_UNICODE
|
|
+ int ret = 0;
|
|
+ /* Not supported on individual files. */
|
|
+ if (!S_ISDIR(bi->bi_mode))
|
|
+ return -EOPNOTSUPP;
|
|
+
|
|
+ /*
|
|
+ * Make sure the dir is empty, as otherwise we'd need to
|
|
+ * rehash everything and update the dirent keys.
|
|
+ */
|
|
+ ret = bch2_empty_dir_trans(trans, inum);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ bch2_check_set_feature(c, BCH_FEATURE_casefolding);
|
|
+
|
|
+ bi->bi_casefold = v + 1;
|
|
+ bi->bi_fields_set |= BIT(Inode_opt_casefold);
|
|
+
|
|
+ return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi);
|
|
+#else
|
|
+ bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE");
|
|
+ return -EOPNOTSUPP;
|
|
+#endif
|
|
+}
|
|
+
|
|
static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter iter = { NULL };
|
|
+ struct btree_iter iter = {};
|
|
struct bkey_i_inode_generation delete;
|
|
struct bch_inode_unpacked inode_u;
|
|
struct bkey_s_c k;
|
|
@@ -1256,7 +1328,7 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum
|
|
bch2_fs_inconsistent(c,
|
|
"inode %llu:%u not found when deleting",
|
|
inum, snapshot);
|
|
- ret = -EIO;
|
|
+ ret = -BCH_ERR_ENOENT_inode;
|
|
goto err;
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
|
|
index 428b9be6af34..77ad2d549541 100644
|
|
--- a/fs/bcachefs/inode.h
|
|
+++ b/fs/bcachefs/inode.h
|
|
@@ -134,10 +134,21 @@ static inline int bch2_inode_peek(struct btree_trans *trans,
|
|
subvol_inum inum, unsigned flags)
|
|
{
|
|
return __bch2_inode_peek(trans, iter, inode, inum, flags, true);
|
|
- int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
|
|
- return ret;
|
|
}
|
|
|
|
+int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32,
|
|
+ struct bch_inode_unpacked *, unsigned);
|
|
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
|
|
+ subvol_inum,
|
|
+ struct bch_inode_unpacked *);
|
|
+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
|
|
+ struct bch_inode_unpacked *);
|
|
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
|
|
+ struct bch_inode_unpacked *);
|
|
+
|
|
+int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
|
|
+ struct bch_inode_unpacked *root);
|
|
+
|
|
int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
|
|
struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
|
|
|
|
@@ -153,7 +164,7 @@ int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
|
|
|
|
void bch2_inode_init_early(struct bch_fs *,
|
|
struct bch_inode_unpacked *);
|
|
-void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
|
|
+void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64,
|
|
uid_t, gid_t, umode_t, dev_t,
|
|
struct bch_inode_unpacked *);
|
|
void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
|
|
@@ -165,14 +176,6 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *,
|
|
|
|
int bch2_inode_rm(struct bch_fs *, subvol_inum);
|
|
|
|
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
|
|
- subvol_inum,
|
|
- struct bch_inode_unpacked *);
|
|
-int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
|
|
- struct bch_inode_unpacked *);
|
|
-int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
|
|
- struct bch_inode_unpacked *);
|
|
-
|
|
#define inode_opt_get(_c, _inode, _name) \
|
|
((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
|
|
|
|
@@ -243,6 +246,14 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k)
|
|
}
|
|
}
|
|
|
|
+static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi)
|
|
+{
|
|
+ /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */
|
|
+ return bi->bi_casefold
|
|
+ ? bi->bi_casefold - 1
|
|
+ : c->opts.casefold;
|
|
+}
|
|
+
|
|
/* i_nlink: */
|
|
|
|
static inline unsigned nlink_bias(umode_t mode)
|
|
@@ -277,13 +288,16 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i
|
|
bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
|
|
|
|
return S_ISDIR(inode->bi_mode) ||
|
|
+ inode->bi_subvol ||
|
|
(!inode->bi_nlink && inode_has_bp);
|
|
}
|
|
|
|
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
|
|
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
|
|
struct bch_inode_unpacked *);
|
|
-int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
|
|
+int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *);
|
|
+int bch2_inode_set_casefold(struct btree_trans *, subvol_inum,
|
|
+ struct bch_inode_unpacked *, unsigned);
|
|
|
|
#include "rebalance.h"
|
|
|
|
@@ -295,6 +309,14 @@ bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode
|
|
return io_opts_to_rebalance_opts(c, &io_opts);
|
|
}
|
|
|
|
+#define BCACHEFS_ROOT_SUBVOL_INUM \
|
|
+ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
|
|
+
|
|
+static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b)
|
|
+{
|
|
+ return a.subvol == b.subvol && a.inum == b.inum;
|
|
+}
|
|
+
|
|
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
|
|
int bch2_delete_dead_inodes(struct bch_fs *);
|
|
|
|
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
|
|
index b99a5bf1a75e..1f00938b1bdc 100644
|
|
--- a/fs/bcachefs/inode_format.h
|
|
+++ b/fs/bcachefs/inode_format.h
|
|
@@ -103,7 +103,8 @@ struct bch_inode_generation {
|
|
x(bi_parent_subvol, 32) \
|
|
x(bi_nocow, 8) \
|
|
x(bi_depth, 32) \
|
|
- x(bi_inodes_32bit, 8)
|
|
+ x(bi_inodes_32bit, 8) \
|
|
+ x(bi_casefold, 8)
|
|
|
|
/* subset of BCH_INODE_FIELDS */
|
|
#define BCH_INODE_OPTS() \
|
|
@@ -117,7 +118,8 @@ struct bch_inode_generation {
|
|
x(background_target, 16) \
|
|
x(erasure_code, 16) \
|
|
x(nocow, 8) \
|
|
- x(inodes_32bit, 8)
|
|
+ x(inodes_32bit, 8) \
|
|
+ x(casefold, 8)
|
|
|
|
enum inode_opt_id {
|
|
#define x(name, ...) \
|
|
@@ -127,6 +129,10 @@ enum inode_opt_id {
|
|
Inode_opt_nr,
|
|
};
|
|
|
|
+/*
|
|
+ * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive -
|
|
+ * for overlayfs
|
|
+ */
|
|
#define BCH_INODE_FLAGS() \
|
|
x(sync, 0) \
|
|
x(immutable, 1) \
|
|
@@ -137,7 +143,8 @@ enum inode_opt_id {
|
|
x(i_sectors_dirty, 6) \
|
|
x(unlinked, 7) \
|
|
x(backptr_untrusted, 8) \
|
|
- x(has_child_snapshot, 9)
|
|
+ x(has_child_snapshot, 9) \
|
|
+ x(has_case_insensitive, 10)
|
|
|
|
/* bits 20+ reserved for packed fields below: */
|
|
|
|
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
|
|
index 5353979117b0..cc07729a4b62 100644
|
|
--- a/fs/bcachefs/io_misc.c
|
|
+++ b/fs/bcachefs/io_misc.c
|
|
@@ -43,7 +43,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
|
|
bch2_bkey_buf_init(&new);
|
|
closure_init_stack(&cl);
|
|
|
|
- k = bch2_btree_iter_peek_slot(iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
return ret;
|
|
@@ -115,7 +115,8 @@ int bch2_extent_fallocate(struct btree_trans *trans,
|
|
bch2_increment_clock(c, sectors_allocated, WRITE);
|
|
if (should_print_err(ret)) {
|
|
struct printbuf buf = PRINTBUF;
|
|
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9);
|
|
+ lockrestart_do(trans,
|
|
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9));
|
|
prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
@@ -163,12 +164,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
|
|
if (ret)
|
|
continue;
|
|
|
|
- bch2_btree_iter_set_snapshot(iter, snapshot);
|
|
+ bch2_btree_iter_set_snapshot(trans, iter, snapshot);
|
|
|
|
/*
|
|
* peek_max() doesn't have ideal semantics for extents:
|
|
*/
|
|
- k = bch2_btree_iter_peek_max(iter, end_pos);
|
|
+ k = bch2_btree_iter_peek_max(trans, iter, end_pos);
|
|
if (!k.k)
|
|
break;
|
|
|
|
@@ -229,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans,
|
|
u64 new_i_size,
|
|
bool warn)
|
|
{
|
|
- struct btree_iter iter = { NULL };
|
|
+ struct btree_iter iter = {};
|
|
struct bch_inode_unpacked inode_u;
|
|
int ret;
|
|
|
|
@@ -398,7 +399,7 @@ case LOGGED_OP_FINSERT_start:
|
|
if (ret)
|
|
goto err;
|
|
} else {
|
|
- bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
|
|
+ bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset));
|
|
|
|
ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
|
|
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
@@ -424,12 +425,12 @@ case LOGGED_OP_FINSERT_shift_extents:
|
|
if (ret)
|
|
goto btree_err;
|
|
|
|
- bch2_btree_iter_set_snapshot(&iter, snapshot);
|
|
- bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
|
|
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
|
|
+ bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot));
|
|
|
|
k = insert
|
|
- ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0))
|
|
- : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
|
|
+ ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0))
|
|
+ : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX));
|
|
if ((ret = bkey_err(k)))
|
|
goto btree_err;
|
|
|
|
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
|
|
index aa91fcf51eec..885c5f71a341 100644
|
|
--- a/fs/bcachefs/io_read.c
|
|
+++ b/fs/bcachefs/io_read.c
|
|
@@ -9,6 +9,7 @@
|
|
#include "bcachefs.h"
|
|
#include "alloc_background.h"
|
|
#include "alloc_foreground.h"
|
|
+#include "async_objs.h"
|
|
#include "btree_update.h"
|
|
#include "buckets.h"
|
|
#include "checksum.h"
|
|
@@ -17,6 +18,7 @@
|
|
#include "data_update.h"
|
|
#include "disk_groups.h"
|
|
#include "ec.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "error.h"
|
|
#include "io_read.h"
|
|
#include "io_misc.h"
|
|
@@ -25,8 +27,16 @@
|
|
#include "subvolume.h"
|
|
#include "trace.h"
|
|
|
|
+#include <linux/moduleparam.h>
|
|
+#include <linux/random.h>
|
|
#include <linux/sched/mm.h>
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+static unsigned bch2_read_corrupt_ratio;
|
|
+module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
|
|
+MODULE_PARM_DESC(read_corrupt_ratio, "");
|
|
+#endif
|
|
+
|
|
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
|
|
|
static bool bch2_target_congested(struct bch_fs *c, u16 target)
|
|
@@ -73,17 +83,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
|
|
|
|
/* Cache promotion on read */
|
|
|
|
-struct promote_op {
|
|
- struct rcu_head rcu;
|
|
- u64 start_time;
|
|
-
|
|
- struct rhash_head hash;
|
|
- struct bpos pos;
|
|
-
|
|
- struct data_update write;
|
|
- struct bio_vec bi_inline_vecs[]; /* must be last */
|
|
-};
|
|
-
|
|
static const struct rhashtable_params bch_promote_params = {
|
|
.head_offset = offsetof(struct promote_op, hash),
|
|
.key_offset = offsetof(struct promote_op, pos),
|
|
@@ -96,6 +95,33 @@ static inline bool have_io_error(struct bch_io_failures *failed)
|
|
return failed && failed->nr;
|
|
}
|
|
|
|
+static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
|
|
+{
|
|
+ EBUG_ON(rbio->split);
|
|
+
|
|
+ return rbio->data_update
|
|
+ ? container_of(rbio, struct data_update, rbio)
|
|
+ : NULL;
|
|
+}
|
|
+
|
|
+static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
|
|
+{
|
|
+ struct data_update *u = rbio_data_update(orig);
|
|
+ if (!u)
|
|
+ return false;
|
|
+
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
|
|
+ unsigned i = 0;
|
|
+ bkey_for_each_ptr(ptrs, ptr) {
|
|
+ if (ptr->dev == dev &&
|
|
+ u->data_opts.rewrite_ptrs & BIT(i))
|
|
+ return true;
|
|
+ i++;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
|
|
struct bpos pos,
|
|
struct bch_io_opts opts,
|
|
@@ -105,7 +131,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
|
|
if (!have_io_error(failed)) {
|
|
BUG_ON(!opts.promote_target);
|
|
|
|
- if (!(flags & BCH_READ_MAY_PROMOTE))
|
|
+ if (!(flags & BCH_READ_may_promote))
|
|
return -BCH_ERR_nopromote_may_not;
|
|
|
|
if (bch2_bkey_has_target(c, k, opts.promote_target))
|
|
@@ -125,98 +151,95 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
|
|
return 0;
|
|
}
|
|
|
|
-static void promote_free(struct bch_fs *c, struct promote_op *op)
|
|
+static noinline void promote_free(struct bch_read_bio *rbio)
|
|
{
|
|
- int ret;
|
|
+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
|
|
+ struct bch_fs *c = rbio->c;
|
|
+
|
|
+ int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
|
|
+ bch_promote_params);
|
|
+ BUG_ON(ret);
|
|
+
|
|
+ async_object_list_del(c, promote, op->list_idx);
|
|
|
|
bch2_data_update_exit(&op->write);
|
|
|
|
- ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
|
|
- bch_promote_params);
|
|
- BUG_ON(ret);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_promote);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
|
|
kfree_rcu(op, rcu);
|
|
}
|
|
|
|
static void promote_done(struct bch_write_op *wop)
|
|
{
|
|
- struct promote_op *op =
|
|
- container_of(wop, struct promote_op, write.op);
|
|
- struct bch_fs *c = op->write.op.c;
|
|
+ struct promote_op *op = container_of(wop, struct promote_op, write.op);
|
|
+ struct bch_fs *c = op->write.rbio.c;
|
|
|
|
- bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
|
|
- op->start_time);
|
|
- promote_free(c, op);
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
|
|
+ promote_free(&op->write.rbio);
|
|
}
|
|
|
|
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
|
|
+static void promote_start_work(struct work_struct *work)
|
|
{
|
|
- struct bio *bio = &op->write.op.wbio.bio;
|
|
+ struct promote_op *op = container_of(work, struct promote_op, work);
|
|
|
|
- trace_and_count(op->write.op.c, read_promote, &rbio->bio);
|
|
+ bch2_data_update_read_done(&op->write);
|
|
+}
|
|
|
|
- /* we now own pages: */
|
|
- BUG_ON(!rbio->bounce);
|
|
- BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
|
|
+static noinline void promote_start(struct bch_read_bio *rbio)
|
|
+{
|
|
+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
|
|
|
|
- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
|
|
- sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
|
|
- swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
|
|
+ trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
|
|
|
|
- bch2_data_update_read_done(&op->write, rbio->pick.crc);
|
|
+ INIT_WORK(&op->work, promote_start_work);
|
|
+ queue_work(rbio->c->write_ref_wq, &op->work);
|
|
}
|
|
|
|
-static struct promote_op *__promote_alloc(struct btree_trans *trans,
|
|
- enum btree_id btree_id,
|
|
- struct bkey_s_c k,
|
|
- struct bpos pos,
|
|
- struct extent_ptr_decoded *pick,
|
|
- struct bch_io_opts opts,
|
|
- unsigned sectors,
|
|
- struct bch_read_bio **rbio,
|
|
- struct bch_io_failures *failed)
|
|
+static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
|
|
+ enum btree_id btree_id,
|
|
+ struct bkey_s_c k,
|
|
+ struct bpos pos,
|
|
+ struct extent_ptr_decoded *pick,
|
|
+ unsigned sectors,
|
|
+ struct bch_read_bio *orig,
|
|
+ struct bch_io_failures *failed)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct promote_op *op = NULL;
|
|
- struct bio *bio;
|
|
- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
|
|
int ret;
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
|
|
- return ERR_PTR(-BCH_ERR_nopromote_no_writes);
|
|
+ struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
|
|
|
|
- op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
|
|
- if (!op) {
|
|
- ret = -BCH_ERR_nopromote_enomem;
|
|
- goto err;
|
|
- }
|
|
+ if (!have_io_error(failed)) {
|
|
+ update_opts.target = orig->opts.promote_target;
|
|
+ update_opts.extra_replicas = 1;
|
|
+ update_opts.write_flags |= BCH_WRITE_cached;
|
|
+ update_opts.write_flags |= BCH_WRITE_only_specified_devs;
|
|
+ } else {
|
|
+ update_opts.target = orig->opts.foreground_target;
|
|
|
|
- op->start_time = local_clock();
|
|
- op->pos = pos;
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ unsigned ptr_bit = 1;
|
|
+ bkey_for_each_ptr(ptrs, ptr) {
|
|
+ if (bch2_dev_io_failures(failed, ptr->dev) &&
|
|
+ !ptr_being_rewritten(orig, ptr->dev))
|
|
+ update_opts.rewrite_ptrs |= ptr_bit;
|
|
+ ptr_bit <<= 1;
|
|
+ }
|
|
|
|
- /*
|
|
- * We don't use the mempool here because extents that aren't
|
|
- * checksummed or compressed can be too big for the mempool:
|
|
- */
|
|
- *rbio = kzalloc(sizeof(struct bch_read_bio) +
|
|
- sizeof(struct bio_vec) * pages,
|
|
- GFP_KERNEL);
|
|
- if (!*rbio) {
|
|
- ret = -BCH_ERR_nopromote_enomem;
|
|
- goto err;
|
|
+ if (!update_opts.rewrite_ptrs)
|
|
+ return NULL;
|
|
}
|
|
|
|
- rbio_init(&(*rbio)->bio, opts);
|
|
- bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote))
|
|
+ return ERR_PTR(-BCH_ERR_nopromote_no_writes);
|
|
|
|
- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) {
|
|
+ struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
|
|
+ if (!op) {
|
|
ret = -BCH_ERR_nopromote_enomem;
|
|
- goto err;
|
|
+ goto err_put;
|
|
}
|
|
|
|
- (*rbio)->bounce = true;
|
|
- (*rbio)->split = true;
|
|
- (*rbio)->kmalloc = true;
|
|
+ op->start_time = local_clock();
|
|
+ op->pos = pos;
|
|
|
|
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
|
|
bch_promote_params)) {
|
|
@@ -224,68 +247,61 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
|
|
goto err;
|
|
}
|
|
|
|
- bio = &op->write.op.wbio.bio;
|
|
- bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
|
|
-
|
|
- struct data_update_opts update_opts = {};
|
|
-
|
|
- if (!have_io_error(failed)) {
|
|
- update_opts.target = opts.promote_target;
|
|
- update_opts.extra_replicas = 1;
|
|
- update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED;
|
|
- } else {
|
|
- update_opts.target = opts.foreground_target;
|
|
-
|
|
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
- unsigned ptr_bit = 1;
|
|
- bkey_for_each_ptr(ptrs, ptr) {
|
|
- if (bch2_dev_io_failures(failed, ptr->dev))
|
|
- update_opts.rewrite_ptrs |= ptr_bit;
|
|
- ptr_bit <<= 1;
|
|
- }
|
|
- }
|
|
+ ret = async_object_list_add(c, promote, op, &op->list_idx);
|
|
+ if (ret < 0)
|
|
+ goto err_remove_hash;
|
|
|
|
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
|
|
writepoint_hashed((unsigned long) current),
|
|
- opts,
|
|
+ &orig->opts,
|
|
update_opts,
|
|
btree_id, k);
|
|
+ op->write.type = BCH_DATA_UPDATE_promote;
|
|
/*
|
|
* possible errors: -BCH_ERR_nocow_lock_blocked,
|
|
* -BCH_ERR_ENOSPC_disk_reservation:
|
|
*/
|
|
- if (ret) {
|
|
- BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
|
|
- bch_promote_params));
|
|
- goto err;
|
|
- }
|
|
+ if (ret)
|
|
+ goto err_remove_list;
|
|
|
|
+ rbio_init_fragment(&op->write.rbio.bio, orig);
|
|
+ op->write.rbio.bounce = true;
|
|
+ op->write.rbio.promote = true;
|
|
op->write.op.end_io = promote_done;
|
|
|
|
- return op;
|
|
+ return &op->write.rbio;
|
|
+err_remove_list:
|
|
+ async_object_list_del(c, promote, op->list_idx);
|
|
+err_remove_hash:
|
|
+ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
|
|
+ bch_promote_params));
|
|
err:
|
|
- if (*rbio)
|
|
- bio_free_pages(&(*rbio)->bio);
|
|
- kfree(*rbio);
|
|
- *rbio = NULL;
|
|
+ bio_free_pages(&op->write.op.wbio.bio);
|
|
/* We may have added to the rhashtable and thus need rcu freeing: */
|
|
kfree_rcu(op, rcu);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_promote);
|
|
+err_put:
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
|
|
return ERR_PTR(ret);
|
|
}
|
|
|
|
noinline
|
|
-static struct promote_op *promote_alloc(struct btree_trans *trans,
|
|
+static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
|
|
struct bvec_iter iter,
|
|
struct bkey_s_c k,
|
|
struct extent_ptr_decoded *pick,
|
|
- struct bch_io_opts opts,
|
|
unsigned flags,
|
|
- struct bch_read_bio **rbio,
|
|
+ struct bch_read_bio *orig,
|
|
bool *bounce,
|
|
bool *read_full,
|
|
struct bch_io_failures *failed)
|
|
{
|
|
+ /*
|
|
+ * We're in the retry path, but we don't know what to repair yet, and we
|
|
+ * don't want to do a promote here:
|
|
+ */
|
|
+ if (failed && !failed->nr)
|
|
+ return NULL;
|
|
+
|
|
struct bch_fs *c = trans->c;
|
|
/*
|
|
* if failed != NULL we're not actually doing a promote, we're
|
|
@@ -301,18 +317,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
|
|
struct bpos pos = promote_full
|
|
? bkey_start_pos(k.k)
|
|
: POS(k.k->p.inode, iter.bi_sector);
|
|
- struct promote_op *promote;
|
|
int ret;
|
|
|
|
- ret = should_promote(c, k, pos, opts, flags, failed);
|
|
+ ret = should_promote(c, k, pos, orig->opts, flags, failed);
|
|
if (ret)
|
|
goto nopromote;
|
|
|
|
- promote = __promote_alloc(trans,
|
|
- k.k->type == KEY_TYPE_reflink_v
|
|
- ? BTREE_ID_reflink
|
|
- : BTREE_ID_extents,
|
|
- k, pos, pick, opts, sectors, rbio, failed);
|
|
+ struct bch_read_bio *promote =
|
|
+ __promote_alloc(trans,
|
|
+ k.k->type == KEY_TYPE_reflink_v
|
|
+ ? BTREE_ID_reflink
|
|
+ : BTREE_ID_extents,
|
|
+ k, pos, pick, sectors, orig, failed);
|
|
+ if (!promote)
|
|
+ return NULL;
|
|
+
|
|
ret = PTR_ERR_OR_ZERO(promote);
|
|
if (ret)
|
|
goto nopromote;
|
|
@@ -321,18 +340,38 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
|
|
*read_full = promote_full;
|
|
return promote;
|
|
nopromote:
|
|
- trace_read_nopromote(c, ret);
|
|
+ trace_io_read_nopromote(c, ret);
|
|
return NULL;
|
|
}
|
|
|
|
+void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op)
|
|
+{
|
|
+ if (!op->write.read_done) {
|
|
+ prt_printf(out, "parent read: %px\n", op->write.rbio.parent);
|
|
+ printbuf_indent_add(out, 2);
|
|
+ bch2_read_bio_to_text(out, op->write.rbio.parent);
|
|
+ printbuf_indent_sub(out, 2);
|
|
+ }
|
|
+
|
|
+ bch2_data_update_to_text(out, &op->write);
|
|
+}
|
|
+
|
|
/* Read */
|
|
|
|
static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
|
|
struct bch_read_bio *rbio, struct bpos read_pos)
|
|
{
|
|
- return bch2_inum_offset_err_msg_trans(trans, out,
|
|
- (subvol_inum) { rbio->subvol, read_pos.inode },
|
|
- read_pos.offset << 9);
|
|
+ int ret = lockrestart_do(trans,
|
|
+ bch2_inum_offset_err_msg_trans(trans, out,
|
|
+ (subvol_inum) { rbio->subvol, read_pos.inode },
|
|
+ read_pos.offset << 9));
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (rbio->data_update)
|
|
+ prt_str(out, "(internal move) ");
|
|
+
|
|
+ return 0;
|
|
}
|
|
|
|
static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
|
|
@@ -341,10 +380,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
|
|
bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
|
|
}
|
|
|
|
-#define READ_RETRY_AVOID 1
|
|
-#define READ_RETRY 2
|
|
-#define READ_ERR 3
|
|
-
|
|
enum rbio_context {
|
|
RBIO_CONTEXT_NULL,
|
|
RBIO_CONTEXT_HIGHPRI,
|
|
@@ -375,20 +410,27 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
|
|
{
|
|
BUG_ON(rbio->bounce && !rbio->split);
|
|
|
|
- if (rbio->promote)
|
|
- promote_free(rbio->c, rbio->promote);
|
|
- rbio->promote = NULL;
|
|
-
|
|
- if (rbio->bounce)
|
|
- bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
|
|
+ if (rbio->have_ioref) {
|
|
+ struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
|
|
+ }
|
|
|
|
if (rbio->split) {
|
|
struct bch_read_bio *parent = rbio->parent;
|
|
|
|
- if (rbio->kmalloc)
|
|
- kfree(rbio);
|
|
- else
|
|
+ if (unlikely(rbio->promote)) {
|
|
+ if (!rbio->bio.bi_status)
|
|
+ promote_start(rbio);
|
|
+ else
|
|
+ promote_free(rbio);
|
|
+ } else {
|
|
+ async_object_list_del(rbio->c, rbio, rbio->list_idx);
|
|
+
|
|
+ if (rbio->bounce)
|
|
+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
|
|
+
|
|
bio_put(&rbio->bio);
|
|
+ }
|
|
|
|
rbio = parent;
|
|
}
|
|
@@ -408,61 +450,115 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
|
|
bio_endio(&rbio->bio);
|
|
}
|
|
|
|
-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
- struct bvec_iter bvec_iter,
|
|
- struct bch_io_failures *failed,
|
|
- unsigned flags)
|
|
+static void get_rbio_extent(struct btree_trans *trans,
|
|
+ struct bch_read_bio *rbio,
|
|
+ struct bkey_buf *sk)
|
|
{
|
|
- struct btree_trans *trans = bch2_trans_get(c);
|
|
struct btree_iter iter;
|
|
- struct bkey_buf sk;
|
|
struct bkey_s_c k;
|
|
- int ret;
|
|
+ int ret = lockrestart_do(trans,
|
|
+ bkey_err(k = bch2_bkey_get_iter(trans, &iter,
|
|
+ rbio->data_btree, rbio->data_pos, 0)));
|
|
+ if (ret)
|
|
+ return;
|
|
|
|
- flags &= ~BCH_READ_LAST_FRAGMENT;
|
|
- flags |= BCH_READ_MUST_CLONE;
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ bkey_for_each_ptr(ptrs, ptr)
|
|
+ if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) {
|
|
+ bch2_bkey_buf_reassemble(sk, trans->c, k);
|
|
+ break;
|
|
+ }
|
|
|
|
- bch2_bkey_buf_init(&sk);
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+}
|
|
+
|
|
+static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
|
|
+ enum btree_id btree, struct bkey_s_c read_k)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+
|
|
+ struct data_update *u = rbio_data_update(rbio);
|
|
+ if (u)
|
|
+ read_k = bkey_i_to_s_c(u->k.k);
|
|
+
|
|
+ u64 flags = bch2_bkey_extent_flags(read_k);
|
|
+ if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
|
|
+ return 0;
|
|
+
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k),
|
|
+ BTREE_ITER_intent);
|
|
+ int ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (!bkey_and_val_eq(k, read_k))
|
|
+ goto out;
|
|
|
|
- bch2_trans_iter_init(trans, &iter, rbio->data_btree,
|
|
- rbio->read_pos, BTREE_ITER_slots);
|
|
+ struct bkey_i *new = bch2_trans_kmalloc(trans,
|
|
+ bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
|
|
+ ret = PTR_ERR_OR_ZERO(new) ?:
|
|
+ (bkey_reassemble(new, k), 0) ?:
|
|
+ bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
|
|
+ bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?:
|
|
+ bch2_trans_commit(trans, NULL, NULL, 0);
|
|
+
|
|
+ /*
|
|
+ * Propagate key change back to data update path, in particular so it
|
|
+ * knows the extent has been poisoned and it's safe to change the
|
|
+ * checksum
|
|
+ */
|
|
+ if (u && !ret)
|
|
+ bch2_bkey_buf_copy(&u->k, c, new);
|
|
+out:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
|
|
+ struct bch_read_bio *rbio,
|
|
+ struct bvec_iter bvec_iter,
|
|
+ struct bch_io_failures *failed,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct data_update *u = container_of(rbio, struct data_update, rbio);
|
|
retry:
|
|
bch2_trans_begin(trans);
|
|
- rbio->bio.bi_status = 0;
|
|
|
|
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = lockrestart_do(trans,
|
|
+ bkey_err(k = bch2_bkey_get_iter(trans, &iter,
|
|
+ u->btree_id, bkey_start_pos(&u->k.k->k),
|
|
+ 0)));
|
|
if (ret)
|
|
goto err;
|
|
|
|
- bch2_bkey_buf_reassemble(&sk, c, k);
|
|
- k = bkey_i_to_s_c(sk.k);
|
|
-
|
|
- if (!bch2_bkey_matches_ptr(c, k,
|
|
- rbio->pick.ptr,
|
|
- rbio->data_pos.offset -
|
|
- rbio->pick.crc.offset)) {
|
|
+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
|
|
/* extent we wanted to read no longer exists: */
|
|
- rbio->hole = true;
|
|
- goto out;
|
|
+ rbio->ret = -BCH_ERR_data_read_key_overwritten;
|
|
+ goto err;
|
|
}
|
|
|
|
ret = __bch2_read_extent(trans, rbio, bvec_iter,
|
|
- rbio->read_pos,
|
|
- rbio->data_btree,
|
|
- k, 0, failed, flags);
|
|
- if (ret == READ_RETRY)
|
|
- goto retry;
|
|
- if (ret)
|
|
- goto err;
|
|
-out:
|
|
- bch2_rbio_done(rbio);
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
- bch2_trans_put(trans);
|
|
- bch2_bkey_buf_exit(&sk, c);
|
|
- return;
|
|
+ bkey_start_pos(&u->k.k->k),
|
|
+ u->btree_id,
|
|
+ bkey_i_to_s_c(u->k.k),
|
|
+ 0, failed, flags, -1);
|
|
err:
|
|
- rbio->bio.bi_status = BLK_STS_IOERR;
|
|
- goto out;
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
|
|
+ bch2_err_matches(ret, BCH_ERR_data_read_retry))
|
|
+ goto retry;
|
|
+
|
|
+ if (ret) {
|
|
+ rbio->bio.bi_status = BLK_STS_IOERR;
|
|
+ rbio->ret = ret;
|
|
+ }
|
|
+
|
|
+ BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
|
|
+ return ret;
|
|
}
|
|
|
|
static void bch2_rbio_retry(struct work_struct *work)
|
|
@@ -478,68 +574,105 @@ static void bch2_rbio_retry(struct work_struct *work)
|
|
};
|
|
struct bch_io_failures failed = { .nr = 0 };
|
|
|
|
- trace_and_count(c, read_retry, &rbio->bio);
|
|
+ struct btree_trans *trans = bch2_trans_get(c);
|
|
+
|
|
+ struct bkey_buf sk;
|
|
+ bch2_bkey_buf_init(&sk);
|
|
+ bkey_init(&sk.k->k);
|
|
+
|
|
+ trace_io_read_retry(&rbio->bio);
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
|
|
+ bvec_iter_sectors(rbio->bvec_iter));
|
|
|
|
- if (rbio->retry == READ_RETRY_AVOID)
|
|
- bch2_mark_io_failure(&failed, &rbio->pick);
|
|
+ get_rbio_extent(trans, rbio, &sk);
|
|
|
|
- rbio->bio.bi_status = 0;
|
|
+ if (!bkey_deleted(&sk.k->k) &&
|
|
+ bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
|
|
+ bch2_mark_io_failure(&failed, &rbio->pick,
|
|
+ rbio->ret == -BCH_ERR_data_read_retry_csum_err);
|
|
+
|
|
+ if (!rbio->split) {
|
|
+ rbio->bio.bi_status = 0;
|
|
+ rbio->ret = 0;
|
|
+ }
|
|
+
|
|
+ unsigned subvol = rbio->subvol;
|
|
+ struct bpos read_pos = rbio->read_pos;
|
|
|
|
rbio = bch2_rbio_free(rbio);
|
|
|
|
- flags |= BCH_READ_IN_RETRY;
|
|
- flags &= ~BCH_READ_MAY_PROMOTE;
|
|
+ flags |= BCH_READ_in_retry;
|
|
+ flags &= ~BCH_READ_may_promote;
|
|
+ flags &= ~BCH_READ_last_fragment;
|
|
+ flags |= BCH_READ_must_clone;
|
|
|
|
- if (flags & BCH_READ_NODECODE) {
|
|
- bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
|
|
- } else {
|
|
- flags &= ~BCH_READ_LAST_FRAGMENT;
|
|
- flags |= BCH_READ_MUST_CLONE;
|
|
+ int ret = rbio->data_update
|
|
+ ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
|
|
+ : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags);
|
|
|
|
- __bch2_read(c, rbio, iter, inum, &failed, flags);
|
|
+ if (ret) {
|
|
+ rbio->ret = ret;
|
|
+ rbio->bio.bi_status = BLK_STS_IOERR;
|
|
}
|
|
-}
|
|
|
|
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
|
|
- blk_status_t error)
|
|
-{
|
|
- rbio->retry = retry;
|
|
+ if (failed.nr || ret) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
|
|
- if (rbio->flags & BCH_READ_IN_RETRY)
|
|
- return;
|
|
+ lockrestart_do(trans,
|
|
+ bch2_inum_offset_err_msg_trans(trans, &buf,
|
|
+ (subvol_inum) { subvol, read_pos.inode },
|
|
+ read_pos.offset << 9));
|
|
+ if (rbio->data_update)
|
|
+ prt_str(&buf, "(internal move) ");
|
|
|
|
- if (retry == READ_ERR) {
|
|
- rbio = bch2_rbio_free(rbio);
|
|
+ prt_str(&buf, "data read error, ");
|
|
+ if (!ret)
|
|
+ prt_str(&buf, "successful retry");
|
|
+ else
|
|
+ prt_str(&buf, bch2_err_str(ret));
|
|
+ prt_newline(&buf);
|
|
|
|
- rbio->bio.bi_status = error;
|
|
- bch2_rbio_done(rbio);
|
|
- } else {
|
|
- bch2_rbio_punt(rbio, bch2_rbio_retry,
|
|
- RBIO_CONTEXT_UNBOUND, system_unbound_wq);
|
|
+ if (!bkey_deleted(&sk.k->k)) {
|
|
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k));
|
|
+ prt_newline(&buf);
|
|
+ }
|
|
+
|
|
+ bch2_io_failures_to_text(&buf, c, &failed);
|
|
+
|
|
+ bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
}
|
|
+
|
|
+ bch2_rbio_done(rbio);
|
|
+ bch2_bkey_buf_exit(&sk, c);
|
|
+ bch2_trans_put(trans);
|
|
}
|
|
|
|
-static void bch2_read_io_err(struct work_struct *work)
|
|
+static void bch2_rbio_error(struct bch_read_bio *rbio,
|
|
+ int ret, blk_status_t blk_error)
|
|
{
|
|
- struct bch_read_bio *rbio =
|
|
- container_of(work, struct bch_read_bio, work);
|
|
- struct bio *bio = &rbio->bio;
|
|
- struct bch_fs *c = rbio->c;
|
|
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
|
|
- struct printbuf buf = PRINTBUF;
|
|
+ BUG_ON(ret >= 0);
|
|
|
|
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
|
|
- prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
|
|
+ rbio->ret = ret;
|
|
+ rbio->bio.bi_status = blk_error;
|
|
|
|
- if (ca) {
|
|
- bch2_io_error(ca, BCH_MEMBER_ERROR_read);
|
|
- bch_err_ratelimited(ca, "%s", buf.buf);
|
|
+ bch2_rbio_parent(rbio)->saw_error = true;
|
|
+
|
|
+ if (rbio->flags & BCH_READ_in_retry)
|
|
+ return;
|
|
+
|
|
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
|
|
+ bch2_rbio_punt(rbio, bch2_rbio_retry,
|
|
+ RBIO_CONTEXT_UNBOUND, system_unbound_wq);
|
|
} else {
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- }
|
|
+ rbio = bch2_rbio_free(rbio);
|
|
|
|
- printbuf_exit(&buf);
|
|
- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
|
|
+ rbio->ret = ret;
|
|
+ rbio->bio.bi_status = blk_error;
|
|
+
|
|
+ bch2_rbio_done(rbio);
|
|
+ }
|
|
}
|
|
|
|
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
|
|
@@ -605,33 +738,6 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
|
|
__bch2_rbio_narrow_crcs(trans, rbio));
|
|
}
|
|
|
|
-static void bch2_read_csum_err(struct work_struct *work)
|
|
-{
|
|
- struct bch_read_bio *rbio =
|
|
- container_of(work, struct bch_read_bio, work);
|
|
- struct bch_fs *c = rbio->c;
|
|
- struct bio *src = &rbio->bio;
|
|
- struct bch_extent_crc_unpacked crc = rbio->pick.crc;
|
|
- struct nonce nonce = extent_nonce(rbio->version, crc);
|
|
- struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
|
|
- struct printbuf buf = PRINTBUF;
|
|
-
|
|
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
|
|
- prt_str(&buf, "data ");
|
|
- bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
|
|
-
|
|
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
|
|
- if (ca) {
|
|
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
|
|
- bch_err_ratelimited(ca, "%s", buf.buf);
|
|
- } else {
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- }
|
|
-
|
|
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
|
- printbuf_exit(&buf);
|
|
-}
|
|
-
|
|
static void bch2_read_decompress_err(struct work_struct *work)
|
|
{
|
|
struct bch_read_bio *rbio =
|
|
@@ -648,7 +754,7 @@ static void bch2_read_decompress_err(struct work_struct *work)
|
|
else
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
|
|
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
@@ -668,7 +774,7 @@ static void bch2_read_decrypt_err(struct work_struct *work)
|
|
else
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
|
|
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
@@ -678,9 +784,11 @@ static void __bch2_read_endio(struct work_struct *work)
|
|
struct bch_read_bio *rbio =
|
|
container_of(work, struct bch_read_bio, work);
|
|
struct bch_fs *c = rbio->c;
|
|
- struct bio *src = &rbio->bio;
|
|
- struct bio *dst = &bch2_rbio_parent(rbio)->bio;
|
|
- struct bvec_iter dst_iter = rbio->bvec_iter;
|
|
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
|
|
+ struct bch_read_bio *parent = bch2_rbio_parent(rbio);
|
|
+ struct bio *src = &rbio->bio;
|
|
+ struct bio *dst = &parent->bio;
|
|
+ struct bvec_iter dst_iter = rbio->bvec_iter;
|
|
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
|
|
struct nonce nonce = extent_nonce(rbio->version, crc);
|
|
unsigned nofs_flags;
|
|
@@ -698,8 +806,26 @@ static void __bch2_read_endio(struct work_struct *work)
|
|
src->bi_iter = rbio->bvec_iter;
|
|
}
|
|
|
|
+ bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
|
|
+
|
|
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
|
|
- if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
|
|
+ bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
|
|
+
|
|
+ /*
|
|
+ * Checksum error: if the bio wasn't bounced, we may have been
|
|
+ * reading into buffers owned by userspace (that userspace can
|
|
+ * scribble over) - retry the read, bouncing it this time:
|
|
+ */
|
|
+ if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
|
|
+ rbio->flags |= BCH_READ_must_bounce;
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
|
|
+ BLK_STS_IOERR);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
|
|
+
|
|
+ if (!csum_good)
|
|
goto csum_err;
|
|
|
|
/*
|
|
@@ -712,32 +838,40 @@ static void __bch2_read_endio(struct work_struct *work)
|
|
if (unlikely(rbio->narrow_crcs))
|
|
bch2_rbio_narrow_crcs(rbio);
|
|
|
|
- if (rbio->flags & BCH_READ_NODECODE)
|
|
- goto nodecode;
|
|
+ if (likely(!parent->data_update)) {
|
|
+ /* Adjust crc to point to subset of data we want: */
|
|
+ crc.offset += rbio->offset_into_extent;
|
|
+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
|
|
|
|
- /* Adjust crc to point to subset of data we want: */
|
|
- crc.offset += rbio->offset_into_extent;
|
|
- crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
|
|
+ if (crc_is_compressed(crc)) {
|
|
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
+ if (ret)
|
|
+ goto decrypt_err;
|
|
|
|
- if (crc_is_compressed(crc)) {
|
|
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
- if (ret)
|
|
- goto decrypt_err;
|
|
+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
|
|
+ !c->opts.no_data_io)
|
|
+ goto decompression_err;
|
|
+ } else {
|
|
+ /* don't need to decrypt the entire bio: */
|
|
+ nonce = nonce_add(nonce, crc.offset << 9);
|
|
+ bio_advance(src, crc.offset << 9);
|
|
|
|
- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
|
|
- !c->opts.no_data_io)
|
|
- goto decompression_err;
|
|
- } else {
|
|
- /* don't need to decrypt the entire bio: */
|
|
- nonce = nonce_add(nonce, crc.offset << 9);
|
|
- bio_advance(src, crc.offset << 9);
|
|
+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
|
|
+ src->bi_iter.bi_size = dst_iter.bi_size;
|
|
|
|
- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
|
|
- src->bi_iter.bi_size = dst_iter.bi_size;
|
|
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
+ if (ret)
|
|
+ goto decrypt_err;
|
|
|
|
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
- if (ret)
|
|
- goto decrypt_err;
|
|
+ if (rbio->bounce) {
|
|
+ struct bvec_iter src_iter = src->bi_iter;
|
|
+
|
|
+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ if (rbio->split)
|
|
+ rbio->parent->pick = rbio->pick;
|
|
|
|
if (rbio->bounce) {
|
|
struct bvec_iter src_iter = src->bi_iter;
|
|
@@ -754,12 +888,9 @@ static void __bch2_read_endio(struct work_struct *work)
|
|
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
if (ret)
|
|
goto decrypt_err;
|
|
-
|
|
- promote_start(rbio->promote, rbio);
|
|
- rbio->promote = NULL;
|
|
}
|
|
-nodecode:
|
|
- if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
|
|
+
|
|
+ if (likely(!(rbio->flags & BCH_READ_in_retry))) {
|
|
rbio = bch2_rbio_free(rbio);
|
|
bch2_rbio_done(rbio);
|
|
}
|
|
@@ -767,18 +898,7 @@ static void __bch2_read_endio(struct work_struct *work)
|
|
memalloc_nofs_restore(nofs_flags);
|
|
return;
|
|
csum_err:
|
|
- /*
|
|
- * Checksum error: if the bio wasn't bounced, we may have been
|
|
- * reading into buffers owned by userspace (that userspace can
|
|
- * scribble over) - retry the read, bouncing it this time:
|
|
- */
|
|
- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
|
|
- rbio->flags |= BCH_READ_MUST_BOUNCE;
|
|
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
|
|
- goto out;
|
|
- }
|
|
-
|
|
- bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
|
|
goto out;
|
|
decompression_err:
|
|
bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
|
|
@@ -797,27 +917,25 @@ static void bch2_read_endio(struct bio *bio)
|
|
struct workqueue_struct *wq = NULL;
|
|
enum rbio_context context = RBIO_CONTEXT_NULL;
|
|
|
|
- if (rbio->have_ioref) {
|
|
- bch2_latency_acct(ca, rbio->submit_time, READ);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
- }
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
|
+ rbio->submit_time, !bio->bi_status);
|
|
|
|
if (!rbio->split)
|
|
rbio->bio.bi_end_io = rbio->end_io;
|
|
|
|
if (unlikely(bio->bi_status)) {
|
|
- bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
|
|
return;
|
|
}
|
|
|
|
- if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
|
|
+ if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
|
|
(ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
|
|
- trace_and_count(c, read_reuse_race, &rbio->bio);
|
|
+ trace_and_count(c, io_read_reuse_race, &rbio->bio);
|
|
|
|
- if (rbio->flags & BCH_READ_RETRY_IF_STALE)
|
|
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
|
|
+ if (rbio->flags & BCH_READ_retry_if_stale)
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
|
|
else
|
|
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
|
|
return;
|
|
}
|
|
|
|
@@ -856,7 +974,7 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
|
|
|
|
prt_printf(&buf, "memory gen: %u", gen);
|
|
|
|
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
|
|
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter)));
|
|
if (!ret) {
|
|
prt_newline(&buf);
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
@@ -883,15 +1001,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
struct bvec_iter iter, struct bpos read_pos,
|
|
enum btree_id data_btree, struct bkey_s_c k,
|
|
unsigned offset_into_extent,
|
|
- struct bch_io_failures *failed, unsigned flags)
|
|
+ struct bch_io_failures *failed, unsigned flags, int dev)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct extent_ptr_decoded pick;
|
|
struct bch_read_bio *rbio = NULL;
|
|
- struct promote_op *promote = NULL;
|
|
bool bounce = false, read_full = false, narrow_crcs = false;
|
|
struct bpos data_pos = bkey_start_pos(k.k);
|
|
- int pick_ret;
|
|
+ struct data_update *u = rbio_data_update(orig);
|
|
+ int ret = 0;
|
|
|
|
if (bkey_extent_is_inline_data(k.k)) {
|
|
unsigned bytes = min_t(unsigned, iter.bi_size,
|
|
@@ -902,19 +1020,35 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
swap(iter.bi_size, bytes);
|
|
bio_advance_iter(&orig->bio, &iter, bytes);
|
|
zero_fill_bio_iter(&orig->bio, iter);
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
|
|
+ bvec_iter_sectors(iter));
|
|
goto out_read_done;
|
|
}
|
|
+
|
|
+ if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) &&
|
|
+ !orig->data_update)
|
|
+ return -BCH_ERR_extent_poisoned;
|
|
retry_pick:
|
|
- pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
|
|
+ ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
|
|
|
|
/* hole or reservation - just zero fill: */
|
|
- if (!pick_ret)
|
|
+ if (!ret)
|
|
goto hole;
|
|
|
|
- if (unlikely(pick_ret < 0)) {
|
|
+ if (unlikely(ret < 0)) {
|
|
+ if (ret == -BCH_ERR_data_read_csum_err) {
|
|
+ int ret2 = maybe_poison_extent(trans, orig, data_btree, k);
|
|
+ if (ret2) {
|
|
+ ret = ret2;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ trace_and_count(c, io_read_fail_and_poison, &orig->bio);
|
|
+ }
|
|
+
|
|
struct printbuf buf = PRINTBUF;
|
|
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
|
|
- prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret));
|
|
+ prt_printf(&buf, "%s\n ", bch2_err_str(ret));
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
@@ -922,7 +1056,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
goto err;
|
|
}
|
|
|
|
- if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
|
|
+ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
|
|
+ !c->chacha20_key_set) {
|
|
struct printbuf buf = PRINTBUF;
|
|
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
|
|
prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
|
|
@@ -930,10 +1065,12 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
+ ret = -BCH_ERR_data_read_no_encryption_key;
|
|
goto err;
|
|
}
|
|
|
|
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
|
|
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
|
|
+ BCH_DEV_READ_REF_io_read);
|
|
|
|
/*
|
|
* Stale dirty pointers are treated as IO errors, but @failed isn't
|
|
@@ -941,56 +1078,58 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
* retry path, don't check here, it'll be caught in bch2_read_endio()
|
|
* and we'll end up in the retry path:
|
|
*/
|
|
- if ((flags & BCH_READ_IN_RETRY) &&
|
|
+ if ((flags & BCH_READ_in_retry) &&
|
|
!pick.ptr.cached &&
|
|
ca &&
|
|
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
|
|
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
|
|
- bch2_mark_io_failure(failed, &pick);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ bch2_mark_io_failure(failed, &pick, false);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
|
|
goto retry_pick;
|
|
}
|
|
|
|
- if (flags & BCH_READ_NODECODE) {
|
|
+ if (likely(!u)) {
|
|
+ if (!(flags & BCH_READ_last_fragment) ||
|
|
+ bio_flagged(&orig->bio, BIO_CHAIN))
|
|
+ flags |= BCH_READ_must_clone;
|
|
+
|
|
+ narrow_crcs = !(flags & BCH_READ_in_retry) &&
|
|
+ bch2_can_narrow_extent_crcs(k, pick.crc);
|
|
+
|
|
+ if (narrow_crcs && (flags & BCH_READ_user_mapped))
|
|
+ flags |= BCH_READ_must_bounce;
|
|
+
|
|
+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
|
|
+
|
|
+ if (crc_is_compressed(pick.crc) ||
|
|
+ (pick.crc.csum_type != BCH_CSUM_none &&
|
|
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
|
|
+ (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
|
|
+ (flags & BCH_READ_user_mapped)) ||
|
|
+ (flags & BCH_READ_must_bounce)))) {
|
|
+ read_full = true;
|
|
+ bounce = true;
|
|
+ }
|
|
+ } else {
|
|
/*
|
|
* can happen if we retry, and the extent we were going to read
|
|
* has been merged in the meantime:
|
|
*/
|
|
- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) {
|
|
+ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
|
|
if (ca)
|
|
- percpu_ref_put(&ca->io_ref);
|
|
- goto hole;
|
|
+ enumerated_ref_put(&ca->io_ref[READ],
|
|
+ BCH_DEV_READ_REF_io_read);
|
|
+ rbio->ret = -BCH_ERR_data_read_buffer_too_small;
|
|
+ goto out_read_done;
|
|
}
|
|
|
|
iter.bi_size = pick.crc.compressed_size << 9;
|
|
- goto get_bio;
|
|
- }
|
|
-
|
|
- if (!(flags & BCH_READ_LAST_FRAGMENT) ||
|
|
- bio_flagged(&orig->bio, BIO_CHAIN))
|
|
- flags |= BCH_READ_MUST_CLONE;
|
|
-
|
|
- narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
|
|
- bch2_can_narrow_extent_crcs(k, pick.crc);
|
|
-
|
|
- if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
|
|
- flags |= BCH_READ_MUST_BOUNCE;
|
|
-
|
|
- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
|
|
-
|
|
- if (crc_is_compressed(pick.crc) ||
|
|
- (pick.crc.csum_type != BCH_CSUM_none &&
|
|
- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
|
|
- (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
|
|
- (flags & BCH_READ_USER_MAPPED)) ||
|
|
- (flags & BCH_READ_MUST_BOUNCE)))) {
|
|
read_full = true;
|
|
- bounce = true;
|
|
}
|
|
|
|
if (orig->opts.promote_target || have_io_error(failed))
|
|
- promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
|
|
- &rbio, &bounce, &read_full, failed);
|
|
+ rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
|
|
+ &bounce, &read_full, failed);
|
|
|
|
if (!read_full) {
|
|
EBUG_ON(crc_is_compressed(pick.crc));
|
|
@@ -1009,7 +1148,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
pick.crc.offset = 0;
|
|
pick.crc.live_size = bvec_iter_sectors(iter);
|
|
}
|
|
-get_bio:
|
|
+
|
|
if (rbio) {
|
|
/*
|
|
* promote already allocated bounce rbio:
|
|
@@ -1024,17 +1163,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
} else if (bounce) {
|
|
unsigned sectors = pick.crc.compressed_size;
|
|
|
|
- rbio = rbio_init(bio_alloc_bioset(NULL,
|
|
+ rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
|
|
DIV_ROUND_UP(sectors, PAGE_SECTORS),
|
|
0,
|
|
GFP_NOFS,
|
|
&c->bio_read_split),
|
|
- orig->opts);
|
|
+ orig);
|
|
|
|
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
|
|
rbio->bounce = true;
|
|
- rbio->split = true;
|
|
- } else if (flags & BCH_READ_MUST_CLONE) {
|
|
+ } else if (flags & BCH_READ_must_clone) {
|
|
/*
|
|
* Have to clone if there were any splits, due to error
|
|
* reporting issues (if a split errored, and retrying didn't
|
|
@@ -1043,11 +1181,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
* from the whole bio, in which case we don't want to retry and
|
|
* lose the error)
|
|
*/
|
|
- rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
|
|
+ rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
|
|
&c->bio_read_split),
|
|
- orig->opts);
|
|
+ orig);
|
|
rbio->bio.bi_iter = iter;
|
|
- rbio->split = true;
|
|
} else {
|
|
rbio = orig;
|
|
rbio->bio.bi_iter = iter;
|
|
@@ -1056,77 +1193,70 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
|
|
EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
|
|
|
|
- rbio->c = c;
|
|
rbio->submit_time = local_clock();
|
|
- if (rbio->split)
|
|
- rbio->parent = orig;
|
|
- else
|
|
+ if (!rbio->split)
|
|
rbio->end_io = orig->bio.bi_end_io;
|
|
rbio->bvec_iter = iter;
|
|
rbio->offset_into_extent= offset_into_extent;
|
|
rbio->flags = flags;
|
|
rbio->have_ioref = ca != NULL;
|
|
rbio->narrow_crcs = narrow_crcs;
|
|
- rbio->hole = 0;
|
|
- rbio->retry = 0;
|
|
+ rbio->ret = 0;
|
|
rbio->context = 0;
|
|
- /* XXX: only initialize this if needed */
|
|
- rbio->devs_have = bch2_bkey_devs(k);
|
|
rbio->pick = pick;
|
|
rbio->subvol = orig->subvol;
|
|
rbio->read_pos = read_pos;
|
|
rbio->data_btree = data_btree;
|
|
rbio->data_pos = data_pos;
|
|
rbio->version = k.k->bversion;
|
|
- rbio->promote = promote;
|
|
INIT_WORK(&rbio->work, NULL);
|
|
|
|
- if (flags & BCH_READ_NODECODE)
|
|
- orig->pick = pick;
|
|
-
|
|
rbio->bio.bi_opf = orig->bio.bi_opf;
|
|
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
|
|
rbio->bio.bi_end_io = bch2_read_endio;
|
|
|
|
+ async_object_list_add(c, rbio, rbio, &rbio->list_idx);
|
|
+
|
|
+ /* XXX: also nvme read recovery level */
|
|
+ if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
|
|
+ rbio->bio.bi_opf |= REQ_FUA;
|
|
+
|
|
if (rbio->bounce)
|
|
- trace_and_count(c, read_bounce, &rbio->bio);
|
|
+ trace_and_count(c, io_read_bounce, &rbio->bio);
|
|
|
|
- this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
|
|
+ if (!u)
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
|
|
+ else
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
|
|
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
|
|
|
|
/*
|
|
* If it's being moved internally, we don't want to flag it as a cache
|
|
* hit:
|
|
*/
|
|
- if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE))
|
|
+ if (ca && pick.ptr.cached && !u)
|
|
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
|
|
PTR_BUCKET_NR(ca, &pick.ptr), READ);
|
|
|
|
- if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
|
|
+ if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
|
|
bio_inc_remaining(&orig->bio);
|
|
- trace_and_count(c, read_split, &orig->bio);
|
|
+ trace_and_count(c, io_read_split, &orig->bio);
|
|
}
|
|
|
|
/*
|
|
* Unlock the iterator while the btree node's lock is still in
|
|
* cache, before doing the IO:
|
|
*/
|
|
- if (!(flags & BCH_READ_IN_RETRY))
|
|
+ if (!(flags & BCH_READ_in_retry))
|
|
bch2_trans_unlock(trans);
|
|
else
|
|
bch2_trans_unlock_long(trans);
|
|
|
|
- if (!rbio->pick.idx) {
|
|
+ if (likely(!rbio->pick.do_ec_reconstruct)) {
|
|
if (unlikely(!rbio->have_ioref)) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
|
|
- prt_printf(&buf, "no device to read from:\n ");
|
|
- bch2_bkey_val_to_text(&buf, c, k);
|
|
-
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
-
|
|
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
|
+ bch2_rbio_error(rbio,
|
|
+ -BCH_ERR_data_read_retry_device_offline,
|
|
+ BLK_STS_IOERR);
|
|
goto out;
|
|
}
|
|
|
|
@@ -1135,10 +1265,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
|
|
|
|
if (unlikely(c->opts.no_data_io)) {
|
|
- if (likely(!(flags & BCH_READ_IN_RETRY)))
|
|
+ if (likely(!(flags & BCH_READ_in_retry)))
|
|
bio_endio(&rbio->bio);
|
|
} else {
|
|
- if (likely(!(flags & BCH_READ_IN_RETRY)))
|
|
+ if (likely(!(flags & BCH_READ_in_retry)))
|
|
submit_bio(&rbio->bio);
|
|
else
|
|
submit_bio_wait(&rbio->bio);
|
|
@@ -1152,15 +1282,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
} else {
|
|
/* Attempting reconstruct read: */
|
|
if (bch2_ec_read_extent(trans, rbio, k)) {
|
|
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
|
|
+ BLK_STS_IOERR);
|
|
goto out;
|
|
}
|
|
|
|
- if (likely(!(flags & BCH_READ_IN_RETRY)))
|
|
+ if (likely(!(flags & BCH_READ_in_retry)))
|
|
bio_endio(&rbio->bio);
|
|
}
|
|
out:
|
|
- if (likely(!(flags & BCH_READ_IN_RETRY))) {
|
|
+ if (likely(!(flags & BCH_READ_in_retry))) {
|
|
return 0;
|
|
} else {
|
|
bch2_trans_unlock(trans);
|
|
@@ -1170,54 +1301,57 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
rbio->context = RBIO_CONTEXT_UNBOUND;
|
|
bch2_read_endio(&rbio->bio);
|
|
|
|
- ret = rbio->retry;
|
|
+ ret = rbio->ret;
|
|
rbio = bch2_rbio_free(rbio);
|
|
|
|
- if (ret == READ_RETRY_AVOID) {
|
|
- bch2_mark_io_failure(failed, &pick);
|
|
- ret = READ_RETRY;
|
|
- }
|
|
-
|
|
- if (!ret)
|
|
- goto out_read_done;
|
|
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
|
|
+ bch2_mark_io_failure(failed, &pick,
|
|
+ ret == -BCH_ERR_data_read_retry_csum_err);
|
|
|
|
return ret;
|
|
}
|
|
|
|
err:
|
|
- if (flags & BCH_READ_IN_RETRY)
|
|
- return READ_ERR;
|
|
+ if (flags & BCH_READ_in_retry)
|
|
+ return ret;
|
|
|
|
- orig->bio.bi_status = BLK_STS_IOERR;
|
|
+ orig->bio.bi_status = BLK_STS_IOERR;
|
|
+ orig->ret = ret;
|
|
goto out_read_done;
|
|
|
|
hole:
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
|
|
+ bvec_iter_sectors(iter));
|
|
/*
|
|
- * won't normally happen in the BCH_READ_NODECODE
|
|
- * (bch2_move_extent()) path, but if we retry and the extent we wanted
|
|
- * to read no longer exists we have to signal that:
|
|
+ * won't normally happen in the data update (bch2_move_extent()) path,
|
|
+ * but if we retry and the extent we wanted to read no longer exists we
|
|
+ * have to signal that:
|
|
*/
|
|
- if (flags & BCH_READ_NODECODE)
|
|
- orig->hole = true;
|
|
+ if (u)
|
|
+ orig->ret = -BCH_ERR_data_read_key_overwritten;
|
|
|
|
zero_fill_bio_iter(&orig->bio, iter);
|
|
out_read_done:
|
|
- if (flags & BCH_READ_LAST_FRAGMENT)
|
|
+ if ((flags & BCH_READ_last_fragment) &&
|
|
+ !(flags & BCH_READ_in_retry))
|
|
bch2_rbio_done(orig);
|
|
return 0;
|
|
}
|
|
|
|
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
- struct bvec_iter bvec_iter, subvol_inum inum,
|
|
- struct bch_io_failures *failed, unsigned flags)
|
|
+int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
|
|
+ struct bvec_iter bvec_iter, subvol_inum inum,
|
|
+ struct bch_io_failures *failed,
|
|
+ struct bkey_buf *prev_read,
|
|
+ unsigned flags)
|
|
{
|
|
- struct btree_trans *trans = bch2_trans_get(c);
|
|
+ struct bch_fs *c = trans->c;
|
|
struct btree_iter iter;
|
|
struct bkey_buf sk;
|
|
struct bkey_s_c k;
|
|
+ enum btree_id data_btree;
|
|
int ret;
|
|
|
|
- BUG_ON(flags & BCH_READ_NODECODE);
|
|
+ EBUG_ON(rbio->data_update);
|
|
|
|
bch2_bkey_buf_init(&sk);
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
|
|
@@ -1225,7 +1359,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
BTREE_ITER_slots);
|
|
|
|
while (1) {
|
|
- enum btree_id data_btree = BTREE_ID_extents;
|
|
+ data_btree = BTREE_ID_extents;
|
|
|
|
bch2_trans_begin(trans);
|
|
|
|
@@ -1234,12 +1368,12 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- bch2_btree_iter_set_snapshot(&iter, snapshot);
|
|
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
|
|
|
|
- bch2_btree_iter_set_pos(&iter,
|
|
+ bch2_btree_iter_set_pos(trans, &iter,
|
|
POS(inum.inum, bvec_iter.bi_sector));
|
|
|
|
- k = bch2_btree_iter_peek_slot(&iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1257,6 +1391,12 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
|
|
k = bkey_i_to_s_c(sk.k);
|
|
|
|
+ if (unlikely(flags & BCH_READ_in_retry)) {
|
|
+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k)))
|
|
+ failed->nr = 0;
|
|
+ bch2_bkey_buf_copy(prev_read, c, sk.k);
|
|
+ }
|
|
+
|
|
/*
|
|
* With indirect extents, the amount of data to read is the min
|
|
* of the original extent and the indirect extent:
|
|
@@ -1267,42 +1407,86 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
swap(bvec_iter.bi_size, bytes);
|
|
|
|
if (bvec_iter.bi_size == bytes)
|
|
- flags |= BCH_READ_LAST_FRAGMENT;
|
|
+ flags |= BCH_READ_last_fragment;
|
|
|
|
ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
|
|
data_btree, k,
|
|
- offset_into_extent, failed, flags);
|
|
+ offset_into_extent, failed, flags, -1);
|
|
+ swap(bvec_iter.bi_size, bytes);
|
|
+
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (flags & BCH_READ_LAST_FRAGMENT)
|
|
+ if (flags & BCH_READ_last_fragment)
|
|
break;
|
|
|
|
- swap(bvec_iter.bi_size, bytes);
|
|
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
|
|
err:
|
|
+ if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
|
|
+ flags |= BCH_READ_must_bounce;
|
|
+
|
|
if (ret &&
|
|
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
|
|
- ret != READ_RETRY &&
|
|
- ret != READ_RETRY_AVOID)
|
|
+ !bch2_err_matches(ret, BCH_ERR_data_read_retry))
|
|
break;
|
|
}
|
|
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
+ if (unlikely(ret)) {
|
|
+ if (ret != -BCH_ERR_extent_poisoned) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ lockrestart_do(trans,
|
|
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum,
|
|
+ bvec_iter.bi_sector << 9));
|
|
+ prt_printf(&buf, "data read error: %s", bch2_err_str(ret));
|
|
+ bch_err_ratelimited(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
|
|
- if (ret) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9);
|
|
- prt_printf(&buf, "read error %i from btree lookup", ret);
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
+ rbio->bio.bi_status = BLK_STS_IOERR;
|
|
+ rbio->ret = ret;
|
|
|
|
- rbio->bio.bi_status = BLK_STS_IOERR;
|
|
- bch2_rbio_done(rbio);
|
|
+ if (!(flags & BCH_READ_in_retry))
|
|
+ bch2_rbio_done(rbio);
|
|
}
|
|
|
|
- bch2_trans_put(trans);
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
bch2_bkey_buf_exit(&sk, c);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static const char * const bch2_read_bio_flags[] = {
|
|
+#define x(n) #n,
|
|
+ BCH_READ_FLAGS()
|
|
+#undef x
|
|
+ NULL
|
|
+};
|
|
+
|
|
+void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio)
|
|
+{
|
|
+ u64 now = local_clock();
|
|
+ prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0);
|
|
+ prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0);
|
|
+
|
|
+ if (!rbio->split)
|
|
+ prt_printf(out, "end_io:\t%ps\n", rbio->end_io);
|
|
+ else
|
|
+ prt_printf(out, "parent:\t%px\n", rbio->parent);
|
|
+
|
|
+ prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io);
|
|
+
|
|
+ prt_printf(out, "promote:\t%u\n", rbio->promote);
|
|
+ prt_printf(out, "bounce:\t%u\n", rbio->bounce);
|
|
+ prt_printf(out, "split:\t%u\n", rbio->split);
|
|
+ prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref);
|
|
+ prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs);
|
|
+ prt_printf(out, "context:\t%u\n", rbio->context);
|
|
+ prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret));
|
|
+
|
|
+ prt_printf(out, "flags:\t");
|
|
+ bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags);
|
|
+ prt_newline(out);
|
|
+
|
|
+ bch2_bio_to_text(out, &rbio->bio);
|
|
}
|
|
|
|
void bch2_fs_io_read_exit(struct bch_fs *c)
|
|
@@ -1311,10 +1495,18 @@ void bch2_fs_io_read_exit(struct bch_fs *c)
|
|
rhashtable_destroy(&c->promote_table);
|
|
bioset_exit(&c->bio_read_split);
|
|
bioset_exit(&c->bio_read);
|
|
+ mempool_exit(&c->bio_bounce_pages);
|
|
}
|
|
|
|
int bch2_fs_io_read_init(struct bch_fs *c)
|
|
{
|
|
+ if (mempool_init_page_pool(&c->bio_bounce_pages,
|
|
+ max_t(unsigned,
|
|
+ c->opts.btree_node_size,
|
|
+ c->opts.encoded_extent_max) /
|
|
+ PAGE_SIZE, 0))
|
|
+ return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
|
|
+
|
|
if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
|
|
BIOSET_NEED_BVECS))
|
|
return -BCH_ERR_ENOMEM_bio_read_init;
|
|
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
|
|
index a82e8a94ccb6..c08b9c047b3e 100644
|
|
--- a/fs/bcachefs/io_read.h
|
|
+++ b/fs/bcachefs/io_read.h
|
|
@@ -3,6 +3,8 @@
|
|
#define _BCACHEFS_IO_READ_H
|
|
|
|
#include "bkey_buf.h"
|
|
+#include "btree_iter.h"
|
|
+#include "extents_types.h"
|
|
#include "reflink.h"
|
|
|
|
struct bch_read_bio {
|
|
@@ -35,19 +37,21 @@ struct bch_read_bio {
|
|
u16 flags;
|
|
union {
|
|
struct {
|
|
- u16 bounce:1,
|
|
+ u16 data_update:1,
|
|
+ promote:1,
|
|
+ bounce:1,
|
|
split:1,
|
|
- kmalloc:1,
|
|
have_ioref:1,
|
|
narrow_crcs:1,
|
|
- hole:1,
|
|
- retry:2,
|
|
+ saw_error:1,
|
|
context:2;
|
|
};
|
|
u16 _state;
|
|
};
|
|
-
|
|
- struct bch_devs_list devs_have;
|
|
+ s16 ret;
|
|
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
|
|
+ unsigned list_idx;
|
|
+#endif
|
|
|
|
struct extent_ptr_decoded pick;
|
|
|
|
@@ -65,8 +69,6 @@ struct bch_read_bio {
|
|
struct bpos data_pos;
|
|
struct bversion version;
|
|
|
|
- struct promote_op *promote;
|
|
-
|
|
struct bch_io_opts opts;
|
|
|
|
struct work_struct work;
|
|
@@ -108,64 +110,103 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
+#define BCH_READ_FLAGS() \
|
|
+ x(retry_if_stale) \
|
|
+ x(may_promote) \
|
|
+ x(user_mapped) \
|
|
+ x(last_fragment) \
|
|
+ x(must_bounce) \
|
|
+ x(must_clone) \
|
|
+ x(in_retry)
|
|
+
|
|
+enum __bch_read_flags {
|
|
+#define x(n) __BCH_READ_##n,
|
|
+ BCH_READ_FLAGS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
enum bch_read_flags {
|
|
- BCH_READ_RETRY_IF_STALE = 1 << 0,
|
|
- BCH_READ_MAY_PROMOTE = 1 << 1,
|
|
- BCH_READ_USER_MAPPED = 1 << 2,
|
|
- BCH_READ_NODECODE = 1 << 3,
|
|
- BCH_READ_LAST_FRAGMENT = 1 << 4,
|
|
-
|
|
- /* internal: */
|
|
- BCH_READ_MUST_BOUNCE = 1 << 5,
|
|
- BCH_READ_MUST_CLONE = 1 << 6,
|
|
- BCH_READ_IN_RETRY = 1 << 7,
|
|
+#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n),
|
|
+ BCH_READ_FLAGS()
|
|
+#undef x
|
|
};
|
|
|
|
int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
|
|
struct bvec_iter, struct bpos, enum btree_id,
|
|
struct bkey_s_c, unsigned,
|
|
- struct bch_io_failures *, unsigned);
|
|
+ struct bch_io_failures *, unsigned, int);
|
|
|
|
static inline void bch2_read_extent(struct btree_trans *trans,
|
|
struct bch_read_bio *rbio, struct bpos read_pos,
|
|
enum btree_id data_btree, struct bkey_s_c k,
|
|
unsigned offset_into_extent, unsigned flags)
|
|
{
|
|
- __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
|
|
- data_btree, k, offset_into_extent, NULL, flags);
|
|
+ int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
|
|
+ data_btree, k, offset_into_extent, NULL, flags, -1);
|
|
+ /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */
|
|
+ WARN(ret, "unhandled error from __bch2_read_extent()");
|
|
}
|
|
|
|
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
|
|
- subvol_inum, struct bch_io_failures *, unsigned flags);
|
|
+int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
|
|
+ subvol_inum,
|
|
+ struct bch_io_failures *, struct bkey_buf *, unsigned flags);
|
|
|
|
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
subvol_inum inum)
|
|
{
|
|
- struct bch_io_failures failed = { .nr = 0 };
|
|
-
|
|
BUG_ON(rbio->_state);
|
|
|
|
- rbio->c = c;
|
|
- rbio->start_time = local_clock();
|
|
rbio->subvol = inum.subvol;
|
|
|
|
- __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
|
|
- BCH_READ_RETRY_IF_STALE|
|
|
- BCH_READ_MAY_PROMOTE|
|
|
- BCH_READ_USER_MAPPED);
|
|
+ bch2_trans_run(c,
|
|
+ __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL,
|
|
+ BCH_READ_retry_if_stale|
|
|
+ BCH_READ_may_promote|
|
|
+ BCH_READ_user_mapped));
|
|
+}
|
|
+
|
|
+static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
|
|
+ struct bch_read_bio *orig)
|
|
+{
|
|
+ struct bch_read_bio *rbio = to_rbio(bio);
|
|
+
|
|
+ rbio->c = orig->c;
|
|
+ rbio->_state = 0;
|
|
+ rbio->flags = 0;
|
|
+ rbio->ret = 0;
|
|
+ rbio->split = true;
|
|
+ rbio->parent = orig;
|
|
+ rbio->opts = orig->opts;
|
|
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
|
|
+ rbio->list_idx = 0;
|
|
+#endif
|
|
+ return rbio;
|
|
}
|
|
|
|
static inline struct bch_read_bio *rbio_init(struct bio *bio,
|
|
- struct bch_io_opts opts)
|
|
+ struct bch_fs *c,
|
|
+ struct bch_io_opts opts,
|
|
+ bio_end_io_t end_io)
|
|
{
|
|
struct bch_read_bio *rbio = to_rbio(bio);
|
|
|
|
- rbio->_state = 0;
|
|
- rbio->promote = NULL;
|
|
- rbio->opts = opts;
|
|
+ rbio->start_time = local_clock();
|
|
+ rbio->c = c;
|
|
+ rbio->_state = 0;
|
|
+ rbio->flags = 0;
|
|
+ rbio->ret = 0;
|
|
+ rbio->opts = opts;
|
|
+ rbio->bio.bi_end_io = end_io;
|
|
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
|
|
+ rbio->list_idx = 0;
|
|
+#endif
|
|
return rbio;
|
|
}
|
|
|
|
+struct promote_op;
|
|
+void bch2_promote_op_to_text(struct printbuf *, struct promote_op *);
|
|
+void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *);
|
|
+
|
|
void bch2_fs_io_read_exit(struct bch_fs *);
|
|
int bch2_fs_io_read_init(struct bch_fs *);
|
|
|
|
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
|
|
index 03892388832b..52a60982a66b 100644
|
|
--- a/fs/bcachefs/io_write.c
|
|
+++ b/fs/bcachefs/io_write.c
|
|
@@ -6,6 +6,7 @@
|
|
|
|
#include "bcachefs.h"
|
|
#include "alloc_foreground.h"
|
|
+#include "async_objs.h"
|
|
#include "bkey_buf.h"
|
|
#include "bset.h"
|
|
#include "btree_update.h"
|
|
@@ -15,6 +16,7 @@
|
|
#include "compress.h"
|
|
#include "debug.h"
|
|
#include "ec.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "error.h"
|
|
#include "extent_update.h"
|
|
#include "inode.h"
|
|
@@ -34,6 +36,12 @@
|
|
#include <linux/random.h>
|
|
#include <linux/sched/mm.h>
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+static unsigned bch2_write_corrupt_ratio;
|
|
+module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
|
|
+MODULE_PARM_DESC(write_corrupt_ratio, "");
|
|
+#endif
|
|
+
|
|
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
|
|
|
static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
|
|
@@ -162,9 +170,9 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
|
|
*i_sectors_delta = 0;
|
|
*disk_sectors_delta = 0;
|
|
|
|
- bch2_trans_copy_iter(&iter, extent_iter);
|
|
+ bch2_trans_copy_iter(trans, &iter, extent_iter);
|
|
|
|
- for_each_btree_key_max_continue_norestart(iter,
|
|
+ for_each_btree_key_max_continue_norestart(trans, iter,
|
|
new->k.p, BTREE_ITER_slots, old, ret) {
|
|
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
|
|
max(bkey_start_offset(&new->k),
|
|
@@ -249,10 +257,35 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
|
|
}
|
|
|
|
if (i_sectors_delta) {
|
|
+ s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors);
|
|
+ if (unlikely(bi_sectors + i_sectors_delta < 0)) {
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+ prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0",
|
|
+ extent_iter->pos.inode, bi_sectors, i_sectors_delta);
|
|
+
|
|
+ bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf);
|
|
+ if (print)
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+
|
|
+ if (i_sectors_delta < 0)
|
|
+ i_sectors_delta = -bi_sectors;
|
|
+ else
|
|
+ i_sectors_delta = 0;
|
|
+ }
|
|
+
|
|
le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
|
|
inode_update_flags = 0;
|
|
}
|
|
|
|
+ /*
|
|
+ * extents, dirents and xattrs updates require that an inode update also
|
|
+ * happens - to ensure that if a key exists in one of those btrees with
|
|
+ * a given snapshot ID an inode is also present - so we may have to skip
|
|
+ * the nojournal optimization:
|
|
+ */
|
|
if (inode->k.p.snapshot != iter.snapshot) {
|
|
inode->k.p.snapshot = iter.snapshot;
|
|
inode_update_flags = 0;
|
|
@@ -286,7 +319,7 @@ int bch2_extent_update(struct btree_trans *trans,
|
|
* path already traversed at iter->pos because
|
|
* bch2_trans_extent_update() will use it to attempt extent merging
|
|
*/
|
|
- ret = __bch2_btree_iter_traverse(iter);
|
|
+ ret = __bch2_btree_iter_traverse(trans, iter);
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -331,7 +364,7 @@ int bch2_extent_update(struct btree_trans *trans,
|
|
|
|
if (i_sectors_delta_total)
|
|
*i_sectors_delta_total += i_sectors_delta;
|
|
- bch2_btree_iter_set_pos(iter, next_pos);
|
|
+ bch2_btree_iter_set_pos(trans, iter, next_pos);
|
|
return 0;
|
|
}
|
|
|
|
@@ -370,11 +403,10 @@ static int bch2_write_index_default(struct bch_write_op *op)
|
|
bkey_start_pos(&sk.k->k),
|
|
BTREE_ITER_slots|BTREE_ITER_intent);
|
|
|
|
- ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?:
|
|
- bch2_extent_update(trans, inum, &iter, sk.k,
|
|
+ ret = bch2_extent_update(trans, inum, &iter, sk.k,
|
|
&op->res,
|
|
op->new_i_size, &op->i_sectors_delta,
|
|
- op->flags & BCH_WRITE_CHECK_ENOSPC);
|
|
+ op->flags & BCH_WRITE_check_enospc);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
@@ -396,29 +428,36 @@ static int bch2_write_index_default(struct bch_write_op *op)
|
|
|
|
/* Writes */
|
|
|
|
-static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
|
|
- u64 offset)
|
|
+void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
|
|
{
|
|
- bch2_inum_offset_err_msg(op->c, out,
|
|
- (subvol_inum) { op->subvol, op->pos.inode, },
|
|
- offset << 9);
|
|
- prt_printf(out, "write error%s: ",
|
|
- op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
|
|
-}
|
|
+ struct printbuf buf = PRINTBUF;
|
|
|
|
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
|
|
-{
|
|
- __bch2_write_op_error(out, op, op->pos.offset);
|
|
-}
|
|
+ if (op->subvol) {
|
|
+ bch2_inum_offset_err_msg(op->c, &buf,
|
|
+ (subvol_inum) { op->subvol, op->pos.inode, },
|
|
+ offset << 9);
|
|
+ } else {
|
|
+ struct bpos pos = op->pos;
|
|
+ pos.offset = offset;
|
|
+ bch2_inum_snap_offset_err_msg(op->c, &buf, pos);
|
|
+ }
|
|
|
|
-static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
|
|
- struct bch_write_op *op, u64 offset)
|
|
-{
|
|
- bch2_inum_offset_err_msg_trans(trans, out,
|
|
- (subvol_inum) { op->subvol, op->pos.inode, },
|
|
- offset << 9);
|
|
- prt_printf(out, "write error%s: ",
|
|
- op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
|
|
+ prt_str(&buf, "write error: ");
|
|
+
|
|
+ va_list args;
|
|
+ va_start(args, fmt);
|
|
+ prt_vprintf(&buf, fmt, args);
|
|
+ va_end(args);
|
|
+
|
|
+ if (op->flags & BCH_WRITE_move) {
|
|
+ struct data_update *u = container_of(op, struct data_update, op);
|
|
+
|
|
+ prt_printf(&buf, "\n from internal move ");
|
|
+ bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
|
|
+ }
|
|
+
|
|
+ bch_err_ratelimited(op->c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
}
|
|
|
|
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
|
|
@@ -428,15 +467,28 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
|
|
{
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
|
|
struct bch_write_bio *n;
|
|
+ unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE;
|
|
+ unsigned ref_idx = type == BCH_DATA_btree
|
|
+ ? BCH_DEV_READ_REF_btree_node_write
|
|
+ : BCH_DEV_WRITE_REF_io_write;
|
|
|
|
BUG_ON(c->opts.nochanges);
|
|
|
|
+ const struct bch_extent_ptr *last = NULL;
|
|
+ bkey_for_each_ptr(ptrs, ptr)
|
|
+ last = ptr;
|
|
+
|
|
bkey_for_each_ptr(ptrs, ptr) {
|
|
+ /*
|
|
+ * XXX: btree writes should be using io_ref[WRITE], but we
|
|
+ * aren't retrying failed btree writes yet (due to device
|
|
+ * removal/ro):
|
|
+ */
|
|
struct bch_dev *ca = nocow
|
|
? bch2_dev_have_ref(c, ptr->dev)
|
|
- : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);
|
|
+ : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx);
|
|
|
|
- if (to_entry(ptr + 1) < ptrs.end) {
|
|
+ if (ptr != last) {
|
|
n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
|
|
|
|
n->bio.bi_end_io = wbio->bio.bi_end_io;
|
|
@@ -493,12 +545,13 @@ static void bch2_write_done(struct closure *cl)
|
|
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
|
|
bch2_disk_reservation_put(c, &op->res);
|
|
|
|
- if (!(op->flags & BCH_WRITE_MOVE))
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_write);
|
|
+ if (!(op->flags & BCH_WRITE_move))
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_write);
|
|
bch2_keylist_free(&op->insert_keys, op->inline_keys);
|
|
|
|
EBUG_ON(cl->parent);
|
|
closure_debug_destroy(cl);
|
|
+ async_object_list_del(c, write_op, op->list_idx);
|
|
if (op->end_io)
|
|
op->end_io(op);
|
|
}
|
|
@@ -516,7 +569,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
|
|
test_bit(ptr->dev, op->failed.d));
|
|
|
|
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
|
|
- return -EIO;
|
|
+ return -BCH_ERR_data_write_io;
|
|
}
|
|
|
|
if (dst != src)
|
|
@@ -539,7 +592,7 @@ static void __bch2_write_index(struct bch_write_op *op)
|
|
unsigned dev;
|
|
int ret = 0;
|
|
|
|
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
|
|
+ if (unlikely(op->flags & BCH_WRITE_io_error)) {
|
|
ret = bch2_write_drop_io_error_ptrs(op);
|
|
if (ret)
|
|
goto err;
|
|
@@ -548,7 +601,7 @@ static void __bch2_write_index(struct bch_write_op *op)
|
|
if (!bch2_keylist_empty(keys)) {
|
|
u64 sectors_start = keylist_sectors(keys);
|
|
|
|
- ret = !(op->flags & BCH_WRITE_MOVE)
|
|
+ ret = !(op->flags & BCH_WRITE_move)
|
|
? bch2_write_index_default(op)
|
|
: bch2_data_update_index_update(op);
|
|
|
|
@@ -560,11 +613,8 @@ static void __bch2_write_index(struct bch_write_op *op)
|
|
if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
|
|
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
|
|
|
|
- struct printbuf buf = PRINTBUF;
|
|
- __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
|
|
- prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
|
|
+ "btree update error: %s", bch2_err_str(ret));
|
|
}
|
|
|
|
if (ret)
|
|
@@ -573,21 +623,29 @@ static void __bch2_write_index(struct bch_write_op *op)
|
|
out:
|
|
/* If some a bucket wasn't written, we can't erasure code it: */
|
|
for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
|
|
- bch2_open_bucket_write_error(c, &op->open_buckets, dev);
|
|
+ bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io);
|
|
|
|
bch2_open_buckets_put(c, &op->open_buckets);
|
|
return;
|
|
err:
|
|
keys->top = keys->keys;
|
|
op->error = ret;
|
|
- op->flags |= BCH_WRITE_SUBMITTED;
|
|
+ op->flags |= BCH_WRITE_submitted;
|
|
goto out;
|
|
}
|
|
|
|
static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
|
|
{
|
|
if (state != wp->state) {
|
|
+ struct task_struct *p = current;
|
|
u64 now = ktime_get_ns();
|
|
+ u64 runtime = p->se.sum_exec_runtime +
|
|
+ (now - p->se.exec_start);
|
|
+
|
|
+ if (state == WRITE_POINT_runnable)
|
|
+ wp->last_runtime = runtime;
|
|
+ else if (wp->state == WRITE_POINT_runnable)
|
|
+ wp->time[WRITE_POINT_running] += runtime - wp->last_runtime;
|
|
|
|
if (wp->last_state_change &&
|
|
time_after64(now, wp->last_state_change))
|
|
@@ -601,7 +659,7 @@ static inline void wp_update_state(struct write_point *wp, bool running)
|
|
{
|
|
enum write_point_state state;
|
|
|
|
- state = running ? WRITE_POINT_running :
|
|
+ state = running ? WRITE_POINT_runnable:
|
|
!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
|
|
: WRITE_POINT_stopped;
|
|
|
|
@@ -615,8 +673,8 @@ static CLOSURE_CALLBACK(bch2_write_index)
|
|
struct workqueue_struct *wq = index_update_wq(op);
|
|
unsigned long flags;
|
|
|
|
- if ((op->flags & BCH_WRITE_SUBMITTED) &&
|
|
- (op->flags & BCH_WRITE_MOVE))
|
|
+ if ((op->flags & BCH_WRITE_submitted) &&
|
|
+ (op->flags & BCH_WRITE_move))
|
|
bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
|
|
|
|
spin_lock_irqsave(&wp->writes_lock, flags);
|
|
@@ -654,11 +712,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
|
|
if (!op)
|
|
break;
|
|
|
|
- op->flags |= BCH_WRITE_IN_WORKER;
|
|
+ op->flags |= BCH_WRITE_in_worker;
|
|
|
|
__bch2_write_index(op);
|
|
|
|
- if (!(op->flags & BCH_WRITE_SUBMITTED))
|
|
+ if (!(op->flags & BCH_WRITE_submitted))
|
|
__bch2_write(op);
|
|
else
|
|
bch2_write_done(&op->cl);
|
|
@@ -676,13 +734,24 @@ static void bch2_write_endio(struct bio *bio)
|
|
? bch2_dev_have_ref(c, wbio->dev)
|
|
: NULL;
|
|
|
|
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
|
|
- op->pos.inode,
|
|
- wbio->inode_offset << 9,
|
|
- "data write error: %s",
|
|
- bch2_blk_status_to_str(bio->bi_status))) {
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
|
|
+ wbio->submit_time, !bio->bi_status);
|
|
+
|
|
+ if (unlikely(bio->bi_status)) {
|
|
+ if (ca)
|
|
+ bch_err_inum_offset_ratelimited(ca,
|
|
+ op->pos.inode,
|
|
+ wbio->inode_offset << 9,
|
|
+ "data write error: %s",
|
|
+ bch2_blk_status_to_str(bio->bi_status));
|
|
+ else
|
|
+ bch_err_inum_offset_ratelimited(c,
|
|
+ op->pos.inode,
|
|
+ wbio->inode_offset << 9,
|
|
+ "data write error: %s",
|
|
+ bch2_blk_status_to_str(bio->bi_status));
|
|
set_bit(wbio->dev, op->failed.d);
|
|
- op->flags |= BCH_WRITE_IO_ERROR;
|
|
+ op->flags |= BCH_WRITE_io_error;
|
|
}
|
|
|
|
if (wbio->nocow) {
|
|
@@ -692,10 +761,9 @@ static void bch2_write_endio(struct bio *bio)
|
|
set_bit(wbio->dev, op->devs_need_flush->d);
|
|
}
|
|
|
|
- if (wbio->have_ioref) {
|
|
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
- }
|
|
+ if (wbio->have_ioref)
|
|
+ enumerated_ref_put(&ca->io_ref[WRITE],
|
|
+ BCH_DEV_WRITE_REF_io_write);
|
|
|
|
if (wbio->bounce)
|
|
bch2_bio_free_pages_pool(c, bio);
|
|
@@ -729,7 +797,10 @@ static void init_append_extent(struct bch_write_op *op,
|
|
bch2_extent_crc_append(&e->k_i, crc);
|
|
|
|
bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
|
|
- op->flags & BCH_WRITE_CACHED);
|
|
+ op->flags & BCH_WRITE_cached);
|
|
+
|
|
+ if (!(op->flags & BCH_WRITE_move))
|
|
+ bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i);
|
|
|
|
bch2_keylist_push(&op->insert_keys);
|
|
}
|
|
@@ -789,7 +860,6 @@ static int bch2_write_rechecksum(struct bch_fs *c,
|
|
{
|
|
struct bio *bio = &op->wbio.bio;
|
|
struct bch_extent_crc_unpacked new_crc;
|
|
- int ret;
|
|
|
|
/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
|
|
|
|
@@ -797,10 +867,10 @@ static int bch2_write_rechecksum(struct bch_fs *c,
|
|
bch2_csum_type_is_encryption(new_csum_type))
|
|
new_csum_type = op->crc.csum_type;
|
|
|
|
- ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
|
|
- NULL, &new_crc,
|
|
- op->crc.offset, op->crc.live_size,
|
|
- new_csum_type);
|
|
+ int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
|
|
+ NULL, &new_crc,
|
|
+ op->crc.offset, op->crc.live_size,
|
|
+ new_csum_type);
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -810,44 +880,12 @@ static int bch2_write_rechecksum(struct bch_fs *c,
|
|
return 0;
|
|
}
|
|
|
|
-static int bch2_write_decrypt(struct bch_write_op *op)
|
|
-{
|
|
- struct bch_fs *c = op->c;
|
|
- struct nonce nonce = extent_nonce(op->version, op->crc);
|
|
- struct bch_csum csum;
|
|
- int ret;
|
|
-
|
|
- if (!bch2_csum_type_is_encryption(op->crc.csum_type))
|
|
- return 0;
|
|
-
|
|
- /*
|
|
- * If we need to decrypt data in the write path, we'll no longer be able
|
|
- * to verify the existing checksum (poly1305 mac, in this case) after
|
|
- * it's decrypted - this is the last point we'll be able to reverify the
|
|
- * checksum:
|
|
- */
|
|
- csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
|
|
- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
|
|
- return -EIO;
|
|
-
|
|
- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
|
|
- op->crc.csum_type = 0;
|
|
- op->crc.csum = (struct bch_csum) { 0, 0 };
|
|
- return ret;
|
|
-}
|
|
-
|
|
-static enum prep_encoded_ret {
|
|
- PREP_ENCODED_OK,
|
|
- PREP_ENCODED_ERR,
|
|
- PREP_ENCODED_CHECKSUM_ERR,
|
|
- PREP_ENCODED_DO_WRITE,
|
|
-} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
|
|
+static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
|
|
{
|
|
struct bch_fs *c = op->c;
|
|
struct bio *bio = &op->wbio.bio;
|
|
-
|
|
- if (!(op->flags & BCH_WRITE_DATA_ENCODED))
|
|
- return PREP_ENCODED_OK;
|
|
+ struct bch_csum csum;
|
|
+ int ret = 0;
|
|
|
|
BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
|
|
|
|
@@ -858,12 +896,13 @@ static enum prep_encoded_ret {
|
|
(op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
|
|
op->incompressible)) {
|
|
if (!crc_is_compressed(op->crc) &&
|
|
- op->csum_type != op->crc.csum_type &&
|
|
- bch2_write_rechecksum(c, op, op->csum_type) &&
|
|
- !c->opts.no_data_io)
|
|
- return PREP_ENCODED_CHECKSUM_ERR;
|
|
+ op->csum_type != op->crc.csum_type) {
|
|
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
|
|
- return PREP_ENCODED_DO_WRITE;
|
|
+ return 1;
|
|
}
|
|
|
|
/*
|
|
@@ -871,20 +910,24 @@ static enum prep_encoded_ret {
|
|
* is, we have to decompress it:
|
|
*/
|
|
if (crc_is_compressed(op->crc)) {
|
|
- struct bch_csum csum;
|
|
-
|
|
- if (bch2_write_decrypt(op))
|
|
- return PREP_ENCODED_CHECKSUM_ERR;
|
|
-
|
|
/* Last point we can still verify checksum: */
|
|
- csum = bch2_checksum_bio(c, op->crc.csum_type,
|
|
- extent_nonce(op->version, op->crc),
|
|
- bio);
|
|
+ struct nonce nonce = extent_nonce(op->version, op->crc);
|
|
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
|
|
if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
|
|
- return PREP_ENCODED_CHECKSUM_ERR;
|
|
+ goto csum_err;
|
|
+
|
|
+ if (bch2_csum_type_is_encryption(op->crc.csum_type)) {
|
|
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ op->crc.csum_type = 0;
|
|
+ op->crc.csum = (struct bch_csum) { 0, 0 };
|
|
+ }
|
|
|
|
- if (bch2_bio_uncompress_inplace(op, bio))
|
|
- return PREP_ENCODED_ERR;
|
|
+ ret = bch2_bio_uncompress_inplace(op, bio);
|
|
+ if (ret)
|
|
+ return ret;
|
|
}
|
|
|
|
/*
|
|
@@ -896,22 +939,44 @@ static enum prep_encoded_ret {
|
|
* If the data is checksummed and we're only writing a subset,
|
|
* rechecksum and adjust bio to point to currently live data:
|
|
*/
|
|
- if ((op->crc.live_size != op->crc.uncompressed_size ||
|
|
- op->crc.csum_type != op->csum_type) &&
|
|
- bch2_write_rechecksum(c, op, op->csum_type) &&
|
|
- !c->opts.no_data_io)
|
|
- return PREP_ENCODED_CHECKSUM_ERR;
|
|
+ if (op->crc.live_size != op->crc.uncompressed_size ||
|
|
+ op->crc.csum_type != op->csum_type) {
|
|
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
|
|
/*
|
|
* If we want to compress the data, it has to be decrypted:
|
|
*/
|
|
- if ((op->compression_opt ||
|
|
- bch2_csum_type_is_encryption(op->crc.csum_type) !=
|
|
- bch2_csum_type_is_encryption(op->csum_type)) &&
|
|
- bch2_write_decrypt(op))
|
|
- return PREP_ENCODED_CHECKSUM_ERR;
|
|
+ if (bch2_csum_type_is_encryption(op->crc.csum_type) &&
|
|
+ (op->compression_opt || op->crc.csum_type != op->csum_type)) {
|
|
+ struct nonce nonce = extent_nonce(op->version, op->crc);
|
|
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
|
|
+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
|
|
+ goto csum_err;
|
|
|
|
- return PREP_ENCODED_OK;
|
|
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ op->crc.csum_type = 0;
|
|
+ op->crc.csum = (struct bch_csum) { 0, 0 };
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+csum_err:
|
|
+ bch2_write_op_error(op, op->pos.offset,
|
|
+ "error verifying existing checksum while moving existing data (memory corruption?)\n"
|
|
+ " expected %0llx:%0llx got %0llx:%0llx type %s",
|
|
+ op->crc.csum.hi,
|
|
+ op->crc.csum.lo,
|
|
+ csum.hi,
|
|
+ csum.lo,
|
|
+ op->crc.csum_type < BCH_CSUM_NR
|
|
+ ? __bch2_csum_types[op->crc.csum_type]
|
|
+ : "(unknown)");
|
|
+ return -BCH_ERR_data_write_csum;
|
|
}
|
|
|
|
static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
@@ -926,43 +991,51 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
bool page_alloc_failed = false;
|
|
int ret, more = 0;
|
|
|
|
+ if (op->incompressible)
|
|
+ op->compression_opt = 0;
|
|
+
|
|
BUG_ON(!bio_sectors(src));
|
|
|
|
ec_buf = bch2_writepoint_ec_buf(c, wp);
|
|
|
|
- switch (bch2_write_prep_encoded_data(op, wp)) {
|
|
- case PREP_ENCODED_OK:
|
|
- break;
|
|
- case PREP_ENCODED_ERR:
|
|
- ret = -EIO;
|
|
- goto err;
|
|
- case PREP_ENCODED_CHECKSUM_ERR:
|
|
- goto csum_err;
|
|
- case PREP_ENCODED_DO_WRITE:
|
|
- /* XXX look for bug here */
|
|
- if (ec_buf) {
|
|
- dst = bch2_write_bio_alloc(c, wp, src,
|
|
- &page_alloc_failed,
|
|
- ec_buf);
|
|
- bio_copy_data(dst, src);
|
|
- bounce = true;
|
|
+ if (unlikely(op->flags & BCH_WRITE_data_encoded)) {
|
|
+ ret = bch2_write_prep_encoded_data(op, wp);
|
|
+ if (ret < 0)
|
|
+ goto err;
|
|
+ if (ret) {
|
|
+ if (ec_buf) {
|
|
+ dst = bch2_write_bio_alloc(c, wp, src,
|
|
+ &page_alloc_failed,
|
|
+ ec_buf);
|
|
+ bio_copy_data(dst, src);
|
|
+ bounce = true;
|
|
+ }
|
|
+ init_append_extent(op, wp, op->version, op->crc);
|
|
+ goto do_write;
|
|
}
|
|
- init_append_extent(op, wp, op->version, op->crc);
|
|
- goto do_write;
|
|
}
|
|
|
|
if (ec_buf ||
|
|
op->compression_opt ||
|
|
(op->csum_type &&
|
|
- !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
|
|
+ !(op->flags & BCH_WRITE_pages_stable)) ||
|
|
(bch2_csum_type_is_encryption(op->csum_type) &&
|
|
- !(op->flags & BCH_WRITE_PAGES_OWNED))) {
|
|
+ !(op->flags & BCH_WRITE_pages_owned))) {
|
|
dst = bch2_write_bio_alloc(c, wp, src,
|
|
&page_alloc_failed,
|
|
ec_buf);
|
|
bounce = true;
|
|
}
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
|
|
+ if (!bounce && write_corrupt_ratio) {
|
|
+ dst = bch2_write_bio_alloc(c, wp, src,
|
|
+ &page_alloc_failed,
|
|
+ ec_buf);
|
|
+ bounce = true;
|
|
+ }
|
|
+#endif
|
|
saved_iter = dst->bi_iter;
|
|
|
|
do {
|
|
@@ -976,7 +1049,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
break;
|
|
|
|
BUG_ON(op->compression_opt &&
|
|
- (op->flags & BCH_WRITE_DATA_ENCODED) &&
|
|
+ (op->flags & BCH_WRITE_data_encoded) &&
|
|
bch2_csum_type_is_encryption(op->crc.csum_type));
|
|
BUG_ON(op->compression_opt && !bounce);
|
|
|
|
@@ -1014,7 +1087,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
}
|
|
}
|
|
|
|
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
|
|
+ if ((op->flags & BCH_WRITE_data_encoded) &&
|
|
!crc_is_compressed(crc) &&
|
|
bch2_csum_type_is_encryption(op->crc.csum_type) ==
|
|
bch2_csum_type_is_encryption(op->csum_type)) {
|
|
@@ -1032,12 +1105,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
* data can't be modified (by userspace) while it's in
|
|
* flight.
|
|
*/
|
|
- if (bch2_rechecksum_bio(c, src, version, op->crc,
|
|
+ ret = bch2_rechecksum_bio(c, src, version, op->crc,
|
|
&crc, &op->crc,
|
|
src_len >> 9,
|
|
bio_sectors(src) - (src_len >> 9),
|
|
- op->csum_type))
|
|
- goto csum_err;
|
|
+ op->csum_type);
|
|
+ if (ret)
|
|
+ goto err;
|
|
/*
|
|
* rchecksum_bio sets compression_type on crc from op->crc,
|
|
* this isn't always correct as sometimes we're changing
|
|
@@ -1046,13 +1120,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
crc.compression_type = compression_type;
|
|
crc.nonce = nonce;
|
|
} else {
|
|
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
|
|
- bch2_rechecksum_bio(c, src, version, op->crc,
|
|
+ if ((op->flags & BCH_WRITE_data_encoded) &&
|
|
+ (ret = bch2_rechecksum_bio(c, src, version, op->crc,
|
|
NULL, &op->crc,
|
|
src_len >> 9,
|
|
bio_sectors(src) - (src_len >> 9),
|
|
- op->crc.csum_type))
|
|
- goto csum_err;
|
|
+ op->crc.csum_type)))
|
|
+ goto err;
|
|
|
|
crc.compressed_size = dst_len >> 9;
|
|
crc.uncompressed_size = src_len >> 9;
|
|
@@ -1072,6 +1146,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
|
|
init_append_extent(op, wp, version, crc);
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ if (write_corrupt_ratio) {
|
|
+ swap(dst->bi_iter.bi_size, dst_len);
|
|
+ bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
|
|
+ swap(dst->bi_iter.bi_size, dst_len);
|
|
+ }
|
|
+#endif
|
|
+
|
|
if (dst != src)
|
|
bio_advance(dst, dst_len);
|
|
bio_advance(src, src_len);
|
|
@@ -1103,16 +1185,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
do_write:
|
|
*_dst = dst;
|
|
return more;
|
|
-csum_err:
|
|
- {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)");
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
-
|
|
- ret = -EIO;
|
|
err:
|
|
if (to_wbio(dst)->bounce)
|
|
bch2_bio_free_pages_pool(c, dst);
|
|
@@ -1190,39 +1262,36 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
|
|
{
|
|
struct bch_fs *c = op->c;
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
+ int ret = 0;
|
|
|
|
for_each_keylist_key(&op->insert_keys, orig) {
|
|
- int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
|
|
+ ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
|
|
bkey_start_pos(&orig->k), orig->k.p,
|
|
BTREE_ITER_intent, k,
|
|
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
|
|
}));
|
|
-
|
|
- if (ret && !bch2_err_matches(ret, EROFS)) {
|
|
- struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
|
|
-
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k));
|
|
- prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
-
|
|
- if (ret) {
|
|
- op->error = ret;
|
|
+ if (ret)
|
|
break;
|
|
- }
|
|
}
|
|
|
|
bch2_trans_put(trans);
|
|
+
|
|
+ if (ret && !bch2_err_matches(ret, EROFS)) {
|
|
+ struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
|
|
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
|
|
+ "btree update error: %s", bch2_err_str(ret));
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ op->error = ret;
|
|
}
|
|
|
|
static void __bch2_nocow_write_done(struct bch_write_op *op)
|
|
{
|
|
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
|
|
- op->error = -EIO;
|
|
- } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
|
|
+ if (unlikely(op->flags & BCH_WRITE_io_error)) {
|
|
+ op->error = -BCH_ERR_data_write_io;
|
|
+ } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
|
|
bch2_nocow_write_convert_unwritten(op);
|
|
}
|
|
|
|
@@ -1251,7 +1320,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
struct bucket_to_lock *stale_at;
|
|
int stale, ret;
|
|
|
|
- if (op->flags & BCH_WRITE_MOVE)
|
|
+ if (op->flags & BCH_WRITE_move)
|
|
return;
|
|
|
|
darray_init(&buckets);
|
|
@@ -1275,7 +1344,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
if (ret)
|
|
break;
|
|
|
|
- k = bch2_btree_iter_peek_slot(&iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
break;
|
|
@@ -1294,7 +1363,8 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
/* Get iorefs before dropping btree locks: */
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
bkey_for_each_ptr(ptrs, ptr) {
|
|
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
|
|
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
|
|
+ BCH_DEV_WRITE_REF_io_write);
|
|
if (unlikely(!ca))
|
|
goto err_get_ioref;
|
|
|
|
@@ -1309,7 +1379,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
}), GFP_KERNEL|__GFP_NOFAIL);
|
|
|
|
if (ptr->unwritten)
|
|
- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
|
|
+ op->flags |= BCH_WRITE_convert_unwritten;
|
|
}
|
|
|
|
/* Unlock before taking nocow locks, doing IO: */
|
|
@@ -1317,7 +1387,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
bch2_trans_unlock(trans);
|
|
|
|
bch2_cut_front(op->pos, op->insert_keys.top);
|
|
- if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
|
|
+ if (op->flags & BCH_WRITE_convert_unwritten)
|
|
bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
|
|
|
|
darray_for_each(buckets, i) {
|
|
@@ -1342,7 +1412,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
wbio_init(bio)->put_bio = true;
|
|
bio->bi_opf = op->wbio.bio.bi_opf;
|
|
} else {
|
|
- op->flags |= BCH_WRITE_SUBMITTED;
|
|
+ op->flags |= BCH_WRITE_submitted;
|
|
}
|
|
|
|
op->pos.offset += bio_sectors(bio);
|
|
@@ -1352,13 +1422,14 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
bio->bi_private = &op->cl;
|
|
bio->bi_opf |= REQ_OP_WRITE;
|
|
closure_get(&op->cl);
|
|
+
|
|
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
|
|
op->insert_keys.top, true);
|
|
|
|
bch2_keylist_push(&op->insert_keys);
|
|
- if (op->flags & BCH_WRITE_SUBMITTED)
|
|
+ if (op->flags & BCH_WRITE_submitted)
|
|
break;
|
|
- bch2_btree_iter_advance(&iter);
|
|
+ bch2_btree_iter_advance(trans, &iter);
|
|
}
|
|
out:
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
@@ -1370,21 +1441,18 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
darray_exit(&buckets);
|
|
|
|
if (ret) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
+ bch2_write_op_error(op, op->pos.offset,
|
|
+ "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
|
|
op->error = ret;
|
|
- op->flags |= BCH_WRITE_SUBMITTED;
|
|
+ op->flags |= BCH_WRITE_submitted;
|
|
}
|
|
|
|
/* fallback to cow write path? */
|
|
- if (!(op->flags & BCH_WRITE_SUBMITTED)) {
|
|
+ if (!(op->flags & BCH_WRITE_submitted)) {
|
|
closure_sync(&op->cl);
|
|
__bch2_nocow_write_done(op);
|
|
op->insert_keys.top = op->insert_keys.keys;
|
|
- } else if (op->flags & BCH_WRITE_SYNC) {
|
|
+ } else if (op->flags & BCH_WRITE_sync) {
|
|
closure_sync(&op->cl);
|
|
bch2_nocow_write_done(&op->cl.work);
|
|
} else {
|
|
@@ -1398,7 +1466,8 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
return;
|
|
err_get_ioref:
|
|
darray_for_each(buckets, i)
|
|
- percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref);
|
|
+ enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE],
|
|
+ BCH_DEV_WRITE_REF_io_write);
|
|
|
|
/* Fall back to COW path: */
|
|
goto out;
|
|
@@ -1414,7 +1483,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
"pointer to invalid bucket in nocow path on device %llu\n %s",
|
|
stale_at->b.inode,
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
|
- ret = -EIO;
|
|
+ ret = -BCH_ERR_data_write_invalid_ptr;
|
|
} else {
|
|
/* We can retry this: */
|
|
ret = -BCH_ERR_transaction_restart;
|
|
@@ -1436,7 +1505,7 @@ static void __bch2_write(struct bch_write_op *op)
|
|
|
|
if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
|
|
bch2_nocow_write(op);
|
|
- if (op->flags & BCH_WRITE_SUBMITTED)
|
|
+ if (op->flags & BCH_WRITE_submitted)
|
|
goto out_nofs_restore;
|
|
}
|
|
again:
|
|
@@ -1466,7 +1535,7 @@ static void __bch2_write(struct bch_write_op *op)
|
|
ret = bch2_trans_run(c, lockrestart_do(trans,
|
|
bch2_alloc_sectors_start_trans(trans,
|
|
op->target,
|
|
- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
|
|
+ op->opts.erasure_code && !(op->flags & BCH_WRITE_cached),
|
|
op->write_point,
|
|
&op->devs_have,
|
|
op->nr_replicas,
|
|
@@ -1489,16 +1558,12 @@ static void __bch2_write(struct bch_write_op *op)
|
|
bch2_alloc_sectors_done_inlined(c, wp);
|
|
err:
|
|
if (ret <= 0) {
|
|
- op->flags |= BCH_WRITE_SUBMITTED;
|
|
+ op->flags |= BCH_WRITE_submitted;
|
|
|
|
if (unlikely(ret < 0)) {
|
|
- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
+ if (!(op->flags & BCH_WRITE_alloc_nowait))
|
|
+ bch2_write_op_error(op, op->pos.offset,
|
|
+ "%s(): %s", __func__, bch2_err_str(ret));
|
|
op->error = ret;
|
|
break;
|
|
}
|
|
@@ -1524,14 +1589,14 @@ static void __bch2_write(struct bch_write_op *op)
|
|
* synchronously here if we weren't able to submit all of the IO at
|
|
* once, as that signals backpressure to the caller.
|
|
*/
|
|
- if ((op->flags & BCH_WRITE_SYNC) ||
|
|
- (!(op->flags & BCH_WRITE_SUBMITTED) &&
|
|
- !(op->flags & BCH_WRITE_IN_WORKER))) {
|
|
+ if ((op->flags & BCH_WRITE_sync) ||
|
|
+ (!(op->flags & BCH_WRITE_submitted) &&
|
|
+ !(op->flags & BCH_WRITE_in_worker))) {
|
|
bch2_wait_on_allocator(c, &op->cl);
|
|
|
|
__bch2_write_index(op);
|
|
|
|
- if (!(op->flags & BCH_WRITE_SUBMITTED))
|
|
+ if (!(op->flags & BCH_WRITE_submitted))
|
|
goto again;
|
|
bch2_write_done(&op->cl);
|
|
} else {
|
|
@@ -1552,8 +1617,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
|
|
|
|
memset(&op->failed, 0, sizeof(op->failed));
|
|
|
|
- op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
|
|
- op->flags |= BCH_WRITE_SUBMITTED;
|
|
+ op->flags |= BCH_WRITE_wrote_data_inline;
|
|
+ op->flags |= BCH_WRITE_submitted;
|
|
|
|
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
|
|
|
|
@@ -1616,8 +1681,10 @@ CLOSURE_CALLBACK(bch2_write)
|
|
BUG_ON(!op->write_point.v);
|
|
BUG_ON(bkey_eq(op->pos, POS_MAX));
|
|
|
|
- if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
|
|
- op->flags |= BCH_WRITE_ALLOC_NOWAIT;
|
|
+ async_object_list_add(c, write_op, op, &op->list_idx);
|
|
+
|
|
+ if (op->flags & BCH_WRITE_only_specified_devs)
|
|
+ op->flags |= BCH_WRITE_alloc_nowait;
|
|
|
|
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
|
|
op->start_time = local_clock();
|
|
@@ -1625,11 +1692,8 @@ CLOSURE_CALLBACK(bch2_write)
|
|
wbio_init(bio)->put_bio = false;
|
|
|
|
if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "misaligned write");
|
|
- printbuf_exit(&buf);
|
|
- op->error = -EIO;
|
|
+ bch2_write_op_error(op, op->pos.offset, "misaligned write");
|
|
+ op->error = -BCH_ERR_data_write_misaligned;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1638,13 +1702,14 @@ CLOSURE_CALLBACK(bch2_write)
|
|
goto err;
|
|
}
|
|
|
|
- if (!(op->flags & BCH_WRITE_MOVE) &&
|
|
- !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
|
|
+ if (!(op->flags & BCH_WRITE_move) &&
|
|
+ !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) {
|
|
op->error = -BCH_ERR_erofs_no_writes;
|
|
goto err;
|
|
}
|
|
|
|
- this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
|
|
+ if (!(op->flags & BCH_WRITE_move))
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
|
|
bch2_increment_clock(c, bio_sectors(bio), WRITE);
|
|
|
|
data_len = min_t(u64, bio->bi_iter.bi_size,
|
|
@@ -1662,6 +1727,7 @@ CLOSURE_CALLBACK(bch2_write)
|
|
bch2_disk_reservation_put(c, &op->res);
|
|
|
|
closure_debug_destroy(&op->cl);
|
|
+ async_object_list_del(c, write_op, op->list_idx);
|
|
if (op->end_io)
|
|
op->end_io(op);
|
|
}
|
|
@@ -1675,27 +1741,33 @@ static const char * const bch2_write_flags[] = {
|
|
|
|
void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
|
|
{
|
|
- prt_str(out, "pos: ");
|
|
+ if (!out->nr_tabstops)
|
|
+ printbuf_tabstop_push(out, 32);
|
|
+
|
|
+ prt_printf(out, "pos:\t");
|
|
bch2_bpos_to_text(out, op->pos);
|
|
prt_newline(out);
|
|
printbuf_indent_add(out, 2);
|
|
|
|
- prt_str(out, "started: ");
|
|
+ prt_printf(out, "started:\t");
|
|
bch2_pr_time_units(out, local_clock() - op->start_time);
|
|
prt_newline(out);
|
|
|
|
- prt_str(out, "flags: ");
|
|
+ prt_printf(out, "flags:\t");
|
|
prt_bitflags(out, bch2_write_flags, op->flags);
|
|
prt_newline(out);
|
|
|
|
- prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
|
|
+ prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
|
|
+ prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
|
|
+
|
|
+ prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
|
|
+ prt_printf(out, "ret\t%s\n", bch2_err_str(op->error));
|
|
|
|
printbuf_indent_sub(out, 2);
|
|
}
|
|
|
|
void bch2_fs_io_write_exit(struct bch_fs *c)
|
|
{
|
|
- mempool_exit(&c->bio_bounce_pages);
|
|
bioset_exit(&c->replica_set);
|
|
bioset_exit(&c->bio_write);
|
|
}
|
|
@@ -1706,12 +1778,5 @@ int bch2_fs_io_write_init(struct bch_fs *c)
|
|
bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
|
|
return -BCH_ERR_ENOMEM_bio_write_init;
|
|
|
|
- if (mempool_init_page_pool(&c->bio_bounce_pages,
|
|
- max_t(unsigned,
|
|
- c->opts.btree_node_size,
|
|
- c->opts.encoded_extent_max) /
|
|
- PAGE_SIZE, 0))
|
|
- return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
|
|
-
|
|
return 0;
|
|
}
|
|
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
|
|
index b4626013abc8..2c0a8f35ee1f 100644
|
|
--- a/fs/bcachefs/io_write.h
|
|
+++ b/fs/bcachefs/io_write.h
|
|
@@ -11,45 +11,11 @@
|
|
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
|
|
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
|
|
|
|
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
|
-void bch2_latency_acct(struct bch_dev *, u64, int);
|
|
-#else
|
|
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
|
|
-#endif
|
|
-
|
|
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
|
|
enum bch_data_type, const struct bkey_i *, bool);
|
|
|
|
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op);
|
|
-
|
|
-#define BCH_WRITE_FLAGS() \
|
|
- x(ALLOC_NOWAIT) \
|
|
- x(CACHED) \
|
|
- x(DATA_ENCODED) \
|
|
- x(PAGES_STABLE) \
|
|
- x(PAGES_OWNED) \
|
|
- x(ONLY_SPECIFIED_DEVS) \
|
|
- x(WROTE_DATA_INLINE) \
|
|
- x(FROM_INTERNAL) \
|
|
- x(CHECK_ENOSPC) \
|
|
- x(SYNC) \
|
|
- x(MOVE) \
|
|
- x(IN_WORKER) \
|
|
- x(SUBMITTED) \
|
|
- x(IO_ERROR) \
|
|
- x(CONVERT_UNWRITTEN)
|
|
-
|
|
-enum __bch_write_flags {
|
|
-#define x(f) __BCH_WRITE_##f,
|
|
- BCH_WRITE_FLAGS()
|
|
-#undef x
|
|
-};
|
|
-
|
|
-enum bch_write_flags {
|
|
-#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
|
|
- BCH_WRITE_FLAGS()
|
|
-#undef x
|
|
-};
|
|
+__printf(3, 4)
|
|
+void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...);
|
|
|
|
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
|
|
{
|
|
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
|
|
index 6e878a6f2f0b..5da4eb8bb6f6 100644
|
|
--- a/fs/bcachefs/io_write_types.h
|
|
+++ b/fs/bcachefs/io_write_types.h
|
|
@@ -13,6 +13,34 @@
|
|
#include <linux/llist.h>
|
|
#include <linux/workqueue.h>
|
|
|
|
+#define BCH_WRITE_FLAGS() \
|
|
+ x(alloc_nowait) \
|
|
+ x(cached) \
|
|
+ x(data_encoded) \
|
|
+ x(pages_stable) \
|
|
+ x(pages_owned) \
|
|
+ x(only_specified_devs) \
|
|
+ x(wrote_data_inline) \
|
|
+ x(check_enospc) \
|
|
+ x(sync) \
|
|
+ x(move) \
|
|
+ x(in_worker) \
|
|
+ x(submitted) \
|
|
+ x(io_error) \
|
|
+ x(convert_unwritten)
|
|
+
|
|
+enum __bch_write_flags {
|
|
+#define x(f) __BCH_WRITE_##f,
|
|
+ BCH_WRITE_FLAGS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+enum bch_write_flags {
|
|
+#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
|
|
+ BCH_WRITE_FLAGS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
struct bch_write_bio {
|
|
struct_group(wbio,
|
|
struct bch_fs *c;
|
|
@@ -43,6 +71,10 @@ struct bch_write_op {
|
|
void (*end_io)(struct bch_write_op *);
|
|
u64 start_time;
|
|
|
|
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
|
|
+ unsigned list_idx;
|
|
+#endif
|
|
+
|
|
unsigned written; /* sectors */
|
|
u16 flags;
|
|
s16 error; /* dio write path expects it to hold -ERESTARTSYS... */
|
|
@@ -64,7 +96,7 @@ struct bch_write_op {
|
|
struct bpos pos;
|
|
struct bversion version;
|
|
|
|
- /* For BCH_WRITE_DATA_ENCODED: */
|
|
+ /* For BCH_WRITE_data_encoded: */
|
|
struct bch_extent_crc_unpacked crc;
|
|
|
|
struct write_point_specifier write_point;
|
|
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
|
|
index 05b1250619ec..f2963a6cca88 100644
|
|
--- a/fs/bcachefs/journal.c
|
|
+++ b/fs/bcachefs/journal.c
|
|
@@ -12,6 +12,7 @@
|
|
#include "btree_update.h"
|
|
#include "btree_write_buffer.h"
|
|
#include "buckets.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "error.h"
|
|
#include "journal.h"
|
|
#include "journal_io.h"
|
|
@@ -20,13 +21,6 @@
|
|
#include "journal_seq_blacklist.h"
|
|
#include "trace.h"
|
|
|
|
-static const char * const bch2_journal_errors[] = {
|
|
-#define x(n) #n,
|
|
- JOURNAL_ERRORS()
|
|
-#undef x
|
|
- NULL
|
|
-};
|
|
-
|
|
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
|
|
{
|
|
return seq > j->seq_ondisk;
|
|
@@ -56,14 +50,20 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
|
|
prt_printf(out, "seq:\t%llu\n", seq);
|
|
printbuf_indent_add(out, 2);
|
|
|
|
- prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i));
|
|
+ if (!buf->write_started)
|
|
+ prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK));
|
|
|
|
- prt_printf(out, "size:\t");
|
|
- prt_human_readable_u64(out, vstruct_bytes(buf->data));
|
|
- prt_newline(out);
|
|
+ struct closure *cl = &buf->io;
|
|
+ int r = atomic_read(&cl->remaining);
|
|
+ prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK);
|
|
+
|
|
+ if (buf->data) {
|
|
+ prt_printf(out, "size:\t");
|
|
+ prt_human_readable_u64(out, vstruct_bytes(buf->data));
|
|
+ prt_newline(out);
|
|
+ }
|
|
|
|
- prt_printf(out, "expires:\t");
|
|
- prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
|
|
+ prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies);
|
|
|
|
prt_printf(out, "flags:\t");
|
|
if (buf->noflush)
|
|
@@ -87,6 +87,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
|
|
|
|
static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
|
|
{
|
|
+ lockdep_assert_held(&j->lock);
|
|
+ out->atomic++;
|
|
+
|
|
if (!out->nr_tabstops)
|
|
printbuf_tabstop_push(out, 24);
|
|
|
|
@@ -95,6 +98,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
|
|
seq++)
|
|
bch2_journal_buf_to_text(out, j, seq);
|
|
prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
|
|
+
|
|
+ --out->atomic;
|
|
}
|
|
|
|
static inline struct journal_buf *
|
|
@@ -104,10 +109,8 @@ journal_seq_to_buf(struct journal *j, u64 seq)
|
|
|
|
EBUG_ON(seq > journal_cur_seq(j));
|
|
|
|
- if (journal_seq_unwritten(j, seq)) {
|
|
+ if (journal_seq_unwritten(j, seq))
|
|
buf = j->buf + (seq & JOURNAL_BUF_MASK);
|
|
- EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
|
|
- }
|
|
return buf;
|
|
}
|
|
|
|
@@ -139,8 +142,10 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
|
|
bool stuck = false;
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
- if (!(error == JOURNAL_ERR_journal_full ||
|
|
- error == JOURNAL_ERR_journal_pin_full) ||
|
|
+ buf.atomic++;
|
|
+
|
|
+ if (!(error == -BCH_ERR_journal_full ||
|
|
+ error == -BCH_ERR_journal_pin_full) ||
|
|
nr_unwritten_journal_entries(j) ||
|
|
(flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
|
|
return stuck;
|
|
@@ -164,12 +169,12 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
|
|
return stuck;
|
|
}
|
|
j->err_seq = journal_cur_seq(j);
|
|
- spin_unlock(&j->lock);
|
|
|
|
- bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
|
|
- bch2_journal_errors[error]);
|
|
- bch2_journal_debug_to_text(&buf, j);
|
|
- bch_err(c, "%s", buf.buf);
|
|
+ __bch2_journal_debug_to_text(&buf, j);
|
|
+ spin_unlock(&j->lock);
|
|
+ prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"),
|
|
+ bch2_err_str(error));
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
|
|
printbuf_reset(&buf);
|
|
bch2_journal_pins_to_text(&buf, j);
|
|
@@ -195,7 +200,8 @@ void bch2_journal_do_writes(struct journal *j)
|
|
if (w->write_started)
|
|
continue;
|
|
|
|
- if (!journal_state_count(j->reservations, idx)) {
|
|
+ if (!journal_state_seq_count(j, j->reservations, seq)) {
|
|
+ j->seq_write_started = seq;
|
|
w->write_started = true;
|
|
closure_call(&w->io, bch2_journal_write, j->wq, NULL);
|
|
}
|
|
@@ -276,7 +282,24 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
|
|
|
|
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
|
|
buf->u64s_reserved) << c->block_bits;
|
|
- BUG_ON(sectors > buf->sectors);
|
|
+ if (unlikely(sectors > buf->sectors)) {
|
|
+ struct printbuf err = PRINTBUF;
|
|
+ err.atomic++;
|
|
+
|
|
+ prt_printf(&err, "journal entry overran reserved space: %u > %u\n",
|
|
+ sectors, buf->sectors);
|
|
+ prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n",
|
|
+ le32_to_cpu(buf->data->u64s), buf->u64s_reserved,
|
|
+ j->cur_entry_u64s,
|
|
+ c->block_bits);
|
|
+ prt_printf(&err, "fatal error - emergency read only");
|
|
+ bch2_journal_halt_locked(j);
|
|
+
|
|
+ bch_err(c, "%s", err.buf);
|
|
+ printbuf_exit(&err);
|
|
+ return;
|
|
+ }
|
|
+
|
|
buf->sectors = sectors;
|
|
|
|
/*
|
|
@@ -306,17 +329,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
|
|
|
|
bch2_journal_space_available(j);
|
|
|
|
- __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
|
|
-}
|
|
-
|
|
-void bch2_journal_halt(struct journal *j)
|
|
-{
|
|
- spin_lock(&j->lock);
|
|
- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
|
|
- if (!j->err_seq)
|
|
- j->err_seq = journal_cur_seq(j);
|
|
- journal_wake(j);
|
|
- spin_unlock(&j->lock);
|
|
+ __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq));
|
|
}
|
|
|
|
void bch2_journal_halt_locked(struct journal *j)
|
|
@@ -329,6 +342,13 @@ void bch2_journal_halt_locked(struct journal *j)
|
|
journal_wake(j);
|
|
}
|
|
|
|
+void bch2_journal_halt(struct journal *j)
|
|
+{
|
|
+ spin_lock(&j->lock);
|
|
+ bch2_journal_halt_locked(j);
|
|
+ spin_unlock(&j->lock);
|
|
+}
|
|
+
|
|
static bool journal_entry_want_write(struct journal *j)
|
|
{
|
|
bool ret = !journal_entry_is_open(j) ||
|
|
@@ -377,29 +397,41 @@ static int journal_entry_open(struct journal *j)
|
|
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
|
|
|
|
if (j->blocked)
|
|
- return JOURNAL_ERR_blocked;
|
|
+ return -BCH_ERR_journal_blocked;
|
|
|
|
if (j->cur_entry_error)
|
|
return j->cur_entry_error;
|
|
|
|
- if (bch2_journal_error(j))
|
|
- return JOURNAL_ERR_insufficient_devices; /* -EROFS */
|
|
+ int ret = bch2_journal_error(j);
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
|
|
if (!fifo_free(&j->pin))
|
|
- return JOURNAL_ERR_journal_pin_full;
|
|
+ return -BCH_ERR_journal_pin_full;
|
|
|
|
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
|
|
- return JOURNAL_ERR_max_in_flight;
|
|
+ return -BCH_ERR_journal_max_in_flight;
|
|
+
|
|
+ if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
|
|
+ return -BCH_ERR_journal_max_open;
|
|
|
|
if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) {
|
|
bch_err(c, "cannot start: journal seq overflow");
|
|
if (bch2_fs_emergency_read_only_locked(c))
|
|
bch_err(c, "fatal error - emergency read only");
|
|
- return JOURNAL_ERR_insufficient_devices; /* -EROFS */
|
|
+ return -BCH_ERR_journal_shutdown;
|
|
}
|
|
|
|
+ if (!j->free_buf && !buf->data)
|
|
+ return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */
|
|
+
|
|
BUG_ON(!j->cur_entry_sectors);
|
|
|
|
+ if (!buf->data) {
|
|
+ swap(buf->data, j->free_buf);
|
|
+ swap(buf->buf_size, j->free_buf_size);
|
|
+ }
|
|
+
|
|
buf->expires =
|
|
(journal_cur_seq(j) == j->flushed_seq_ondisk
|
|
? jiffies
|
|
@@ -415,7 +447,7 @@ static int journal_entry_open(struct journal *j)
|
|
u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
|
|
|
|
if (u64s <= (ssize_t) j->early_journal_entries.nr)
|
|
- return JOURNAL_ERR_journal_full;
|
|
+ return -BCH_ERR_journal_full;
|
|
|
|
if (fifo_empty(&j->pin) && j->reclaim_thread)
|
|
wake_up_process(j->reclaim_thread);
|
|
@@ -464,7 +496,7 @@ static int journal_entry_open(struct journal *j)
|
|
|
|
new.idx++;
|
|
BUG_ON(journal_state_count(new, new.idx));
|
|
- BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
|
|
+ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK));
|
|
|
|
journal_state_inc(&new);
|
|
|
|
@@ -514,6 +546,33 @@ static void journal_write_work(struct work_struct *work)
|
|
spin_unlock(&j->lock);
|
|
}
|
|
|
|
+static void journal_buf_prealloc(struct journal *j)
|
|
+{
|
|
+ if (j->free_buf &&
|
|
+ j->free_buf_size >= j->buf_size_want)
|
|
+ return;
|
|
+
|
|
+ unsigned buf_size = j->buf_size_want;
|
|
+
|
|
+ spin_unlock(&j->lock);
|
|
+ void *buf = kvmalloc(buf_size, GFP_NOFS);
|
|
+ spin_lock(&j->lock);
|
|
+
|
|
+ if (buf &&
|
|
+ (!j->free_buf ||
|
|
+ buf_size > j->free_buf_size)) {
|
|
+ swap(buf, j->free_buf);
|
|
+ swap(buf_size, j->free_buf_size);
|
|
+ }
|
|
+
|
|
+ if (unlikely(buf)) {
|
|
+ spin_unlock(&j->lock);
|
|
+ /* kvfree can sleep */
|
|
+ kvfree(buf);
|
|
+ spin_lock(&j->lock);
|
|
+ }
|
|
+}
|
|
+
|
|
static int __journal_res_get(struct journal *j, struct journal_res *res,
|
|
unsigned flags)
|
|
{
|
|
@@ -525,25 +584,28 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
|
|
if (journal_res_get_fast(j, res, flags))
|
|
return 0;
|
|
|
|
- if (bch2_journal_error(j))
|
|
- return -BCH_ERR_erofs_journal_err;
|
|
+ ret = bch2_journal_error(j);
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
|
|
if (j->blocked)
|
|
- return -BCH_ERR_journal_res_get_blocked;
|
|
+ return -BCH_ERR_journal_blocked;
|
|
|
|
if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
|
|
- ret = JOURNAL_ERR_journal_full;
|
|
+ ret = -BCH_ERR_journal_full;
|
|
can_discard = j->can_discard;
|
|
goto out;
|
|
}
|
|
|
|
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
|
|
- ret = JOURNAL_ERR_max_in_flight;
|
|
+ ret = -BCH_ERR_journal_max_in_flight;
|
|
goto out;
|
|
}
|
|
|
|
spin_lock(&j->lock);
|
|
|
|
+ journal_buf_prealloc(j);
|
|
+
|
|
/*
|
|
* Recheck after taking the lock, so we don't race with another thread
|
|
* that just did journal_entry_open() and call bch2_journal_entry_close()
|
|
@@ -566,25 +628,48 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
|
|
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
|
|
|
|
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
|
|
- ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
|
|
+ ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open;
|
|
unlock:
|
|
can_discard = j->can_discard;
|
|
spin_unlock(&j->lock);
|
|
out:
|
|
- if (ret == JOURNAL_ERR_retry)
|
|
- goto retry;
|
|
- if (!ret)
|
|
+ if (likely(!ret))
|
|
return 0;
|
|
+ if (ret == -BCH_ERR_journal_retry_open)
|
|
+ goto retry;
|
|
|
|
if (journal_error_check_stuck(j, ret, flags))
|
|
- ret = -BCH_ERR_journal_res_get_blocked;
|
|
+ ret = -BCH_ERR_journal_stuck;
|
|
+
|
|
+ if (ret == -BCH_ERR_journal_max_in_flight &&
|
|
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
|
|
+ trace_journal_entry_full_enabled()) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ bch2_printbuf_make_room(&buf, 4096);
|
|
|
|
- if (ret == JOURNAL_ERR_max_in_flight &&
|
|
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
|
|
+ spin_lock(&j->lock);
|
|
+ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
|
|
+ bch2_journal_bufs_to_text(&buf, j);
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ trace_journal_entry_full(c, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ count_event(c, journal_entry_full);
|
|
+ }
|
|
|
|
+ if (ret == -BCH_ERR_journal_max_open &&
|
|
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) &&
|
|
+ trace_journal_entry_full_enabled()) {
|
|
struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ bch2_printbuf_make_room(&buf, 4096);
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
|
|
bch2_journal_bufs_to_text(&buf, j);
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
trace_journal_entry_full(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
count_event(c, journal_entry_full);
|
|
@@ -594,8 +679,8 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
|
|
* Journal is full - can't rely on reclaim from work item due to
|
|
* freezing:
|
|
*/
|
|
- if ((ret == JOURNAL_ERR_journal_full ||
|
|
- ret == JOURNAL_ERR_journal_pin_full) &&
|
|
+ if ((ret == -BCH_ERR_journal_full ||
|
|
+ ret == -BCH_ERR_journal_pin_full) &&
|
|
!(flags & JOURNAL_RES_GET_NONBLOCK)) {
|
|
if (can_discard) {
|
|
bch2_journal_do_discards(j);
|
|
@@ -608,17 +693,17 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
|
|
}
|
|
}
|
|
|
|
- return ret == JOURNAL_ERR_insufficient_devices
|
|
- ? -BCH_ERR_erofs_journal_err
|
|
- : -BCH_ERR_journal_res_get_blocked;
|
|
+ return ret;
|
|
}
|
|
|
|
static unsigned max_dev_latency(struct bch_fs *c)
|
|
{
|
|
u64 nsecs = 0;
|
|
|
|
- for_each_rw_member(c, ca)
|
|
+ rcu_read_lock();
|
|
+ for_each_rw_member_rcu(c, ca)
|
|
nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
|
|
+ rcu_read_unlock();
|
|
|
|
return nsecs_to_jiffies(nsecs);
|
|
}
|
|
@@ -640,7 +725,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
|
|
int ret;
|
|
|
|
if (closure_wait_event_timeout(&j->async_wait,
|
|
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
|
|
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
|
|
(flags & JOURNAL_RES_GET_NONBLOCK),
|
|
HZ))
|
|
return ret;
|
|
@@ -654,19 +739,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
|
|
remaining_wait = max(0, remaining_wait - HZ);
|
|
|
|
if (closure_wait_event_timeout(&j->async_wait,
|
|
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
|
|
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
|
|
(flags & JOURNAL_RES_GET_NONBLOCK),
|
|
remaining_wait))
|
|
return ret;
|
|
|
|
struct printbuf buf = PRINTBUF;
|
|
bch2_journal_debug_to_text(&buf, j);
|
|
- bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
|
|
- buf.buf);
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
|
|
printbuf_exit(&buf);
|
|
|
|
closure_wait_event(&j->async_wait,
|
|
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
|
|
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
|
|
(flags & JOURNAL_RES_GET_NONBLOCK));
|
|
return ret;
|
|
}
|
|
@@ -687,7 +772,6 @@ void bch2_journal_entry_res_resize(struct journal *j,
|
|
goto out;
|
|
|
|
j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
|
|
- smp_mb();
|
|
state = READ_ONCE(j->reservations);
|
|
|
|
if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
|
|
@@ -906,11 +990,11 @@ int bch2_journal_meta(struct journal *j)
|
|
{
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal))
|
|
- return -EROFS;
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal))
|
|
+ return -BCH_ERR_erofs_no_writes;
|
|
|
|
int ret = __bch2_journal_meta(j);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_journal);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal);
|
|
return ret;
|
|
}
|
|
|
|
@@ -951,7 +1035,8 @@ static void __bch2_journal_block(struct journal *j)
|
|
new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
|
|
} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
|
|
|
|
- journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
|
|
+ if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL)
|
|
+ journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
|
|
}
|
|
}
|
|
|
|
@@ -992,7 +1077,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
|
|
*blocked = true;
|
|
}
|
|
|
|
- ret = journal_state_count(s, idx) > open
|
|
+ ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open
|
|
? ERR_PTR(-EAGAIN)
|
|
: buf;
|
|
break;
|
|
@@ -1213,6 +1298,16 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
|
|
|
|
int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
|
|
{
|
|
+ struct bch_fs *c = ca->fs;
|
|
+
|
|
+ if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal)))
|
|
+ return 0;
|
|
+
|
|
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
|
|
+ bch_err(c, "cannot allocate journal, filesystem is an unresized image file");
|
|
+ return -BCH_ERR_erofs_filesystem_full;
|
|
+ }
|
|
+
|
|
unsigned nr;
|
|
int ret;
|
|
|
|
@@ -1233,7 +1328,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
|
|
min(1 << 13,
|
|
(1 << 24) / ca->mi.bucket_size));
|
|
|
|
- ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs);
|
|
+ ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs);
|
|
err:
|
|
bch_err_fn(ca, ret);
|
|
return ret;
|
|
@@ -1241,13 +1336,14 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
|
|
|
|
int bch2_fs_journal_alloc(struct bch_fs *c)
|
|
{
|
|
- for_each_online_member(c, ca) {
|
|
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) {
|
|
if (ca->journal.nr)
|
|
continue;
|
|
|
|
int ret = bch2_dev_journal_alloc(ca, true);
|
|
if (ret) {
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ],
|
|
+ BCH_DEV_READ_REF_fs_journal_alloc);
|
|
return ret;
|
|
}
|
|
}
|
|
@@ -1336,19 +1432,26 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
|
|
|
|
nr = cur_seq - last_seq;
|
|
|
|
- if (nr + 1 > j->pin.size) {
|
|
- free_fifo(&j->pin);
|
|
- init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
|
|
- if (!j->pin.data) {
|
|
- bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
|
|
- return -BCH_ERR_ENOMEM_journal_pin_fifo;
|
|
- }
|
|
+ /*
|
|
+ * Extra fudge factor, in case we crashed when the journal pin fifo was
|
|
+ * nearly or completely full. We'll need to be able to open additional
|
|
+ * journal entries (at least a few) in order for journal replay to get
|
|
+ * going:
|
|
+ */
|
|
+ nr += nr / 4;
|
|
+
|
|
+ nr = max(nr, JOURNAL_PIN);
|
|
+ init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
|
|
+ if (!j->pin.data) {
|
|
+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
|
|
+ return -BCH_ERR_ENOMEM_journal_pin_fifo;
|
|
}
|
|
|
|
j->replay_journal_seq = last_seq;
|
|
j->replay_journal_seq_end = cur_seq;
|
|
j->last_seq_ondisk = last_seq;
|
|
j->flushed_seq_ondisk = cur_seq - 1;
|
|
+ j->seq_write_started = cur_seq - 1;
|
|
j->seq_ondisk = cur_seq - 1;
|
|
j->pin.front = last_seq;
|
|
j->pin.back = cur_seq;
|
|
@@ -1385,19 +1488,29 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
|
|
j->last_empty_seq = cur_seq - 1; /* to match j->seq */
|
|
|
|
spin_lock(&j->lock);
|
|
-
|
|
- set_bit(JOURNAL_running, &j->flags);
|
|
j->last_flush_write = jiffies;
|
|
|
|
- j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
|
|
- j->reservations.unwritten_idx++;
|
|
+ j->reservations.idx = journal_cur_seq(j);
|
|
|
|
c->last_bucket_seq_cleanup = journal_cur_seq(j);
|
|
+ spin_unlock(&j->lock);
|
|
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_journal_set_replay_done(struct journal *j)
|
|
+{
|
|
+ /*
|
|
+ * journal_space_available must happen before setting JOURNAL_running
|
|
+ * JOURNAL_running must happen before JOURNAL_replay_done
|
|
+ */
|
|
+ spin_lock(&j->lock);
|
|
bch2_journal_space_available(j);
|
|
- spin_unlock(&j->lock);
|
|
|
|
- return bch2_journal_reclaim_start(j);
|
|
+ set_bit(JOURNAL_need_flush_write, &j->flags);
|
|
+ set_bit(JOURNAL_running, &j->flags);
|
|
+ set_bit(JOURNAL_replay_done, &j->flags);
|
|
+ spin_unlock(&j->lock);
|
|
}
|
|
|
|
/* init/exit: */
|
|
@@ -1443,7 +1556,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
|
|
unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
|
|
- ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
|
|
+ ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
|
|
nr_bvecs), GFP_KERNEL);
|
|
if (!ja->bio[i])
|
|
return -BCH_ERR_ENOMEM_dev_journal_init;
|
|
@@ -1482,10 +1595,11 @@ void bch2_fs_journal_exit(struct journal *j)
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
|
|
kvfree(j->buf[i].data);
|
|
+ kvfree(j->free_buf);
|
|
free_fifo(&j->pin);
|
|
}
|
|
|
|
-int bch2_fs_journal_init(struct journal *j)
|
|
+void bch2_fs_journal_init_early(struct journal *j)
|
|
{
|
|
static struct lock_class_key res_key;
|
|
|
|
@@ -1504,19 +1618,17 @@ int bch2_fs_journal_init(struct journal *j)
|
|
atomic64_set(&j->reservations.counter,
|
|
((union journal_res_state)
|
|
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
|
|
+}
|
|
|
|
- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
|
|
- return -BCH_ERR_ENOMEM_journal_pin_fifo;
|
|
+int bch2_fs_journal_init(struct journal *j)
|
|
+{
|
|
+ j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
|
|
+ j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
|
|
+ if (!j->free_buf)
|
|
+ return -BCH_ERR_ENOMEM_journal_buf;
|
|
|
|
- for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
|
|
- j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
|
|
- j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
|
|
- if (!j->buf[i].data)
|
|
- return -BCH_ERR_ENOMEM_journal_buf;
|
|
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
|
|
j->buf[i].idx = i;
|
|
- }
|
|
-
|
|
- j->pin.front = j->pin.back = 1;
|
|
|
|
j->wq = alloc_workqueue("bcachefs_journal",
|
|
WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
|
|
@@ -1564,6 +1676,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
|
|
prt_printf(out, "average write size:\t");
|
|
prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
|
|
prt_newline(out);
|
|
+ prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0);
|
|
prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
|
|
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
|
|
prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked);
|
|
@@ -1571,7 +1684,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
|
|
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
|
|
prt_printf(out, "blocked:\t%u\n", j->blocked);
|
|
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
|
|
- prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
|
|
+ prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error));
|
|
prt_printf(out, "current entry:\t");
|
|
|
|
switch (s.cur_entry_offset) {
|
|
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
|
|
index 107f7f901cd9..8ff00a0ec778 100644
|
|
--- a/fs/bcachefs/journal.h
|
|
+++ b/fs/bcachefs/journal.h
|
|
@@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j)
|
|
closure_wake_up(&j->async_wait);
|
|
}
|
|
|
|
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
|
|
-{
|
|
- return j->buf + j->reservations.idx;
|
|
-}
|
|
-
|
|
/* Sequence number of oldest dirty journal entry */
|
|
|
|
static inline u64 journal_last_seq(struct journal *j)
|
|
@@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j)
|
|
return j->seq_ondisk + 1;
|
|
}
|
|
|
|
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
|
|
+{
|
|
+ unsigned idx = (journal_cur_seq(j) &
|
|
+ JOURNAL_BUF_MASK &
|
|
+ ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx;
|
|
+
|
|
+ return j->buf + idx;
|
|
+}
|
|
+
|
|
static inline int journal_state_count(union journal_res_state s, int idx)
|
|
{
|
|
switch (idx) {
|
|
@@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx)
|
|
BUG();
|
|
}
|
|
|
|
+static inline int journal_state_seq_count(struct journal *j,
|
|
+ union journal_res_state s, u64 seq)
|
|
+{
|
|
+ if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR)
|
|
+ return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK);
|
|
+ else
|
|
+ return 0;
|
|
+}
|
|
+
|
|
static inline void journal_state_inc(union journal_res_state *s)
|
|
{
|
|
s->buf0_count += s->idx == 0;
|
|
@@ -193,7 +206,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
|
|
static inline struct jset_entry *
|
|
journal_res_entry(struct journal *j, struct journal_res *res)
|
|
{
|
|
- return vstruct_idx(j->buf[res->idx].data, res->offset);
|
|
+ return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset);
|
|
}
|
|
|
|
static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
|
|
@@ -267,8 +280,9 @@ bool bch2_journal_entry_close(struct journal *);
|
|
void bch2_journal_do_writes(struct journal *);
|
|
void bch2_journal_buf_put_final(struct journal *, u64);
|
|
|
|
-static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
|
|
+static inline void __bch2_journal_buf_put(struct journal *j, u64 seq)
|
|
{
|
|
+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
|
|
union journal_res_state s;
|
|
|
|
s = journal_state_buf_put(j, idx);
|
|
@@ -276,8 +290,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
|
|
bch2_journal_buf_put_final(j, seq);
|
|
}
|
|
|
|
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
|
|
+static inline void bch2_journal_buf_put(struct journal *j, u64 seq)
|
|
{
|
|
+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
|
|
union journal_res_state s;
|
|
|
|
s = journal_state_buf_put(j, idx);
|
|
@@ -306,7 +321,7 @@ static inline void bch2_journal_res_put(struct journal *j,
|
|
BCH_JSET_ENTRY_btree_keys,
|
|
0, 0, 0);
|
|
|
|
- bch2_journal_buf_put(j, res->idx, res->seq);
|
|
+ bch2_journal_buf_put(j, res->seq);
|
|
|
|
res->ref = 0;
|
|
}
|
|
@@ -335,8 +350,10 @@ static inline int journal_res_get_fast(struct journal *j,
|
|
|
|
/*
|
|
* Check if there is still room in the current journal
|
|
- * entry:
|
|
+ * entry, smp_rmb() guarantees that reads from reservations.counter
|
|
+ * occur before accessing cur_entry_u64s:
|
|
*/
|
|
+ smp_rmb();
|
|
if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
|
|
return 0;
|
|
|
|
@@ -361,9 +378,9 @@ static inline int journal_res_get_fast(struct journal *j,
|
|
&old.v, new.v));
|
|
|
|
res->ref = true;
|
|
- res->idx = old.idx;
|
|
res->offset = old.cur_entry_offset;
|
|
- res->seq = le64_to_cpu(j->buf[old.idx].data->seq);
|
|
+ res->seq = journal_cur_seq(j);
|
|
+ res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK;
|
|
return 1;
|
|
}
|
|
|
|
@@ -390,6 +407,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
|
|
(flags & JOURNAL_RES_GET_NONBLOCK) != 0,
|
|
NULL, _THIS_IP_);
|
|
EBUG_ON(!res->ref);
|
|
+ BUG_ON(!res->seq);
|
|
}
|
|
return 0;
|
|
}
|
|
@@ -408,8 +426,8 @@ int bch2_journal_flush(struct journal *);
|
|
bool bch2_journal_noflush_seq(struct journal *, u64, u64);
|
|
int bch2_journal_meta(struct journal *);
|
|
|
|
-void bch2_journal_halt(struct journal *);
|
|
void bch2_journal_halt_locked(struct journal *);
|
|
+void bch2_journal_halt(struct journal *);
|
|
|
|
static inline int bch2_journal_error(struct journal *j)
|
|
{
|
|
@@ -419,12 +437,6 @@ static inline int bch2_journal_error(struct journal *j)
|
|
|
|
struct bch_dev;
|
|
|
|
-static inline void bch2_journal_set_replay_done(struct journal *j)
|
|
-{
|
|
- BUG_ON(!test_bit(JOURNAL_running, &j->flags));
|
|
- set_bit(JOURNAL_replay_done, &j->flags);
|
|
-}
|
|
-
|
|
void bch2_journal_unblock(struct journal *);
|
|
void bch2_journal_block(struct journal *);
|
|
struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
|
|
@@ -441,10 +453,12 @@ void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
|
|
|
|
void bch2_fs_journal_stop(struct journal *);
|
|
int bch2_fs_journal_start(struct journal *, u64);
|
|
+void bch2_journal_set_replay_done(struct journal *);
|
|
|
|
void bch2_dev_journal_exit(struct bch_dev *);
|
|
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
|
|
void bch2_fs_journal_exit(struct journal *);
|
|
+void bch2_fs_journal_init_early(struct journal *);
|
|
int bch2_fs_journal_init(struct journal *);
|
|
|
|
#endif /* _BCACHEFS_JOURNAL_H */
|
|
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
|
|
index 11c39e0c34f4..06f7b018492c 100644
|
|
--- a/fs/bcachefs/journal_io.c
|
|
+++ b/fs/bcachefs/journal_io.c
|
|
@@ -19,6 +19,7 @@
|
|
|
|
#include <linux/ioprio.h>
|
|
#include <linux/string_choices.h>
|
|
+#include <linux/sched/sysctl.h>
|
|
|
|
void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
|
|
{
|
|
@@ -214,12 +215,12 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
|
|
|
|
fsck_err_on(same_device,
|
|
c, journal_entry_dup_same_device,
|
|
- "duplicate journal entry on same device\n %s",
|
|
+ "duplicate journal entry on same device\n%s",
|
|
buf.buf);
|
|
|
|
fsck_err_on(not_identical,
|
|
c, journal_entry_replicas_data_mismatch,
|
|
- "found duplicate but non identical journal entries\n %s",
|
|
+ "found duplicate but non identical journal entries\n%s",
|
|
buf.buf);
|
|
|
|
if (entry_ptr.csum_good && !identical)
|
|
@@ -308,8 +309,8 @@ static void journal_entry_err_msg(struct printbuf *out,
|
|
break; \
|
|
case WRITE: \
|
|
bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
|
|
- bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
|
|
- if (bch2_fs_inconsistent(c)) { \
|
|
+ if (bch2_fs_inconsistent(c, \
|
|
+ "corrupt metadata before write: %s\n", _buf.buf)) {\
|
|
ret = -BCH_ERR_fsck_errors_not_fixed; \
|
|
goto fsck_err; \
|
|
} \
|
|
@@ -764,6 +765,23 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
|
|
journal_entry_btree_keys_to_text(out, c, entry);
|
|
}
|
|
|
|
+static int journal_entry_log_bkey_validate(struct bch_fs *c,
|
|
+ struct jset *jset,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned version, int big_endian,
|
|
+ struct bkey_validate_context from)
|
|
+{
|
|
+ from.flags = 0;
|
|
+ return journal_entry_btree_keys_validate(c, jset, entry,
|
|
+ version, big_endian, from);
|
|
+}
|
|
+
|
|
+static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct jset_entry *entry)
|
|
+{
|
|
+ journal_entry_btree_keys_to_text(out, c, entry);
|
|
+}
|
|
+
|
|
static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
|
|
struct jset *jset,
|
|
struct jset_entry *entry,
|
|
@@ -1041,13 +1059,19 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|
bio->bi_iter.bi_sector = offset;
|
|
bch2_bio_map(bio, buf->data, sectors_read << 9);
|
|
|
|
+ u64 submit_time = local_clock();
|
|
ret = submit_bio_wait(bio);
|
|
kfree(bio);
|
|
|
|
- if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
|
|
- "journal read error: sector %llu",
|
|
- offset) ||
|
|
- bch2_meta_read_fault("journal")) {
|
|
+ if (!ret && bch2_meta_read_fault("journal"))
|
|
+ ret = -BCH_ERR_EIO_fault_injected;
|
|
+
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
|
+ submit_time, !ret);
|
|
+
|
|
+ if (ret) {
|
|
+ bch_err_dev_ratelimited(ca,
|
|
+ "journal read error: sector %llu", offset);
|
|
/*
|
|
* We don't error out of the recovery process
|
|
* here, since the relevant journal entry may be
|
|
@@ -1110,13 +1134,16 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|
struct bch_csum csum;
|
|
csum_good = jset_csum_good(c, j, &csum);
|
|
|
|
- if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
|
|
- "%s",
|
|
- (printbuf_reset(&err),
|
|
- prt_str(&err, "journal "),
|
|
- bch2_csum_err_msg(&err, csum_type, j->csum, csum),
|
|
- err.buf)))
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
|
|
+
|
|
+ if (!csum_good) {
|
|
+ bch_err_dev_ratelimited(ca, "%s",
|
|
+ (printbuf_reset(&err),
|
|
+ prt_str(&err, "journal "),
|
|
+ bch2_csum_err_msg(&err, csum_type, j->csum, csum),
|
|
+ err.buf));
|
|
saw_bad = true;
|
|
+ }
|
|
|
|
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
|
|
j->encrypted_start,
|
|
@@ -1192,7 +1219,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
|
|
out:
|
|
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
|
|
kvfree(buf.data);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read);
|
|
closure_return(cl);
|
|
return;
|
|
err:
|
|
@@ -1227,7 +1254,8 @@ int bch2_journal_read(struct bch_fs *c,
|
|
|
|
if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
|
|
ca->mi.state == BCH_MEMBER_STATE_ro) &&
|
|
- percpu_ref_tryget(&ca->io_ref))
|
|
+ enumerated_ref_tryget(&ca->io_ref[READ],
|
|
+ BCH_DEV_READ_REF_journal_read))
|
|
closure_call(&ca->journal.read,
|
|
bch2_journal_read_device,
|
|
system_unbound_wq,
|
|
@@ -1236,7 +1264,8 @@ int bch2_journal_read(struct bch_fs *c,
|
|
degraded = true;
|
|
}
|
|
|
|
- closure_sync(&jlist.cl);
|
|
+ while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2))
|
|
+ ;
|
|
|
|
if (jlist.ret)
|
|
return jlist.ret;
|
|
@@ -1362,8 +1391,8 @@ int bch2_journal_read(struct bch_fs *c,
|
|
missing_end = seq - 1;
|
|
fsck_err(c, journal_entries_missing,
|
|
"journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
|
|
- " prev at %s\n"
|
|
- " next at %s, continue?",
|
|
+ "prev at %s\n"
|
|
+ "next at %s, continue?",
|
|
missing_start, missing_end,
|
|
*last_seq, *blacklist_seq - 1,
|
|
buf1.buf, buf2.buf);
|
|
@@ -1377,7 +1406,7 @@ int bch2_journal_read(struct bch_fs *c,
|
|
}
|
|
|
|
genradix_for_each(&c->journal_entries, radix_iter, _i) {
|
|
- struct bch_replicas_padded replicas = {
|
|
+ union bch_replicas_padded replicas = {
|
|
.e.data_type = BCH_DATA_journal,
|
|
.e.nr_devs = 0,
|
|
.e.nr_required = 1,
|
|
@@ -1417,7 +1446,7 @@ int bch2_journal_read(struct bch_fs *c,
|
|
!bch2_replicas_marked(c, &replicas.e) &&
|
|
(le64_to_cpu(i->j.seq) == *last_seq ||
|
|
fsck_err(c, journal_entry_replicas_not_marked,
|
|
- "superblock not marked as containing replicas for journal entry %llu\n %s",
|
|
+ "superblock not marked as containing replicas for journal entry %llu\n%s",
|
|
le64_to_cpu(i->j.seq), buf.buf))) {
|
|
ret = bch2_mark_replicas(c, &replicas.e);
|
|
if (ret)
|
|
@@ -1434,10 +1463,11 @@ int bch2_journal_read(struct bch_fs *c,
|
|
|
|
static void journal_advance_devs_to_next_bucket(struct journal *j,
|
|
struct dev_alloc_list *devs,
|
|
- unsigned sectors, u64 seq)
|
|
+ unsigned sectors, __le64 seq)
|
|
{
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
|
|
+ rcu_read_lock();
|
|
darray_for_each(*devs, i) {
|
|
struct bch_dev *ca = rcu_dereference(c->devs[*i]);
|
|
if (!ca)
|
|
@@ -1459,6 +1489,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j,
|
|
ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
|
|
}
|
|
}
|
|
+ rcu_read_unlock();
|
|
}
|
|
|
|
static void __journal_write_alloc(struct journal *j,
|
|
@@ -1471,7 +1502,8 @@ static void __journal_write_alloc(struct journal *j,
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
|
|
darray_for_each(*devs, i) {
|
|
- struct bch_dev *ca = rcu_dereference(c->devs[*i]);
|
|
+ struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE,
|
|
+ BCH_DEV_WRITE_REF_journal_write);
|
|
if (!ca)
|
|
continue;
|
|
|
|
@@ -1485,8 +1517,10 @@ static void __journal_write_alloc(struct journal *j,
|
|
ca->mi.state != BCH_MEMBER_STATE_rw ||
|
|
!ja->nr ||
|
|
bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
|
|
- sectors > ja->sectors_free)
|
|
+ sectors > ja->sectors_free) {
|
|
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
|
|
continue;
|
|
+ }
|
|
|
|
bch2_dev_stripe_increment(ca, &j->wp.stripe);
|
|
|
|
@@ -1509,15 +1543,8 @@ static void __journal_write_alloc(struct journal *j,
|
|
}
|
|
}
|
|
|
|
-/**
|
|
- * journal_write_alloc - decide where to write next journal entry
|
|
- *
|
|
- * @j: journal object
|
|
- * @w: journal buf (entry to be written)
|
|
- *
|
|
- * Returns: 0 on success, or -EROFS on failure
|
|
- */
|
|
-static int journal_write_alloc(struct journal *j, struct journal_buf *w)
|
|
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
|
|
+ unsigned *replicas)
|
|
{
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
struct bch_devs_mask devs;
|
|
@@ -1525,29 +1552,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
|
|
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
|
|
unsigned target = c->opts.metadata_target ?:
|
|
c->opts.foreground_target;
|
|
- unsigned replicas = 0, replicas_want =
|
|
- READ_ONCE(c->opts.metadata_replicas);
|
|
+ unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas);
|
|
unsigned replicas_need = min_t(unsigned, replicas_want,
|
|
READ_ONCE(c->opts.metadata_replicas_required));
|
|
bool advance_done = false;
|
|
|
|
- rcu_read_lock();
|
|
-
|
|
- /* We might run more than once if we have to stop and do discards: */
|
|
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key));
|
|
- bkey_for_each_ptr(ptrs, p) {
|
|
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev);
|
|
- if (ca)
|
|
- replicas += ca->mi.durability;
|
|
- }
|
|
-
|
|
retry_target:
|
|
devs = target_rw_devs(c, BCH_DATA_journal, target);
|
|
devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
|
|
retry_alloc:
|
|
- __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want);
|
|
+ __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want);
|
|
|
|
- if (likely(replicas >= replicas_want))
|
|
+ if (likely(*replicas >= replicas_want))
|
|
goto done;
|
|
|
|
if (!advance_done) {
|
|
@@ -1556,18 +1572,16 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
|
|
goto retry_alloc;
|
|
}
|
|
|
|
- if (replicas < replicas_want && target) {
|
|
+ if (*replicas < replicas_want && target) {
|
|
/* Retry from all devices: */
|
|
target = 0;
|
|
advance_done = false;
|
|
goto retry_target;
|
|
}
|
|
done:
|
|
- rcu_read_unlock();
|
|
-
|
|
BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
|
|
|
|
- return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
|
|
+ return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
|
|
}
|
|
|
|
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
|
|
@@ -1600,18 +1614,12 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
|
|
kvfree(new_buf);
|
|
}
|
|
|
|
-static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
|
|
-{
|
|
- return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
|
|
-}
|
|
-
|
|
static CLOSURE_CALLBACK(journal_write_done)
|
|
{
|
|
closure_type(w, struct journal_buf, io);
|
|
struct journal *j = container_of(w, struct journal, buf[w->idx]);
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
- struct bch_replicas_padded replicas;
|
|
- union journal_res_state old, new;
|
|
+ union bch_replicas_padded replicas;
|
|
u64 seq = le64_to_cpu(w->data->seq);
|
|
int err = 0;
|
|
|
|
@@ -1620,17 +1628,27 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
: j->noflush_write_time, j->write_start_time);
|
|
|
|
if (!w->devs_written.nr) {
|
|
- bch_err(c, "unable to write journal to sufficient devices");
|
|
- err = -EIO;
|
|
+ err = -BCH_ERR_journal_write_err;
|
|
} else {
|
|
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
|
|
w->devs_written);
|
|
- if (bch2_mark_replicas(c, &replicas.e))
|
|
- err = -EIO;
|
|
+ err = bch2_mark_replicas(c, &replicas.e);
|
|
}
|
|
|
|
- if (err)
|
|
- bch2_fatal_error(c);
|
|
+ if (err && !bch2_journal_error(j)) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+
|
|
+ if (err == -BCH_ERR_journal_write_err)
|
|
+ prt_printf(&buf, "unable to write journal to sufficient devices");
|
|
+ else
|
|
+ prt_printf(&buf, "journal write error marking replicas: %s", bch2_err_str(err));
|
|
+
|
|
+ bch2_fs_emergency_read_only2(c, &buf);
|
|
+
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
|
|
closure_debug_destroy(cl);
|
|
|
|
@@ -1641,7 +1659,23 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
j->err_seq = seq;
|
|
w->write_done = true;
|
|
|
|
+ if (!j->free_buf || j->free_buf_size < w->buf_size) {
|
|
+ swap(j->free_buf, w->data);
|
|
+ swap(j->free_buf_size, w->buf_size);
|
|
+ }
|
|
+
|
|
+ if (w->data) {
|
|
+ void *buf = w->data;
|
|
+ w->data = NULL;
|
|
+ w->buf_size = 0;
|
|
+
|
|
+ spin_unlock(&j->lock);
|
|
+ kvfree(buf);
|
|
+ spin_lock(&j->lock);
|
|
+ }
|
|
+
|
|
bool completed = false;
|
|
+ bool do_discards = false;
|
|
|
|
for (seq = journal_last_unwritten_seq(j);
|
|
seq <= journal_cur_seq(j);
|
|
@@ -1650,11 +1684,10 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
if (!w->write_done)
|
|
break;
|
|
|
|
- if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
|
|
+ if (!j->err_seq && !w->noflush) {
|
|
j->flushed_seq_ondisk = seq;
|
|
j->last_seq_ondisk = w->last_seq;
|
|
|
|
- bch2_do_discards(c);
|
|
closure_wake_up(&c->freelist_wait);
|
|
bch2_reset_alloc_cursors(c);
|
|
}
|
|
@@ -1671,16 +1704,6 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
if (j->watermark != BCH_WATERMARK_stripe)
|
|
journal_reclaim_kick(&c->journal);
|
|
|
|
- old.v = atomic64_read(&j->reservations.counter);
|
|
- do {
|
|
- new.v = old.v;
|
|
- BUG_ON(journal_state_count(new, new.unwritten_idx));
|
|
- BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
|
|
-
|
|
- new.unwritten_idx++;
|
|
- } while (!atomic64_try_cmpxchg(&j->reservations.counter,
|
|
- &old.v, new.v));
|
|
-
|
|
closure_wake_up(&w->wait);
|
|
completed = true;
|
|
}
|
|
@@ -1695,7 +1718,7 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
}
|
|
|
|
if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
|
|
- new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
|
|
+ j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
|
|
struct journal_buf *buf = journal_cur_buf(j);
|
|
long delta = buf->expires - jiffies;
|
|
|
|
@@ -1715,6 +1738,9 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
*/
|
|
bch2_journal_do_writes(j);
|
|
spin_unlock(&j->lock);
|
|
+
|
|
+ if (do_discards)
|
|
+ bch2_do_discards(c);
|
|
}
|
|
|
|
static void journal_write_endio(struct bio *bio)
|
|
@@ -1724,20 +1750,23 @@ static void journal_write_endio(struct bio *bio)
|
|
struct journal *j = &ca->fs->journal;
|
|
struct journal_buf *w = j->buf + jbio->buf_idx;
|
|
|
|
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
|
|
+ jbio->submit_time, !bio->bi_status);
|
|
+
|
|
+ if (bio->bi_status) {
|
|
+ bch_err_dev_ratelimited(ca,
|
|
"error writing journal entry %llu: %s",
|
|
le64_to_cpu(w->data->seq),
|
|
- bch2_blk_status_to_str(bio->bi_status)) ||
|
|
- bch2_meta_write_fault("journal")) {
|
|
- unsigned long flags;
|
|
+ bch2_blk_status_to_str(bio->bi_status));
|
|
|
|
+ unsigned long flags;
|
|
spin_lock_irqsave(&j->err_lock, flags);
|
|
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
|
|
spin_unlock_irqrestore(&j->err_lock, flags);
|
|
}
|
|
|
|
closure_put(&w->io);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
|
|
}
|
|
|
|
static CLOSURE_CALLBACK(journal_write_submit)
|
|
@@ -1748,18 +1777,17 @@ static CLOSURE_CALLBACK(journal_write_submit)
|
|
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
|
|
|
|
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
|
|
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
|
|
- if (!ca) {
|
|
- /* XXX: fix this */
|
|
- bch_err(c, "missing device for journal write\n");
|
|
- continue;
|
|
- }
|
|
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
|
|
|
|
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
|
|
sectors);
|
|
|
|
struct journal_device *ja = &ca->journal;
|
|
- struct bio *bio = &ja->bio[w->idx]->bio;
|
|
+ struct journal_bio *jbio = ja->bio[w->idx];
|
|
+ struct bio *bio = &jbio->bio;
|
|
+
|
|
+ jbio->submit_time = local_clock();
|
|
+
|
|
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
|
|
bio->bi_iter.bi_sector = ptr->offset;
|
|
bio->bi_end_io = journal_write_endio;
|
|
@@ -1791,6 +1819,10 @@ static CLOSURE_CALLBACK(journal_write_preflush)
|
|
struct journal *j = container_of(w, struct journal, buf[w->idx]);
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
|
|
+ /*
|
|
+ * Wait for previous journal writes to comelete; they won't necessarily
|
|
+ * be flushed if they're still in flight
|
|
+ */
|
|
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
|
|
spin_lock(&j->lock);
|
|
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
|
|
@@ -1803,8 +1835,9 @@ static CLOSURE_CALLBACK(journal_write_preflush)
|
|
}
|
|
|
|
if (w->separate_flush) {
|
|
- for_each_rw_member(c, ca) {
|
|
- percpu_ref_get(&ca->io_ref);
|
|
+ for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) {
|
|
+ enumerated_ref_get(&ca->io_ref[WRITE],
|
|
+ BCH_DEV_WRITE_REF_journal_write);
|
|
|
|
struct journal_device *ja = &ca->journal;
|
|
struct bio *bio = &ja->bio[w->idx]->bio;
|
|
@@ -1984,7 +2017,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
|
|
* write anything at all.
|
|
*/
|
|
if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
|
|
- return -EIO;
|
|
+ return error;
|
|
|
|
if (error ||
|
|
w->noflush ||
|
|
@@ -2013,13 +2046,10 @@ CLOSURE_CALLBACK(bch2_journal_write)
|
|
closure_type(w, struct journal_buf, io);
|
|
struct journal *j = container_of(w, struct journal, buf[w->idx]);
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
- struct bch_replicas_padded replicas;
|
|
- unsigned nr_rw_members = 0;
|
|
+ union bch_replicas_padded replicas;
|
|
+ unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]);
|
|
int ret;
|
|
|
|
- for_each_rw_member(c, ca)
|
|
- nr_rw_members++;
|
|
-
|
|
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
|
|
BUG_ON(!w->write_started);
|
|
BUG_ON(w->write_allocated);
|
|
@@ -2033,7 +2063,8 @@ CLOSURE_CALLBACK(bch2_journal_write)
|
|
|
|
ret = bch2_journal_write_pick_flush(j, w);
|
|
spin_unlock(&j->lock);
|
|
- if (ret)
|
|
+
|
|
+ if (unlikely(ret))
|
|
goto err;
|
|
|
|
mutex_lock(&j->buf_lock);
|
|
@@ -2041,43 +2072,30 @@ CLOSURE_CALLBACK(bch2_journal_write)
|
|
|
|
ret = bch2_journal_write_prep(j, w);
|
|
mutex_unlock(&j->buf_lock);
|
|
- if (ret)
|
|
- goto err;
|
|
|
|
- j->entry_bytes_written += vstruct_bytes(w->data);
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
|
|
+ unsigned replicas_allocated = 0;
|
|
while (1) {
|
|
- spin_lock(&j->lock);
|
|
- ret = journal_write_alloc(j, w);
|
|
+ ret = journal_write_alloc(j, w, &replicas_allocated);
|
|
if (!ret || !j->can_discard)
|
|
break;
|
|
|
|
- spin_unlock(&j->lock);
|
|
bch2_journal_do_discards(j);
|
|
}
|
|
|
|
- if (ret && !bch2_journal_error(j)) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- buf.atomic++;
|
|
-
|
|
- prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
|
|
- le64_to_cpu(w->data->seq),
|
|
- vstruct_sectors(w->data, c->block_bits),
|
|
- bch2_err_str(ret));
|
|
- __bch2_journal_debug_to_text(&buf, j);
|
|
- spin_unlock(&j->lock);
|
|
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
- if (ret)
|
|
- goto err;
|
|
+ if (unlikely(ret))
|
|
+ goto err_allocate_write;
|
|
|
|
+ spin_lock(&j->lock);
|
|
/*
|
|
* write is allocated, no longer need to account for it in
|
|
* bch2_journal_space_available():
|
|
*/
|
|
w->sectors = 0;
|
|
w->write_allocated = true;
|
|
+ j->entry_bytes_written += vstruct_bytes(w->data);
|
|
|
|
/*
|
|
* journal entry has been compacted and allocated, recalculate space
|
|
@@ -2089,9 +2107,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
|
|
|
|
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
|
|
|
|
- if (c->opts.nochanges)
|
|
- goto no_io;
|
|
-
|
|
/*
|
|
* Mark journal replicas before we submit the write to guarantee
|
|
* recovery will find the journal entries after a crash.
|
|
@@ -2102,15 +2117,33 @@ CLOSURE_CALLBACK(bch2_journal_write)
|
|
if (ret)
|
|
goto err;
|
|
|
|
+ if (c->opts.nochanges)
|
|
+ goto no_io;
|
|
+
|
|
if (!JSET_NO_FLUSH(w->data))
|
|
continue_at(cl, journal_write_preflush, j->wq);
|
|
else
|
|
continue_at(cl, journal_write_submit, j->wq);
|
|
return;
|
|
-no_io:
|
|
- continue_at(cl, journal_write_done, j->wq);
|
|
- return;
|
|
+err_allocate_write:
|
|
+ if (!bch2_journal_error(j)) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ bch2_journal_debug_to_text(&buf, j);
|
|
+ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
|
|
+ le64_to_cpu(w->data->seq),
|
|
+ vstruct_sectors(w->data, c->block_bits),
|
|
+ bch2_err_str(ret));
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
err:
|
|
bch2_fatal_error(c);
|
|
+no_io:
|
|
+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
|
|
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
|
|
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
|
|
+ }
|
|
+
|
|
continue_at(cl, journal_write_done, j->wq);
|
|
}
|
|
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
|
|
index 12b39fcb4424..ffa543424e9e 100644
|
|
--- a/fs/bcachefs/journal_io.h
|
|
+++ b/fs/bcachefs/journal_io.h
|
|
@@ -2,7 +2,7 @@
|
|
#ifndef _BCACHEFS_JOURNAL_IO_H
|
|
#define _BCACHEFS_JOURNAL_IO_H
|
|
|
|
-#include "darray.h"
|
|
+#include <linux/darray_types.h>
|
|
|
|
void bch2_journal_pos_from_member_info_set(struct bch_fs *);
|
|
void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
|
|
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
|
|
index d373cd181a7f..70f36f6bc482 100644
|
|
--- a/fs/bcachefs/journal_reclaim.c
|
|
+++ b/fs/bcachefs/journal_reclaim.c
|
|
@@ -17,6 +17,8 @@
|
|
#include <linux/kthread.h>
|
|
#include <linux/sched/mm.h>
|
|
|
|
+static bool __should_discard_bucket(struct journal *, struct journal_device *);
|
|
+
|
|
/* Free space calculations: */
|
|
|
|
static unsigned journal_space_from(struct journal_device *ja,
|
|
@@ -203,8 +205,7 @@ void bch2_journal_space_available(struct journal *j)
|
|
ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
|
|
ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
|
|
|
|
- if (ja->discard_idx != ja->dirty_idx_ondisk)
|
|
- can_discard = true;
|
|
+ can_discard |= __should_discard_bucket(j, ja);
|
|
|
|
max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
|
|
nr_online++;
|
|
@@ -214,19 +215,21 @@ void bch2_journal_space_available(struct journal *j)
|
|
j->can_discard = can_discard;
|
|
|
|
if (nr_online < metadata_replicas_required(c)) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- buf.atomic++;
|
|
- prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
|
|
- "rw journal devs:", nr_online, metadata_replicas_required(c));
|
|
-
|
|
- rcu_read_lock();
|
|
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
|
|
- prt_printf(&buf, " %s", ca->name);
|
|
- rcu_read_unlock();
|
|
-
|
|
- bch_err(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- ret = JOURNAL_ERR_insufficient_devices;
|
|
+ if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ buf.atomic++;
|
|
+ prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
|
|
+ "rw journal devs:", nr_online, metadata_replicas_required(c));
|
|
+
|
|
+ rcu_read_lock();
|
|
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
|
|
+ prt_printf(&buf, " %s", ca->name);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ bch_err(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
+ ret = -BCH_ERR_insufficient_journal_devices;
|
|
goto out;
|
|
}
|
|
|
|
@@ -240,7 +243,7 @@ void bch2_journal_space_available(struct journal *j)
|
|
total = j->space[journal_space_total].total;
|
|
|
|
if (!j->space[journal_space_discarded].next_entry)
|
|
- ret = JOURNAL_ERR_journal_full;
|
|
+ ret = -BCH_ERR_journal_full;
|
|
|
|
if ((j->space[journal_space_clean_ondisk].next_entry <
|
|
j->space[journal_space_clean_ondisk].total) &&
|
|
@@ -252,7 +255,10 @@ void bch2_journal_space_available(struct journal *j)
|
|
|
|
bch2_journal_set_watermark(j);
|
|
out:
|
|
- j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
|
|
+ j->cur_entry_sectors = !ret
|
|
+ ? round_down(j->space[journal_space_discarded].next_entry,
|
|
+ block_sectors(c))
|
|
+ : 0;
|
|
j->cur_entry_error = ret;
|
|
|
|
if (!ret)
|
|
@@ -261,12 +267,19 @@ void bch2_journal_space_available(struct journal *j)
|
|
|
|
/* Discards - last part of journal reclaim: */
|
|
|
|
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
|
|
+static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)
|
|
{
|
|
- bool ret;
|
|
+ unsigned min_free = max(4, ja->nr / 8);
|
|
|
|
+ return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) <
|
|
+ min_free &&
|
|
+ ja->discard_idx != ja->dirty_idx_ondisk;
|
|
+}
|
|
+
|
|
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
|
|
+{
|
|
spin_lock(&j->lock);
|
|
- ret = ja->discard_idx != ja->dirty_idx_ondisk;
|
|
+ bool ret = __should_discard_bucket(j, ja);
|
|
spin_unlock(&j->lock);
|
|
|
|
return ret;
|
|
@@ -282,12 +295,12 @@ void bch2_journal_do_discards(struct journal *j)
|
|
|
|
mutex_lock(&j->discard_lock);
|
|
|
|
- for_each_rw_member(c, ca) {
|
|
+ for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) {
|
|
struct journal_device *ja = &ca->journal;
|
|
|
|
while (should_discard_bucket(j, ja)) {
|
|
if (!c->opts.nochanges &&
|
|
- ca->mi.discard &&
|
|
+ bch2_discard_opt_enabled(c, ca) &&
|
|
bdev_max_discard_sectors(ca->disk_sb.bdev))
|
|
blkdev_issue_discard(ca->disk_sb.bdev,
|
|
bucket_to_sector(ca,
|
|
@@ -614,7 +627,8 @@ static u64 journal_seq_to_flush(struct journal *j)
|
|
|
|
spin_lock(&j->lock);
|
|
|
|
- for_each_rw_member(c, ca) {
|
|
+ rcu_read_lock();
|
|
+ for_each_rw_member_rcu(c, ca) {
|
|
struct journal_device *ja = &ca->journal;
|
|
unsigned nr_buckets, bucket_to_flush;
|
|
|
|
@@ -624,12 +638,11 @@ static u64 journal_seq_to_flush(struct journal *j)
|
|
/* Try to keep the journal at most half full: */
|
|
nr_buckets = ja->nr / 2;
|
|
|
|
- nr_buckets = min(nr_buckets, ja->nr);
|
|
-
|
|
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
|
|
seq_to_flush = max(seq_to_flush,
|
|
ja->bucket_seq[bucket_to_flush]);
|
|
}
|
|
+ rcu_read_unlock();
|
|
|
|
/* Also flush if the pin fifo is more than half full */
|
|
seq_to_flush = max_t(s64, seq_to_flush,
|
|
@@ -645,7 +658,6 @@ static u64 journal_seq_to_flush(struct journal *j)
|
|
* @j: journal object
|
|
* @direct: direct or background reclaim?
|
|
* @kicked: requested to run since we last ran?
|
|
- * Returns: 0 on success, or -EIO if the journal has been shutdown
|
|
*
|
|
* Background journal reclaim writes out btree nodes. It should be run
|
|
* early enough so that we never completely run out of journal buckets.
|
|
@@ -685,11 +697,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
|
|
if (kthread && kthread_should_stop())
|
|
break;
|
|
|
|
- if (bch2_journal_error(j)) {
|
|
- ret = -EIO;
|
|
+ ret = bch2_journal_error(j);
|
|
+ if (ret)
|
|
break;
|
|
- }
|
|
|
|
+ /* XXX shove journal discards off to another thread */
|
|
bch2_journal_do_discards(j);
|
|
|
|
seq_to_flush = journal_seq_to_flush(j);
|
|
@@ -952,7 +964,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
|
|
seq = 0;
|
|
spin_lock(&j->lock);
|
|
while (!ret) {
|
|
- struct bch_replicas_padded replicas;
|
|
+ union bch_replicas_padded replicas;
|
|
|
|
seq = max(seq, journal_last_seq(j));
|
|
if (seq >= j->pin.back)
|
|
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
|
|
index 62b910f2fb27..68b960e08f12 100644
|
|
--- a/fs/bcachefs/journal_sb.c
|
|
+++ b/fs/bcachefs/journal_sb.c
|
|
@@ -2,8 +2,8 @@
|
|
|
|
#include "bcachefs.h"
|
|
#include "journal_sb.h"
|
|
-#include "darray.h"
|
|
|
|
+#include <linux/darray.h>
|
|
#include <linux/sort.h>
|
|
|
|
/* BCH_SB_FIELD_journal: */
|
|
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
|
|
index 1f25c111c54c..e463d2d95359 100644
|
|
--- a/fs/bcachefs/journal_seq_blacklist.c
|
|
+++ b/fs/bcachefs/journal_seq_blacklist.c
|
|
@@ -231,15 +231,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c)
|
|
struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
|
|
BUG_ON(nr != t->nr);
|
|
|
|
- unsigned i;
|
|
- for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
|
|
- src < bl->start + nr;
|
|
- src++, i = eytzinger0_next(i, nr)) {
|
|
+ src = bl->start;
|
|
+ eytzinger0_for_each(i, nr) {
|
|
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
|
|
BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
|
|
|
|
if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
|
|
*dst++ = *src;
|
|
+ src++;
|
|
}
|
|
|
|
unsigned new_nr = dst - bl->start;
|
|
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
|
|
index 1ef3a28ed6ab..51104bbb99da 100644
|
|
--- a/fs/bcachefs/journal_types.h
|
|
+++ b/fs/bcachefs/journal_types.h
|
|
@@ -12,7 +12,11 @@
|
|
/* btree write buffer steals 8 bits for its own purposes: */
|
|
#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1)
|
|
|
|
-#define JOURNAL_BUF_BITS 2
|
|
+#define JOURNAL_STATE_BUF_BITS 2
|
|
+#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS)
|
|
+#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1)
|
|
+
|
|
+#define JOURNAL_BUF_BITS 4
|
|
#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
|
|
#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
|
|
|
|
@@ -82,7 +86,6 @@ struct journal_entry_pin {
|
|
|
|
struct journal_res {
|
|
bool ref;
|
|
- u8 idx;
|
|
u16 u64s;
|
|
u32 offset;
|
|
u64 seq;
|
|
@@ -98,9 +101,8 @@ union journal_res_state {
|
|
};
|
|
|
|
struct {
|
|
- u64 cur_entry_offset:20,
|
|
+ u64 cur_entry_offset:22,
|
|
idx:2,
|
|
- unwritten_idx:2,
|
|
buf0_count:10,
|
|
buf1_count:10,
|
|
buf2_count:10,
|
|
@@ -110,13 +112,13 @@ union journal_res_state {
|
|
|
|
/* bytes: */
|
|
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
|
|
-#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
|
|
+#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */
|
|
|
|
/*
|
|
* We stash some journal state as sentinal values in cur_entry_offset:
|
|
* note - cur_entry_offset is in units of u64s
|
|
*/
|
|
-#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
|
|
+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1)
|
|
|
|
#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2)
|
|
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
|
|
@@ -149,28 +151,10 @@ enum journal_flags {
|
|
#undef x
|
|
};
|
|
|
|
-/* Reasons we may fail to get a journal reservation: */
|
|
-#define JOURNAL_ERRORS() \
|
|
- x(ok) \
|
|
- x(retry) \
|
|
- x(blocked) \
|
|
- x(max_in_flight) \
|
|
- x(journal_full) \
|
|
- x(journal_pin_full) \
|
|
- x(journal_stuck) \
|
|
- x(insufficient_devices)
|
|
-
|
|
-enum journal_errors {
|
|
-#define x(n) JOURNAL_ERR_##n,
|
|
- JOURNAL_ERRORS()
|
|
-#undef x
|
|
-};
|
|
-
|
|
-typedef DARRAY(u64) darray_u64;
|
|
-
|
|
struct journal_bio {
|
|
struct bch_dev *ca;
|
|
unsigned buf_idx;
|
|
+ u64 submit_time;
|
|
|
|
struct bio bio;
|
|
};
|
|
@@ -199,7 +183,7 @@ struct journal {
|
|
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
|
|
* insufficient devices:
|
|
*/
|
|
- enum journal_errors cur_entry_error;
|
|
+ int cur_entry_error;
|
|
unsigned cur_entry_offset_if_blocked;
|
|
|
|
unsigned buf_size_want;
|
|
@@ -220,6 +204,8 @@ struct journal {
|
|
* other is possibly being written out.
|
|
*/
|
|
struct journal_buf buf[JOURNAL_BUF_NR];
|
|
+ void *free_buf;
|
|
+ unsigned free_buf_size;
|
|
|
|
spinlock_t lock;
|
|
|
|
@@ -237,6 +223,7 @@ struct journal {
|
|
/* Sequence number of most recent journal entry (last entry in @pin) */
|
|
atomic64_t seq;
|
|
|
|
+ u64 seq_write_started;
|
|
/* seq, last_seq from the most recent journal entry successfully written */
|
|
u64 seq_ondisk;
|
|
u64 flushed_seq_ondisk;
|
|
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
|
|
index ce794d55818f..2f63fc6d456f 100644
|
|
--- a/fs/bcachefs/lru.c
|
|
+++ b/fs/bcachefs/lru.c
|
|
@@ -6,6 +6,7 @@
|
|
#include "btree_iter.h"
|
|
#include "btree_update.h"
|
|
#include "btree_write_buffer.h"
|
|
+#include "ec.h"
|
|
#include "error.h"
|
|
#include "lru.h"
|
|
#include "recovery.h"
|
|
@@ -59,9 +60,9 @@ int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time
|
|
return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
|
|
}
|
|
|
|
-int bch2_lru_change(struct btree_trans *trans,
|
|
- u16 lru_id, u64 dev_bucket,
|
|
- u64 old_time, u64 new_time)
|
|
+int __bch2_lru_change(struct btree_trans *trans,
|
|
+ u16 lru_id, u64 dev_bucket,
|
|
+ u64 old_time, u64 new_time)
|
|
{
|
|
if (old_time == new_time)
|
|
return 0;
|
|
@@ -78,7 +79,9 @@ static const char * const bch2_lru_types[] = {
|
|
};
|
|
|
|
int bch2_lru_check_set(struct btree_trans *trans,
|
|
- u16 lru_id, u64 time,
|
|
+ u16 lru_id,
|
|
+ u64 dev_bucket,
|
|
+ u64 time,
|
|
struct bkey_s_c referring_k,
|
|
struct bkey_buf *last_flushed)
|
|
{
|
|
@@ -87,9 +90,7 @@ int bch2_lru_check_set(struct btree_trans *trans,
|
|
struct btree_iter lru_iter;
|
|
struct bkey_s_c lru_k =
|
|
bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
|
|
- lru_pos(lru_id,
|
|
- bucket_to_u64(referring_k.k->p),
|
|
- time), 0);
|
|
+ lru_pos(lru_id, dev_bucket, time), 0);
|
|
int ret = bkey_err(lru_k);
|
|
if (ret)
|
|
return ret;
|
|
@@ -100,11 +101,10 @@ int bch2_lru_check_set(struct btree_trans *trans,
|
|
goto err;
|
|
|
|
if (fsck_err(trans, alloc_key_to_missing_lru_entry,
|
|
- "missing %s lru entry\n"
|
|
- " %s",
|
|
+ "missing %s lru entry\n%s",
|
|
bch2_lru_types[lru_type(lru_k)],
|
|
(bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
|
|
- ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time);
|
|
+ ret = bch2_lru_set(trans, lru_id, dev_bucket, time);
|
|
if (ret)
|
|
goto err;
|
|
}
|
|
@@ -116,57 +116,81 @@ int bch2_lru_check_set(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
+static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k)
|
|
+{
|
|
+ enum bch_lru_type type = lru_type(lru_k);
|
|
+
|
|
+ switch (type) {
|
|
+ case BCH_LRU_read:
|
|
+ case BCH_LRU_fragmentation:
|
|
+ return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset));
|
|
+ case BCH_LRU_stripes:
|
|
+ return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset));
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static u64 bkey_lru_type_idx(struct bch_fs *c,
|
|
+ enum bch_lru_type type,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bch_alloc_v4 a_convert;
|
|
+ const struct bch_alloc_v4 *a;
|
|
+
|
|
+ switch (type) {
|
|
+ case BCH_LRU_read:
|
|
+ a = bch2_alloc_to_v4(k, &a_convert);
|
|
+ return alloc_lru_idx_read(*a);
|
|
+ case BCH_LRU_fragmentation: {
|
|
+ a = bch2_alloc_to_v4(k, &a_convert);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
|
|
+ u64 idx = ca
|
|
+ ? alloc_lru_idx_fragmentation(*a, ca)
|
|
+ : 0;
|
|
+ rcu_read_unlock();
|
|
+ return idx;
|
|
+ }
|
|
+ case BCH_LRU_stripes:
|
|
+ return k.k->type == KEY_TYPE_stripe
|
|
+ ? stripe_lru_pos(bkey_s_c_to_stripe(k).v)
|
|
+ : 0;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
static int bch2_check_lru_key(struct btree_trans *trans,
|
|
struct btree_iter *lru_iter,
|
|
struct bkey_s_c lru_k,
|
|
struct bkey_buf *last_flushed)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter iter;
|
|
- struct bkey_s_c k;
|
|
- struct bch_alloc_v4 a_convert;
|
|
- const struct bch_alloc_v4 *a;
|
|
struct printbuf buf1 = PRINTBUF;
|
|
struct printbuf buf2 = PRINTBUF;
|
|
- enum bch_lru_type type = lru_type(lru_k);
|
|
- struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
|
|
- u64 idx;
|
|
- int ret;
|
|
-
|
|
- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos);
|
|
|
|
- if (fsck_err_on(!ca,
|
|
- trans, lru_entry_to_invalid_bucket,
|
|
- "lru key points to nonexistent device:bucket %llu:%llu",
|
|
- alloc_pos.inode, alloc_pos.offset))
|
|
- return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
|
|
+ struct bbpos bp = lru_pos_to_bp(lru_k);
|
|
|
|
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
|
|
- ret = bkey_err(k);
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0);
|
|
+ int ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- a = bch2_alloc_to_v4(k, &a_convert);
|
|
-
|
|
- switch (type) {
|
|
- case BCH_LRU_read:
|
|
- idx = alloc_lru_idx_read(*a);
|
|
- break;
|
|
- case BCH_LRU_fragmentation:
|
|
- idx = alloc_lru_idx_fragmentation(*a, ca);
|
|
- break;
|
|
- }
|
|
+ enum bch_lru_type type = lru_type(lru_k);
|
|
+ u64 idx = bkey_lru_type_idx(c, type, k);
|
|
|
|
- if (lru_k.k->type != KEY_TYPE_set ||
|
|
- lru_pos_time(lru_k.k->p) != idx) {
|
|
+ if (lru_pos_time(lru_k.k->p) != idx) {
|
|
ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
|
|
if (ret)
|
|
goto err;
|
|
|
|
if (fsck_err(trans, lru_entry_bad,
|
|
"incorrect lru entry: lru %s time %llu\n"
|
|
- " %s\n"
|
|
- " for %s",
|
|
+ "%s\n"
|
|
+ "for %s",
|
|
bch2_lru_types[type],
|
|
lru_pos_time(lru_k.k->p),
|
|
(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
|
|
@@ -176,7 +200,6 @@ static int bch2_check_lru_key(struct btree_trans *trans,
|
|
err:
|
|
fsck_err:
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
- bch2_dev_put(ca);
|
|
printbuf_exit(&buf2);
|
|
printbuf_exit(&buf1);
|
|
return ret;
|
|
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
|
|
index f31a6cf1514c..8abd0aa2083a 100644
|
|
--- a/fs/bcachefs/lru.h
|
|
+++ b/fs/bcachefs/lru.h
|
|
@@ -28,9 +28,14 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
|
|
{
|
|
u16 lru_id = l.k->p.inode >> 48;
|
|
|
|
- if (lru_id == BCH_LRU_FRAGMENTATION_START)
|
|
+ switch (lru_id) {
|
|
+ case BCH_LRU_BUCKET_FRAGMENTATION:
|
|
return BCH_LRU_fragmentation;
|
|
- return BCH_LRU_read;
|
|
+ case BCH_LRU_STRIPE_FRAGMENTATION:
|
|
+ return BCH_LRU_stripes;
|
|
+ default:
|
|
+ return BCH_LRU_read;
|
|
+ }
|
|
}
|
|
|
|
int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
|
|
@@ -46,10 +51,19 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
|
|
|
|
int bch2_lru_del(struct btree_trans *, u16, u64, u64);
|
|
int bch2_lru_set(struct btree_trans *, u16, u64, u64);
|
|
-int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
|
|
+int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
|
|
+
|
|
+static inline int bch2_lru_change(struct btree_trans *trans,
|
|
+ u16 lru_id, u64 dev_bucket,
|
|
+ u64 old_time, u64 new_time)
|
|
+{
|
|
+ return old_time != new_time
|
|
+ ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time)
|
|
+ : 0;
|
|
+}
|
|
|
|
struct bkey_buf;
|
|
-int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *);
|
|
+int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *);
|
|
|
|
int bch2_check_lrus(struct bch_fs *);
|
|
|
|
diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h
|
|
index f372cb3b8cda..b7392ad8e41f 100644
|
|
--- a/fs/bcachefs/lru_format.h
|
|
+++ b/fs/bcachefs/lru_format.h
|
|
@@ -9,7 +9,8 @@ struct bch_lru {
|
|
|
|
#define BCH_LRU_TYPES() \
|
|
x(read) \
|
|
- x(fragmentation)
|
|
+ x(fragmentation) \
|
|
+ x(stripes)
|
|
|
|
enum bch_lru_type {
|
|
#define x(n) BCH_LRU_##n,
|
|
@@ -17,7 +18,8 @@ enum bch_lru_type {
|
|
#undef x
|
|
};
|
|
|
|
-#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
|
|
+#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1)
|
|
+#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2)
|
|
|
|
#define LRU_TIME_BITS 48
|
|
#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
|
|
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
|
|
index ddc187fb693d..bb7a92270c09 100644
|
|
--- a/fs/bcachefs/migrate.c
|
|
+++ b/fs/bcachefs/migrate.c
|
|
@@ -4,10 +4,13 @@
|
|
*/
|
|
|
|
#include "bcachefs.h"
|
|
+#include "backpointers.h"
|
|
#include "bkey_buf.h"
|
|
#include "btree_update.h"
|
|
#include "btree_update_interior.h"
|
|
+#include "btree_write_buffer.h"
|
|
#include "buckets.h"
|
|
+#include "ec.h"
|
|
#include "errcode.h"
|
|
#include "extents.h"
|
|
#include "io_write.h"
|
|
@@ -15,11 +18,12 @@
|
|
#include "keylist.h"
|
|
#include "migrate.h"
|
|
#include "move.h"
|
|
+#include "progress.h"
|
|
#include "replicas.h"
|
|
#include "super-io.h"
|
|
|
|
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
|
|
- unsigned dev_idx, int flags, bool metadata)
|
|
+ unsigned dev_idx, unsigned flags, bool metadata)
|
|
{
|
|
unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
|
|
unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
|
|
@@ -36,11 +40,28 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
|
|
return 0;
|
|
}
|
|
|
|
+static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct btree *b, unsigned dev_idx, unsigned flags)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_buf k;
|
|
+
|
|
+ bch2_bkey_buf_init(&k);
|
|
+ bch2_bkey_buf_copy(&k, c, &b->key);
|
|
+
|
|
+ int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?:
|
|
+ bch2_btree_node_update_key(trans, iter, b, k.k, 0, false);
|
|
+
|
|
+ bch_err_fn(c, ret);
|
|
+ bch2_bkey_buf_exit(&k, c);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
struct bkey_s_c k,
|
|
unsigned dev_idx,
|
|
- int flags)
|
|
+ unsigned flags)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct bkey_i *n;
|
|
@@ -76,7 +97,27 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
+static int bch2_dev_btree_drop_key(struct btree_trans *trans,
|
|
+ struct bkey_s_c_backpointer bp,
|
|
+ unsigned dev_idx,
|
|
+ struct bkey_buf *last_flushed,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct btree_iter iter;
|
|
+ struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed);
|
|
+ int ret = PTR_ERR_OR_ZERO(b);
|
|
+ if (ret)
|
|
+ return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret;
|
|
+
|
|
+ ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
|
|
+
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_dev_usrdata_drop(struct bch_fs *c,
|
|
+ struct progress_indicator_state *progress,
|
|
+ unsigned dev_idx, unsigned flags)
|
|
{
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
enum btree_id id;
|
|
@@ -88,8 +129,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
|
|
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
|
|
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
|
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
- bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
|
|
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
+ bch2_progress_update_iter(trans, progress, &iter, "dropping user data");
|
|
+ bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
|
|
+ }));
|
|
if (ret)
|
|
break;
|
|
}
|
|
@@ -99,7 +142,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
return ret;
|
|
}
|
|
|
|
-static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
+static int bch2_dev_metadata_drop(struct bch_fs *c,
|
|
+ struct progress_indicator_state *progress,
|
|
+ unsigned dev_idx, unsigned flags)
|
|
{
|
|
struct btree_trans *trans;
|
|
struct btree_iter iter;
|
|
@@ -123,29 +168,23 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
retry:
|
|
ret = 0;
|
|
while (bch2_trans_begin(trans),
|
|
- (b = bch2_btree_iter_peek_node(&iter)) &&
|
|
+ (b = bch2_btree_iter_peek_node(trans, &iter)) &&
|
|
!(ret = PTR_ERR_OR_ZERO(b))) {
|
|
+ bch2_progress_update_iter(trans, progress, &iter, "dropping metadata");
|
|
+
|
|
if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
|
|
goto next;
|
|
|
|
- bch2_bkey_buf_copy(&k, c, &b->key);
|
|
-
|
|
- ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
|
|
- dev_idx, flags, true);
|
|
- if (ret)
|
|
- break;
|
|
-
|
|
- ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
|
|
+ ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
|
|
ret = 0;
|
|
continue;
|
|
}
|
|
|
|
- bch_err_msg(c, ret, "updating btree node key");
|
|
if (ret)
|
|
break;
|
|
next:
|
|
- bch2_btree_iter_next_node(&iter);
|
|
+ bch2_btree_iter_next_node(trans, &iter);
|
|
}
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
goto retry;
|
|
@@ -167,8 +206,72 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
return ret;
|
|
}
|
|
|
|
-int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
+static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx,
|
|
+ struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent,
|
|
+ last_flushed);
|
|
+ int ret = bkey_err(k);
|
|
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
|
|
+ return 0;
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (!k.k || !bch2_bkey_has_device_c(k, dev_idx))
|
|
+ goto out;
|
|
+
|
|
+ /*
|
|
+ * XXX: pass flags arg to invalidate_stripe_to_dev and handle it
|
|
+ * properly
|
|
+ */
|
|
+
|
|
+ if (bkey_is_btree_ptr(k.k))
|
|
+ ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags);
|
|
+ else if (k.k->type == KEY_TYPE_stripe)
|
|
+ ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags);
|
|
+ else
|
|
+ ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
|
|
+out:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags)
|
|
{
|
|
- return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
|
|
- bch2_dev_metadata_drop(c, dev_idx, flags);
|
|
+ struct btree_trans *trans = bch2_trans_get(c);
|
|
+
|
|
+ struct bkey_buf last_flushed;
|
|
+ bch2_bkey_buf_init(&last_flushed);
|
|
+ bkey_init(&last_flushed.k->k);
|
|
+
|
|
+ int ret = bch2_btree_write_buffer_flush_sync(trans) ?:
|
|
+ for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
|
|
+ POS(dev_idx, 0),
|
|
+ POS(dev_idx, U64_MAX), 0, k,
|
|
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
+ if (k.k->type != KEY_TYPE_backpointer)
|
|
+ continue;
|
|
+
|
|
+ data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k),
|
|
+ &last_flushed, flags);
|
|
+
|
|
+ }));
|
|
+
|
|
+ bch2_bkey_buf_exit(&last_flushed, trans->c);
|
|
+ bch2_trans_put(trans);
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags)
|
|
+{
|
|
+ struct progress_indicator_state progress;
|
|
+ bch2_progress_init(&progress, c,
|
|
+ BIT_ULL(BTREE_ID_extents)|
|
|
+ BIT_ULL(BTREE_ID_reflink));
|
|
+
|
|
+ return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?:
|
|
+ bch2_dev_metadata_drop(c, &progress, dev_idx, flags);
|
|
}
|
|
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
|
|
index 027efaa0d575..30018140711b 100644
|
|
--- a/fs/bcachefs/migrate.h
|
|
+++ b/fs/bcachefs/migrate.h
|
|
@@ -2,6 +2,7 @@
|
|
#ifndef _BCACHEFS_MIGRATE_H
|
|
#define _BCACHEFS_MIGRATE_H
|
|
|
|
-int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
|
|
+int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned);
|
|
+int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned);
|
|
|
|
#endif /* _BCACHEFS_MIGRATE_H */
|
|
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
|
|
index 160b4374160a..79f4722621d5 100644
|
|
--- a/fs/bcachefs/move.c
|
|
+++ b/fs/bcachefs/move.c
|
|
@@ -38,28 +38,28 @@ const char * const bch2_data_ops_strs[] = {
|
|
NULL
|
|
};
|
|
|
|
-static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
|
|
+static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
|
|
struct bch_io_opts *io_opts,
|
|
struct data_update_opts *data_opts)
|
|
{
|
|
- if (trace_move_extent_enabled()) {
|
|
+ if (trace_io_move_enabled()) {
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
prt_newline(&buf);
|
|
bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
|
|
- trace_move_extent(c, buf.buf);
|
|
+ trace_io_move(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
}
|
|
|
|
-static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
|
|
+static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
|
|
{
|
|
- if (trace_move_extent_read_enabled()) {
|
|
+ if (trace_io_move_read_enabled()) {
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
- trace_move_extent_read(c, buf.buf);
|
|
+ trace_io_move_read(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
}
|
|
@@ -67,18 +67,14 @@ static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
|
|
struct moving_io {
|
|
struct list_head read_list;
|
|
struct list_head io_list;
|
|
- struct move_bucket_in_flight *b;
|
|
+ struct move_bucket *b;
|
|
struct closure cl;
|
|
bool read_completed;
|
|
|
|
unsigned read_sectors;
|
|
unsigned write_sectors;
|
|
|
|
- struct bch_read_bio rbio;
|
|
-
|
|
struct data_update write;
|
|
- /* Must be last since it is variable size */
|
|
- struct bio_vec bi_inline_vecs[];
|
|
};
|
|
|
|
static void move_free(struct moving_io *io)
|
|
@@ -88,43 +84,85 @@ static void move_free(struct moving_io *io)
|
|
if (io->b)
|
|
atomic_dec(&io->b->count);
|
|
|
|
- bch2_data_update_exit(&io->write);
|
|
-
|
|
mutex_lock(&ctxt->lock);
|
|
list_del(&io->io_list);
|
|
wake_up(&ctxt->wait);
|
|
mutex_unlock(&ctxt->lock);
|
|
|
|
+ if (!io->write.data_opts.scrub) {
|
|
+ bch2_data_update_exit(&io->write);
|
|
+ } else {
|
|
+ bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
|
|
+ kfree(io->write.bvecs);
|
|
+ }
|
|
kfree(io);
|
|
}
|
|
|
|
static void move_write_done(struct bch_write_op *op)
|
|
{
|
|
struct moving_io *io = container_of(op, struct moving_io, write.op);
|
|
+ struct bch_fs *c = op->c;
|
|
struct moving_context *ctxt = io->write.ctxt;
|
|
|
|
- if (io->write.op.error)
|
|
+ if (op->error) {
|
|
+ if (trace_io_move_write_fail_enabled()) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ bch2_write_op_to_text(&buf, op);
|
|
+ trace_io_move_write_fail(c, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
+ this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
|
|
+
|
|
ctxt->write_error = true;
|
|
+ }
|
|
|
|
- atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
|
|
- atomic_dec(&io->write.ctxt->write_ios);
|
|
+ atomic_sub(io->write_sectors, &ctxt->write_sectors);
|
|
+ atomic_dec(&ctxt->write_ios);
|
|
move_free(io);
|
|
closure_put(&ctxt->cl);
|
|
}
|
|
|
|
static void move_write(struct moving_io *io)
|
|
{
|
|
- if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
|
|
+ struct bch_fs *c = io->write.op.c;
|
|
+ struct moving_context *ctxt = io->write.ctxt;
|
|
+ struct bch_read_bio *rbio = &io->write.rbio;
|
|
+
|
|
+ if (ctxt->stats) {
|
|
+ if (rbio->bio.bi_status)
|
|
+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
|
|
+ &ctxt->stats->sectors_error_uncorrected);
|
|
+ else if (rbio->saw_error)
|
|
+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
|
|
+ &ctxt->stats->sectors_error_corrected);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If the extent has been bitrotted, we're going to have to give it a
|
|
+ * new checksum in order to move it - but the poison bit will ensure
|
|
+ * that userspace still gets the appropriate error.
|
|
+ */
|
|
+ if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
|
|
+ (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
|
|
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
|
|
+ struct nonce nonce = extent_nonce(rbio->version, crc);
|
|
+
|
|
+ rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type,
|
|
+ nonce, &rbio->bio);
|
|
+ rbio->ret = 0;
|
|
+ }
|
|
+
|
|
+ if (unlikely(rbio->ret || io->write.data_opts.scrub)) {
|
|
move_free(io);
|
|
return;
|
|
}
|
|
|
|
- if (trace_move_extent_write_enabled()) {
|
|
- struct bch_fs *c = io->write.op.c;
|
|
+ if (trace_io_move_write_enabled()) {
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
|
|
- trace_move_extent_write(c, buf.buf);
|
|
+ trace_io_move_write(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
@@ -132,7 +170,7 @@ static void move_write(struct moving_io *io)
|
|
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
|
|
atomic_inc(&io->write.ctxt->write_ios);
|
|
|
|
- bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
|
|
+ bch2_data_update_read_done(&io->write);
|
|
}
|
|
|
|
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
|
|
@@ -145,7 +183,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx
|
|
|
|
static void move_read_endio(struct bio *bio)
|
|
{
|
|
- struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
|
|
+ struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio);
|
|
struct moving_context *ctxt = io->write.ctxt;
|
|
|
|
atomic_sub(io->read_sectors, &ctxt->read_sectors);
|
|
@@ -250,7 +288,7 @@ void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
|
|
}
|
|
|
|
int bch2_move_extent(struct moving_context *ctxt,
|
|
- struct move_bucket_in_flight *bucket_in_flight,
|
|
+ struct move_bucket *bucket_in_flight,
|
|
struct btree_iter *iter,
|
|
struct bkey_s_c k,
|
|
struct bch_io_opts io_opts,
|
|
@@ -258,14 +296,10 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
{
|
|
struct btree_trans *trans = ctxt->trans;
|
|
struct bch_fs *c = trans->c;
|
|
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
- struct moving_io *io;
|
|
- const union bch_extent_entry *entry;
|
|
- struct extent_ptr_decoded p;
|
|
- unsigned sectors = k.k->size, pages;
|
|
int ret = -ENOMEM;
|
|
|
|
- trace_move_extent2(c, k, &io_opts, &data_opts);
|
|
+ trace_io_move2(c, k, &io_opts, &data_opts);
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
|
|
|
|
if (ctxt->stats)
|
|
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
|
|
@@ -273,7 +307,8 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
bch2_data_update_opts_normalize(k, &data_opts);
|
|
|
|
if (!data_opts.rewrite_ptrs &&
|
|
- !data_opts.extra_replicas) {
|
|
+ !data_opts.extra_replicas &&
|
|
+ !data_opts.scrub) {
|
|
if (data_opts.kill_ptrs)
|
|
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
|
|
return 0;
|
|
@@ -285,13 +320,7 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
*/
|
|
bch2_trans_unlock(trans);
|
|
|
|
- /* write path might have to decompress data: */
|
|
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
- sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
|
|
-
|
|
- pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
|
|
- io = kzalloc(sizeof(struct moving_io) +
|
|
- sizeof(struct bio_vec) * pages, GFP_KERNEL);
|
|
+ struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL);
|
|
if (!io)
|
|
goto err;
|
|
|
|
@@ -300,31 +329,27 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
io->read_sectors = k.k->size;
|
|
io->write_sectors = k.k->size;
|
|
|
|
- bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
|
|
- io->write.op.wbio.bio.bi_ioprio =
|
|
- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
|
-
|
|
- if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
|
|
- GFP_KERNEL))
|
|
- goto err_free;
|
|
+ if (!data_opts.scrub) {
|
|
+ ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
|
|
+ &io_opts, data_opts, iter->btree_id, k);
|
|
+ if (ret)
|
|
+ goto err_free;
|
|
|
|
- io->rbio.c = c;
|
|
- io->rbio.opts = io_opts;
|
|
- bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
|
|
- io->rbio.bio.bi_vcnt = pages;
|
|
- io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
|
- io->rbio.bio.bi_iter.bi_size = sectors << 9;
|
|
+ io->write.op.end_io = move_write_done;
|
|
+ } else {
|
|
+ bch2_bkey_buf_init(&io->write.k);
|
|
+ bch2_bkey_buf_reassemble(&io->write.k, c, k);
|
|
|
|
- io->rbio.bio.bi_opf = REQ_OP_READ;
|
|
- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
|
|
- io->rbio.bio.bi_end_io = move_read_endio;
|
|
+ io->write.op.c = c;
|
|
+ io->write.data_opts = data_opts;
|
|
|
|
- ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
|
|
- io_opts, data_opts, iter->btree_id, k);
|
|
- if (ret)
|
|
- goto err_free_pages;
|
|
+ ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
|
|
+ if (ret)
|
|
+ goto err_free;
|
|
+ }
|
|
|
|
- io->write.op.end_io = move_write_done;
|
|
+ io->write.rbio.bio.bi_end_io = move_read_endio;
|
|
+ io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
|
|
|
if (ctxt->rate)
|
|
bch2_ratelimit_increment(ctxt->rate, k.k->size);
|
|
@@ -339,9 +364,7 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
atomic_inc(&io->b->count);
|
|
}
|
|
|
|
- this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
|
|
- this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
|
|
- trace_move_extent_read2(c, k);
|
|
+ trace_io_move_read2(c, k);
|
|
|
|
mutex_lock(&ctxt->lock);
|
|
atomic_add(io->read_sectors, &ctxt->read_sectors);
|
|
@@ -356,39 +379,39 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
* ctxt when doing wakeup
|
|
*/
|
|
closure_get(&ctxt->cl);
|
|
- bch2_read_extent(trans, &io->rbio,
|
|
- bkey_start_pos(k.k),
|
|
- iter->btree_id, k, 0,
|
|
- BCH_READ_NODECODE|
|
|
- BCH_READ_LAST_FRAGMENT);
|
|
+ __bch2_read_extent(trans, &io->write.rbio,
|
|
+ io->write.rbio.bio.bi_iter,
|
|
+ bkey_start_pos(k.k),
|
|
+ iter->btree_id, k, 0,
|
|
+ NULL,
|
|
+ BCH_READ_last_fragment,
|
|
+ data_opts.scrub ? data_opts.read_dev : -1);
|
|
return 0;
|
|
-err_free_pages:
|
|
- bio_free_pages(&io->write.op.wbio.bio);
|
|
err_free:
|
|
kfree(io);
|
|
err:
|
|
- if (ret == -BCH_ERR_data_update_done)
|
|
+ if (bch2_err_matches(ret, BCH_ERR_data_update_done))
|
|
return 0;
|
|
|
|
if (bch2_err_matches(ret, EROFS) ||
|
|
bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
return ret;
|
|
|
|
- count_event(c, move_extent_start_fail);
|
|
+ count_event(c, io_move_start_fail);
|
|
|
|
- if (trace_move_extent_start_fail_enabled()) {
|
|
+ if (trace_io_move_start_fail_enabled()) {
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
prt_str(&buf, ": ");
|
|
prt_str(&buf, bch2_err_str(ret));
|
|
- trace_move_extent_start_fail(c, buf.buf);
|
|
+ trace_io_move_start_fail(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
-static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
|
|
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
|
|
struct per_snapshot_io_opts *io_opts,
|
|
struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
|
|
struct btree_iter *extent_iter,
|
|
@@ -399,6 +422,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
|
|
struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
|
|
int ret = 0;
|
|
|
|
+ if (extent_iter->min_depth)
|
|
+ return opts_ret;
|
|
+
|
|
if (extent_k.k->type == KEY_TYPE_reflink_v)
|
|
goto out;
|
|
|
|
@@ -518,11 +544,42 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
|
|
return 0;
|
|
}
|
|
|
|
-static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
- struct bpos start,
|
|
- struct bpos end,
|
|
- move_pred_fn pred, void *arg,
|
|
- enum btree_id btree_id)
|
|
+/*
|
|
+ * Move requires non extents iterators, and there's also no need for it to
|
|
+ * signal indirect_extent_missing_error:
|
|
+ */
|
|
+static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_s_c_reflink_p p)
|
|
+{
|
|
+ if (unlikely(REFLINK_P_ERROR(p.v)))
|
|
+ return bkey_s_c_null;
|
|
+
|
|
+ struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v));
|
|
+
|
|
+ bch2_trans_iter_init(trans, iter,
|
|
+ BTREE_ID_reflink, reflink_pos,
|
|
+ BTREE_ITER_not_extents);
|
|
+
|
|
+ struct bkey_s_c k = bch2_btree_iter_peek(trans, iter);
|
|
+ if (!k.k || bkey_err(k)) {
|
|
+ bch2_trans_iter_exit(trans, iter);
|
|
+ return k;
|
|
+ }
|
|
+
|
|
+ if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) {
|
|
+ bch2_trans_iter_exit(trans, iter);
|
|
+ return bkey_s_c_null;
|
|
+ }
|
|
+
|
|
+ return k;
|
|
+}
|
|
+
|
|
+int bch2_move_data_btree(struct moving_context *ctxt,
|
|
+ struct bpos start,
|
|
+ struct bpos end,
|
|
+ move_pred_fn pred, void *arg,
|
|
+ enum btree_id btree_id, unsigned level)
|
|
{
|
|
struct btree_trans *trans = ctxt->trans;
|
|
struct bch_fs *c = trans->c;
|
|
@@ -548,10 +605,56 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
ctxt->stats->pos = BBPOS(btree_id, start);
|
|
}
|
|
|
|
+retry_root:
|
|
bch2_trans_begin(trans);
|
|
- bch2_trans_iter_init(trans, &iter, btree_id, start,
|
|
- BTREE_ITER_prefetch|
|
|
- BTREE_ITER_all_snapshots);
|
|
+
|
|
+ if (level == bch2_btree_id_root(c, btree_id)->level + 1) {
|
|
+ bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1,
|
|
+ BTREE_ITER_prefetch|
|
|
+ BTREE_ITER_not_extents|
|
|
+ BTREE_ITER_all_snapshots);
|
|
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
|
|
+ ret = PTR_ERR_OR_ZERO(b);
|
|
+ if (ret)
|
|
+ goto root_err;
|
|
+
|
|
+ if (b != btree_node_root(c, b)) {
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ goto retry_root;
|
|
+ }
|
|
+
|
|
+ k = bkey_i_to_s_c(&b->key);
|
|
+
|
|
+ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
|
|
+ iter.pos, &iter, k);
|
|
+ ret = PTR_ERR_OR_ZERO(io_opts);
|
|
+ if (ret)
|
|
+ goto root_err;
|
|
+
|
|
+ memset(&data_opts, 0, sizeof(data_opts));
|
|
+ if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts))
|
|
+ goto out;
|
|
+
|
|
+
|
|
+ if (!data_opts.scrub)
|
|
+ ret = bch2_btree_node_rewrite_pos(trans, btree_id, level,
|
|
+ k.k->p, data_opts.target, 0);
|
|
+ else
|
|
+ ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
|
|
+
|
|
+root_err:
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ goto retry_root;
|
|
+ }
|
|
+
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level,
|
|
+ BTREE_ITER_prefetch|
|
|
+ BTREE_ITER_not_extents|
|
|
+ BTREE_ITER_all_snapshots);
|
|
|
|
if (ctxt->rate)
|
|
bch2_ratelimit_reset(ctxt->rate);
|
|
@@ -561,7 +664,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
|
|
bch2_trans_begin(trans);
|
|
|
|
- k = bch2_btree_iter_peek(&iter);
|
|
+ k = bch2_btree_iter_peek(trans, &iter);
|
|
if (!k.k)
|
|
break;
|
|
|
|
@@ -571,7 +674,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
if (ret)
|
|
break;
|
|
|
|
- if (bkey_ge(bkey_start_pos(k.k), end))
|
|
+ if (bkey_gt(bkey_start_pos(k.k), end))
|
|
break;
|
|
|
|
if (ctxt->stats)
|
|
@@ -581,17 +684,16 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
k.k->type == KEY_TYPE_reflink_p &&
|
|
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
|
|
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
|
|
- s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
|
|
|
|
bch2_trans_iter_exit(trans, &reflink_iter);
|
|
- k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
|
|
+ k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p);
|
|
ret = bkey_err(k);
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
continue;
|
|
if (ret)
|
|
break;
|
|
|
|
- if (bkey_deleted(k.k))
|
|
+ if (!k.k)
|
|
goto next_nondata;
|
|
|
|
/*
|
|
@@ -612,7 +714,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
continue;
|
|
|
|
memset(&data_opts, 0, sizeof(data_opts));
|
|
- if (!pred(c, arg, k, io_opts, &data_opts))
|
|
+ if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts))
|
|
goto next;
|
|
|
|
/*
|
|
@@ -622,12 +724,19 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
bch2_bkey_buf_reassemble(&sk, c, k);
|
|
k = bkey_i_to_s_c(sk.k);
|
|
|
|
- ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
|
|
+ if (!level)
|
|
+ ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
|
|
+ else if (!data_opts.scrub)
|
|
+ ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level,
|
|
+ k.k->p, data_opts.target, 0);
|
|
+ else
|
|
+ ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
|
|
+
|
|
if (ret2) {
|
|
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
|
|
continue;
|
|
|
|
- if (ret2 == -ENOMEM) {
|
|
+ if (bch2_err_matches(ret2, ENOMEM)) {
|
|
/* memory allocation failure, wait for some IO to finish */
|
|
bch2_move_ctxt_wait_for_io(ctxt);
|
|
continue;
|
|
@@ -640,9 +749,10 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
if (ctxt->stats)
|
|
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
|
|
next_nondata:
|
|
- bch2_btree_iter_advance(&iter);
|
|
+ if (!bch2_btree_iter_advance(trans, &iter))
|
|
+ break;
|
|
}
|
|
-
|
|
+out:
|
|
bch2_trans_iter_exit(trans, &reflink_iter);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
bch2_bkey_buf_exit(&sk, c);
|
|
@@ -672,7 +782,7 @@ int __bch2_move_data(struct moving_context *ctxt,
|
|
ret = bch2_move_data_btree(ctxt,
|
|
id == start.btree ? start.pos : POS_MIN,
|
|
id == end.btree ? end.pos : POS_MAX,
|
|
- pred, arg, id);
|
|
+ pred, arg, id, 0);
|
|
if (ret)
|
|
break;
|
|
}
|
|
@@ -689,21 +799,23 @@ int bch2_move_data(struct bch_fs *c,
|
|
bool wait_on_copygc,
|
|
move_pred_fn pred, void *arg)
|
|
{
|
|
-
|
|
struct moving_context ctxt;
|
|
- int ret;
|
|
|
|
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
|
|
- ret = __bch2_move_data(&ctxt, start, end, pred, arg);
|
|
+ int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
|
|
bch2_moving_ctxt_exit(&ctxt);
|
|
|
|
return ret;
|
|
}
|
|
|
|
-int bch2_evacuate_bucket(struct moving_context *ctxt,
|
|
- struct move_bucket_in_flight *bucket_in_flight,
|
|
- struct bpos bucket, int gen,
|
|
- struct data_update_opts _data_opts)
|
|
+static int __bch2_move_data_phys(struct moving_context *ctxt,
|
|
+ struct move_bucket *bucket_in_flight,
|
|
+ unsigned dev,
|
|
+ u64 bucket_start,
|
|
+ u64 bucket_end,
|
|
+ unsigned data_types,
|
|
+ bool copygc,
|
|
+ move_pred_fn pred, void *arg)
|
|
{
|
|
struct btree_trans *trans = ctxt->trans;
|
|
struct bch_fs *c = trans->c;
|
|
@@ -712,16 +824,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
|
struct btree_iter iter = {}, bp_iter = {};
|
|
struct bkey_buf sk;
|
|
struct bkey_s_c k;
|
|
- struct data_update_opts data_opts;
|
|
- unsigned sectors_moved = 0;
|
|
struct bkey_buf last_flushed;
|
|
+ u64 check_mismatch_done = bucket_start;
|
|
int ret = 0;
|
|
|
|
- struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
|
|
+ struct bch_dev *ca = bch2_dev_tryget(c, dev);
|
|
if (!ca)
|
|
return 0;
|
|
|
|
- trace_bucket_evacuate(c, &bucket);
|
|
+ bucket_end = min(bucket_end, ca->mi.nbuckets);
|
|
+
|
|
+ struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start));
|
|
+ struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end));
|
|
|
|
bch2_bkey_buf_init(&last_flushed);
|
|
bkey_init(&last_flushed.k->k);
|
|
@@ -732,15 +846,11 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
|
*/
|
|
bch2_trans_begin(trans);
|
|
|
|
- bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
|
|
- bucket_pos_to_bp_start(ca, bucket), 0);
|
|
-
|
|
- bch_err_msg(c, ret, "looking up alloc key");
|
|
- if (ret)
|
|
- goto err;
|
|
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0);
|
|
|
|
ret = bch2_btree_write_buffer_tryflush(trans);
|
|
- bch_err_msg(c, ret, "flushing btree write buffer");
|
|
+ if (!bch2_err_matches(ret, EROFS))
|
|
+ bch_err_msg(c, ret, "flushing btree write buffer");
|
|
if (ret)
|
|
goto err;
|
|
|
|
@@ -750,122 +860,182 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
|
|
|
bch2_trans_begin(trans);
|
|
|
|
- k = bch2_btree_iter_peek(&bp_iter);
|
|
+ k = bch2_btree_iter_peek(trans, &bp_iter);
|
|
ret = bkey_err(k);
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
continue;
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket)))
|
|
+ if (!k.k || bkey_gt(k.k->p, bp_end))
|
|
break;
|
|
|
|
+ if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
|
|
+ while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
|
|
+ bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
|
|
+ copygc, &last_flushed);
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+
|
|
if (k.k->type != KEY_TYPE_backpointer)
|
|
goto next;
|
|
|
|
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
|
|
|
|
- if (!bp.v->level) {
|
|
- k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
|
|
- ret = bkey_err(k);
|
|
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
- continue;
|
|
- if (ret)
|
|
- goto err;
|
|
- if (!k.k)
|
|
- goto next;
|
|
+ if (ctxt->stats)
|
|
+ ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
|
|
|
|
- bch2_bkey_buf_reassemble(&sk, c, k);
|
|
- k = bkey_i_to_s_c(sk.k);
|
|
+ if (!(data_types & BIT(bp.v->data_type)))
|
|
+ goto next;
|
|
+
|
|
+ if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes)
|
|
+ goto next;
|
|
+
|
|
+ k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
|
|
+ ret = bkey_err(k);
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
+ continue;
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ if (!k.k)
|
|
+ goto next;
|
|
|
|
+ if (!bp.v->level) {
|
|
ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
|
|
if (ret) {
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
continue;
|
|
}
|
|
+ }
|
|
|
|
- data_opts = _data_opts;
|
|
- data_opts.target = io_opts.background_target;
|
|
- data_opts.rewrite_ptrs = 0;
|
|
-
|
|
- unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */
|
|
- unsigned i = 0;
|
|
- const union bch_extent_entry *entry;
|
|
- struct extent_ptr_decoded p;
|
|
- bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
|
|
- if (p.ptr.dev == bucket.inode) {
|
|
- if (p.ptr.cached) {
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
- goto next;
|
|
- }
|
|
- data_opts.rewrite_ptrs |= 1U << i;
|
|
- break;
|
|
- }
|
|
- i++;
|
|
- }
|
|
-
|
|
- ret = bch2_move_extent(ctxt, bucket_in_flight,
|
|
- &iter, k, io_opts, data_opts);
|
|
+ struct data_update_opts data_opts = {};
|
|
+ if (!pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts)) {
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
+ goto next;
|
|
+ }
|
|
|
|
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
- continue;
|
|
- if (ret == -ENOMEM) {
|
|
- /* memory allocation failure, wait for some IO to finish */
|
|
- bch2_move_ctxt_wait_for_io(ctxt);
|
|
- continue;
|
|
- }
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- if (ctxt->stats)
|
|
- atomic64_add(sectors, &ctxt->stats->sectors_seen);
|
|
- sectors_moved += sectors;
|
|
- } else {
|
|
- struct btree *b;
|
|
+ if (data_opts.scrub &&
|
|
+ !bch2_dev_idx_is_online(c, data_opts.read_dev)) {
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ ret = -BCH_ERR_device_offline;
|
|
+ break;
|
|
+ }
|
|
|
|
- b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed);
|
|
- ret = PTR_ERR_OR_ZERO(b);
|
|
- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
|
|
- goto next;
|
|
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
- continue;
|
|
- if (ret)
|
|
- goto err;
|
|
- if (!b)
|
|
- goto next;
|
|
+ bch2_bkey_buf_reassemble(&sk, c, k);
|
|
+ k = bkey_i_to_s_c(sk.k);
|
|
|
|
- unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
|
|
+ /* move_extent will drop locks */
|
|
+ unsigned sectors = bp.v->bucket_len;
|
|
|
|
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
+ if (!bp.v->level)
|
|
+ ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
|
|
+ else if (!data_opts.scrub)
|
|
+ ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level,
|
|
+ k.k->p, data_opts.target, 0);
|
|
+ else
|
|
+ ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
|
|
|
|
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
- continue;
|
|
- if (ret)
|
|
- goto err;
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
|
|
- if (ctxt->rate)
|
|
- bch2_ratelimit_increment(ctxt->rate, sectors);
|
|
- if (ctxt->stats) {
|
|
- atomic64_add(sectors, &ctxt->stats->sectors_seen);
|
|
- atomic64_add(sectors, &ctxt->stats->sectors_moved);
|
|
- }
|
|
- sectors_moved += btree_sectors(c);
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
+ continue;
|
|
+ if (ret == -ENOMEM) {
|
|
+ /* memory allocation failure, wait for some IO to finish */
|
|
+ bch2_move_ctxt_wait_for_io(ctxt);
|
|
+ continue;
|
|
}
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (ctxt->stats)
|
|
+ atomic64_add(sectors, &ctxt->stats->sectors_seen);
|
|
next:
|
|
- bch2_btree_iter_advance(&bp_iter);
|
|
+ bch2_btree_iter_advance(trans, &bp_iter);
|
|
}
|
|
|
|
- trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret);
|
|
+ while (check_mismatch_done < bucket_end)
|
|
+ bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
|
|
+ copygc, &last_flushed);
|
|
err:
|
|
bch2_trans_iter_exit(trans, &bp_iter);
|
|
- bch2_dev_put(ca);
|
|
bch2_bkey_buf_exit(&sk, c);
|
|
bch2_bkey_buf_exit(&last_flushed, c);
|
|
+ bch2_dev_put(ca);
|
|
return ret;
|
|
}
|
|
|
|
+int bch2_move_data_phys(struct bch_fs *c,
|
|
+ unsigned dev,
|
|
+ u64 start,
|
|
+ u64 end,
|
|
+ unsigned data_types,
|
|
+ struct bch_ratelimit *rate,
|
|
+ struct bch_move_stats *stats,
|
|
+ struct write_point_specifier wp,
|
|
+ bool wait_on_copygc,
|
|
+ move_pred_fn pred, void *arg)
|
|
+{
|
|
+ struct moving_context ctxt;
|
|
+
|
|
+ bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
|
|
+
|
|
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
|
|
+ if (ctxt.stats) {
|
|
+ ctxt.stats->phys = true;
|
|
+ ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
|
|
+ }
|
|
+
|
|
+ int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end,
|
|
+ data_types, false, pred, arg);
|
|
+ bch2_moving_ctxt_exit(&ctxt);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct evacuate_bucket_arg {
|
|
+ struct bpos bucket;
|
|
+ int gen;
|
|
+ struct data_update_opts data_opts;
|
|
+};
|
|
+
|
|
+static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg,
|
|
+ enum btree_id btree, struct bkey_s_c k,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_update_opts *data_opts)
|
|
+{
|
|
+ struct evacuate_bucket_arg *arg = _arg;
|
|
+
|
|
+ *data_opts = arg->data_opts;
|
|
+
|
|
+ unsigned i = 0;
|
|
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
|
|
+ if (ptr->dev == arg->bucket.inode &&
|
|
+ (arg->gen < 0 || arg->gen == ptr->gen) &&
|
|
+ !ptr->cached)
|
|
+ data_opts->rewrite_ptrs |= BIT(i);
|
|
+ i++;
|
|
+ }
|
|
+
|
|
+ return data_opts->rewrite_ptrs != 0;
|
|
+}
|
|
+
|
|
+int bch2_evacuate_bucket(struct moving_context *ctxt,
|
|
+ struct move_bucket *bucket_in_flight,
|
|
+ struct bpos bucket, int gen,
|
|
+ struct data_update_opts data_opts)
|
|
+{
|
|
+ struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
|
|
+
|
|
+ return __bch2_move_data_phys(ctxt, bucket_in_flight,
|
|
+ bucket.inode,
|
|
+ bucket.offset,
|
|
+ bucket.offset + 1,
|
|
+ ~0,
|
|
+ true,
|
|
+ evacuate_bucket_pred, &arg);
|
|
+}
|
|
+
|
|
typedef bool (*move_btree_pred)(struct bch_fs *, void *,
|
|
struct btree *, struct bch_io_opts *,
|
|
struct data_update_opts *);
|
|
@@ -906,7 +1076,7 @@ static int bch2_move_btree(struct bch_fs *c,
|
|
retry:
|
|
ret = 0;
|
|
while (bch2_trans_begin(trans),
|
|
- (b = bch2_btree_iter_peek_node(&iter)) &&
|
|
+ (b = bch2_btree_iter_peek_node(trans, &iter)) &&
|
|
!(ret = PTR_ERR_OR_ZERO(b))) {
|
|
if (kthread && kthread_should_stop())
|
|
break;
|
|
@@ -920,13 +1090,13 @@ static int bch2_move_btree(struct bch_fs *c,
|
|
if (!pred(c, arg, b, &io_opts, &data_opts))
|
|
goto next;
|
|
|
|
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
|
|
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret;
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
continue;
|
|
if (ret)
|
|
break;
|
|
next:
|
|
- bch2_btree_iter_next_node(&iter);
|
|
+ bch2_btree_iter_next_node(trans, &iter);
|
|
}
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
goto retry;
|
|
@@ -945,7 +1115,7 @@ static int bch2_move_btree(struct bch_fs *c,
|
|
}
|
|
|
|
static bool rereplicate_pred(struct bch_fs *c, void *arg,
|
|
- struct bkey_s_c k,
|
|
+ enum btree_id btree, struct bkey_s_c k,
|
|
struct bch_io_opts *io_opts,
|
|
struct data_update_opts *data_opts)
|
|
{
|
|
@@ -977,7 +1147,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
|
|
}
|
|
|
|
static bool migrate_pred(struct bch_fs *c, void *arg,
|
|
- struct bkey_s_c k,
|
|
+ enum btree_id btree, struct bkey_s_c k,
|
|
struct bch_io_opts *io_opts,
|
|
struct data_update_opts *data_opts)
|
|
{
|
|
@@ -1004,15 +1174,7 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
|
|
struct bch_io_opts *io_opts,
|
|
struct data_update_opts *data_opts)
|
|
{
|
|
- return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
|
-}
|
|
-
|
|
-static bool migrate_btree_pred(struct bch_fs *c, void *arg,
|
|
- struct btree *b,
|
|
- struct bch_io_opts *io_opts,
|
|
- struct data_update_opts *data_opts)
|
|
-{
|
|
- return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
|
+ return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
|
}
|
|
|
|
/*
|
|
@@ -1068,7 +1230,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
|
|
}
|
|
|
|
static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
|
|
- struct bkey_s_c k,
|
|
+ enum btree_id btree, struct bkey_s_c k,
|
|
struct bch_io_opts *io_opts,
|
|
struct data_update_opts *data_opts)
|
|
{
|
|
@@ -1101,7 +1263,32 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
|
|
struct bch_io_opts *io_opts,
|
|
struct data_update_opts *data_opts)
|
|
{
|
|
- return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
|
+ return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key),
|
|
+ io_opts, data_opts);
|
|
+}
|
|
+
|
|
+static bool scrub_pred(struct bch_fs *c, void *_arg,
|
|
+ enum btree_id btree, struct bkey_s_c k,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_update_opts *data_opts)
|
|
+{
|
|
+ struct bch_ioctl_data *arg = _arg;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_btree_ptr_v2) {
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
+ if (p.ptr.dev == arg->migrate.dev) {
|
|
+ if (!p.crc.csum_type)
|
|
+ return false;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ data_opts->scrub = true;
|
|
+ data_opts->read_dev = arg->migrate.dev;
|
|
+ return true;
|
|
}
|
|
|
|
int bch2_data_job(struct bch_fs *c,
|
|
@@ -1118,6 +1305,22 @@ int bch2_data_job(struct bch_fs *c,
|
|
bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
|
|
|
|
switch (op.op) {
|
|
+ case BCH_DATA_OP_scrub:
|
|
+ /*
|
|
+ * prevent tests from spuriously failing, make sure we see all
|
|
+ * btree nodes that need to be repaired
|
|
+ */
|
|
+ bch2_btree_interior_updates_flush(c);
|
|
+
|
|
+ ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
|
|
+ op.scrub.data_types,
|
|
+ NULL,
|
|
+ stats,
|
|
+ writepoint_hashed((unsigned long) current),
|
|
+ false,
|
|
+ scrub_pred, &op) ?: ret;
|
|
+ break;
|
|
+
|
|
case BCH_DATA_OP_rereplicate:
|
|
stats->data_type = BCH_DATA_journal;
|
|
ret = bch2_journal_flush_device_pins(&c->journal, -1);
|
|
@@ -1137,14 +1340,14 @@ int bch2_data_job(struct bch_fs *c,
|
|
|
|
stats->data_type = BCH_DATA_journal;
|
|
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
|
|
- ret = bch2_move_btree(c, start, end,
|
|
- migrate_btree_pred, &op, stats) ?: ret;
|
|
- ret = bch2_move_data(c, start, end,
|
|
- NULL,
|
|
- stats,
|
|
- writepoint_hashed((unsigned long) current),
|
|
- true,
|
|
- migrate_pred, &op) ?: ret;
|
|
+ ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX,
|
|
+ ~0,
|
|
+ NULL,
|
|
+ stats,
|
|
+ writepoint_hashed((unsigned long) current),
|
|
+ true,
|
|
+ migrate_pred, &op) ?: ret;
|
|
+ bch2_btree_interior_updates_flush(c);
|
|
ret = bch2_replicas_gc2(c) ?: ret;
|
|
break;
|
|
case BCH_DATA_OP_rewrite_old_nodes:
|
|
@@ -1176,17 +1379,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
|
|
prt_newline(out);
|
|
printbuf_indent_add(out, 2);
|
|
|
|
- prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved));
|
|
- prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced));
|
|
- prt_printf(out, "bytes seen: ");
|
|
+ prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved));
|
|
+ prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced));
|
|
+ prt_printf(out, "bytes seen:\t");
|
|
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
|
|
prt_newline(out);
|
|
|
|
- prt_printf(out, "bytes moved: ");
|
|
+ prt_printf(out, "bytes moved:\t");
|
|
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
|
|
prt_newline(out);
|
|
|
|
- prt_printf(out, "bytes raced: ");
|
|
+ prt_printf(out, "bytes raced:\t");
|
|
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
|
|
prt_newline(out);
|
|
|
|
@@ -1195,7 +1398,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
|
|
|
|
static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
|
|
{
|
|
- struct moving_io *io;
|
|
+ if (!out->nr_tabstops)
|
|
+ printbuf_tabstop_push(out, 32);
|
|
|
|
bch2_move_stats_to_text(out, ctxt->stats);
|
|
printbuf_indent_add(out, 2);
|
|
@@ -1215,8 +1419,9 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
|
|
printbuf_indent_add(out, 2);
|
|
|
|
mutex_lock(&ctxt->lock);
|
|
+ struct moving_io *io;
|
|
list_for_each_entry(io, &ctxt->ios, io_list)
|
|
- bch2_write_op_to_text(out, &io->write.op);
|
|
+ bch2_data_update_inflight_to_text(out, &io->write);
|
|
mutex_unlock(&ctxt->lock);
|
|
|
|
printbuf_indent_sub(out, 4);
|
|
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
|
|
index 51e0505a8156..86b80499ac55 100644
|
|
--- a/fs/bcachefs/move.h
|
|
+++ b/fs/bcachefs/move.h
|
|
@@ -72,7 +72,7 @@ do { \
|
|
break; \
|
|
} while (1)
|
|
|
|
-typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
|
|
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c,
|
|
struct bch_io_opts *, struct data_update_opts *);
|
|
|
|
extern const char * const bch2_data_ops_strs[];
|
|
@@ -116,12 +116,18 @@ int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
|
|
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
|
|
|
|
int bch2_move_extent(struct moving_context *,
|
|
- struct move_bucket_in_flight *,
|
|
+ struct move_bucket *,
|
|
struct btree_iter *,
|
|
struct bkey_s_c,
|
|
struct bch_io_opts,
|
|
struct data_update_opts);
|
|
|
|
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
|
|
+ struct per_snapshot_io_opts *, struct bpos,
|
|
+ struct btree_iter *, struct bkey_s_c);
|
|
+
|
|
+int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos,
|
|
+ move_pred_fn, void *, enum btree_id, unsigned);
|
|
int __bch2_move_data(struct moving_context *,
|
|
struct bbpos,
|
|
struct bbpos,
|
|
@@ -135,8 +141,13 @@ int bch2_move_data(struct bch_fs *,
|
|
bool,
|
|
move_pred_fn, void *);
|
|
|
|
+int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned,
|
|
+ struct bch_ratelimit *, struct bch_move_stats *,
|
|
+ struct write_point_specifier, bool,
|
|
+ move_pred_fn, void *);
|
|
+
|
|
int bch2_evacuate_bucket(struct moving_context *,
|
|
- struct move_bucket_in_flight *,
|
|
+ struct move_bucket *,
|
|
struct bpos, int,
|
|
struct data_update_opts);
|
|
int bch2_data_job(struct bch_fs *,
|
|
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
|
|
index e22841ef31e4..c5c62cd600de 100644
|
|
--- a/fs/bcachefs/move_types.h
|
|
+++ b/fs/bcachefs/move_types.h
|
|
@@ -3,33 +3,43 @@
|
|
#define _BCACHEFS_MOVE_TYPES_H
|
|
|
|
#include "bbpos_types.h"
|
|
+#include "bcachefs_ioctl.h"
|
|
|
|
struct bch_move_stats {
|
|
- enum bch_data_type data_type;
|
|
- struct bbpos pos;
|
|
char name[32];
|
|
+ bool phys;
|
|
+ enum bch_ioctl_data_event_ret ret;
|
|
+
|
|
+ union {
|
|
+ struct {
|
|
+ enum bch_data_type data_type;
|
|
+ struct bbpos pos;
|
|
+ };
|
|
+ struct {
|
|
+ unsigned dev;
|
|
+ u64 offset;
|
|
+ };
|
|
+ };
|
|
|
|
atomic64_t keys_moved;
|
|
atomic64_t keys_raced;
|
|
atomic64_t sectors_seen;
|
|
atomic64_t sectors_moved;
|
|
atomic64_t sectors_raced;
|
|
+ atomic64_t sectors_error_corrected;
|
|
+ atomic64_t sectors_error_uncorrected;
|
|
};
|
|
|
|
struct move_bucket_key {
|
|
struct bpos bucket;
|
|
- u8 gen;
|
|
+ unsigned gen;
|
|
};
|
|
|
|
struct move_bucket {
|
|
+ struct move_bucket *next;
|
|
+ struct rhash_head hash;
|
|
struct move_bucket_key k;
|
|
unsigned sectors;
|
|
-};
|
|
-
|
|
-struct move_bucket_in_flight {
|
|
- struct move_bucket_in_flight *next;
|
|
- struct rhash_head hash;
|
|
- struct move_bucket bucket;
|
|
atomic_t count;
|
|
};
|
|
|
|
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
|
|
index 6718dc37c5a3..e7a2a13554d7 100644
|
|
--- a/fs/bcachefs/movinggc.c
|
|
+++ b/fs/bcachefs/movinggc.c
|
|
@@ -8,6 +8,7 @@
|
|
#include "bcachefs.h"
|
|
#include "alloc_background.h"
|
|
#include "alloc_foreground.h"
|
|
+#include "backpointers.h"
|
|
#include "btree_iter.h"
|
|
#include "btree_update.h"
|
|
#include "btree_write_buffer.h"
|
|
@@ -27,47 +28,32 @@
|
|
#include <linux/wait.h>
|
|
|
|
struct buckets_in_flight {
|
|
- struct rhashtable table;
|
|
- struct move_bucket_in_flight *first;
|
|
- struct move_bucket_in_flight *last;
|
|
- size_t nr;
|
|
- size_t sectors;
|
|
+ struct rhashtable table;
|
|
+ struct move_bucket *first;
|
|
+ struct move_bucket *last;
|
|
+ size_t nr;
|
|
+ size_t sectors;
|
|
+
|
|
+ DARRAY(struct move_bucket *) to_evacuate;
|
|
};
|
|
|
|
static const struct rhashtable_params bch_move_bucket_params = {
|
|
- .head_offset = offsetof(struct move_bucket_in_flight, hash),
|
|
- .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
|
|
+ .head_offset = offsetof(struct move_bucket, hash),
|
|
+ .key_offset = offsetof(struct move_bucket, k),
|
|
.key_len = sizeof(struct move_bucket_key),
|
|
.automatic_shrinking = true,
|
|
};
|
|
|
|
-static struct move_bucket_in_flight *
|
|
-move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
|
|
+static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b)
|
|
{
|
|
- struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL);
|
|
- int ret;
|
|
-
|
|
- if (!new)
|
|
- return ERR_PTR(-ENOMEM);
|
|
-
|
|
- new->bucket = b;
|
|
-
|
|
- ret = rhashtable_lookup_insert_fast(&list->table, &new->hash,
|
|
- bch_move_bucket_params);
|
|
- if (ret) {
|
|
- kfree(new);
|
|
- return ERR_PTR(ret);
|
|
- }
|
|
-
|
|
if (!list->first)
|
|
- list->first = new;
|
|
+ list->first = b;
|
|
else
|
|
- list->last->next = new;
|
|
+ list->last->next = b;
|
|
|
|
- list->last = new;
|
|
+ list->last = b;
|
|
list->nr++;
|
|
- list->sectors += b.sectors;
|
|
- return new;
|
|
+ list->sectors += b->sectors;
|
|
}
|
|
|
|
static int bch2_bucket_is_movable(struct btree_trans *trans,
|
|
@@ -89,9 +75,12 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
|
|
if (!ca)
|
|
goto out;
|
|
|
|
+ if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset))
|
|
+ goto out;
|
|
+
|
|
if (ca->mi.state != BCH_MEMBER_STATE_rw ||
|
|
!bch2_dev_is_online(ca))
|
|
- goto out_put;
|
|
+ goto out;
|
|
|
|
struct bch_alloc_v4 _a;
|
|
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
|
|
@@ -100,19 +89,26 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
|
|
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
|
|
|
|
ret = lru_idx && lru_idx <= time;
|
|
-out_put:
|
|
- bch2_dev_put(ca);
|
|
out:
|
|
+ bch2_dev_put(ca);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
|
|
+static void move_bucket_free(struct buckets_in_flight *list,
|
|
+ struct move_bucket *b)
|
|
+{
|
|
+ int ret = rhashtable_remove_fast(&list->table, &b->hash,
|
|
+ bch_move_bucket_params);
|
|
+ BUG_ON(ret);
|
|
+ kfree(b);
|
|
+}
|
|
+
|
|
static void move_buckets_wait(struct moving_context *ctxt,
|
|
struct buckets_in_flight *list,
|
|
bool flush)
|
|
{
|
|
- struct move_bucket_in_flight *i;
|
|
- int ret;
|
|
+ struct move_bucket *i;
|
|
|
|
while ((i = list->first)) {
|
|
if (flush)
|
|
@@ -126,12 +122,9 @@ static void move_buckets_wait(struct moving_context *ctxt,
|
|
list->last = NULL;
|
|
|
|
list->nr--;
|
|
- list->sectors -= i->bucket.sectors;
|
|
+ list->sectors -= i->sectors;
|
|
|
|
- ret = rhashtable_remove_fast(&list->table, &i->hash,
|
|
- bch_move_bucket_params);
|
|
- BUG_ON(ret);
|
|
- kfree(i);
|
|
+ move_bucket_free(list, i);
|
|
}
|
|
|
|
bch2_trans_unlock_long(ctxt->trans);
|
|
@@ -143,11 +136,8 @@ static bool bucket_in_flight(struct buckets_in_flight *list,
|
|
return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
|
|
}
|
|
|
|
-typedef DARRAY(struct move_bucket) move_buckets;
|
|
-
|
|
static int bch2_copygc_get_buckets(struct moving_context *ctxt,
|
|
- struct buckets_in_flight *buckets_in_flight,
|
|
- move_buckets *buckets)
|
|
+ struct buckets_in_flight *buckets_in_flight)
|
|
{
|
|
struct btree_trans *trans = ctxt->trans;
|
|
struct bch_fs *c = trans->c;
|
|
@@ -164,11 +154,9 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
|
|
if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
|
|
return ret;
|
|
|
|
- bch2_trans_begin(trans);
|
|
-
|
|
ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
|
|
- lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
|
|
- lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
|
|
+ lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
|
|
+ lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
|
|
0, k, ({
|
|
struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
|
|
int ret2 = 0;
|
|
@@ -184,20 +172,34 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
|
|
else if (bucket_in_flight(buckets_in_flight, b.k))
|
|
in_flight++;
|
|
else {
|
|
- ret2 = darray_push(buckets, b);
|
|
+ struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL);
|
|
+ ret2 = b_i ? 0 : -ENOMEM;
|
|
if (ret2)
|
|
goto err;
|
|
+
|
|
+ *b_i = b;
|
|
+
|
|
+ ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i);
|
|
+ if (ret2) {
|
|
+ kfree(b_i);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret2 = rhashtable_lookup_insert_fast(&buckets_in_flight->table, &b_i->hash,
|
|
+ bch_move_bucket_params);
|
|
+ BUG_ON(ret2);
|
|
+
|
|
sectors += b.sectors;
|
|
}
|
|
|
|
- ret2 = buckets->nr >= nr_to_get;
|
|
+ ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get;
|
|
err:
|
|
ret2;
|
|
}));
|
|
|
|
pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
|
|
buckets_in_flight->nr, buckets_in_flight->sectors,
|
|
- saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret);
|
|
+ saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret);
|
|
|
|
return ret < 0 ? ret : 0;
|
|
}
|
|
@@ -212,40 +214,30 @@ static int bch2_copygc(struct moving_context *ctxt,
|
|
struct data_update_opts data_opts = {
|
|
.btree_insert_flags = BCH_WATERMARK_copygc,
|
|
};
|
|
- move_buckets buckets = { 0 };
|
|
- struct move_bucket_in_flight *f;
|
|
u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen);
|
|
u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);
|
|
int ret = 0;
|
|
|
|
- ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
|
|
+ ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- darray_for_each(buckets, i) {
|
|
+ darray_for_each(buckets_in_flight->to_evacuate, i) {
|
|
if (kthread_should_stop() || freezing(current))
|
|
break;
|
|
|
|
- f = move_bucket_in_flight_add(buckets_in_flight, *i);
|
|
- ret = PTR_ERR_OR_ZERO(f);
|
|
- if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
|
|
- ret = 0;
|
|
- continue;
|
|
- }
|
|
- if (ret == -ENOMEM) { /* flush IO, continue later */
|
|
- ret = 0;
|
|
- break;
|
|
- }
|
|
+ struct move_bucket *b = *i;
|
|
+ *i = NULL;
|
|
|
|
- ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
|
|
- f->bucket.k.gen, data_opts);
|
|
+ move_bucket_in_flight_add(buckets_in_flight, b);
|
|
+
|
|
+ ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts);
|
|
if (ret)
|
|
goto err;
|
|
|
|
*did_work = true;
|
|
}
|
|
err:
|
|
-
|
|
/* no entries in LRU btree found, or got to end: */
|
|
if (bch2_err_matches(ret, ENOENT))
|
|
ret = 0;
|
|
@@ -255,12 +247,34 @@ static int bch2_copygc(struct moving_context *ctxt,
|
|
|
|
sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
|
|
sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
|
|
- trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved);
|
|
+ trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved);
|
|
|
|
- darray_exit(&buckets);
|
|
+ darray_for_each(buckets_in_flight->to_evacuate, i)
|
|
+ if (*i)
|
|
+ move_bucket_free(buckets_in_flight, *i);
|
|
+ darray_exit(&buckets_in_flight->to_evacuate);
|
|
return ret;
|
|
}
|
|
|
|
+static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca)
|
|
+{
|
|
+ struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca);
|
|
+ struct bch_dev_usage usage;
|
|
+
|
|
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
|
|
+ usage.buckets[i] = usage_full.d[i].buckets;
|
|
+
|
|
+ s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
|
|
+ ca->mi.bucket_size) >> 1);
|
|
+ s64 fragmented = 0;
|
|
+
|
|
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
|
|
+ if (data_type_movable(i))
|
|
+ fragmented += usage_full.d[i].fragmented;
|
|
+
|
|
+ return max(0LL, fragmented_allowed - fragmented);
|
|
+}
|
|
+
|
|
/*
|
|
* Copygc runs when the amount of fragmented data is above some arbitrary
|
|
* threshold:
|
|
@@ -275,23 +289,14 @@ static int bch2_copygc(struct moving_context *ctxt,
|
|
* often and continually reduce the amount of fragmented space as the device
|
|
* fills up. So, we increase the threshold by half the current free space.
|
|
*/
|
|
-unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
|
|
+u64 bch2_copygc_wait_amount(struct bch_fs *c)
|
|
{
|
|
- s64 wait = S64_MAX, fragmented_allowed, fragmented;
|
|
-
|
|
- for_each_rw_member(c, ca) {
|
|
- struct bch_dev_usage usage = bch2_dev_usage_read(ca);
|
|
+ u64 wait = U64_MAX;
|
|
|
|
- fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
|
|
- ca->mi.bucket_size) >> 1);
|
|
- fragmented = 0;
|
|
-
|
|
- for (unsigned i = 0; i < BCH_DATA_NR; i++)
|
|
- if (data_type_movable(i))
|
|
- fragmented += usage.d[i].fragmented;
|
|
-
|
|
- wait = min(wait, max(0LL, fragmented_allowed - fragmented));
|
|
- }
|
|
+ rcu_read_lock();
|
|
+ for_each_rw_member_rcu(c, ca)
|
|
+ wait = min(wait, bch2_copygc_dev_wait_amount(ca));
|
|
+ rcu_read_unlock();
|
|
|
|
return wait;
|
|
}
|
|
@@ -314,9 +319,28 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
|
|
c->copygc_wait_at) << 9);
|
|
prt_newline(out);
|
|
|
|
- prt_printf(out, "Currently calculated wait:\t");
|
|
- prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
|
|
- prt_newline(out);
|
|
+ bch2_printbuf_make_room(out, 4096);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ out->atomic++;
|
|
+
|
|
+ prt_printf(out, "Currently calculated wait:\n");
|
|
+ for_each_rw_member_rcu(c, ca) {
|
|
+ prt_printf(out, " %s:\t", ca->name);
|
|
+ prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca));
|
|
+ prt_newline(out);
|
|
+ }
|
|
+
|
|
+ struct task_struct *t = rcu_dereference(c->copygc_thread);
|
|
+ if (t)
|
|
+ get_task_struct(t);
|
|
+ --out->atomic;
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (t) {
|
|
+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
|
|
+ put_task_struct(t);
|
|
+ }
|
|
}
|
|
|
|
static int bch2_copygc_thread(void *arg)
|
|
@@ -325,22 +349,23 @@ static int bch2_copygc_thread(void *arg)
|
|
struct moving_context ctxt;
|
|
struct bch_move_stats move_stats;
|
|
struct io_clock *clock = &c->io_clock[WRITE];
|
|
- struct buckets_in_flight *buckets;
|
|
+ struct buckets_in_flight buckets = {};
|
|
u64 last, wait;
|
|
- int ret = 0;
|
|
|
|
- buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL);
|
|
- if (!buckets)
|
|
- return -ENOMEM;
|
|
- ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
|
|
+ int ret = rhashtable_init(&buckets.table, &bch_move_bucket_params);
|
|
bch_err_msg(c, ret, "allocating copygc buckets in flight");
|
|
- if (ret) {
|
|
- kfree(buckets);
|
|
+ if (ret)
|
|
return ret;
|
|
- }
|
|
|
|
set_freezable();
|
|
|
|
+ /*
|
|
+ * Data move operations can't run until after check_snapshots has
|
|
+ * completed, and bch2_snapshot_is_ancestor() is available.
|
|
+ */
|
|
+ kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
|
|
+ kthread_should_stop());
|
|
+
|
|
bch2_move_stats_init(&move_stats, "copygc");
|
|
bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
|
|
writepoint_ptr(&c->copygc_write_point),
|
|
@@ -353,13 +378,13 @@ static int bch2_copygc_thread(void *arg)
|
|
cond_resched();
|
|
|
|
if (!c->opts.copygc_enabled) {
|
|
- move_buckets_wait(&ctxt, buckets, true);
|
|
+ move_buckets_wait(&ctxt, &buckets, true);
|
|
kthread_wait_freezable(c->opts.copygc_enabled ||
|
|
kthread_should_stop());
|
|
}
|
|
|
|
if (unlikely(freezing(current))) {
|
|
- move_buckets_wait(&ctxt, buckets, true);
|
|
+ move_buckets_wait(&ctxt, &buckets, true);
|
|
__refrigerator(false);
|
|
continue;
|
|
}
|
|
@@ -370,7 +395,7 @@ static int bch2_copygc_thread(void *arg)
|
|
if (wait > clock->max_slop) {
|
|
c->copygc_wait_at = last;
|
|
c->copygc_wait = last + wait;
|
|
- move_buckets_wait(&ctxt, buckets, true);
|
|
+ move_buckets_wait(&ctxt, &buckets, true);
|
|
trace_and_count(c, copygc_wait, c, wait, last + wait);
|
|
bch2_kthread_io_clock_wait(clock, last + wait,
|
|
MAX_SCHEDULE_TIMEOUT);
|
|
@@ -380,7 +405,7 @@ static int bch2_copygc_thread(void *arg)
|
|
c->copygc_wait = 0;
|
|
|
|
c->copygc_running = true;
|
|
- ret = bch2_copygc(&ctxt, buckets, &did_work);
|
|
+ ret = bch2_copygc(&ctxt, &buckets, &did_work);
|
|
c->copygc_running = false;
|
|
|
|
wake_up(&c->copygc_running_wq);
|
|
@@ -391,16 +416,14 @@ static int bch2_copygc_thread(void *arg)
|
|
if (min_member_capacity == U64_MAX)
|
|
min_member_capacity = 128 * 2048;
|
|
|
|
- move_buckets_wait(&ctxt, buckets, true);
|
|
+ move_buckets_wait(&ctxt, &buckets, true);
|
|
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
|
|
MAX_SCHEDULE_TIMEOUT);
|
|
}
|
|
}
|
|
|
|
- move_buckets_wait(&ctxt, buckets, true);
|
|
-
|
|
- rhashtable_destroy(&buckets->table);
|
|
- kfree(buckets);
|
|
+ move_buckets_wait(&ctxt, &buckets, true);
|
|
+ rhashtable_destroy(&buckets.table);
|
|
bch2_moving_ctxt_exit(&ctxt);
|
|
bch2_move_stats_exit(&move_stats, c);
|
|
|
|
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
|
|
index ea181fef5bc9..b9683d22bab0 100644
|
|
--- a/fs/bcachefs/movinggc.h
|
|
+++ b/fs/bcachefs/movinggc.h
|
|
@@ -2,9 +2,18 @@
|
|
#ifndef _BCACHEFS_MOVINGGC_H
|
|
#define _BCACHEFS_MOVINGGC_H
|
|
|
|
-unsigned long bch2_copygc_wait_amount(struct bch_fs *);
|
|
+u64 bch2_copygc_wait_amount(struct bch_fs *);
|
|
void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
|
|
|
|
+static inline void bch2_copygc_wakeup(struct bch_fs *c)
|
|
+{
|
|
+ rcu_read_lock();
|
|
+ struct task_struct *p = rcu_dereference(c->copygc_thread);
|
|
+ if (p)
|
|
+ wake_up_process(p);
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
void bch2_copygc_stop(struct bch_fs *);
|
|
int bch2_copygc_start(struct bch_fs *);
|
|
void bch2_fs_copygc_init(struct bch_fs *);
|
|
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/namei.c
|
|
similarity index 52%
|
|
rename from fs/bcachefs/fs-common.c
|
|
rename to fs/bcachefs/namei.c
|
|
index 2c3d46ac70c6..a84b69d6caef 100644
|
|
--- a/fs/bcachefs/fs-common.c
|
|
+++ b/fs/bcachefs/namei.c
|
|
@@ -4,13 +4,21 @@
|
|
#include "acl.h"
|
|
#include "btree_update.h"
|
|
#include "dirent.h"
|
|
-#include "fs-common.h"
|
|
#include "inode.h"
|
|
+#include "namei.h"
|
|
#include "subvolume.h"
|
|
#include "xattr.h"
|
|
|
|
#include <linux/posix_acl.h>
|
|
|
|
+static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ return (subvol_inum) {
|
|
+ .subvol = inode->bi_parent_subvol ?: inum.subvol,
|
|
+ .inum = inode->bi_dir,
|
|
+ };
|
|
+}
|
|
+
|
|
static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
|
|
{
|
|
return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
|
|
@@ -28,8 +36,8 @@ int bch2_create_trans(struct btree_trans *trans,
|
|
unsigned flags)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter dir_iter = { NULL };
|
|
- struct btree_iter inode_iter = { NULL };
|
|
+ struct btree_iter dir_iter = {};
|
|
+ struct btree_iter inode_iter = {};
|
|
subvol_inum new_inum = dir;
|
|
u64 now = bch2_current_time(c);
|
|
u64 cpu = raw_smp_processor_id();
|
|
@@ -49,7 +57,7 @@ int bch2_create_trans(struct btree_trans *trans,
|
|
|
|
if (!(flags & BCH_CREATE_SNAPSHOT)) {
|
|
/* Normal create path - allocate a new inode: */
|
|
- bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
|
|
+ bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u);
|
|
|
|
if (flags & BCH_CREATE_TMPFILE)
|
|
new_inode->bi_flags |= BCH_INODE_unlinked;
|
|
@@ -123,8 +131,8 @@ int bch2_create_trans(struct btree_trans *trans,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
|
|
- ret = bch2_btree_iter_traverse(&dir_iter);
|
|
+ bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot);
|
|
+ ret = bch2_btree_iter_traverse(trans, &dir_iter);
|
|
if (ret)
|
|
goto err;
|
|
}
|
|
@@ -153,16 +161,13 @@ int bch2_create_trans(struct btree_trans *trans,
|
|
dir_u->bi_nlink++;
|
|
dir_u->bi_mtime = dir_u->bi_ctime = now;
|
|
|
|
- ret = bch2_inode_write(trans, &dir_iter, dir_u);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- ret = bch2_dirent_create(trans, dir, &dir_hash,
|
|
- dir_type,
|
|
- name,
|
|
- dir_target,
|
|
- &dir_offset,
|
|
- STR_HASH_must_create|BTREE_ITER_with_updates);
|
|
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
|
|
+ dir_type,
|
|
+ name,
|
|
+ dir_target,
|
|
+ &dir_offset,
|
|
+ STR_HASH_must_create|BTREE_ITER_with_updates) ?:
|
|
+ bch2_inode_write(trans, &dir_iter, dir_u);
|
|
if (ret)
|
|
goto err;
|
|
|
|
@@ -175,9 +180,9 @@ int bch2_create_trans(struct btree_trans *trans,
|
|
new_inode->bi_depth = dir_u->bi_depth + 1;
|
|
|
|
inode_iter.flags &= ~BTREE_ITER_all_snapshots;
|
|
- bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
|
|
+ bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot);
|
|
|
|
- ret = bch2_btree_iter_traverse(&inode_iter) ?:
|
|
+ ret = bch2_btree_iter_traverse(trans, &inode_iter) ?:
|
|
bch2_inode_write(trans, &inode_iter, new_inode);
|
|
err:
|
|
bch2_trans_iter_exit(trans, &inode_iter);
|
|
@@ -191,8 +196,8 @@ int bch2_link_trans(struct btree_trans *trans,
|
|
const struct qstr *name)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter dir_iter = { NULL };
|
|
- struct btree_iter inode_iter = { NULL };
|
|
+ struct btree_iter dir_iter = {};
|
|
+ struct btree_iter inode_iter = {};
|
|
struct bch_hash_info dir_hash;
|
|
u64 now = bch2_current_time(c);
|
|
u64 dir_offset = 0;
|
|
@@ -225,7 +230,8 @@ int bch2_link_trans(struct btree_trans *trans,
|
|
|
|
ret = bch2_dirent_create(trans, dir, &dir_hash,
|
|
mode_to_type(inode_u->bi_mode),
|
|
- name, inum.inum, &dir_offset,
|
|
+ name, inum.inum,
|
|
+ &dir_offset,
|
|
STR_HASH_must_create);
|
|
if (ret)
|
|
goto err;
|
|
@@ -249,9 +255,9 @@ int bch2_unlink_trans(struct btree_trans *trans,
|
|
bool deleting_subvol)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter dir_iter = { NULL };
|
|
- struct btree_iter dirent_iter = { NULL };
|
|
- struct btree_iter inode_iter = { NULL };
|
|
+ struct btree_iter dir_iter = {};
|
|
+ struct btree_iter dirent_iter = {};
|
|
+ struct btree_iter inode_iter = {};
|
|
struct bch_hash_info dir_hash;
|
|
subvol_inum inum;
|
|
u64 now = bch2_current_time(c);
|
|
@@ -297,7 +303,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- k = bch2_btree_iter_peek_slot(&dirent_iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &dirent_iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -306,8 +312,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
|
|
* If we're deleting a subvolume, we need to really delete the
|
|
* dirent, not just emit a whiteout in the current snapshot:
|
|
*/
|
|
- bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
|
|
- ret = bch2_btree_iter_traverse(&dirent_iter);
|
|
+ bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot);
|
|
+ ret = bch2_btree_iter_traverse(trans, &dirent_iter);
|
|
if (ret)
|
|
goto err;
|
|
} else {
|
|
@@ -343,6 +349,9 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
|
|
bool ret = false;
|
|
|
|
for (id = 0; id < Inode_opt_nr; id++) {
|
|
+ if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold)
|
|
+ continue;
|
|
+
|
|
/* Skip attributes that were explicitly set on this inode */
|
|
if (dst_u->bi_fields_set & (1 << id))
|
|
continue;
|
|
@@ -386,10 +395,10 @@ int bch2_rename_trans(struct btree_trans *trans,
|
|
enum bch_rename_mode mode)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter src_dir_iter = { NULL };
|
|
- struct btree_iter dst_dir_iter = { NULL };
|
|
- struct btree_iter src_inode_iter = { NULL };
|
|
- struct btree_iter dst_inode_iter = { NULL };
|
|
+ struct btree_iter src_dir_iter = {};
|
|
+ struct btree_iter dst_dir_iter = {};
|
|
+ struct btree_iter src_inode_iter = {};
|
|
+ struct btree_iter dst_inode_iter = {};
|
|
struct bch_hash_info src_hash, dst_hash;
|
|
subvol_inum src_inum, dst_inum;
|
|
u64 src_offset, dst_offset;
|
|
@@ -403,8 +412,7 @@ int bch2_rename_trans(struct btree_trans *trans,
|
|
|
|
src_hash = bch2_hash_info_init(c, src_dir_u);
|
|
|
|
- if (dst_dir.inum != src_dir.inum ||
|
|
- dst_dir.subvol != src_dir.subvol) {
|
|
+ if (!subvol_inum_eq(dst_dir, src_dir)) {
|
|
ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
|
|
BTREE_ITER_intent);
|
|
if (ret)
|
|
@@ -417,8 +425,8 @@ int bch2_rename_trans(struct btree_trans *trans,
|
|
}
|
|
|
|
ret = bch2_dirent_rename(trans,
|
|
- src_dir, &src_hash,
|
|
- dst_dir, &dst_hash,
|
|
+ src_dir, &src_hash, &src_dir_u->bi_size,
|
|
+ dst_dir, &dst_hash, &dst_dir_u->bi_size,
|
|
src_name, &src_inum, &src_offset,
|
|
dst_name, &dst_inum, &dst_offset,
|
|
mode);
|
|
@@ -496,32 +504,41 @@ int bch2_rename_trans(struct btree_trans *trans,
|
|
}
|
|
}
|
|
|
|
- if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
|
|
- S_ISDIR(src_inode_u->bi_mode)) {
|
|
- ret = -EXDEV;
|
|
- goto err;
|
|
- }
|
|
+ if (!subvol_inum_eq(dst_dir, src_dir)) {
|
|
+ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
|
|
+ S_ISDIR(src_inode_u->bi_mode)) {
|
|
+ ret = -EXDEV;
|
|
+ goto err;
|
|
+ }
|
|
|
|
- if (mode == BCH_RENAME_EXCHANGE &&
|
|
- bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
|
|
- S_ISDIR(dst_inode_u->bi_mode)) {
|
|
- ret = -EXDEV;
|
|
- goto err;
|
|
- }
|
|
+ if (mode == BCH_RENAME_EXCHANGE &&
|
|
+ bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
|
|
+ S_ISDIR(dst_inode_u->bi_mode)) {
|
|
+ ret = -EXDEV;
|
|
+ goto err;
|
|
+ }
|
|
|
|
- if (is_subdir_for_nlink(src_inode_u)) {
|
|
- src_dir_u->bi_nlink--;
|
|
- dst_dir_u->bi_nlink++;
|
|
- }
|
|
+ ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?:
|
|
+ (mode == BCH_RENAME_EXCHANGE
|
|
+ ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u)
|
|
+ : 0);
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
- if (S_ISDIR(src_inode_u->bi_mode) &&
|
|
- !src_inode_u->bi_subvol)
|
|
- src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
|
|
+ if (is_subdir_for_nlink(src_inode_u)) {
|
|
+ src_dir_u->bi_nlink--;
|
|
+ dst_dir_u->bi_nlink++;
|
|
+ }
|
|
|
|
- if (mode == BCH_RENAME_EXCHANGE &&
|
|
- S_ISDIR(dst_inode_u->bi_mode) &&
|
|
- !dst_inode_u->bi_subvol)
|
|
- dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
|
|
+ if (S_ISDIR(src_inode_u->bi_mode) &&
|
|
+ !src_inode_u->bi_subvol)
|
|
+ src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
|
|
+
|
|
+ if (mode == BCH_RENAME_EXCHANGE &&
|
|
+ S_ISDIR(dst_inode_u->bi_mode) &&
|
|
+ !dst_inode_u->bi_subvol)
|
|
+ dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
|
|
+ }
|
|
|
|
if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
|
|
dst_dir_u->bi_nlink--;
|
|
@@ -560,6 +577,8 @@ int bch2_rename_trans(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
+/* inum_to_path */
|
|
+
|
|
static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
|
|
{
|
|
bch2_printbuf_make_room(out, n);
|
|
@@ -590,31 +609,39 @@ static inline void reverse_bytes(void *b, size_t n)
|
|
}
|
|
}
|
|
|
|
-/* XXX: we don't yet attempt to print paths when we don't know the subvol */
|
|
-int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path)
|
|
+static int __bch2_inum_to_path(struct btree_trans *trans,
|
|
+ u32 subvol, u64 inum, u32 snapshot,
|
|
+ struct printbuf *path)
|
|
{
|
|
unsigned orig_pos = path->pos;
|
|
int ret = 0;
|
|
|
|
- while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL &&
|
|
- inum.inum == BCACHEFS_ROOT_INO)) {
|
|
+ while (true) {
|
|
+ if (!snapshot) {
|
|
+ ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot);
|
|
+ if (ret)
|
|
+ goto disconnected;
|
|
+ }
|
|
+
|
|
struct bch_inode_unpacked inode;
|
|
- ret = bch2_inode_find_by_inum_trans(trans, inum, &inode);
|
|
+ ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0);
|
|
if (ret)
|
|
goto disconnected;
|
|
|
|
+ if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL &&
|
|
+ inode.bi_inum == BCACHEFS_ROOT_INO)
|
|
+ break;
|
|
+
|
|
if (!inode.bi_dir && !inode.bi_dir_offset) {
|
|
ret = -BCH_ERR_ENOENT_inode_no_backpointer;
|
|
goto disconnected;
|
|
}
|
|
|
|
- inum.subvol = inode.bi_parent_subvol ?: inum.subvol;
|
|
- inum.inum = inode.bi_dir;
|
|
-
|
|
- u32 snapshot;
|
|
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
|
|
- if (ret)
|
|
- goto disconnected;
|
|
+ inum = inode.bi_dir;
|
|
+ if (inode.bi_parent_subvol) {
|
|
+ subvol = inode.bi_parent_subvol;
|
|
+ snapshot = 0;
|
|
+ }
|
|
|
|
struct btree_iter d_iter;
|
|
struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
|
|
@@ -650,3 +677,339 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
|
|
prt_str_reversed(path, "(disconnected)");
|
|
goto out;
|
|
}
|
|
+
|
|
+int bch2_inum_to_path(struct btree_trans *trans,
|
|
+ subvol_inum inum,
|
|
+ struct printbuf *path)
|
|
+{
|
|
+ return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path);
|
|
+}
|
|
+
|
|
+int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot,
|
|
+ snapshot_id_list *snapshot_overwrites,
|
|
+ struct printbuf *path)
|
|
+{
|
|
+ return __bch2_inum_to_path(trans, 0, inum, snapshot, path);
|
|
+}
|
|
+
|
|
+/* fsck */
|
|
+
|
|
+static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
|
|
+ struct bkey_s_c_dirent d,
|
|
+ struct bch_inode_unpacked *target,
|
|
+ bool in_fsck)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ struct btree_iter bp_iter = {};
|
|
+ int ret = 0;
|
|
+
|
|
+ if (inode_points_to_dirent(target, d))
|
|
+ return 0;
|
|
+
|
|
+ if (!target->bi_dir &&
|
|
+ !target->bi_dir_offset) {
|
|
+ fsck_err_on(S_ISDIR(target->bi_mode),
|
|
+ trans, inode_dir_missing_backpointer,
|
|
+ "directory with missing backpointer\n%s",
|
|
+ (printbuf_reset(&buf),
|
|
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
+ prt_printf(&buf, "\n"),
|
|
+ bch2_inode_unpacked_to_text(&buf, target),
|
|
+ buf.buf));
|
|
+
|
|
+ fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
|
|
+ trans, inode_unlinked_but_has_dirent,
|
|
+ "inode unlinked but has dirent\n%s",
|
|
+ (printbuf_reset(&buf),
|
|
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
+ prt_printf(&buf, "\n"),
|
|
+ bch2_inode_unpacked_to_text(&buf, target),
|
|
+ buf.buf));
|
|
+
|
|
+ target->bi_flags &= ~BCH_INODE_unlinked;
|
|
+ target->bi_dir = d.k->p.inode;
|
|
+ target->bi_dir_offset = d.k->p.offset;
|
|
+ return __bch2_fsck_write_inode(trans, target);
|
|
+ }
|
|
+
|
|
+ if (bch2_inode_should_have_single_bp(target) &&
|
|
+ !fsck_err(trans, inode_wrong_backpointer,
|
|
+ "dirent points to inode that does not point back:\n%s",
|
|
+ (bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
+ prt_newline(&buf),
|
|
+ bch2_inode_unpacked_to_text(&buf, target),
|
|
+ buf.buf)))
|
|
+ goto err;
|
|
+
|
|
+ struct bkey_s_c_dirent bp_dirent =
|
|
+ bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
|
|
+ SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),
|
|
+ 0, dirent);
|
|
+ ret = bkey_err(bp_dirent);
|
|
+ if (ret && !bch2_err_matches(ret, ENOENT))
|
|
+ goto err;
|
|
+
|
|
+ bool backpointer_exists = !ret;
|
|
+ ret = 0;
|
|
+
|
|
+ if (!backpointer_exists) {
|
|
+ if (fsck_err(trans, inode_wrong_backpointer,
|
|
+ "inode %llu:%u has wrong backpointer:\n"
|
|
+ "got %llu:%llu\n"
|
|
+ "should be %llu:%llu",
|
|
+ target->bi_inum, target->bi_snapshot,
|
|
+ target->bi_dir,
|
|
+ target->bi_dir_offset,
|
|
+ d.k->p.inode,
|
|
+ d.k->p.offset)) {
|
|
+ target->bi_dir = d.k->p.inode;
|
|
+ target->bi_dir_offset = d.k->p.offset;
|
|
+ ret = __bch2_fsck_write_inode(trans, target);
|
|
+ }
|
|
+ } else {
|
|
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
|
|
+ prt_newline(&buf);
|
|
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
|
|
+
|
|
+ if (S_ISDIR(target->bi_mode) || target->bi_subvol) {
|
|
+ /*
|
|
+ * XXX: verify connectivity of the other dirent
|
|
+ * up to the root before removing this one
|
|
+ *
|
|
+ * Additionally, bch2_lookup would need to cope with the
|
|
+ * dirent it found being removed - or should we remove
|
|
+ * the other one, even though the inode points to it?
|
|
+ */
|
|
+ if (in_fsck) {
|
|
+ if (fsck_err(trans, inode_dir_multiple_links,
|
|
+ "%s %llu:%u with multiple links\n%s",
|
|
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
|
|
+ target->bi_inum, target->bi_snapshot, buf.buf))
|
|
+ ret = bch2_fsck_remove_dirent(trans, d.k->p);
|
|
+ } else {
|
|
+ bch2_fs_inconsistent(c,
|
|
+ "%s %llu:%u with multiple links\n%s",
|
|
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
|
|
+ target->bi_inum, target->bi_snapshot, buf.buf);
|
|
+ }
|
|
+
|
|
+ goto out;
|
|
+ } else {
|
|
+ /*
|
|
+ * hardlinked file with nlink 0:
|
|
+ * We're just adjusting nlink here so check_nlinks() will pick
|
|
+ * it up, it ignores inodes with nlink 0
|
|
+ */
|
|
+ if (fsck_err_on(!target->bi_nlink,
|
|
+ trans, inode_multiple_links_but_nlink_0,
|
|
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
|
|
+ target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
|
|
+ target->bi_nlink++;
|
|
+ target->bi_flags &= ~BCH_INODE_unlinked;
|
|
+ ret = __bch2_fsck_write_inode(trans, target);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+out:
|
|
+err:
|
|
+fsck_err:
|
|
+ bch2_trans_iter_exit(trans, &bp_iter);
|
|
+ printbuf_exit(&buf);
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int __bch2_check_dirent_target(struct btree_trans *trans,
|
|
+ struct btree_iter *dirent_iter,
|
|
+ struct bkey_s_c_dirent d,
|
|
+ struct bch_inode_unpacked *target,
|
|
+ bool in_fsck)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ int ret = 0;
|
|
+
|
|
+ ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (fsck_err_on(d.v->d_type != inode_d_type(target),
|
|
+ trans, dirent_d_type_wrong,
|
|
+ "incorrect d_type: got %s, should be %s:\n%s",
|
|
+ bch2_d_type_str(d.v->d_type),
|
|
+ bch2_d_type_str(inode_d_type(target)),
|
|
+ (printbuf_reset(&buf),
|
|
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
|
|
+ struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
|
|
+ ret = PTR_ERR_OR_ZERO(n);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bkey_reassemble(&n->k_i, d.s_c);
|
|
+ n->v.d_type = inode_d_type(target);
|
|
+ if (n->v.d_type == DT_SUBVOL) {
|
|
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
|
|
+ n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
|
|
+ } else {
|
|
+ n->v.d_inum = cpu_to_le64(target->bi_inum);
|
|
+ }
|
|
+
|
|
+ ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+err:
|
|
+fsck_err:
|
|
+ printbuf_exit(&buf);
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * BCH_INODE_has_case_insensitive:
|
|
+ * We have to track whether directories have any descendent directory that is
|
|
+ * casefolded - for overlayfs:
|
|
+ */
|
|
+
|
|
+static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum)
|
|
+{
|
|
+ struct btree_iter iter = {};
|
|
+ int ret = 0;
|
|
+
|
|
+ while (true) {
|
|
+ struct bch_inode_unpacked inode;
|
|
+ ret = bch2_inode_peek(trans, &iter, &inode, inum,
|
|
+ BTREE_ITER_intent|BTREE_ITER_with_updates);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ if (inode.bi_flags & BCH_INODE_has_case_insensitive)
|
|
+ break;
|
|
+
|
|
+ inode.bi_flags |= BCH_INODE_has_case_insensitive;
|
|
+ ret = bch2_inode_write(trans, &iter, &inode);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM))
|
|
+ break;
|
|
+
|
|
+ inum = parent_inum(inum, &inode);
|
|
+ }
|
|
+
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ if (!bch2_inode_casefold(trans->c, inode))
|
|
+ return 0;
|
|
+
|
|
+ inode->bi_flags |= BCH_INODE_has_case_insensitive;
|
|
+
|
|
+ return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode));
|
|
+}
|
|
+
|
|
+int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode,
|
|
+ snapshot_id_list *snapshot_overwrites,
|
|
+ bool *do_update)
|
|
+{
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bool repairing_parents = false;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!S_ISDIR(inode->bi_mode)) {
|
|
+ /*
|
|
+ * Old versions set bi_casefold for non dirs, but that's
|
|
+ * unnecessary and wasteful
|
|
+ */
|
|
+ if (inode->bi_casefold) {
|
|
+ inode->bi_casefold = 0;
|
|
+ *do_update = true;
|
|
+ }
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive)
|
|
+ return 0;
|
|
+
|
|
+ if (bch2_inode_casefold(trans->c, inode) &&
|
|
+ !(inode->bi_flags & BCH_INODE_has_case_insensitive)) {
|
|
+ prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ",
|
|
+ inode->bi_inum, inode->bi_snapshot);
|
|
+
|
|
+ ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot,
|
|
+ snapshot_overwrites, &buf);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) {
|
|
+ inode->bi_flags |= BCH_INODE_has_case_insensitive;
|
|
+ *do_update = true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!(inode->bi_flags & BCH_INODE_has_case_insensitive))
|
|
+ goto out;
|
|
+
|
|
+ struct bch_inode_unpacked dir = *inode;
|
|
+ u32 snapshot = dir.bi_snapshot;
|
|
+
|
|
+ while (!(dir.bi_inum == BCACHEFS_ROOT_INO &&
|
|
+ dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
|
|
+ if (dir.bi_parent_subvol) {
|
|
+ ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ snapshot_overwrites = NULL;
|
|
+ }
|
|
+
|
|
+ ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) {
|
|
+ prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n");
|
|
+
|
|
+ ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot,
|
|
+ snapshot_overwrites, &buf);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) {
|
|
+ dir.bi_flags |= BCH_INODE_has_case_insensitive;
|
|
+ ret = __bch2_fsck_write_inode(trans, &dir);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * We only need to check the first parent, unless we find an
|
|
+ * inconsistency
|
|
+ */
|
|
+ if (!repairing_parents)
|
|
+ break;
|
|
+ }
|
|
+out:
|
|
+err:
|
|
+fsck_err:
|
|
+ printbuf_exit(&buf);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (repairing_parents) {
|
|
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
|
|
+ -BCH_ERR_transaction_restart_nested;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/namei.h
|
|
similarity index 52%
|
|
rename from fs/bcachefs/fs-common.h
|
|
rename to fs/bcachefs/namei.h
|
|
index 2b59210bb5e8..ae6ebc2d0785 100644
|
|
--- a/fs/bcachefs/fs-common.h
|
|
+++ b/fs/bcachefs/namei.h
|
|
@@ -1,6 +1,6 @@
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
-#ifndef _BCACHEFS_FS_COMMON_H
|
|
-#define _BCACHEFS_FS_COMMON_H
|
|
+#ifndef _BCACHEFS_NAMEI_H
|
|
+#define _BCACHEFS_NAMEI_H
|
|
|
|
#include "dirent.h"
|
|
|
|
@@ -43,5 +43,37 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
|
|
struct bch_inode_unpacked *);
|
|
|
|
int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
|
|
+int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32,
|
|
+ snapshot_id_list *, struct printbuf *);
|
|
|
|
-#endif /* _BCACHEFS_FS_COMMON_H */
|
|
+int __bch2_check_dirent_target(struct btree_trans *,
|
|
+ struct btree_iter *,
|
|
+ struct bkey_s_c_dirent,
|
|
+ struct bch_inode_unpacked *, bool);
|
|
+
|
|
+static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
|
|
+ struct bkey_s_c_dirent d)
|
|
+{
|
|
+ return inode->bi_dir == d.k->p.inode &&
|
|
+ inode->bi_dir_offset == d.k->p.offset;
|
|
+}
|
|
+
|
|
+static inline int bch2_check_dirent_target(struct btree_trans *trans,
|
|
+ struct btree_iter *dirent_iter,
|
|
+ struct bkey_s_c_dirent d,
|
|
+ struct bch_inode_unpacked *target,
|
|
+ bool in_fsck)
|
|
+{
|
|
+ if (likely(inode_points_to_dirent(target, d) &&
|
|
+ d.v->d_type == inode_d_type(target)))
|
|
+ return 0;
|
|
+
|
|
+ return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck);
|
|
+}
|
|
+
|
|
+int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum,
|
|
+ struct bch_inode_unpacked *);
|
|
+int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *,
|
|
+ snapshot_id_list *, bool *);
|
|
+
|
|
+#endif /* _BCACHEFS_NAMEI_H */
|
|
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
|
|
index 3c21981a4a1c..962218fa68ec 100644
|
|
--- a/fs/bcachefs/nocow_locking.c
|
|
+++ b/fs/bcachefs/nocow_locking.c
|
|
@@ -133,12 +133,10 @@ void bch2_fs_nocow_locking_exit(struct bch_fs *c)
|
|
BUG_ON(atomic_read(&l->l[j]));
|
|
}
|
|
|
|
-int bch2_fs_nocow_locking_init(struct bch_fs *c)
|
|
+void bch2_fs_nocow_locking_init_early(struct bch_fs *c)
|
|
{
|
|
struct bucket_nocow_lock_table *t = &c->nocow_locks;
|
|
|
|
for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
|
|
spin_lock_init(&l->lock);
|
|
-
|
|
- return 0;
|
|
}
|
|
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
|
|
index f9d6a426a960..48b8a003c0d2 100644
|
|
--- a/fs/bcachefs/nocow_locking.h
|
|
+++ b/fs/bcachefs/nocow_locking.h
|
|
@@ -45,6 +45,6 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
|
|
void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
|
|
|
|
void bch2_fs_nocow_locking_exit(struct bch_fs *);
|
|
-int bch2_fs_nocow_locking_init(struct bch_fs *);
|
|
+void bch2_fs_nocow_locking_init_early(struct bch_fs *);
|
|
|
|
#endif /* _BCACHEFS_NOCOW_LOCKING_H */
|
|
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
|
|
index 6772faf385a5..b1cf88905b81 100644
|
|
--- a/fs/bcachefs/opts.c
|
|
+++ b/fs/bcachefs/opts.c
|
|
@@ -7,7 +7,9 @@
|
|
#include "compress.h"
|
|
#include "disk_groups.h"
|
|
#include "error.h"
|
|
+#include "movinggc.h"
|
|
#include "opts.h"
|
|
+#include "rebalance.h"
|
|
#include "recovery_passes.h"
|
|
#include "super-io.h"
|
|
#include "util.h"
|
|
@@ -19,6 +21,11 @@ const char * const bch2_error_actions[] = {
|
|
NULL
|
|
};
|
|
|
|
+const char * const bch2_degraded_actions[] = {
|
|
+ BCH_DEGRADED_ACTIONS()
|
|
+ NULL
|
|
+};
|
|
+
|
|
const char * const bch2_fsck_fix_opts[] = {
|
|
BCH_FIX_ERRORS_OPTS()
|
|
NULL
|
|
@@ -44,7 +51,7 @@ const char * const __bch2_btree_ids[] = {
|
|
NULL
|
|
};
|
|
|
|
-static const char * const __bch2_csum_types[] = {
|
|
+const char * const __bch2_csum_types[] = {
|
|
BCH_CSUM_TYPES()
|
|
NULL
|
|
};
|
|
@@ -163,16 +170,6 @@ const char * const bch2_d_types[BCH_DT_MAX] = {
|
|
[DT_SUBVOL] = "subvol",
|
|
};
|
|
|
|
-u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
|
|
-{
|
|
- BUG();
|
|
-}
|
|
-
|
|
-void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
|
|
-{
|
|
- BUG();
|
|
-}
|
|
-
|
|
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
|
|
{
|
|
#define x(_name, ...) \
|
|
@@ -223,6 +220,21 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
|
|
}
|
|
}
|
|
|
|
+/* dummy option, for options that aren't stored in the superblock */
|
|
+typedef u64 (*sb_opt_get_fn)(const struct bch_sb *);
|
|
+typedef void (*sb_opt_set_fn)(struct bch_sb *, u64);
|
|
+typedef u64 (*member_opt_get_fn)(const struct bch_member *);
|
|
+typedef void (*member_opt_set_fn)(struct bch_member *, u64);
|
|
+
|
|
+__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL;
|
|
+__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL;
|
|
+__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL;
|
|
+__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL;
|
|
+
|
|
+#define type_compatible_or_null(_p, _type) \
|
|
+ __builtin_choose_expr( \
|
|
+ __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL)
|
|
+
|
|
const struct bch_option bch2_opt_table[] = {
|
|
#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2
|
|
#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \
|
|
@@ -239,15 +251,15 @@ const struct bch_option bch2_opt_table[] = {
|
|
|
|
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
|
|
[Opt_##_name] = { \
|
|
- .attr = { \
|
|
- .name = #_name, \
|
|
- .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
|
|
- }, \
|
|
- .flags = _flags, \
|
|
- .hint = _hint, \
|
|
- .help = _help, \
|
|
- .get_sb = _sb_opt, \
|
|
- .set_sb = SET_##_sb_opt, \
|
|
+ .attr.name = #_name, \
|
|
+ .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
|
|
+ .flags = _flags, \
|
|
+ .hint = _hint, \
|
|
+ .help = _help, \
|
|
+ .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \
|
|
+ .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \
|
|
+ .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \
|
|
+ .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\
|
|
_type \
|
|
},
|
|
|
|
@@ -268,20 +280,20 @@ int bch2_opt_lookup(const char *name)
|
|
return -1;
|
|
}
|
|
|
|
-struct synonym {
|
|
+struct opt_synonym {
|
|
const char *s1, *s2;
|
|
};
|
|
|
|
-static const struct synonym bch_opt_synonyms[] = {
|
|
+static const struct opt_synonym bch2_opt_synonyms[] = {
|
|
{ "quota", "usrquota" },
|
|
};
|
|
|
|
static int bch2_mount_opt_lookup(const char *name)
|
|
{
|
|
- const struct synonym *i;
|
|
+ const struct opt_synonym *i;
|
|
|
|
- for (i = bch_opt_synonyms;
|
|
- i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
|
|
+ for (i = bch2_opt_synonyms;
|
|
+ i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms);
|
|
i++)
|
|
if (!strcmp(name, i->s1))
|
|
name = i->s2;
|
|
@@ -289,6 +301,30 @@ static int bch2_mount_opt_lookup(const char *name)
|
|
return bch2_opt_lookup(name);
|
|
}
|
|
|
|
+struct opt_val_synonym {
|
|
+ const char *opt, *v1, *v2;
|
|
+};
|
|
+
|
|
+static const struct opt_val_synonym bch2_opt_val_synonyms[] = {
|
|
+ { "degraded", "true", "yes" },
|
|
+ { "degraded", "false", "no" },
|
|
+ { "degraded", "1", "yes" },
|
|
+ { "degraded", "0", "no" },
|
|
+};
|
|
+
|
|
+static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val)
|
|
+{
|
|
+ const struct opt_val_synonym *i;
|
|
+
|
|
+ for (i = bch2_opt_val_synonyms;
|
|
+ i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms);
|
|
+ i++)
|
|
+ if (!strcmp(opt, i->opt) && !strcmp(val, i->v1))
|
|
+ return i->v2;
|
|
+
|
|
+ return val;
|
|
+}
|
|
+
|
|
int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
|
|
{
|
|
if (v < opt->min) {
|
|
@@ -332,21 +368,22 @@ int bch2_opt_parse(struct bch_fs *c,
|
|
{
|
|
ssize_t ret;
|
|
|
|
+ if (err)
|
|
+ printbuf_indent_add_nextline(err, 2);
|
|
+
|
|
switch (opt->type) {
|
|
case BCH_OPT_BOOL:
|
|
- if (val) {
|
|
- ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
|
|
- if (ret != -BCH_ERR_option_not_bool) {
|
|
- *res = ret;
|
|
- } else {
|
|
- if (err)
|
|
- prt_printf(err, "%s: must be bool", opt->attr.name);
|
|
- return ret;
|
|
- }
|
|
+ if (!val)
|
|
+ val = "1";
|
|
+
|
|
+ ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
|
|
+ if (ret != -BCH_ERR_option_not_bool) {
|
|
+ *res = ret;
|
|
} else {
|
|
- *res = 1;
|
|
+ if (err)
|
|
+ prt_printf(err, "%s: must be bool", opt->attr.name);
|
|
+ return ret;
|
|
}
|
|
-
|
|
break;
|
|
case BCH_OPT_UINT:
|
|
if (!val) {
|
|
@@ -355,9 +392,15 @@ int bch2_opt_parse(struct bch_fs *c,
|
|
return -EINVAL;
|
|
}
|
|
|
|
- ret = opt->flags & OPT_HUMAN_READABLE
|
|
- ? bch2_strtou64_h(val, res)
|
|
- : kstrtou64(val, 10, res);
|
|
+ if (*val != '-') {
|
|
+ ret = opt->flags & OPT_HUMAN_READABLE
|
|
+ ? bch2_strtou64_h(val, res)
|
|
+ : kstrtou64(val, 10, res);
|
|
+ } else {
|
|
+ prt_printf(err, "%s: must be a non-negative number", opt->attr.name);
|
|
+ return -BCH_ERR_option_negative;
|
|
+ }
|
|
+
|
|
if (ret < 0) {
|
|
if (err)
|
|
prt_printf(err, "%s: must be a number",
|
|
@@ -475,11 +518,16 @@ void bch2_opts_to_text(struct printbuf *out,
|
|
}
|
|
}
|
|
|
|
-int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
|
|
+int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v)
|
|
{
|
|
int ret = 0;
|
|
|
|
switch (id) {
|
|
+ case Opt_state:
|
|
+ if (ca)
|
|
+ return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED);
|
|
+ break;
|
|
+
|
|
case Opt_compression:
|
|
case Opt_background_compression:
|
|
ret = bch2_check_set_has_compressed_data(c, v);
|
|
@@ -488,19 +536,17 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
|
|
if (v)
|
|
bch2_check_set_feature(c, BCH_FEATURE_ec);
|
|
break;
|
|
+ default:
|
|
+ break;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
-int bch2_opts_check_may_set(struct bch_fs *c)
|
|
+int bch2_opts_hooks_pre_set(struct bch_fs *c)
|
|
{
|
|
- unsigned i;
|
|
- int ret;
|
|
-
|
|
- for (i = 0; i < bch2_opts_nr; i++) {
|
|
- ret = bch2_opt_check_may_set(c, i,
|
|
- bch2_opt_get_by_id(&c->opts, i));
|
|
+ for (unsigned i = 0; i < bch2_opts_nr; i++) {
|
|
+ int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i));
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -508,6 +554,61 @@ int bch2_opts_check_may_set(struct bch_fs *c)
|
|
return 0;
|
|
}
|
|
|
|
+void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
|
|
+ struct bch_opts *new_opts, enum bch_opt_id id)
|
|
+{
|
|
+ switch (id) {
|
|
+ case Opt_foreground_target:
|
|
+ if (new_opts->foreground_target &&
|
|
+ !new_opts->background_target)
|
|
+ bch2_set_rebalance_needs_scan(c, inum);
|
|
+ break;
|
|
+ case Opt_compression:
|
|
+ if (new_opts->compression &&
|
|
+ !new_opts->background_compression)
|
|
+ bch2_set_rebalance_needs_scan(c, inum);
|
|
+ break;
|
|
+ case Opt_background_target:
|
|
+ if (new_opts->background_target)
|
|
+ bch2_set_rebalance_needs_scan(c, inum);
|
|
+ break;
|
|
+ case Opt_background_compression:
|
|
+ if (new_opts->background_compression)
|
|
+ bch2_set_rebalance_needs_scan(c, inum);
|
|
+ break;
|
|
+ case Opt_rebalance_enabled:
|
|
+ bch2_rebalance_wakeup(c);
|
|
+ break;
|
|
+ case Opt_copygc_enabled:
|
|
+ bch2_copygc_wakeup(c);
|
|
+ break;
|
|
+ case Opt_discard:
|
|
+ if (!ca) {
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ for_each_member_device(c, ca) {
|
|
+ struct bch_member *m =
|
|
+ bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx);
|
|
+ SET_BCH_MEMBER_DISCARD(m, c->opts.discard);
|
|
+ }
|
|
+
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ }
|
|
+ break;
|
|
+ case Opt_version_upgrade:
|
|
+ /*
|
|
+ * XXX: in the future we'll likely want to do compatible
|
|
+ * upgrades at runtime as well, but right now there's nothing
|
|
+ * that does that:
|
|
+ */
|
|
+ if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible)
|
|
+ bch2_sb_upgrade_incompat(c);
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
|
|
struct printbuf *parse_later,
|
|
const char *name, const char *val)
|
|
@@ -530,6 +631,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
|
|
if (id < 0)
|
|
return 0;
|
|
|
|
+ /* must have a value for synonym lookup - but OPT_FN is weird */
|
|
+ if (!val && bch2_opt_table[id].type != BCH_OPT_FN)
|
|
+ val = "1";
|
|
+
|
|
+ val = bch2_opt_val_synonym_lookup(name, val);
|
|
+
|
|
if (!(bch2_opt_table[id].flags & OPT_MOUNT))
|
|
goto bad_opt;
|
|
|
|
@@ -543,14 +650,15 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
|
|
goto bad_opt;
|
|
|
|
ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
|
|
- if (ret == -BCH_ERR_option_needs_open_fs && parse_later) {
|
|
- prt_printf(parse_later, "%s=%s,", name, val);
|
|
- if (parse_later->allocation_failure) {
|
|
- ret = -ENOMEM;
|
|
- goto out;
|
|
+ if (ret == -BCH_ERR_option_needs_open_fs) {
|
|
+ ret = 0;
|
|
+
|
|
+ if (parse_later) {
|
|
+ prt_printf(parse_later, "%s=%s,", name, val);
|
|
+ if (parse_later->allocation_failure)
|
|
+ ret = -ENOMEM;
|
|
}
|
|
|
|
- ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
@@ -561,28 +669,24 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
|
|
bch2_opt_set_by_id(opts, id, v);
|
|
|
|
ret = 0;
|
|
- goto out;
|
|
-
|
|
+out:
|
|
+ printbuf_exit(&err);
|
|
+ return ret;
|
|
bad_opt:
|
|
- pr_err("Bad mount option %s", name);
|
|
ret = -BCH_ERR_option_name;
|
|
goto out;
|
|
-
|
|
bad_val:
|
|
- pr_err("Invalid mount option %s", err.buf);
|
|
ret = -BCH_ERR_option_value;
|
|
-
|
|
-out:
|
|
- printbuf_exit(&err);
|
|
- return ret;
|
|
+ goto out;
|
|
}
|
|
|
|
int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
|
|
- struct printbuf *parse_later, char *options)
|
|
+ struct printbuf *parse_later, char *options,
|
|
+ bool ignore_unknown)
|
|
{
|
|
char *copied_opts, *copied_opts_start;
|
|
char *opt, *name, *val;
|
|
- int ret;
|
|
+ int ret = 0;
|
|
|
|
if (!options)
|
|
return 0;
|
|
@@ -607,24 +711,37 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
|
|
val = opt;
|
|
|
|
ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
|
|
- if (ret < 0)
|
|
- goto out;
|
|
+ if (ret == -BCH_ERR_option_name && ignore_unknown)
|
|
+ ret = 0;
|
|
+ if (ret) {
|
|
+ pr_err("Error parsing option %s: %s", name, bch2_err_str(ret));
|
|
+ break;
|
|
+ }
|
|
}
|
|
|
|
- ret = 0;
|
|
- goto out;
|
|
-
|
|
-out:
|
|
kfree(copied_opts_start);
|
|
return ret;
|
|
}
|
|
|
|
-u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
|
|
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx)
|
|
{
|
|
const struct bch_option *opt = bch2_opt_table + id;
|
|
u64 v;
|
|
|
|
- v = opt->get_sb(sb);
|
|
+ if (dev_idx < 0) {
|
|
+ v = opt->get_sb(sb);
|
|
+ } else {
|
|
+ if (WARN(!bch2_member_exists(sb, dev_idx),
|
|
+ "tried to set device option %s on nonexistent device %i",
|
|
+ opt->attr.name, dev_idx))
|
|
+ return 0;
|
|
+
|
|
+ struct bch_member m = bch2_sb_member_get(sb, dev_idx);
|
|
+ v = opt->get_member(&m);
|
|
+ }
|
|
+
|
|
+ if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
|
|
+ --v;
|
|
|
|
if (opt->flags & OPT_SB_FIELD_ILOG2)
|
|
v = 1ULL << v;
|
|
@@ -641,34 +758,20 @@ u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
|
|
*/
|
|
int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
|
|
{
|
|
- unsigned id;
|
|
-
|
|
- for (id = 0; id < bch2_opts_nr; id++) {
|
|
+ for (unsigned id = 0; id < bch2_opts_nr; id++) {
|
|
const struct bch_option *opt = bch2_opt_table + id;
|
|
|
|
- if (opt->get_sb == BCH2_NO_SB_OPT)
|
|
- continue;
|
|
-
|
|
- bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
|
|
+ if (opt->get_sb)
|
|
+ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1));
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
-struct bch_dev_sb_opt_set {
|
|
- void (*set_sb)(struct bch_member *, u64);
|
|
-};
|
|
-
|
|
-static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
|
|
-#define x(n, set) [Opt_##n] = { .set_sb = SET_##set },
|
|
- BCH_DEV_OPT_SETTERS()
|
|
-#undef x
|
|
-};
|
|
-
|
|
-void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
|
|
+bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
|
|
const struct bch_option *opt, u64 v)
|
|
{
|
|
- enum bch_opt_id id = opt - bch2_opt_table;
|
|
+ bool changed = false;
|
|
|
|
if (opt->flags & OPT_SB_FIELD_SECTORS)
|
|
v >>= 9;
|
|
@@ -679,34 +782,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
|
|
if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
|
|
v++;
|
|
|
|
- if (opt->flags & OPT_FS) {
|
|
- if (opt->set_sb != SET_BCH2_NO_SB_OPT)
|
|
- opt->set_sb(sb, v);
|
|
+ if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) {
|
|
+ changed = v != opt->get_sb(sb);
|
|
+
|
|
+ opt->set_sb(sb, v);
|
|
}
|
|
|
|
- if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) {
|
|
+ if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) {
|
|
if (WARN(!bch2_member_exists(sb, dev_idx),
|
|
"tried to set device option %s on nonexistent device %i",
|
|
opt->attr.name, dev_idx))
|
|
- return;
|
|
+ return false;
|
|
|
|
struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
|
|
-
|
|
- const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
|
|
- if (set->set_sb)
|
|
- set->set_sb(m, v);
|
|
- else
|
|
- pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name);
|
|
+ changed = v != opt->get_member(m);
|
|
+ opt->set_member(m, v);
|
|
}
|
|
+
|
|
+ return changed;
|
|
}
|
|
|
|
-void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
|
|
+bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
|
|
const struct bch_option *opt, u64 v)
|
|
{
|
|
mutex_lock(&c->sb_lock);
|
|
- __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
|
|
- bch2_write_super(c);
|
|
+ bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
|
|
+ if (changed)
|
|
+ bch2_write_super(c);
|
|
mutex_unlock(&c->sb_lock);
|
|
+ return changed;
|
|
}
|
|
|
|
/* io opts: */
|
|
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
|
|
index 9d397fc2a1f0..2a02606254b3 100644
|
|
--- a/fs/bcachefs/opts.h
|
|
+++ b/fs/bcachefs/opts.h
|
|
@@ -11,11 +11,13 @@
|
|
struct bch_fs;
|
|
|
|
extern const char * const bch2_error_actions[];
|
|
+extern const char * const bch2_degraded_actions[];
|
|
extern const char * const bch2_fsck_fix_opts[];
|
|
extern const char * const bch2_version_upgrade_opts[];
|
|
extern const char * const bch2_sb_features[];
|
|
extern const char * const bch2_sb_compat[];
|
|
extern const char * const __bch2_btree_ids[];
|
|
+extern const char * const __bch2_csum_types[];
|
|
extern const char * const __bch2_csum_opts[];
|
|
extern const char * const __bch2_compression_types[];
|
|
extern const char * const bch2_compression_opts[];
|
|
@@ -50,10 +52,6 @@ static inline const char *bch2_d_type_str(unsigned d_type)
|
|
* apply the options from that struct that are defined.
|
|
*/
|
|
|
|
-/* dummy option, for options that aren't stored in the superblock */
|
|
-u64 BCH2_NO_SB_OPT(const struct bch_sb *);
|
|
-void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
|
|
-
|
|
/* When can be set: */
|
|
enum opt_flags {
|
|
OPT_FS = BIT(0), /* Filesystem option */
|
|
@@ -132,19 +130,24 @@ enum fsck_err_opts {
|
|
OPT_FS|OPT_FORMAT| \
|
|
OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
|
|
OPT_UINT(512, 1U << 16), \
|
|
- BCH_SB_BLOCK_SIZE, 8, \
|
|
+ BCH_SB_BLOCK_SIZE, 4 << 10, \
|
|
"size", NULL) \
|
|
x(btree_node_size, u32, \
|
|
OPT_FS|OPT_FORMAT| \
|
|
OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
|
|
OPT_UINT(512, 1U << 20), \
|
|
- BCH_SB_BTREE_NODE_SIZE, 512, \
|
|
+ BCH_SB_BTREE_NODE_SIZE, 256 << 10, \
|
|
"size", "Btree node size, default 256k") \
|
|
x(errors, u8, \
|
|
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_STR(bch2_error_actions), \
|
|
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
|
|
NULL, "Action to take on filesystem error") \
|
|
+ x(write_error_timeout, u16, \
|
|
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_UINT(1, 300), \
|
|
+ BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
|
|
+ NULL, "Number of consecutive write errors allowed before kicking out a device")\
|
|
x(metadata_replicas, u8, \
|
|
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_UINT(1, BCH_REPLICAS_MAX), \
|
|
@@ -181,6 +184,11 @@ enum fsck_err_opts {
|
|
OPT_STR(__bch2_csum_opts), \
|
|
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
|
|
NULL, NULL) \
|
|
+ x(checksum_err_retry_nr, u8, \
|
|
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_UINT(0, 32), \
|
|
+ BCH_SB_CSUM_ERR_RETRY_NR, 3, \
|
|
+ NULL, NULL) \
|
|
x(compression, u8, \
|
|
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_FN(bch2_opt_compression), \
|
|
@@ -197,7 +205,7 @@ enum fsck_err_opts {
|
|
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
|
|
NULL, "Hash function for directory entries and xattrs")\
|
|
x(metadata_target, u16, \
|
|
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_FN(bch2_opt_target), \
|
|
BCH_SB_METADATA_TARGET, 0, \
|
|
"(target)", "Device or label for metadata writes") \
|
|
@@ -221,6 +229,11 @@ enum fsck_err_opts {
|
|
OPT_BOOL(), \
|
|
BCH_SB_ERASURE_CODE, false, \
|
|
NULL, "Enable erasure coding (DO NOT USE YET)") \
|
|
+ x(casefold, u8, \
|
|
+ OPT_FS|OPT_INODE|OPT_FORMAT, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH_SB_CASEFOLD, false, \
|
|
+ NULL, "Dirent lookups are casefolded") \
|
|
x(inodes_32bit, u8, \
|
|
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_BOOL(), \
|
|
@@ -295,24 +308,14 @@ enum fsck_err_opts {
|
|
NULL, "Enable project quotas") \
|
|
x(degraded, u8, \
|
|
OPT_FS|OPT_MOUNT, \
|
|
- OPT_BOOL(), \
|
|
- BCH2_NO_SB_OPT, false, \
|
|
+ OPT_STR(bch2_degraded_actions), \
|
|
+ BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \
|
|
NULL, "Allow mounting in degraded mode") \
|
|
- x(very_degraded, u8, \
|
|
- OPT_FS|OPT_MOUNT, \
|
|
- OPT_BOOL(), \
|
|
- BCH2_NO_SB_OPT, false, \
|
|
- NULL, "Allow mounting in when data will be missing") \
|
|
x(no_splitbrain_check, u8, \
|
|
OPT_FS|OPT_MOUNT, \
|
|
OPT_BOOL(), \
|
|
BCH2_NO_SB_OPT, false, \
|
|
NULL, "Don't kick drives out when splitbrain detected")\
|
|
- x(discard, u8, \
|
|
- OPT_FS|OPT_MOUNT|OPT_DEVICE, \
|
|
- OPT_BOOL(), \
|
|
- BCH2_NO_SB_OPT, true, \
|
|
- NULL, "Enable discard/TRIM support") \
|
|
x(verbose, u8, \
|
|
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_BOOL(), \
|
|
@@ -447,7 +450,7 @@ enum fsck_err_opts {
|
|
BCH2_NO_SB_OPT, false, \
|
|
NULL, "Reconstruct alloc btree") \
|
|
x(version_upgrade, u8, \
|
|
- OPT_FS|OPT_MOUNT, \
|
|
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_STR(bch2_version_upgrade_opts), \
|
|
BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \
|
|
NULL, "Set superblock to latest version,\n" \
|
|
@@ -487,45 +490,56 @@ enum fsck_err_opts {
|
|
BCH2_NO_SB_OPT, true, \
|
|
NULL, "Enable rebalance: disable for debugging, or to\n"\
|
|
"quiet the system when doing performance testing\n")\
|
|
+ x(rebalance_on_ac_only, u8, \
|
|
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH_SB_REBALANCE_AC_ONLY, false, \
|
|
+ NULL, "Enable rebalance while on mains power only\n") \
|
|
+ x(auto_snapshot_deletion, u8, \
|
|
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH2_NO_SB_OPT, true, \
|
|
+ NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\
|
|
+ "quiet the system when doing performance testing\n")\
|
|
x(no_data_io, u8, \
|
|
OPT_MOUNT, \
|
|
OPT_BOOL(), \
|
|
BCH2_NO_SB_OPT, false, \
|
|
NULL, "Skip submit_bio() for data reads and writes, " \
|
|
"for performance testing purposes") \
|
|
- x(fs_size, u64, \
|
|
- OPT_DEVICE, \
|
|
+ x(state, u64, \
|
|
+ OPT_DEVICE|OPT_RUNTIME, \
|
|
+ OPT_STR(bch2_member_states), \
|
|
+ BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \
|
|
+ "state", "rw,ro,failed,spare") \
|
|
+ x(bucket_size, u32, \
|
|
+ OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \
|
|
OPT_UINT(0, S64_MAX), \
|
|
- BCH2_NO_SB_OPT, 0, \
|
|
- "size", "Size of filesystem on device") \
|
|
- x(bucket, u32, \
|
|
- OPT_DEVICE, \
|
|
- OPT_UINT(0, S64_MAX), \
|
|
- BCH2_NO_SB_OPT, 0, \
|
|
+ BCH_MEMBER_BUCKET_SIZE, 0, \
|
|
"size", "Specifies the bucket size; must be greater than the btree node size")\
|
|
x(durability, u8, \
|
|
- OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \
|
|
+ OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \
|
|
OPT_UINT(0, BCH_REPLICAS_MAX), \
|
|
- BCH2_NO_SB_OPT, 1, \
|
|
+ BCH_MEMBER_DURABILITY, 1, \
|
|
"n", "Data written to this device will be considered\n"\
|
|
"to have already been replicated n times") \
|
|
x(data_allowed, u8, \
|
|
OPT_DEVICE, \
|
|
OPT_BITFIELD(__bch2_data_types), \
|
|
- BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
|
|
+ BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
|
|
"types", "Allowed data types for this device: journal, btree, and/or user")\
|
|
+ x(discard, u8, \
|
|
+ OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH_MEMBER_DISCARD, true, \
|
|
+ NULL, "Enable discard/TRIM support") \
|
|
x(btree_node_prefetch, u8, \
|
|
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_BOOL(), \
|
|
BCH2_NO_SB_OPT, true, \
|
|
- NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
|
|
+ NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\
|
|
" prefetched sequentially")
|
|
|
|
-#define BCH_DEV_OPT_SETTERS() \
|
|
- x(discard, BCH_MEMBER_DISCARD) \
|
|
- x(durability, BCH_MEMBER_DURABILITY) \
|
|
- x(data_allowed, BCH_MEMBER_DATA_ALLOWED)
|
|
-
|
|
struct bch_opts {
|
|
#define x(_name, _bits, ...) unsigned _name##_defined:1;
|
|
BCH_OPTS()
|
|
@@ -582,8 +596,6 @@ struct printbuf;
|
|
|
|
struct bch_option {
|
|
struct attribute attr;
|
|
- u64 (*get_sb)(const struct bch_sb *);
|
|
- void (*set_sb)(struct bch_sb *, u64);
|
|
enum opt_type type;
|
|
enum opt_flags flags;
|
|
u64 min, max;
|
|
@@ -595,6 +607,12 @@ struct bch_option {
|
|
const char *hint;
|
|
const char *help;
|
|
|
|
+ u64 (*get_sb)(const struct bch_sb *);
|
|
+ void (*set_sb)(struct bch_sb *, u64);
|
|
+
|
|
+ u64 (*get_member)(const struct bch_member *);
|
|
+ void (*set_member)(struct bch_member *, u64);
|
|
+
|
|
};
|
|
|
|
extern const struct bch_option bch2_opt_table[];
|
|
@@ -603,12 +621,12 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
|
|
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
|
|
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
|
|
|
|
-u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
|
|
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int);
|
|
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
|
|
-void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
|
|
+bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
|
|
|
|
struct bch_dev;
|
|
-void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
|
|
+bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
|
|
|
|
int bch2_opt_lookup(const char *);
|
|
int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
|
|
@@ -625,12 +643,15 @@ void bch2_opts_to_text(struct printbuf *,
|
|
struct bch_fs *, struct bch_sb *,
|
|
unsigned, unsigned, unsigned);
|
|
|
|
-int bch2_opt_check_may_set(struct bch_fs *, int, u64);
|
|
-int bch2_opts_check_may_set(struct bch_fs *);
|
|
+int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64);
|
|
+int bch2_opts_hooks_pre_set(struct bch_fs *);
|
|
+void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64,
|
|
+ struct bch_opts *, enum bch_opt_id);
|
|
+
|
|
int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
|
|
struct printbuf *, const char *, const char *);
|
|
int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
|
|
- char *);
|
|
+ char *, bool);
|
|
|
|
/* inode opts: */
|
|
|
|
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
|
|
index 4cf5a2af1e6f..3302bbc78a09 100644
|
|
--- a/fs/bcachefs/printbuf.c
|
|
+++ b/fs/bcachefs/printbuf.c
|
|
@@ -276,6 +276,25 @@ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
|
|
buf->has_indent_or_tabstops = true;
|
|
}
|
|
|
|
+/**
|
|
+ * bch2_printbuf_indent_add_nextline() - add to the current indent level for
|
|
+ * subsequent lines
|
|
+ *
|
|
+ * @buf: printbuf to control
|
|
+ * @spaces: number of spaces to add to the current indent level
|
|
+ *
|
|
+ * Subsequent lines - not the current line - will be indented by @spaces more
|
|
+ * spaces.
|
|
+ */
|
|
+void bch2_printbuf_indent_add_nextline(struct printbuf *buf, unsigned spaces)
|
|
+{
|
|
+ if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
|
|
+ spaces = 0;
|
|
+
|
|
+ buf->indent += spaces;
|
|
+ buf->has_indent_or_tabstops = true;
|
|
+}
|
|
+
|
|
/**
|
|
* bch2_printbuf_indent_sub() - subtract from the current indent level
|
|
*
|
|
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
|
|
index d0dd398baa2b..1ca476adbf6f 100644
|
|
--- a/fs/bcachefs/printbuf.h
|
|
+++ b/fs/bcachefs/printbuf.h
|
|
@@ -112,6 +112,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *);
|
|
int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
|
|
|
|
void bch2_printbuf_indent_add(struct printbuf *, unsigned);
|
|
+void bch2_printbuf_indent_add_nextline(struct printbuf *, unsigned);
|
|
void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
|
|
|
|
void bch2_prt_newline(struct printbuf *);
|
|
diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c
|
|
new file mode 100644
|
|
index 000000000000..d09898566abe
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/progress.c
|
|
@@ -0,0 +1,61 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "bbpos.h"
|
|
+#include "disk_accounting.h"
|
|
+#include "progress.h"
|
|
+
|
|
+void bch2_progress_init(struct progress_indicator_state *s,
|
|
+ struct bch_fs *c,
|
|
+ u64 btree_id_mask)
|
|
+{
|
|
+ memset(s, 0, sizeof(*s));
|
|
+
|
|
+ s->next_print = jiffies + HZ * 10;
|
|
+
|
|
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
|
|
+ if (!(btree_id_mask & BIT_ULL(i)))
|
|
+ continue;
|
|
+
|
|
+ struct disk_accounting_pos acc;
|
|
+ disk_accounting_key_init(acc, btree, .id = i);
|
|
+
|
|
+ u64 v;
|
|
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
|
|
+ s->nodes_total += div64_ul(v, btree_sectors(c));
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline bool progress_update_p(struct progress_indicator_state *s)
|
|
+{
|
|
+ bool ret = time_after_eq(jiffies, s->next_print);
|
|
+
|
|
+ if (ret)
|
|
+ s->next_print = jiffies + HZ * 10;
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_progress_update_iter(struct btree_trans *trans,
|
|
+ struct progress_indicator_state *s,
|
|
+ struct btree_iter *iter,
|
|
+ const char *msg)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree *b = path_l(btree_iter_path(trans, iter))->b;
|
|
+
|
|
+ s->nodes_seen += b != s->last_node;
|
|
+ s->last_node = b;
|
|
+
|
|
+ if (progress_update_p(s)) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ unsigned percent = s->nodes_total
|
|
+ ? div64_u64(s->nodes_seen * 100, s->nodes_total)
|
|
+ : 0;
|
|
+
|
|
+ prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
|
|
+ msg, percent, s->nodes_seen, s->nodes_total);
|
|
+ bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
|
|
+
|
|
+ bch_info(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
+}
|
|
diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h
|
|
new file mode 100644
|
|
index 000000000000..23fb1811f943
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/progress.h
|
|
@@ -0,0 +1,29 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_PROGRESS_H
|
|
+#define _BCACHEFS_PROGRESS_H
|
|
+
|
|
+/*
|
|
+ * Lame progress indicators
|
|
+ *
|
|
+ * We don't like to use these because they print to the dmesg console, which is
|
|
+ * spammy - we much prefer to be wired up to a userspace programm (e.g. via
|
|
+ * thread_with_file) and have it print the progress indicator.
|
|
+ *
|
|
+ * But some code is old and doesn't support that, or runs in a context where
|
|
+ * that's not yet practical (mount).
|
|
+ */
|
|
+
|
|
+struct progress_indicator_state {
|
|
+ unsigned long next_print;
|
|
+ u64 nodes_seen;
|
|
+ u64 nodes_total;
|
|
+ struct btree *last_node;
|
|
+};
|
|
+
|
|
+void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64);
|
|
+void bch2_progress_update_iter(struct btree_trans *,
|
|
+ struct progress_indicator_state *,
|
|
+ struct btree_iter *,
|
|
+ const char *);
|
|
+
|
|
+#endif /* _BCACHEFS_PROGRESS_H */
|
|
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
|
|
index 8b857fc33244..3d4755d73af7 100644
|
|
--- a/fs/bcachefs/quota.c
|
|
+++ b/fs/bcachefs/quota.c
|
|
@@ -516,7 +516,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
|
|
bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
|
|
KEY_TYPE_QUOTA_NOCHECK);
|
|
advance:
|
|
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
|
|
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
|
|
return 0;
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
|
|
index bef2aa1b8bcd..2cf3d55d0bbc 100644
|
|
--- a/fs/bcachefs/rcu_pending.c
|
|
+++ b/fs/bcachefs/rcu_pending.c
|
|
@@ -1,6 +1,7 @@
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
#define pr_fmt(fmt) "%s() " fmt "\n", __func__
|
|
|
|
+#include <linux/darray.h>
|
|
#include <linux/generic-radix-tree.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/percpu.h>
|
|
@@ -9,8 +10,6 @@
|
|
#include <linux/vmalloc.h>
|
|
|
|
#include "rcu_pending.h"
|
|
-#include "darray.h"
|
|
-#include "util.h"
|
|
|
|
#define static_array_for_each(_a, _i) \
|
|
for (typeof(&(_a)[0]) _i = _a; \
|
|
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
|
|
index d0a1f5cd5c2b..de1ec9e0caa0 100644
|
|
--- a/fs/bcachefs/rebalance.c
|
|
+++ b/fs/bcachefs/rebalance.c
|
|
@@ -26,9 +26,8 @@
|
|
|
|
/* bch_extent_rebalance: */
|
|
|
|
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
|
|
+static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
|
|
{
|
|
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
const union bch_extent_entry *entry;
|
|
|
|
bkey_extent_entry_for_each(ptrs, entry)
|
|
@@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s
|
|
return NULL;
|
|
}
|
|
|
|
+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
|
|
+{
|
|
+ return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
|
|
+}
|
|
+
|
|
static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
|
|
struct bch_io_opts *opts,
|
|
struct bkey_s_c k,
|
|
@@ -76,11 +80,13 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
|
|
unsigned ptr_bit = 1;
|
|
unsigned rewrite_ptrs = 0;
|
|
|
|
+ rcu_read_lock();
|
|
bkey_for_each_ptr(ptrs, ptr) {
|
|
if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
|
|
rewrite_ptrs |= ptr_bit;
|
|
ptr_bit <<= 1;
|
|
}
|
|
+ rcu_read_unlock();
|
|
|
|
return rewrite_ptrs;
|
|
}
|
|
@@ -91,17 +97,24 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
|
|
{
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
|
|
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
|
|
+ return 0;
|
|
+
|
|
return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
|
|
bch2_bkey_ptrs_need_move(c, opts, ptrs);
|
|
}
|
|
|
|
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
|
|
{
|
|
- const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+
|
|
+ const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
|
|
if (!opts)
|
|
return 0;
|
|
|
|
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
|
|
+ return 0;
|
|
+
|
|
const union bch_extent_entry *entry;
|
|
struct extent_ptr_decoded p;
|
|
u64 sectors = 0;
|
|
@@ -121,10 +134,14 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
|
|
}
|
|
}
|
|
incompressible:
|
|
- if (opts->background_target)
|
|
+ if (opts->background_target) {
|
|
+ rcu_read_lock();
|
|
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
- if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
|
|
+ if (!p.ptr.cached &&
|
|
+ !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
|
|
sectors += p.crc.compressed_size;
|
|
+ rcu_read_unlock();
|
|
+ }
|
|
|
|
return sectors;
|
|
}
|
|
@@ -228,7 +245,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
|
|
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
|
|
BTREE_ITER_intent);
|
|
- k = bch2_btree_iter_peek_slot(&iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -257,7 +274,7 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
|
|
int ret = bch2_trans_commit_do(c, NULL, NULL,
|
|
BCH_TRANS_COMMIT_no_enospc,
|
|
bch2_set_rebalance_needs_scan_trans(trans, inum));
|
|
- rebalance_wakeup(c);
|
|
+ bch2_rebalance_wakeup(c);
|
|
return ret;
|
|
}
|
|
|
|
@@ -276,7 +293,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum,
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
|
|
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
|
|
BTREE_ITER_intent);
|
|
- k = bch2_btree_iter_peek_slot(&iter);
|
|
+ k = bch2_btree_iter_peek_slot(trans, &iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -296,7 +313,7 @@ static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
|
|
struct btree_iter *work_iter)
|
|
{
|
|
return !kthread_should_stop()
|
|
- ? bch2_btree_iter_peek(work_iter)
|
|
+ ? bch2_btree_iter_peek(trans, work_iter)
|
|
: bkey_s_c_null;
|
|
}
|
|
|
|
@@ -304,7 +321,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
struct bkey_s_c k)
|
|
{
|
|
- if (!bch2_bkey_rebalance_opts(k))
|
|
+ if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
|
|
return 0;
|
|
|
|
struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
|
|
@@ -330,7 +347,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
|
|
work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
|
|
work_pos,
|
|
BTREE_ITER_all_snapshots);
|
|
- struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter);
|
|
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
|
|
if (bkey_err(k))
|
|
return k;
|
|
|
|
@@ -341,7 +358,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
|
|
memset(data_opts, 0, sizeof(*data_opts));
|
|
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
|
|
data_opts->target = io_opts->background_target;
|
|
- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
|
|
+ data_opts->write_flags |= BCH_WRITE_only_specified_devs;
|
|
|
|
if (!data_opts->rewrite_ptrs) {
|
|
/*
|
|
@@ -442,22 +459,11 @@ static int do_rebalance_extent(struct moving_context *ctxt,
|
|
return ret;
|
|
}
|
|
|
|
-static bool rebalance_pred(struct bch_fs *c, void *arg,
|
|
- struct bkey_s_c k,
|
|
- struct bch_io_opts *io_opts,
|
|
- struct data_update_opts *data_opts)
|
|
-{
|
|
- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
|
|
- data_opts->target = io_opts->background_target;
|
|
- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
|
|
- return data_opts->rewrite_ptrs != 0;
|
|
-}
|
|
-
|
|
static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
|
|
{
|
|
struct btree_trans *trans = ctxt->trans;
|
|
+ struct bch_fs *c = trans->c;
|
|
struct bch_fs_rebalance *r = &trans->c->rebalance;
|
|
- int ret;
|
|
|
|
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
|
|
ctxt->stats = &r->scan_stats;
|
|
@@ -472,11 +478,34 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
|
|
|
|
r->state = BCH_REBALANCE_scanning;
|
|
|
|
- ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
|
|
- commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
- bch2_clear_rebalance_needs_scan(trans, inum, cookie));
|
|
+ struct per_snapshot_io_opts snapshot_io_opts;
|
|
+ per_snapshot_io_opts_init(&snapshot_io_opts, c);
|
|
+
|
|
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
|
|
+ r->scan_start.pos, r->scan_end.pos,
|
|
+ BTREE_ITER_all_snapshots|
|
|
+ BTREE_ITER_not_extents|
|
|
+ BTREE_ITER_prefetch, k, ({
|
|
+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
|
|
|
|
+ struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
|
|
+ &snapshot_io_opts, iter.pos, &iter, k);
|
|
+ PTR_ERR_OR_ZERO(io_opts);
|
|
+ })) ?:
|
|
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
+ bch2_clear_rebalance_needs_scan(trans, inum, cookie));
|
|
+
|
|
+ per_snapshot_io_opts_exit(&snapshot_io_opts);
|
|
bch2_move_stats_exit(&r->scan_stats, trans->c);
|
|
+
|
|
+ /*
|
|
+ * Ensure that the rebalance_work entries we created are seen by the
|
|
+ * next iteration of do_rebalance(), so we don't end up stuck in
|
|
+ * rebalance_wait():
|
|
+ */
|
|
+ atomic64_inc(&r->scan_stats.sectors_seen);
|
|
+ bch2_btree_write_buffer_flush_sync(trans);
|
|
+
|
|
return ret;
|
|
}
|
|
|
|
@@ -501,12 +530,19 @@ static void rebalance_wait(struct bch_fs *c)
|
|
bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
|
|
}
|
|
|
|
+static bool bch2_rebalance_enabled(struct bch_fs *c)
|
|
+{
|
|
+ return c->opts.rebalance_enabled &&
|
|
+ !(c->opts.rebalance_on_ac_only &&
|
|
+ c->rebalance.on_battery);
|
|
+}
|
|
+
|
|
static int do_rebalance(struct moving_context *ctxt)
|
|
{
|
|
struct btree_trans *trans = ctxt->trans;
|
|
struct bch_fs *c = trans->c;
|
|
struct bch_fs_rebalance *r = &c->rebalance;
|
|
- struct btree_iter rebalance_work_iter, extent_iter = { NULL };
|
|
+ struct btree_iter rebalance_work_iter, extent_iter = {};
|
|
struct bkey_s_c k;
|
|
int ret = 0;
|
|
|
|
@@ -520,9 +556,9 @@ static int do_rebalance(struct moving_context *ctxt)
|
|
BTREE_ITER_all_snapshots);
|
|
|
|
while (!bch2_move_ratelimit(ctxt)) {
|
|
- if (!c->opts.rebalance_enabled) {
|
|
+ if (!bch2_rebalance_enabled(c)) {
|
|
bch2_moving_ctxt_flush_all(ctxt);
|
|
- kthread_wait_freezable(c->opts.rebalance_enabled ||
|
|
+ kthread_wait_freezable(bch2_rebalance_enabled(c) ||
|
|
kthread_should_stop());
|
|
}
|
|
|
|
@@ -547,7 +583,7 @@ static int do_rebalance(struct moving_context *ctxt)
|
|
if (ret)
|
|
break;
|
|
|
|
- bch2_btree_iter_advance(&rebalance_work_iter);
|
|
+ bch2_btree_iter_advance(trans, &rebalance_work_iter);
|
|
}
|
|
|
|
bch2_trans_iter_exit(trans, &extent_iter);
|
|
@@ -576,6 +612,13 @@ static int bch2_rebalance_thread(void *arg)
|
|
|
|
set_freezable();
|
|
|
|
+ /*
|
|
+ * Data move operations can't run until after check_snapshots has
|
|
+ * completed, and bch2_snapshot_is_ancestor() is available.
|
|
+ */
|
|
+ kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
|
|
+ kthread_should_stop());
|
|
+
|
|
bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
|
|
writepoint_ptr(&c->rebalance_write_point),
|
|
true);
|
|
@@ -590,8 +633,20 @@ static int bch2_rebalance_thread(void *arg)
|
|
|
|
void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
|
|
{
|
|
+ printbuf_tabstop_push(out, 32);
|
|
+
|
|
struct bch_fs_rebalance *r = &c->rebalance;
|
|
|
|
+ /* print pending work */
|
|
+ struct disk_accounting_pos acc;
|
|
+ disk_accounting_key_init(acc, rebalance_work);
|
|
+ u64 v;
|
|
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
|
|
+
|
|
+ prt_printf(out, "pending work:\t");
|
|
+ prt_human_readable_u64(out, v << 9);
|
|
+ prt_printf(out, "\n\n");
|
|
+
|
|
prt_str(out, bch2_rebalance_state_strs[r->state]);
|
|
prt_newline(out);
|
|
printbuf_indent_add(out, 2);
|
|
@@ -600,15 +655,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
|
|
case BCH_REBALANCE_waiting: {
|
|
u64 now = atomic64_read(&c->io_clock[WRITE].now);
|
|
|
|
- prt_str(out, "io wait duration: ");
|
|
+ prt_printf(out, "io wait duration:\t");
|
|
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
|
|
prt_newline(out);
|
|
|
|
- prt_str(out, "io wait remaining: ");
|
|
+ prt_printf(out, "io wait remaining:\t");
|
|
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
|
|
prt_newline(out);
|
|
|
|
- prt_str(out, "duration waited: ");
|
|
+ prt_printf(out, "duration waited:\t");
|
|
bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
|
|
prt_newline(out);
|
|
break;
|
|
@@ -621,6 +676,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
|
|
break;
|
|
}
|
|
prt_newline(out);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ struct task_struct *t = rcu_dereference(c->rebalance.thread);
|
|
+ if (t)
|
|
+ get_task_struct(t);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (t) {
|
|
+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
|
|
+ put_task_struct(t);
|
|
+ }
|
|
+
|
|
printbuf_indent_sub(out, 2);
|
|
}
|
|
|
|
@@ -635,7 +702,7 @@ void bch2_rebalance_stop(struct bch_fs *c)
|
|
c->rebalance.thread = NULL;
|
|
|
|
if (p) {
|
|
- /* for sychronizing with rebalance_wakeup() */
|
|
+ /* for sychronizing with bch2_rebalance_wakeup() */
|
|
synchronize_rcu();
|
|
|
|
kthread_stop(p);
|
|
@@ -666,7 +733,156 @@ int bch2_rebalance_start(struct bch_fs *c)
|
|
return 0;
|
|
}
|
|
|
|
-void bch2_fs_rebalance_init(struct bch_fs *c)
|
|
+#ifdef CONFIG_POWER_SUPPLY
|
|
+#include <linux/power_supply.h>
|
|
+
|
|
+static int bch2_rebalance_power_notifier(struct notifier_block *nb,
|
|
+ unsigned long event, void *data)
|
|
+{
|
|
+ struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier);
|
|
+
|
|
+ c->rebalance.on_battery = !power_supply_is_system_supplied();
|
|
+ bch2_rebalance_wakeup(c);
|
|
+ return NOTIFY_OK;
|
|
+}
|
|
+#endif
|
|
+
|
|
+void bch2_fs_rebalance_exit(struct bch_fs *c)
|
|
+{
|
|
+#ifdef CONFIG_POWER_SUPPLY
|
|
+ power_supply_unreg_notifier(&c->rebalance.power_notifier);
|
|
+#endif
|
|
+}
|
|
+
|
|
+int bch2_fs_rebalance_init(struct bch_fs *c)
|
|
{
|
|
- bch2_pd_controller_init(&c->rebalance.pd);
|
|
+ struct bch_fs_rebalance *r = &c->rebalance;
|
|
+
|
|
+ bch2_pd_controller_init(&r->pd);
|
|
+
|
|
+#ifdef CONFIG_POWER_SUPPLY
|
|
+ r->power_notifier.notifier_call = bch2_rebalance_power_notifier;
|
|
+ int ret = power_supply_reg_notifier(&r->power_notifier);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ r->on_battery = !power_supply_is_system_supplied();
|
|
+#endif
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int check_rebalance_work_one(struct btree_trans *trans,
|
|
+ struct btree_iter *extent_iter,
|
|
+ struct btree_iter *rebalance_iter,
|
|
+ struct bkey_buf *last_flushed)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_s_c extent_k, rebalance_k;
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?:
|
|
+ bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter));
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (!extent_k.k &&
|
|
+ extent_iter->btree_id == BTREE_ID_reflink &&
|
|
+ (!rebalance_k.k ||
|
|
+ rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
|
|
+ bch2_trans_iter_exit(trans, extent_iter);
|
|
+ bch2_trans_iter_init(trans, extent_iter,
|
|
+ BTREE_ID_extents, POS_MIN,
|
|
+ BTREE_ITER_prefetch|
|
|
+ BTREE_ITER_all_snapshots);
|
|
+ return -BCH_ERR_transaction_restart_nested;
|
|
+ }
|
|
+
|
|
+ if (!extent_k.k && !rebalance_k.k)
|
|
+ return 1;
|
|
+
|
|
+ int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX,
|
|
+ rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
|
|
+
|
|
+ struct bkey deleted;
|
|
+ bkey_init(&deleted);
|
|
+
|
|
+ if (cmp < 0) {
|
|
+ deleted.p = extent_k.k->p;
|
|
+ rebalance_k.k = &deleted;
|
|
+ } else if (cmp > 0) {
|
|
+ deleted.p = rebalance_k.k->p;
|
|
+ extent_k.k = &deleted;
|
|
+ }
|
|
+
|
|
+ bool should_have_rebalance =
|
|
+ bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
|
|
+ bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
|
|
+
|
|
+ if (should_have_rebalance != have_rebalance) {
|
|
+ ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ bch2_bkey_val_to_text(&buf, c, extent_k);
|
|
+ }
|
|
+
|
|
+ if (fsck_err_on(!should_have_rebalance && have_rebalance,
|
|
+ trans, rebalance_work_incorrectly_set,
|
|
+ "rebalance work incorrectly set\n%s", buf.buf)) {
|
|
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
|
|
+ extent_k.k->p, false);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (fsck_err_on(should_have_rebalance && !have_rebalance,
|
|
+ trans, rebalance_work_incorrectly_unset,
|
|
+ "rebalance work incorrectly unset\n%s", buf.buf)) {
|
|
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
|
|
+ extent_k.k->p, true);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (cmp <= 0)
|
|
+ bch2_btree_iter_advance(trans, extent_iter);
|
|
+ if (cmp >= 0)
|
|
+ bch2_btree_iter_advance(trans, rebalance_iter);
|
|
+err:
|
|
+fsck_err:
|
|
+ printbuf_exit(&buf);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_check_rebalance_work(struct bch_fs *c)
|
|
+{
|
|
+ struct btree_trans *trans = bch2_trans_get(c);
|
|
+ struct btree_iter rebalance_iter, extent_iter;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_iter_init(trans, &extent_iter,
|
|
+ BTREE_ID_reflink, POS_MIN,
|
|
+ BTREE_ITER_prefetch);
|
|
+ bch2_trans_iter_init(trans, &rebalance_iter,
|
|
+ BTREE_ID_rebalance_work, POS_MIN,
|
|
+ BTREE_ITER_prefetch);
|
|
+
|
|
+ struct bkey_buf last_flushed;
|
|
+ bch2_bkey_buf_init(&last_flushed);
|
|
+ bkey_init(&last_flushed.k->k);
|
|
+
|
|
+ while (!ret) {
|
|
+ bch2_trans_begin(trans);
|
|
+
|
|
+ ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
|
|
+
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
+ ret = 0;
|
|
+ }
|
|
+
|
|
+ bch2_bkey_buf_exit(&last_flushed, c);
|
|
+ bch2_trans_iter_exit(trans, &extent_iter);
|
|
+ bch2_trans_iter_exit(trans, &rebalance_iter);
|
|
+ bch2_trans_put(trans);
|
|
+ return ret < 0 ? ret : 0;
|
|
}
|
|
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
|
|
index 62a3859d3823..5d9214fe1a22 100644
|
|
--- a/fs/bcachefs/rebalance.h
|
|
+++ b/fs/bcachefs/rebalance.h
|
|
@@ -37,7 +37,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
|
|
int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
|
|
int bch2_set_fs_needs_rebalance(struct bch_fs *);
|
|
|
|
-static inline void rebalance_wakeup(struct bch_fs *c)
|
|
+static inline void bch2_rebalance_wakeup(struct bch_fs *c)
|
|
{
|
|
struct task_struct *p;
|
|
|
|
@@ -52,6 +52,10 @@ void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
|
|
|
|
void bch2_rebalance_stop(struct bch_fs *);
|
|
int bch2_rebalance_start(struct bch_fs *);
|
|
-void bch2_fs_rebalance_init(struct bch_fs *);
|
|
+
|
|
+void bch2_fs_rebalance_exit(struct bch_fs *);
|
|
+int bch2_fs_rebalance_init(struct bch_fs *);
|
|
+
|
|
+int bch2_check_rebalance_work(struct bch_fs *);
|
|
|
|
#endif /* _BCACHEFS_REBALANCE_H */
|
|
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
|
|
index fe5098c17dfc..33d77286f1d5 100644
|
|
--- a/fs/bcachefs/rebalance_types.h
|
|
+++ b/fs/bcachefs/rebalance_types.h
|
|
@@ -30,6 +30,11 @@ struct bch_fs_rebalance {
|
|
struct bbpos scan_start;
|
|
struct bbpos scan_end;
|
|
struct bch_move_stats scan_stats;
|
|
+
|
|
+ bool on_battery;
|
|
+#ifdef CONFIG_POWER_SUPPLY
|
|
+ struct notifier_block power_notifier;
|
|
+#endif
|
|
};
|
|
|
|
#endif /* _BCACHEFS_REBALANCE_TYPES_H */
|
|
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
|
|
index 71c786cdb192..4fca57575565 100644
|
|
--- a/fs/bcachefs/recovery.c
|
|
+++ b/fs/bcachefs/recovery.c
|
|
@@ -13,12 +13,13 @@
|
|
#include "disk_accounting.h"
|
|
#include "errcode.h"
|
|
#include "error.h"
|
|
-#include "fs-common.h"
|
|
#include "journal_io.h"
|
|
#include "journal_reclaim.h"
|
|
#include "journal_seq_blacklist.h"
|
|
#include "logged_ops.h"
|
|
#include "move.h"
|
|
+#include "movinggc.h"
|
|
+#include "namei.h"
|
|
#include "quota.h"
|
|
#include "rebalance.h"
|
|
#include "recovery.h"
|
|
@@ -32,8 +33,9 @@
|
|
#include <linux/sort.h>
|
|
#include <linux/stat.h>
|
|
|
|
-
|
|
-int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
|
|
+int bch2_btree_lost_data(struct bch_fs *c,
|
|
+ struct printbuf *msg,
|
|
+ enum btree_id btree)
|
|
{
|
|
u64 b = BIT_ULL(btree);
|
|
int ret = 0;
|
|
@@ -42,32 +44,32 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
|
|
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
|
|
|
|
if (!(c->sb.btrees_lost_data & b)) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_btree_id_to_text(&buf, btree);
|
|
- bch_err(c, "flagging btree %s lost data", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
+ prt_printf(msg, "flagging btree ");
|
|
+ bch2_btree_id_to_text(msg, btree);
|
|
+ prt_printf(msg, " lost data\n");
|
|
+
|
|
ext->btrees_lost_data |= cpu_to_le64(b);
|
|
}
|
|
|
|
/* Once we have runtime self healing for topology errors we won't need this: */
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
|
|
|
|
/* Btree node accounting will be off: */
|
|
__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
|
|
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
/*
|
|
* These are much more minor, and don't need to be corrected right away,
|
|
* but in debug mode we want the next fsck run to be clean:
|
|
*/
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret;
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret;
|
|
#endif
|
|
|
|
switch (btree) {
|
|
case BTREE_ID_alloc:
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
|
|
|
|
__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
|
|
__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
|
|
@@ -77,26 +79,30 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
|
|
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
|
|
goto out;
|
|
case BTREE_ID_backpointers:
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret;
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret;
|
|
goto out;
|
|
case BTREE_ID_need_discard:
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
|
|
goto out;
|
|
case BTREE_ID_freespace:
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
|
|
goto out;
|
|
case BTREE_ID_bucket_gens:
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
|
|
goto out;
|
|
case BTREE_ID_lru:
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
|
|
goto out;
|
|
case BTREE_ID_accounting:
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
|
|
+ goto out;
|
|
+ case BTREE_ID_snapshots:
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
|
|
goto out;
|
|
default:
|
|
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
|
|
goto out;
|
|
}
|
|
out:
|
|
@@ -113,11 +119,8 @@ static void kill_btree(struct bch_fs *c, enum btree_id btree)
|
|
}
|
|
|
|
/* for -o reconstruct_alloc: */
|
|
-static void bch2_reconstruct_alloc(struct bch_fs *c)
|
|
+void bch2_reconstruct_alloc(struct bch_fs *c)
|
|
{
|
|
- bch2_journal_log_msg(c, "dropping alloc info");
|
|
- bch_info(c, "dropping and reconstructing all alloc info");
|
|
-
|
|
mutex_lock(&c->sb_lock);
|
|
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
|
|
|
|
@@ -159,6 +162,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
|
|
|
|
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
|
|
|
|
+ c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info));
|
|
+
|
|
bch2_write_super(c);
|
|
mutex_unlock(&c->sb_lock);
|
|
|
|
@@ -198,7 +203,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
|
|
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
|
|
BTREE_MAX_DEPTH, k->level,
|
|
BTREE_ITER_intent);
|
|
- int ret = bch2_btree_iter_traverse(&iter);
|
|
+ int ret = bch2_btree_iter_traverse(trans, &iter);
|
|
if (ret)
|
|
goto out;
|
|
|
|
@@ -261,7 +266,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
|
|
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
|
|
BTREE_MAX_DEPTH, k->level,
|
|
iter_flags);
|
|
- ret = bch2_btree_iter_traverse(&iter);
|
|
+ ret = bch2_btree_iter_traverse(trans, &iter);
|
|
if (ret)
|
|
goto out;
|
|
|
|
@@ -270,7 +275,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
|
|
BTREE_MAX_DEPTH, 0, iter_flags);
|
|
- ret = bch2_btree_iter_traverse(&iter) ?:
|
|
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_btree_increase_depth(trans, iter.path, 0) ?:
|
|
-BCH_ERR_transaction_restart_nested;
|
|
goto out;
|
|
@@ -281,7 +286,12 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
|
|
goto out;
|
|
|
|
if (k->k->k.type == KEY_TYPE_accounting) {
|
|
- ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k);
|
|
+ struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s);
|
|
+ ret = PTR_ERR_OR_ZERO(n);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ bkey_copy(n, k->k);
|
|
goto out;
|
|
}
|
|
|
|
@@ -389,9 +399,9 @@ int bch2_journal_replay(struct bch_fs *c)
|
|
* Now, replay any remaining keys in the order in which they appear in
|
|
* the journal, unpinning those journal entries as we go:
|
|
*/
|
|
- sort(keys_sorted.data, keys_sorted.nr,
|
|
- sizeof(keys_sorted.data[0]),
|
|
- journal_sort_seq_cmp, NULL);
|
|
+ sort_nonatomic(keys_sorted.data, keys_sorted.nr,
|
|
+ sizeof(keys_sorted.data[0]),
|
|
+ journal_sort_seq_cmp, NULL);
|
|
|
|
darray_for_each(keys_sorted, kp) {
|
|
cond_resched();
|
|
@@ -429,7 +439,7 @@ int bch2_journal_replay(struct bch_fs *c)
|
|
trans = NULL;
|
|
|
|
if (!c->opts.retain_recovery_info &&
|
|
- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
|
|
+ c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay)
|
|
bch2_journal_keys_put_initial(c);
|
|
|
|
replay_now_at(j, j->replay_journal_seq_end);
|
|
@@ -584,9 +594,6 @@ static int read_btree_roots(struct bch_fs *c)
|
|
buf.buf, bch2_err_str(ret))) {
|
|
if (btree_id_is_alloc(i))
|
|
r->error = 0;
|
|
-
|
|
- ret = bch2_btree_lost_data(c, i);
|
|
- BUG_ON(ret);
|
|
}
|
|
}
|
|
|
|
@@ -666,7 +673,7 @@ static bool check_version_upgrade(struct bch_fs *c)
|
|
bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
|
|
}
|
|
|
|
- bch_info(c, "%s", buf.buf);
|
|
+ bch_notice(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
|
|
ret = true;
|
|
@@ -682,7 +689,7 @@ static bool check_version_upgrade(struct bch_fs *c)
|
|
bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
|
|
prt_newline(&buf);
|
|
|
|
- bch_info(c, "%s", buf.buf);
|
|
+ bch_notice(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
|
|
ret = true;
|
|
@@ -789,11 +796,11 @@ int bch2_fs_recovery(struct bch_fs *c)
|
|
bch2_write_super(c);
|
|
mutex_unlock(&c->sb_lock);
|
|
|
|
- if (c->opts.fsck)
|
|
- set_bit(BCH_FS_fsck_running, &c->flags);
|
|
if (c->sb.clean)
|
|
set_bit(BCH_FS_clean_recovery, &c->flags);
|
|
- set_bit(BCH_FS_recovery_running, &c->flags);
|
|
+ if (c->opts.fsck)
|
|
+ set_bit(BCH_FS_in_fsck, &c->flags);
|
|
+ set_bit(BCH_FS_in_recovery, &c->flags);
|
|
|
|
ret = bch2_blacklist_table_initialize(c);
|
|
if (ret) {
|
|
@@ -888,8 +895,37 @@ int bch2_fs_recovery(struct bch_fs *c)
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (c->opts.reconstruct_alloc)
|
|
+ ret = bch2_fs_resize_on_mount(c);
|
|
+ if (ret) {
|
|
+ up_write(&c->state_lock);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
|
|
+ bch_info(c, "filesystem is an unresized image file, mounting ro");
|
|
+ c->opts.read_only = true;
|
|
+ }
|
|
+
|
|
+ if (!c->opts.read_only &&
|
|
+ (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) {
|
|
+ bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
|
|
+
|
|
bch2_reconstruct_alloc(c);
|
|
+ } else if (c->opts.reconstruct_alloc) {
|
|
+ bch2_journal_log_msg(c, "dropping alloc info");
|
|
+ bch_info(c, "dropping and reconstructing all alloc info");
|
|
+
|
|
+ bch2_reconstruct_alloc(c);
|
|
+ }
|
|
+
|
|
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
|
|
+ /* We can't go RW to fix errors without alloc info */
|
|
+ if (c->opts.fix_errors == FSCK_FIX_yes ||
|
|
+ c->opts.fix_errors == FSCK_FIX_ask)
|
|
+ c->opts.fix_errors = FSCK_FIX_no;
|
|
+ if (c->opts.errors == BCH_ON_ERROR_fix_safe)
|
|
+ c->opts.errors = BCH_ON_ERROR_continue;
|
|
+ }
|
|
|
|
/*
|
|
* After an unclean shutdown, skip then next few journal sequence
|
|
@@ -899,7 +935,7 @@ int bch2_fs_recovery(struct bch_fs *c)
|
|
* journal sequence numbers:
|
|
*/
|
|
if (!c->sb.clean)
|
|
- journal_seq += 8;
|
|
+ journal_seq += JOURNAL_BUF_NR * 4;
|
|
|
|
if (blacklist_seq != journal_seq) {
|
|
ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
|
|
@@ -932,8 +968,10 @@ int bch2_fs_recovery(struct bch_fs *c)
|
|
set_bit(BCH_FS_btree_running, &c->flags);
|
|
|
|
ret = bch2_sb_set_upgrade_extra(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
- ret = bch2_run_recovery_passes(c);
|
|
+ ret = bch2_run_recovery_passes(c, 0);
|
|
if (ret)
|
|
goto err;
|
|
|
|
@@ -944,8 +982,7 @@ int bch2_fs_recovery(struct bch_fs *c)
|
|
* multithreaded use:
|
|
*/
|
|
set_bit(BCH_FS_may_go_rw, &c->flags);
|
|
- clear_bit(BCH_FS_fsck_running, &c->flags);
|
|
- clear_bit(BCH_FS_recovery_running, &c->flags);
|
|
+ clear_bit(BCH_FS_in_fsck, &c->flags);
|
|
|
|
/* in case we don't run journal replay, i.e. norecovery mode */
|
|
set_bit(BCH_FS_accounting_replay_done, &c->flags);
|
|
@@ -968,9 +1005,8 @@ int bch2_fs_recovery(struct bch_fs *c)
|
|
bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
|
|
clear_bit(BCH_FS_errors_fixed, &c->flags);
|
|
|
|
- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
|
|
-
|
|
- ret = bch2_run_recovery_passes(c);
|
|
+ ret = bch2_run_recovery_passes(c,
|
|
+ BCH_RECOVERY_PASS_check_alloc_info);
|
|
if (ret)
|
|
goto err;
|
|
|
|
@@ -1014,7 +1050,7 @@ int bch2_fs_recovery(struct bch_fs *c)
|
|
|
|
if (c->opts.fsck &&
|
|
!test_bit(BCH_FS_error, &c->flags) &&
|
|
- c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
|
|
+ c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 &&
|
|
ext->btrees_lost_data) {
|
|
ext->btrees_lost_data = 0;
|
|
write_sb = true;
|
|
@@ -1075,8 +1111,17 @@ int bch2_fs_recovery(struct bch_fs *c)
|
|
return ret;
|
|
err:
|
|
fsck_err:
|
|
- bch2_fs_emergency_read_only(c);
|
|
- goto out;
|
|
+ {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+
|
|
+ prt_printf(&buf, "error in recovery: %s", bch2_err_str(ret));
|
|
+ bch2_fs_emergency_read_only2(c, &buf);
|
|
+
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
+ return ret;
|
|
}
|
|
|
|
int bch2_fs_initialize(struct bch_fs *c)
|
|
@@ -1125,14 +1170,17 @@ int bch2_fs_initialize(struct bch_fs *c)
|
|
* journal_res_get() will crash if called before this has
|
|
* set up the journal.pin FIFO and journal.cur pointer:
|
|
*/
|
|
- bch2_fs_journal_start(&c->journal, 1);
|
|
- set_bit(BCH_FS_accounting_replay_done, &c->flags);
|
|
- bch2_journal_set_replay_done(&c->journal);
|
|
+ ret = bch2_fs_journal_start(&c->journal, 1);
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
ret = bch2_fs_read_write_early(c);
|
|
if (ret)
|
|
goto err;
|
|
|
|
+ set_bit(BCH_FS_accounting_replay_done, &c->flags);
|
|
+ bch2_journal_set_replay_done(&c->journal);
|
|
+
|
|
for_each_member_device(c, ca) {
|
|
ret = bch2_dev_usage_init(ca, false);
|
|
if (ret) {
|
|
@@ -1189,7 +1237,10 @@ int bch2_fs_initialize(struct bch_fs *c)
|
|
if (ret)
|
|
goto err;
|
|
|
|
- c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
|
|
+ c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1;
|
|
+
|
|
+ bch2_copygc_wakeup(c);
|
|
+ bch2_rebalance_wakeup(c);
|
|
|
|
if (enabled_qtypes(c)) {
|
|
ret = bch2_fs_quota_read(c);
|
|
@@ -1209,7 +1260,7 @@ int bch2_fs_initialize(struct bch_fs *c)
|
|
bch2_write_super(c);
|
|
mutex_unlock(&c->sb_lock);
|
|
|
|
- c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
|
|
+ c->recovery.curr_pass = BCH_RECOVERY_PASS_NR;
|
|
return 0;
|
|
err:
|
|
bch_err_fn(c, ret);
|
|
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
|
|
index b0d55754b21b..c023f52fc2d6 100644
|
|
--- a/fs/bcachefs/recovery.h
|
|
+++ b/fs/bcachefs/recovery.h
|
|
@@ -2,7 +2,8 @@
|
|
#ifndef _BCACHEFS_RECOVERY_H
|
|
#define _BCACHEFS_RECOVERY_H
|
|
|
|
-int bch2_btree_lost_data(struct bch_fs *, enum btree_id);
|
|
+int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id);
|
|
+void bch2_reconstruct_alloc(struct bch_fs *);
|
|
|
|
int bch2_journal_replay(struct bch_fs *);
|
|
|
|
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
|
|
index 0b3c951c32da..f74f14227137 100644
|
|
--- a/fs/bcachefs/recovery_passes.c
|
|
+++ b/fs/bcachefs/recovery_passes.c
|
|
@@ -12,6 +12,7 @@
|
|
#include "journal.h"
|
|
#include "lru.h"
|
|
#include "logged_ops.h"
|
|
+#include "movinggc.h"
|
|
#include "rebalance.h"
|
|
#include "recovery.h"
|
|
#include "recovery_passes.h"
|
|
@@ -27,6 +28,145 @@ const char * const bch2_recovery_passes[] = {
|
|
NULL
|
|
};
|
|
|
|
+static const u8 passes_to_stable_map[] = {
|
|
+#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
|
|
+ BCH_RECOVERY_PASSES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+static const u8 passes_from_stable_map[] = {
|
|
+#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
|
|
+ BCH_RECOVERY_PASSES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
|
|
+{
|
|
+ return passes_to_stable_map[pass];
|
|
+}
|
|
+
|
|
+u64 bch2_recovery_passes_to_stable(u64 v)
|
|
+{
|
|
+ u64 ret = 0;
|
|
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
|
|
+ if (v & BIT_ULL(i))
|
|
+ ret |= BIT_ULL(passes_to_stable_map[i]);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass)
|
|
+{
|
|
+ return pass < ARRAY_SIZE(passes_from_stable_map)
|
|
+ ? passes_from_stable_map[pass]
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+u64 bch2_recovery_passes_from_stable(u64 v)
|
|
+{
|
|
+ u64 ret = 0;
|
|
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++)
|
|
+ if (v & BIT_ULL(i))
|
|
+ ret |= BIT_ULL(passes_from_stable_map[i]);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f,
|
|
+ enum bch_validate_flags flags, struct printbuf *err)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void bch2_sb_recovery_passes_to_text(struct printbuf *out,
|
|
+ struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_recovery_passes *r =
|
|
+ field_to_type(f, recovery_passes);
|
|
+ unsigned nr = recovery_passes_nr_entries(r);
|
|
+
|
|
+ if (out->nr_tabstops < 1)
|
|
+ printbuf_tabstop_push(out, 32);
|
|
+ if (out->nr_tabstops < 2)
|
|
+ printbuf_tabstop_push(out, 16);
|
|
+
|
|
+ prt_printf(out, "Pass\tLast run\tLast runtime\n");
|
|
+
|
|
+ for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) {
|
|
+ if (!i->last_run)
|
|
+ continue;
|
|
+
|
|
+ unsigned idx = i - r->start;
|
|
+
|
|
+ prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]);
|
|
+
|
|
+ bch2_prt_datetime(out, le64_to_cpu(i->last_run));
|
|
+ prt_tab(out);
|
|
+
|
|
+ bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC);
|
|
+ prt_newline(out);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
|
|
+ enum bch_recovery_pass pass,
|
|
+ s64 start_time)
|
|
+{
|
|
+ enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
|
|
+ s64 end_time = ktime_get_real_seconds();
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
|
|
+ __clear_bit_le64(stable, ext->recovery_passes_required);
|
|
+
|
|
+ struct bch_sb_field_recovery_passes *r =
|
|
+ bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
|
|
+
|
|
+ if (stable >= recovery_passes_nr_entries(r)) {
|
|
+ unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64);
|
|
+
|
|
+ r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s);
|
|
+ if (!r) {
|
|
+ bch_err(c, "error creating recovery_passes sb section");
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ r->start[stable].last_run = cpu_to_le64(end_time);
|
|
+ r->start[stable].last_runtime = cpu_to_le32(max(0, end_time - start_time));
|
|
+out:
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+}
|
|
+
|
|
+static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass)
|
|
+{
|
|
+ enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
|
|
+ bool ret = false;
|
|
+
|
|
+ lockdep_assert_held(&c->sb_lock);
|
|
+
|
|
+ struct bch_sb_field_recovery_passes *r =
|
|
+ bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
|
|
+
|
|
+ if (stable < recovery_passes_nr_entries(r)) {
|
|
+ struct recovery_pass_entry *i = r->start + stable;
|
|
+
|
|
+ /*
|
|
+ * Ratelimit if the last runtime was more than 1% of the time
|
|
+ * since we last ran
|
|
+ */
|
|
+ ret = (u64) le32_to_cpu(i->last_runtime) * 100 >
|
|
+ ktime_get_real_seconds() - le64_to_cpu(i->last_run);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = {
|
|
+ .validate = bch2_sb_recovery_passes_validate,
|
|
+ .to_text = bch2_sb_recovery_passes_to_text
|
|
+};
|
|
+
|
|
/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
|
|
static int bch2_recovery_pass_empty(struct bch_fs *c)
|
|
{
|
|
@@ -46,11 +186,36 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
|
|
|
|
set_bit(BCH_FS_may_go_rw, &c->flags);
|
|
|
|
- if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
|
|
+ if (keys->nr ||
|
|
+ !c->opts.read_only ||
|
|
+ !c->sb.clean ||
|
|
+ c->opts.recovery_passes ||
|
|
+ (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))) {
|
|
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
|
|
+ bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
|
|
+ bch2_reconstruct_alloc(c);
|
|
+ }
|
|
+
|
|
return bch2_fs_read_write_early(c);
|
|
+ }
|
|
return 0;
|
|
}
|
|
|
|
+/*
|
|
+ * Make sure root inode is readable while we're still in recovery and can rewind
|
|
+ * for repair:
|
|
+ */
|
|
+static int bch2_lookup_root_inode(struct bch_fs *c)
|
|
+{
|
|
+ subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM;
|
|
+ struct bch_inode_unpacked inode_u;
|
|
+ struct bch_subvolume subvol;
|
|
+
|
|
+ return bch2_trans_do(c,
|
|
+ bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
|
|
+ bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
|
|
+}
|
|
+
|
|
struct recovery_pass_fn {
|
|
int (*fn)(struct bch_fs *);
|
|
unsigned when;
|
|
@@ -62,255 +227,348 @@ static struct recovery_pass_fn recovery_pass_fns[] = {
|
|
#undef x
|
|
};
|
|
|
|
-static const u8 passes_to_stable_map[] = {
|
|
-#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
|
|
- BCH_RECOVERY_PASSES()
|
|
-#undef x
|
|
-};
|
|
+static u64 bch2_recovery_passes_match(unsigned flags)
|
|
+{
|
|
+ u64 ret = 0;
|
|
|
|
-static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
|
|
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
|
|
+ if (recovery_pass_fns[i].when & flags)
|
|
+ ret |= BIT_ULL(i);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+u64 bch2_fsck_recovery_passes(void)
|
|
{
|
|
- return passes_to_stable_map[pass];
|
|
+ return bch2_recovery_passes_match(PASS_FSCK);
|
|
}
|
|
|
|
-u64 bch2_recovery_passes_to_stable(u64 v)
|
|
+static void bch2_run_async_recovery_passes(struct bch_fs *c)
|
|
{
|
|
- u64 ret = 0;
|
|
- for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
|
|
- if (v & BIT_ULL(i))
|
|
- ret |= BIT_ULL(passes_to_stable_map[i]);
|
|
- return ret;
|
|
+ if (!down_trylock(&c->recovery.run_lock))
|
|
+ return;
|
|
+
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes))
|
|
+ goto unlock;
|
|
+
|
|
+ if (queue_work(system_long_wq, &c->recovery.work))
|
|
+ return;
|
|
+
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
|
|
+unlock:
|
|
+ up(&c->recovery.run_lock);
|
|
}
|
|
|
|
-u64 bch2_recovery_passes_from_stable(u64 v)
|
|
+static bool recovery_pass_needs_set(struct bch_fs *c,
|
|
+ enum bch_recovery_pass pass,
|
|
+ enum bch_run_recovery_pass_flags *flags)
|
|
{
|
|
- static const u8 map[] = {
|
|
-#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
|
|
- BCH_RECOVERY_PASSES()
|
|
-#undef x
|
|
- };
|
|
+ struct bch_fs_recovery *r = &c->recovery;
|
|
+ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
|
|
+ bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent);
|
|
|
|
- u64 ret = 0;
|
|
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
|
|
- if (v & BIT_ULL(i))
|
|
- ret |= BIT_ULL(map[i]);
|
|
- return ret;
|
|
+ if ((*flags & RUN_RECOVERY_PASS_ratelimit) &&
|
|
+ !bch2_recovery_pass_want_ratelimit(c, pass))
|
|
+ *flags &= ~RUN_RECOVERY_PASS_ratelimit;
|
|
+
|
|
+ /*
|
|
+ * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do
|
|
+ * anything if the pass has already run: these mean we need a prior pass
|
|
+ * to run before we continue to repair, we don't expect that pass to fix
|
|
+ * the damage we encountered.
|
|
+ *
|
|
+ * Otherwise, we run run_explicit_recovery_pass when we find damage, so
|
|
+ * it should run again even if it's already run:
|
|
+ */
|
|
+
|
|
+ if (persistent
|
|
+ ? !(c->sb.recovery_passes_required & BIT_ULL(pass))
|
|
+ : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass)))
|
|
+ return true;
|
|
+
|
|
+ if (!(*flags & RUN_RECOVERY_PASS_ratelimit) &&
|
|
+ (r->passes_ratelimiting & BIT_ULL(pass)))
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
}
|
|
|
|
/*
|
|
* For when we need to rewind recovery passes and run a pass we skipped:
|
|
*/
|
|
-static int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
|
|
- enum bch_recovery_pass pass)
|
|
+int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
|
|
+ struct printbuf *out,
|
|
+ enum bch_recovery_pass pass,
|
|
+ enum bch_run_recovery_pass_flags flags)
|
|
{
|
|
- if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns))
|
|
- return -BCH_ERR_not_in_recovery;
|
|
+ struct bch_fs_recovery *r = &c->recovery;
|
|
+ int ret = 0;
|
|
|
|
- if (c->recovery_passes_complete & BIT_ULL(pass))
|
|
- return 0;
|
|
+ lockdep_assert_held(&c->sb_lock);
|
|
|
|
- bool print = !(c->opts.recovery_passes & BIT_ULL(pass));
|
|
+ bch2_printbuf_make_room(out, 1024);
|
|
+ out->atomic++;
|
|
|
|
- if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
|
|
- c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) {
|
|
- if (print)
|
|
- bch_info(c, "need recovery pass %s (%u), but already rw",
|
|
- bch2_recovery_passes[pass], pass);
|
|
- return -BCH_ERR_cannot_rewind_recovery;
|
|
- }
|
|
+ unsigned long lockflags;
|
|
+ spin_lock_irqsave(&r->lock, lockflags);
|
|
|
|
- if (print)
|
|
- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
|
|
- bch2_recovery_passes[pass], pass,
|
|
- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
|
|
+ if (!recovery_pass_needs_set(c, pass, &flags))
|
|
+ goto out;
|
|
|
|
- c->opts.recovery_passes |= BIT_ULL(pass);
|
|
+ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
|
|
+ bool rewind = in_recovery && r->curr_pass > pass;
|
|
+ bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;
|
|
|
|
- if (c->curr_recovery_pass > pass) {
|
|
- c->next_recovery_pass = pass;
|
|
- c->recovery_passes_complete &= (1ULL << pass) >> 1;
|
|
- return -BCH_ERR_restart_recovery;
|
|
- } else {
|
|
- return 0;
|
|
+ if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) {
|
|
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
|
|
+ __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
|
|
}
|
|
-}
|
|
|
|
-int bch2_run_explicit_recovery_pass(struct bch_fs *c,
|
|
- enum bch_recovery_pass pass)
|
|
-{
|
|
- unsigned long flags;
|
|
- spin_lock_irqsave(&c->recovery_pass_lock, flags);
|
|
- int ret = __bch2_run_explicit_recovery_pass(c, pass);
|
|
- spin_unlock_irqrestore(&c->recovery_pass_lock, flags);
|
|
- return ret;
|
|
-}
|
|
+ if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
|
|
+ (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) {
|
|
+ prt_printf(out, "need recovery pass %s (%u), but already rw\n",
|
|
+ bch2_recovery_passes[pass], pass);
|
|
+ ret = -BCH_ERR_cannot_rewind_recovery;
|
|
+ goto out;
|
|
+ }
|
|
|
|
-int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c,
|
|
- enum bch_recovery_pass pass)
|
|
-{
|
|
- lockdep_assert_held(&c->sb_lock);
|
|
+ if (ratelimit)
|
|
+ r->passes_ratelimiting |= BIT_ULL(pass);
|
|
+ else
|
|
+ r->passes_ratelimiting &= ~BIT_ULL(pass);
|
|
|
|
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
|
|
- __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
|
|
+ if (in_recovery && !ratelimit) {
|
|
+ prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n",
|
|
+ bch2_recovery_passes[pass], pass,
|
|
+ bch2_recovery_passes[r->curr_pass], r->curr_pass,
|
|
+ rewind ? " - rewinding" : "");
|
|
|
|
- return bch2_run_explicit_recovery_pass(c, pass);
|
|
-}
|
|
-
|
|
-int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
|
|
- enum bch_recovery_pass pass)
|
|
-{
|
|
- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
|
|
+ r->passes_to_run |= BIT_ULL(pass);
|
|
|
|
- mutex_lock(&c->sb_lock);
|
|
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
|
|
+ if (rewind) {
|
|
+ r->next_pass = pass;
|
|
+ r->passes_complete &= (1ULL << pass) >> 1;
|
|
+ ret = -BCH_ERR_restart_recovery;
|
|
+ }
|
|
+ } else {
|
|
+ prt_printf(out, "scheduling recovery pass %s (%u)%s\n",
|
|
+ bch2_recovery_passes[pass], pass,
|
|
+ ratelimit ? " - ratelimiting" : "");
|
|
|
|
- if (!test_bit_le64(s, ext->recovery_passes_required)) {
|
|
- __set_bit_le64(s, ext->recovery_passes_required);
|
|
- bch2_write_super(c);
|
|
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
|
|
+ if (p->when & PASS_ONLINE)
|
|
+ bch2_run_async_recovery_passes(c);
|
|
}
|
|
- mutex_unlock(&c->sb_lock);
|
|
-
|
|
- return bch2_run_explicit_recovery_pass(c, pass);
|
|
+out:
|
|
+ spin_unlock_irqrestore(&r->lock, lockflags);
|
|
+ --out->atomic;
|
|
+ return ret;
|
|
}
|
|
|
|
-static void bch2_clear_recovery_pass_required(struct bch_fs *c,
|
|
- enum bch_recovery_pass pass)
|
|
+int bch2_run_explicit_recovery_pass(struct bch_fs *c,
|
|
+ struct printbuf *out,
|
|
+ enum bch_recovery_pass pass,
|
|
+ enum bch_run_recovery_pass_flags flags)
|
|
{
|
|
- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
|
|
+ int ret = 0;
|
|
|
|
- mutex_lock(&c->sb_lock);
|
|
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
|
|
+ scoped_guard(mutex, &c->sb_lock) {
|
|
+ if (!recovery_pass_needs_set(c, pass, &flags))
|
|
+ return 0;
|
|
|
|
- if (test_bit_le64(s, ext->recovery_passes_required)) {
|
|
- __clear_bit_le64(s, ext->recovery_passes_required);
|
|
+ ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
|
|
bch2_write_super(c);
|
|
}
|
|
- mutex_unlock(&c->sb_lock);
|
|
-}
|
|
-
|
|
-u64 bch2_fsck_recovery_passes(void)
|
|
-{
|
|
- u64 ret = 0;
|
|
|
|
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
|
|
- if (recovery_pass_fns[i].when & PASS_FSCK)
|
|
- ret |= BIT_ULL(i);
|
|
return ret;
|
|
}
|
|
|
|
-static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
|
|
+int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
|
|
{
|
|
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
|
|
+ enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent;
|
|
|
|
- if (c->opts.recovery_passes_exclude & BIT_ULL(pass))
|
|
- return false;
|
|
- if (c->opts.recovery_passes & BIT_ULL(pass))
|
|
- return true;
|
|
- if ((p->when & PASS_FSCK) && c->opts.fsck)
|
|
- return true;
|
|
- if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
|
|
- return true;
|
|
- if (p->when & PASS_ALWAYS)
|
|
- return true;
|
|
- return false;
|
|
+ if (!recovery_pass_needs_set(c, pass, &flags))
|
|
+ return 0;
|
|
+
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass,
|
|
+ RUN_RECOVERY_PASS_nopersistent);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ bch2_print_str(c, KERN_NOTICE, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ return ret;
|
|
}
|
|
|
|
static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
|
|
{
|
|
+ struct bch_fs_recovery *r = &c->recovery;
|
|
struct recovery_pass_fn *p = recovery_pass_fns + pass;
|
|
- int ret;
|
|
|
|
if (!(p->when & PASS_SILENT))
|
|
bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
|
|
bch2_recovery_passes[pass]);
|
|
- ret = p->fn(c);
|
|
- if (ret)
|
|
+
|
|
+ s64 start_time = ktime_get_real_seconds();
|
|
+ int ret = p->fn(c);
|
|
+
|
|
+ r->passes_to_run &= ~BIT_ULL(pass);
|
|
+
|
|
+ if (ret) {
|
|
+ r->passes_failing |= BIT_ULL(pass);
|
|
return ret;
|
|
+ }
|
|
+
|
|
+ r->passes_failing = 0;
|
|
+
|
|
+ if (!test_bit(BCH_FS_error, &c->flags))
|
|
+ bch2_sb_recovery_pass_complete(c, pass, start_time);
|
|
+
|
|
if (!(p->when & PASS_SILENT))
|
|
bch2_print(c, KERN_CONT " done\n");
|
|
|
|
return 0;
|
|
}
|
|
|
|
-int bch2_run_online_recovery_passes(struct bch_fs *c)
|
|
+static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run,
|
|
+ bool online)
|
|
{
|
|
+ struct bch_fs_recovery *r = &c->recovery;
|
|
int ret = 0;
|
|
|
|
- down_read(&c->state_lock);
|
|
+ spin_lock_irq(&r->lock);
|
|
|
|
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
|
|
- struct recovery_pass_fn *p = recovery_pass_fns + i;
|
|
+ if (online)
|
|
+ orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE);
|
|
|
|
- if (!(p->when & PASS_ONLINE))
|
|
- continue;
|
|
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
|
|
+ orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC);
|
|
|
|
- ret = bch2_run_recovery_pass(c, i);
|
|
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
|
|
- i = c->curr_recovery_pass;
|
|
- continue;
|
|
+ /*
|
|
+ * A failed recovery pass will be retried after another pass succeeds -
|
|
+ * but not this iteration.
|
|
+ *
|
|
+ * This is because some passes depend on repair done by other passes: we
|
|
+ * may want to retry, but we don't want to loop on failing passes.
|
|
+ */
|
|
+
|
|
+ orig_passes_to_run &= ~r->passes_failing;
|
|
+
|
|
+ r->passes_to_run = orig_passes_to_run;
|
|
+
|
|
+ while (r->passes_to_run) {
|
|
+ unsigned prev_done = r->pass_done;
|
|
+ unsigned pass = __ffs64(r->passes_to_run);
|
|
+ r->curr_pass = pass;
|
|
+ r->next_pass = r->curr_pass + 1;
|
|
+ r->passes_to_run &= ~BIT_ULL(pass);
|
|
+
|
|
+ spin_unlock_irq(&r->lock);
|
|
+
|
|
+ int ret2 = bch2_run_recovery_pass(c, pass) ?:
|
|
+ bch2_journal_flush(&c->journal);
|
|
+
|
|
+ spin_lock_irq(&r->lock);
|
|
+
|
|
+ if (r->next_pass < r->curr_pass) {
|
|
+ /* Rewind: */
|
|
+ r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass);
|
|
+ } else if (!ret2) {
|
|
+ r->pass_done = max(r->pass_done, pass);
|
|
+ r->passes_complete |= BIT_ULL(pass);
|
|
+ } else {
|
|
+ ret = ret2;
|
|
}
|
|
- if (ret)
|
|
+
|
|
+ if (ret && !online)
|
|
break;
|
|
+
|
|
+ if (prev_done <= BCH_RECOVERY_PASS_check_snapshots &&
|
|
+ r->pass_done > BCH_RECOVERY_PASS_check_snapshots) {
|
|
+ bch2_copygc_wakeup(c);
|
|
+ bch2_rebalance_wakeup(c);
|
|
+ }
|
|
}
|
|
|
|
- up_read(&c->state_lock);
|
|
+ clear_bit(BCH_FS_in_recovery, &c->flags);
|
|
+ spin_unlock_irq(&r->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
-int bch2_run_recovery_passes(struct bch_fs *c)
|
|
+static void bch2_async_recovery_passes_work(struct work_struct *work)
|
|
{
|
|
- int ret = 0;
|
|
+ struct bch_fs *c = container_of(work, struct bch_fs, recovery.work);
|
|
+ struct bch_fs_recovery *r = &c->recovery;
|
|
+
|
|
+ __bch2_run_recovery_passes(c,
|
|
+ c->sb.recovery_passes_required & ~r->passes_ratelimiting,
|
|
+ true);
|
|
+
|
|
+ up(&r->run_lock);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
|
|
+}
|
|
+
|
|
+int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes)
|
|
+{
|
|
+ return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true);
|
|
+}
|
|
+
|
|
+int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from)
|
|
+{
|
|
+ u64 passes =
|
|
+ bch2_recovery_passes_match(PASS_ALWAYS) |
|
|
+ (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) |
|
|
+ (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) |
|
|
+ c->opts.recovery_passes |
|
|
+ c->sb.recovery_passes_required;
|
|
|
|
/*
|
|
* We can't allow set_may_go_rw to be excluded; that would cause us to
|
|
* use the journal replay keys for updates where it's not expected.
|
|
*/
|
|
c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
|
|
+ passes &= ~c->opts.recovery_passes_exclude;
|
|
|
|
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
|
|
- c->next_recovery_pass = c->curr_recovery_pass + 1;
|
|
+ passes &= ~(BIT_ULL(from) - 1);
|
|
|
|
- spin_lock_irq(&c->recovery_pass_lock);
|
|
- unsigned pass = c->curr_recovery_pass;
|
|
-
|
|
- if (c->opts.recovery_pass_last &&
|
|
- c->curr_recovery_pass > c->opts.recovery_pass_last) {
|
|
- spin_unlock_irq(&c->recovery_pass_lock);
|
|
- break;
|
|
- }
|
|
+ down(&c->recovery.run_lock);
|
|
+ int ret = __bch2_run_recovery_passes(c, passes, false);
|
|
+ up(&c->recovery.run_lock);
|
|
|
|
- if (!should_run_recovery_pass(c, pass)) {
|
|
- c->curr_recovery_pass++;
|
|
- c->recovery_pass_done = max(c->recovery_pass_done, pass);
|
|
- spin_unlock_irq(&c->recovery_pass_lock);
|
|
- continue;
|
|
- }
|
|
- spin_unlock_irq(&c->recovery_pass_lock);
|
|
+ return ret;
|
|
+}
|
|
|
|
- ret = bch2_run_recovery_pass(c, pass) ?:
|
|
- bch2_journal_flush(&c->journal);
|
|
+static void prt_passes(struct printbuf *out, const char *msg, u64 passes)
|
|
+{
|
|
+ prt_printf(out, "%s:\t", msg);
|
|
+ prt_bitflags(out, bch2_recovery_passes, passes);
|
|
+ prt_newline(out);
|
|
+}
|
|
|
|
- if (!ret && !test_bit(BCH_FS_error, &c->flags))
|
|
- bch2_clear_recovery_pass_required(c, pass);
|
|
-
|
|
- spin_lock_irq(&c->recovery_pass_lock);
|
|
- if (c->next_recovery_pass < c->curr_recovery_pass) {
|
|
- /*
|
|
- * bch2_run_explicit_recovery_pass() was called: we
|
|
- * can't always catch -BCH_ERR_restart_recovery because
|
|
- * it may have been called from another thread (btree
|
|
- * node read completion)
|
|
- */
|
|
- ret = 0;
|
|
- c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass);
|
|
- } else {
|
|
- c->recovery_passes_complete |= BIT_ULL(pass);
|
|
- c->recovery_pass_done = max(c->recovery_pass_done, pass);
|
|
- }
|
|
- c->curr_recovery_pass = c->next_recovery_pass;
|
|
- spin_unlock_irq(&c->recovery_pass_lock);
|
|
+void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ struct bch_fs_recovery *r = &c->recovery;
|
|
+
|
|
+ printbuf_tabstop_push(out, 32);
|
|
+ prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required);
|
|
+ prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required &
|
|
+ bch2_recovery_passes_match(PASS_ONLINE));
|
|
+ prt_passes(out, "Complete passes", r->passes_complete);
|
|
+ prt_passes(out, "Failing passes", r->passes_failing);
|
|
+
|
|
+ if (r->curr_pass) {
|
|
+ prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]);
|
|
+ prt_passes(out, "Current passes", r->passes_to_run);
|
|
}
|
|
+}
|
|
|
|
- return ret;
|
|
+void bch2_fs_recovery_passes_init(struct bch_fs *c)
|
|
+{
|
|
+ spin_lock_init(&c->recovery.lock);
|
|
+ sema_init(&c->recovery.run_lock, 1);
|
|
+
|
|
+ INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work);
|
|
}
|
|
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
|
|
index 7d7339c8fa29..dc0d2014ff9b 100644
|
|
--- a/fs/bcachefs/recovery_passes.h
|
|
+++ b/fs/bcachefs/recovery_passes.h
|
|
@@ -3,16 +3,32 @@
|
|
|
|
extern const char * const bch2_recovery_passes[];
|
|
|
|
+extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes;
|
|
+
|
|
u64 bch2_recovery_passes_to_stable(u64 v);
|
|
u64 bch2_recovery_passes_from_stable(u64 v);
|
|
|
|
u64 bch2_fsck_recovery_passes(void);
|
|
|
|
-int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
|
|
-int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass);
|
|
-int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
|
|
+enum bch_run_recovery_pass_flags {
|
|
+ RUN_RECOVERY_PASS_nopersistent = BIT(0),
|
|
+ RUN_RECOVERY_PASS_ratelimit = BIT(1),
|
|
+};
|
|
+
|
|
+int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
|
|
+
|
|
+int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
|
|
+ enum bch_recovery_pass,
|
|
+ enum bch_run_recovery_pass_flags);
|
|
+int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
|
|
+ enum bch_recovery_pass,
|
|
+ enum bch_run_recovery_pass_flags);
|
|
+
|
|
+int bch2_run_online_recovery_passes(struct bch_fs *, u64);
|
|
+int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass);
|
|
+
|
|
+void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *);
|
|
|
|
-int bch2_run_online_recovery_passes(struct bch_fs *);
|
|
-int bch2_run_recovery_passes(struct bch_fs *);
|
|
+void bch2_fs_recovery_passes_init(struct bch_fs *);
|
|
|
|
#endif /* _BCACHEFS_RECOVERY_PASSES_H */
|
|
diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h
|
|
new file mode 100644
|
|
index 000000000000..c434eafbca19
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/recovery_passes_format.h
|
|
@@ -0,0 +1,104 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H
|
|
+#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H
|
|
+
|
|
+#define PASS_SILENT BIT(0)
|
|
+#define PASS_FSCK BIT(1)
|
|
+#define PASS_UNCLEAN BIT(2)
|
|
+#define PASS_ALWAYS BIT(3)
|
|
+#define PASS_ONLINE BIT(4)
|
|
+#define PASS_ALLOC BIT(5)
|
|
+#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC)
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+#define PASS_FSCK_DEBUG BIT(1)
|
|
+#else
|
|
+#define PASS_FSCK_DEBUG 0
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * Passes may be reordered, but the second field is a persistent identifier and
|
|
+ * must never change:
|
|
+ */
|
|
+#define BCH_RECOVERY_PASSES() \
|
|
+ x(recovery_pass_empty, 41, PASS_SILENT) \
|
|
+ x(scan_for_btree_nodes, 37, 0) \
|
|
+ x(check_topology, 4, 0) \
|
|
+ x(accounting_read, 39, PASS_ALWAYS) \
|
|
+ x(alloc_read, 0, PASS_ALWAYS) \
|
|
+ x(stripes_read, 1, 0) \
|
|
+ x(initialize_subvolumes, 2, 0) \
|
|
+ x(snapshots_read, 3, PASS_ALWAYS) \
|
|
+ x(check_allocations, 5, PASS_FSCK_ALLOC) \
|
|
+ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \
|
|
+ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \
|
|
+ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
|
|
+ x(journal_replay, 9, PASS_ALWAYS) \
|
|
+ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \
|
|
+ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \
|
|
+ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \
|
|
+ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
|
|
+ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \
|
|
+ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \
|
|
+ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
|
|
+ x(bucket_gens_init, 17, 0) \
|
|
+ x(reconstruct_snapshots, 38, 0) \
|
|
+ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
|
|
+ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
|
|
+ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
|
|
+ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
|
|
+ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
|
|
+ x(fs_upgrade_for_subvolumes, 22, 0) \
|
|
+ x(check_inodes, 24, PASS_FSCK) \
|
|
+ x(check_extents, 25, PASS_FSCK) \
|
|
+ x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \
|
|
+ x(check_dirents, 27, PASS_FSCK) \
|
|
+ x(check_xattrs, 28, PASS_FSCK) \
|
|
+ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
|
|
+ x(check_unreachable_inodes, 40, PASS_FSCK) \
|
|
+ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
|
|
+ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
|
|
+ x(check_nlinks, 31, PASS_FSCK) \
|
|
+ x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \
|
|
+ x(resume_logged_ops, 23, PASS_ALWAYS) \
|
|
+ x(delete_dead_inodes, 32, PASS_ALWAYS) \
|
|
+ x(fix_reflink_p, 33, 0) \
|
|
+ x(set_fs_needs_rebalance, 34, 0) \
|
|
+ x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT)
|
|
+
|
|
+/* We normally enumerate recovery passes in the order we run them: */
|
|
+enum bch_recovery_pass {
|
|
+#define x(n, id, when) BCH_RECOVERY_PASS_##n,
|
|
+ BCH_RECOVERY_PASSES()
|
|
+#undef x
|
|
+ BCH_RECOVERY_PASS_NR
|
|
+};
|
|
+
|
|
+/* But we also need stable identifiers that can be used in the superblock */
|
|
+enum bch_recovery_pass_stable {
|
|
+#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
|
|
+ BCH_RECOVERY_PASSES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+struct recovery_pass_entry {
|
|
+ __le64 last_run;
|
|
+ __le32 last_runtime;
|
|
+ __le32 flags;
|
|
+};
|
|
+
|
|
+struct bch_sb_field_recovery_passes {
|
|
+ struct bch_sb_field field;
|
|
+ struct recovery_pass_entry start[];
|
|
+};
|
|
+
|
|
+static inline unsigned
|
|
+recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r)
|
|
+{
|
|
+ return r
|
|
+ ? ((vstruct_end(&r->field) - (void *) &r->start[0]) /
|
|
+ sizeof(struct recovery_pass_entry))
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */
|
|
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
|
|
index 418557960ed6..aa9526938cc3 100644
|
|
--- a/fs/bcachefs/recovery_passes_types.h
|
|
+++ b/fs/bcachefs/recovery_passes_types.h
|
|
@@ -2,79 +2,26 @@
|
|
#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
|
|
#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
|
|
|
|
-#define PASS_SILENT BIT(0)
|
|
-#define PASS_FSCK BIT(1)
|
|
-#define PASS_UNCLEAN BIT(2)
|
|
-#define PASS_ALWAYS BIT(3)
|
|
-#define PASS_ONLINE BIT(4)
|
|
-
|
|
-#ifdef CONFIG_BCACHEFS_DEBUG
|
|
-#define PASS_FSCK_DEBUG BIT(1)
|
|
-#else
|
|
-#define PASS_FSCK_DEBUG 0
|
|
-#endif
|
|
-
|
|
-/*
|
|
- * Passes may be reordered, but the second field is a persistent identifier and
|
|
- * must never change:
|
|
- */
|
|
-#define BCH_RECOVERY_PASSES() \
|
|
- x(recovery_pass_empty, 41, PASS_SILENT) \
|
|
- x(scan_for_btree_nodes, 37, 0) \
|
|
- x(check_topology, 4, 0) \
|
|
- x(accounting_read, 39, PASS_ALWAYS) \
|
|
- x(alloc_read, 0, PASS_ALWAYS) \
|
|
- x(stripes_read, 1, PASS_ALWAYS) \
|
|
- x(initialize_subvolumes, 2, 0) \
|
|
- x(snapshots_read, 3, PASS_ALWAYS) \
|
|
- x(check_allocations, 5, PASS_FSCK) \
|
|
- x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
|
|
- x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
|
|
- x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
|
|
- x(journal_replay, 9, PASS_ALWAYS) \
|
|
- x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \
|
|
- x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \
|
|
- x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \
|
|
- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
|
|
- x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \
|
|
- x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
|
|
- x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
|
|
- x(bucket_gens_init, 17, 0) \
|
|
- x(reconstruct_snapshots, 38, 0) \
|
|
- x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
|
|
- x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
|
|
- x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
|
|
- x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
|
|
- x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
|
|
- x(fs_upgrade_for_subvolumes, 22, 0) \
|
|
- x(check_inodes, 24, PASS_FSCK) \
|
|
- x(check_extents, 25, PASS_FSCK) \
|
|
- x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \
|
|
- x(check_dirents, 27, PASS_FSCK) \
|
|
- x(check_xattrs, 28, PASS_FSCK) \
|
|
- x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
|
|
- x(check_unreachable_inodes, 40, PASS_FSCK) \
|
|
- x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
|
|
- x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
|
|
- x(check_nlinks, 31, PASS_FSCK) \
|
|
- x(resume_logged_ops, 23, PASS_ALWAYS) \
|
|
- x(delete_dead_inodes, 32, PASS_ALWAYS) \
|
|
- x(fix_reflink_p, 33, 0) \
|
|
- x(set_fs_needs_rebalance, 34, 0)
|
|
-
|
|
-/* We normally enumerate recovery passes in the order we run them: */
|
|
-enum bch_recovery_pass {
|
|
-#define x(n, id, when) BCH_RECOVERY_PASS_##n,
|
|
- BCH_RECOVERY_PASSES()
|
|
-#undef x
|
|
- BCH_RECOVERY_PASS_NR
|
|
-};
|
|
-
|
|
-/* But we also need stable identifiers that can be used in the superblock */
|
|
-enum bch_recovery_pass_stable {
|
|
-#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
|
|
- BCH_RECOVERY_PASSES()
|
|
-#undef x
|
|
+struct bch_fs_recovery {
|
|
+ /*
|
|
+ * Two different uses:
|
|
+ * "Has this fsck pass?" - i.e. should this type of error be an
|
|
+ * emergency read-only
|
|
+ * And, in certain situations fsck will rewind to an earlier pass: used
|
|
+ * for signaling to the toplevel code which pass we want to run now.
|
|
+ */
|
|
+ enum bch_recovery_pass curr_pass;
|
|
+ enum bch_recovery_pass next_pass;
|
|
+ /* never rewinds version of curr_pass */
|
|
+ enum bch_recovery_pass pass_done;
|
|
+ u64 passes_to_run;
|
|
+ /* bitmask of recovery passes that we actually ran */
|
|
+ u64 passes_complete;
|
|
+ u64 passes_failing;
|
|
+ u64 passes_ratelimiting;
|
|
+ spinlock_t lock;
|
|
+ struct semaphore run_lock;
|
|
+ struct work_struct work;
|
|
};
|
|
|
|
#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
|
|
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
|
|
index 441e648f28b5..3a13dbcab6ba 100644
|
|
--- a/fs/bcachefs/reflink.c
|
|
+++ b/fs/bcachefs/reflink.c
|
|
@@ -3,6 +3,7 @@
|
|
#include "bkey_buf.h"
|
|
#include "btree_update.h"
|
|
#include "buckets.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "error.h"
|
|
#include "extents.h"
|
|
#include "inode.h"
|
|
@@ -185,12 +186,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
|
|
BUG_ON(missing_start < refd_start);
|
|
BUG_ON(missing_end > refd_end);
|
|
|
|
- if (fsck_err(trans, reflink_p_to_missing_reflink_v,
|
|
- "pointer to missing indirect extent\n"
|
|
- " %s\n"
|
|
- " missing range %llu-%llu",
|
|
- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
|
|
- missing_start, missing_end)) {
|
|
+ struct bpos missing_pos = bkey_start_pos(p.k);
|
|
+ missing_pos.offset += missing_start - live_start;
|
|
+
|
|
+ prt_printf(&buf, "pointer to missing indirect extent in ");
|
|
+ ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9);
|
|
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
|
|
+
|
|
+ prt_printf(&buf, "\nmissing reflink btree range %llu-%llu",
|
|
+ missing_start, missing_end);
|
|
+
|
|
+ if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) {
|
|
struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
|
|
ret = PTR_ERR_OR_ZERO(new);
|
|
if (ret)
|
|
@@ -314,10 +324,10 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
|
|
__le64 *refcount = bkey_refcount(bkey_i_to_s(new));
|
|
if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
|
|
bch2_bkey_val_to_text(&buf, c, p.s_c);
|
|
- prt_printf(&buf, "\n ");
|
|
+ prt_newline(&buf);
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
log_fsck_err(trans, reflink_refcount_underflow,
|
|
- "indirect extent refcount underflow while marking\n %s",
|
|
+ "indirect extent refcount underflow while marking\n%s",
|
|
buf.buf);
|
|
goto next;
|
|
}
|
|
@@ -486,7 +496,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
|
|
bool reflink_p_may_update_opts_field)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter reflink_iter = { NULL };
|
|
+ struct btree_iter reflink_iter = {};
|
|
struct bkey_s_c k;
|
|
struct bkey_i *r_v;
|
|
struct bkey_i_reflink_p *r_p;
|
|
@@ -498,7 +508,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
|
|
|
|
bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
|
|
BTREE_ITER_intent);
|
|
- k = bch2_btree_iter_peek_prev(&reflink_iter);
|
|
+ k = bch2_btree_iter_peek_prev(trans, &reflink_iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -560,12 +570,13 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
-static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
|
|
+static struct bkey_s_c get_next_src(struct btree_trans *trans,
|
|
+ struct btree_iter *iter, struct bpos end)
|
|
{
|
|
struct bkey_s_c k;
|
|
int ret;
|
|
|
|
- for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) {
|
|
+ for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) {
|
|
if (bkey_extent_is_unwritten(k))
|
|
continue;
|
|
|
|
@@ -574,7 +585,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
|
|
}
|
|
|
|
if (bkey_ge(iter->pos, end))
|
|
- bch2_btree_iter_set_pos(iter, end);
|
|
+ bch2_btree_iter_set_pos(trans, iter, end);
|
|
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
|
|
}
|
|
|
|
@@ -597,10 +608,10 @@ s64 bch2_remap_range(struct bch_fs *c,
|
|
u64 dst_done = 0;
|
|
u32 dst_snapshot, src_snapshot;
|
|
bool reflink_p_may_update_opts_field =
|
|
- bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
|
|
+ !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
|
|
int ret = 0, ret2 = 0;
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink))
|
|
return -BCH_ERR_erofs_no_writes;
|
|
|
|
bch2_check_set_feature(c, BCH_FEATURE_reflink);
|
|
@@ -638,27 +649,27 @@ s64 bch2_remap_range(struct bch_fs *c,
|
|
if (ret)
|
|
continue;
|
|
|
|
- bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
|
|
+ bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot);
|
|
|
|
ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
|
|
&dst_snapshot);
|
|
if (ret)
|
|
continue;
|
|
|
|
- bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
|
|
+ bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot);
|
|
|
|
if (dst_inum.inum < src_inum.inum) {
|
|
/* Avoid some lock cycle transaction restarts */
|
|
- ret = bch2_btree_iter_traverse(&dst_iter);
|
|
+ ret = bch2_btree_iter_traverse(trans, &dst_iter);
|
|
if (ret)
|
|
continue;
|
|
}
|
|
|
|
dst_done = dst_iter.pos.offset - dst_start.offset;
|
|
src_want = POS(src_start.inode, src_start.offset + dst_done);
|
|
- bch2_btree_iter_set_pos(&src_iter, src_want);
|
|
+ bch2_btree_iter_set_pos(trans, &src_iter, src_want);
|
|
|
|
- src_k = get_next_src(&src_iter, src_end);
|
|
+ src_k = get_next_src(trans, &src_iter, src_end);
|
|
ret = bkey_err(src_k);
|
|
if (ret)
|
|
continue;
|
|
@@ -729,7 +740,7 @@ s64 bch2_remap_range(struct bch_fs *c,
|
|
|
|
do {
|
|
struct bch_inode_unpacked inode_u;
|
|
- struct btree_iter inode_iter = { NULL };
|
|
+ struct btree_iter inode_iter = {};
|
|
|
|
bch2_trans_begin(trans);
|
|
|
|
@@ -751,7 +762,7 @@ s64 bch2_remap_range(struct bch_fs *c,
|
|
bch2_bkey_buf_exit(&new_src, c);
|
|
bch2_bkey_buf_exit(&new_dst, c);
|
|
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink);
|
|
|
|
return dst_done ?: ret ?: ret2;
|
|
}
|
|
@@ -786,8 +797,8 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
|
|
if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
|
|
trans, reflink_v_refcount_wrong,
|
|
"reflink key has wrong refcount:\n"
|
|
- " %s\n"
|
|
- " should be %u",
|
|
+ "%s\n"
|
|
+ "should be %u",
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
|
|
r->refcount)) {
|
|
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
|
|
diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c
|
|
index 6992e7469112..2b4b8445d418 100644
|
|
--- a/fs/bcachefs/sb-counters.c
|
|
+++ b/fs/bcachefs/sb-counters.c
|
|
@@ -5,7 +5,13 @@
|
|
|
|
/* BCH_SB_FIELD_counters */
|
|
|
|
-static const char * const bch2_counter_names[] = {
|
|
+static const u8 counters_to_stable_map[] = {
|
|
+#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n,
|
|
+ BCH_PERSISTENT_COUNTERS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+const char * const bch2_counter_names[] = {
|
|
#define x(t, n, ...) (#t),
|
|
BCH_PERSISTENT_COUNTERS()
|
|
#undef x
|
|
@@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
|
|
return 0;
|
|
|
|
return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
|
|
-};
|
|
+}
|
|
|
|
static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
|
|
enum bch_validate_flags flags, struct printbuf *err)
|
|
{
|
|
return 0;
|
|
-};
|
|
+}
|
|
|
|
static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
|
|
struct bch_sb_field *f)
|
|
@@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
|
|
struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
|
|
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
|
|
|
|
- for (unsigned i = 0; i < nr; i++)
|
|
- prt_printf(out, "%s \t%llu\n",
|
|
- i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)",
|
|
- le64_to_cpu(ctrs->d[i]));
|
|
-};
|
|
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
|
+ unsigned stable = counters_to_stable_map[i];
|
|
+ if (stable < nr)
|
|
+ prt_printf(out, "%s \t%llu\n",
|
|
+ bch2_counter_names[i],
|
|
+ le64_to_cpu(ctrs->d[stable]));
|
|
+ }
|
|
+}
|
|
|
|
int bch2_sb_counters_to_cpu(struct bch_fs *c)
|
|
{
|
|
struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
|
|
- unsigned int i;
|
|
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
|
|
- u64 val = 0;
|
|
|
|
- for (i = 0; i < BCH_COUNTER_NR; i++)
|
|
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++)
|
|
c->counters_on_mount[i] = 0;
|
|
|
|
- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
|
|
- val = le64_to_cpu(ctrs->d[i]);
|
|
- percpu_u64_set(&c->counters[i], val);
|
|
- c->counters_on_mount[i] = val;
|
|
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
|
+ unsigned stable = counters_to_stable_map[i];
|
|
+ if (stable < nr) {
|
|
+ u64 v = le64_to_cpu(ctrs->d[stable]);
|
|
+ percpu_u64_set(&c->counters[i], v);
|
|
+ c->counters_on_mount[i] = v;
|
|
+ }
|
|
}
|
|
+
|
|
return 0;
|
|
-};
|
|
+}
|
|
|
|
int bch2_sb_counters_from_cpu(struct bch_fs *c)
|
|
{
|
|
struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
|
|
struct bch_sb_field_counters *ret;
|
|
- unsigned int i;
|
|
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
|
|
|
|
if (nr < BCH_COUNTER_NR) {
|
|
ret = bch2_sb_field_resize(&c->disk_sb, counters,
|
|
- sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
|
|
-
|
|
+ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
|
|
if (ret) {
|
|
ctrs = ret;
|
|
nr = bch2_sb_counter_nr_entries(ctrs);
|
|
}
|
|
}
|
|
|
|
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
|
+ unsigned stable = counters_to_stable_map[i];
|
|
+ if (stable < nr)
|
|
+ ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
|
|
+ }
|
|
|
|
- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
|
|
- ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
|
|
return 0;
|
|
}
|
|
|
|
@@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = {
|
|
.validate = bch2_sb_counters_validate,
|
|
.to_text = bch2_sb_counters_to_text,
|
|
};
|
|
+
|
|
+#ifndef NO_BCACHEFS_CHARDEV
|
|
+long bch2_ioctl_query_counters(struct bch_fs *c,
|
|
+ struct bch_ioctl_query_counters __user *user_arg)
|
|
+{
|
|
+ struct bch_ioctl_query_counters arg;
|
|
+ int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg));
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) ||
|
|
+ arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ arg.nr = min(arg.nr, BCH_COUNTER_NR);
|
|
+ ret = put_user(arg.nr, &user_arg->nr);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
|
+ unsigned stable = counters_to_stable_map[i];
|
|
+
|
|
+ if (stable < arg.nr) {
|
|
+ u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT)
|
|
+ ? percpu_u64_get(&c->counters[i])
|
|
+ : c->counters_on_mount[i];
|
|
+
|
|
+ ret = put_user(v, &user_arg->d[stable]);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+#endif
|
|
diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h
|
|
index 81f8aec9fcb1..a4329ad8dd1b 100644
|
|
--- a/fs/bcachefs/sb-counters.h
|
|
+++ b/fs/bcachefs/sb-counters.h
|
|
@@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *);
|
|
void bch2_fs_counters_exit(struct bch_fs *);
|
|
int bch2_fs_counters_init(struct bch_fs *);
|
|
|
|
+extern const char * const bch2_counter_names[];
|
|
extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
|
|
|
|
+long bch2_ioctl_query_counters(struct bch_fs *,
|
|
+ struct bch_ioctl_query_counters __user *);
|
|
+
|
|
#endif // _BCACHEFS_SB_COUNTERS_H
|
|
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
|
|
index fdcf598f08b1..7c0c9c842b4e 100644
|
|
--- a/fs/bcachefs/sb-counters_format.h
|
|
+++ b/fs/bcachefs/sb-counters_format.h
|
|
@@ -9,10 +9,26 @@ enum counters_flags {
|
|
|
|
#define BCH_PERSISTENT_COUNTERS() \
|
|
x(io_read, 0, TYPE_SECTORS) \
|
|
+ x(io_read_inline, 80, TYPE_SECTORS) \
|
|
+ x(io_read_hole, 81, TYPE_SECTORS) \
|
|
+ x(io_read_promote, 30, TYPE_COUNTER) \
|
|
+ x(io_read_bounce, 31, TYPE_COUNTER) \
|
|
+ x(io_read_split, 33, TYPE_COUNTER) \
|
|
+ x(io_read_reuse_race, 34, TYPE_COUNTER) \
|
|
+ x(io_read_retry, 32, TYPE_COUNTER) \
|
|
+ x(io_read_fail_and_poison, 82, TYPE_COUNTER) \
|
|
x(io_write, 1, TYPE_SECTORS) \
|
|
x(io_move, 2, TYPE_SECTORS) \
|
|
+ x(io_move_read, 35, TYPE_SECTORS) \
|
|
+ x(io_move_write, 36, TYPE_SECTORS) \
|
|
+ x(io_move_finish, 37, TYPE_SECTORS) \
|
|
+ x(io_move_fail, 38, TYPE_COUNTER) \
|
|
+ x(io_move_write_fail, 82, TYPE_COUNTER) \
|
|
+ x(io_move_start_fail, 39, TYPE_COUNTER) \
|
|
+ x(io_move_created_rebalance, 83, TYPE_COUNTER) \
|
|
x(bucket_invalidate, 3, TYPE_COUNTER) \
|
|
x(bucket_discard, 4, TYPE_COUNTER) \
|
|
+ x(bucket_discard_fast, 79, TYPE_COUNTER) \
|
|
x(bucket_alloc, 5, TYPE_COUNTER) \
|
|
x(bucket_alloc_fail, 6, TYPE_COUNTER) \
|
|
x(btree_cache_scan, 7, TYPE_COUNTER) \
|
|
@@ -38,16 +54,6 @@ enum counters_flags {
|
|
x(journal_reclaim_finish, 27, TYPE_COUNTER) \
|
|
x(journal_reclaim_start, 28, TYPE_COUNTER) \
|
|
x(journal_write, 29, TYPE_COUNTER) \
|
|
- x(read_promote, 30, TYPE_COUNTER) \
|
|
- x(read_bounce, 31, TYPE_COUNTER) \
|
|
- x(read_split, 33, TYPE_COUNTER) \
|
|
- x(read_retry, 32, TYPE_COUNTER) \
|
|
- x(read_reuse_race, 34, TYPE_COUNTER) \
|
|
- x(move_extent_read, 35, TYPE_SECTORS) \
|
|
- x(move_extent_write, 36, TYPE_SECTORS) \
|
|
- x(move_extent_finish, 37, TYPE_SECTORS) \
|
|
- x(move_extent_fail, 38, TYPE_COUNTER) \
|
|
- x(move_extent_start_fail, 39, TYPE_COUNTER) \
|
|
x(copygc, 40, TYPE_COUNTER) \
|
|
x(copygc_wait, 41, TYPE_COUNTER) \
|
|
x(gc_gens_end, 42, TYPE_COUNTER) \
|
|
@@ -95,6 +101,13 @@ enum bch_persistent_counters {
|
|
BCH_COUNTER_NR
|
|
};
|
|
|
|
+enum bch_persistent_counters_stable {
|
|
+#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n,
|
|
+ BCH_PERSISTENT_COUNTERS()
|
|
+#undef x
|
|
+ BCH_COUNTER_STABLE_NR
|
|
+};
|
|
+
|
|
struct bch_sb_field_counters {
|
|
struct bch_sb_field field;
|
|
__le64 d[];
|
|
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
|
|
index 051214fdc735..d4e22c43eeb2 100644
|
|
--- a/fs/bcachefs/sb-downgrade.c
|
|
+++ b/fs/bcachefs/sb-downgrade.c
|
|
@@ -6,12 +6,13 @@
|
|
*/
|
|
|
|
#include "bcachefs.h"
|
|
-#include "darray.h"
|
|
#include "recovery_passes.h"
|
|
#include "sb-downgrade.h"
|
|
#include "sb-errors.h"
|
|
#include "super-io.h"
|
|
|
|
+#include <linux/darray.h>
|
|
+
|
|
#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63)
|
|
|
|
/*
|
|
@@ -20,6 +21,10 @@
|
|
* x(version, recovery_passes, errors...)
|
|
*/
|
|
#define UPGRADE_TABLE() \
|
|
+ x(snapshot_2, \
|
|
+ RECOVERY_PASS_ALL_FSCK, \
|
|
+ BCH_FSCK_ERR_subvol_root_wrong_bi_subvol, \
|
|
+ BCH_FSCK_ERR_subvol_not_master_and_not_snapshot) \
|
|
x(backpointers, \
|
|
RECOVERY_PASS_ALL_FSCK) \
|
|
x(inode_v3, \
|
|
@@ -90,7 +95,17 @@
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
|
BCH_FSCK_ERR_accounting_mismatch, \
|
|
BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
|
|
- BCH_FSCK_ERR_accounting_key_junk_at_end)
|
|
+ BCH_FSCK_ERR_accounting_key_junk_at_end) \
|
|
+ x(cached_backpointers, \
|
|
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
|
|
+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \
|
|
+ x(stripe_backpointers, \
|
|
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
|
|
+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \
|
|
+ x(inode_has_case_insensitive, \
|
|
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
|
|
+ BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \
|
|
+ BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)
|
|
|
|
#define DOWNGRADE_TABLE() \
|
|
x(bucket_stripe_sectors, \
|
|
@@ -364,6 +379,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
|
|
if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
|
|
continue;
|
|
|
|
+ if (src->version < c->sb.version_incompat)
|
|
+ continue;
|
|
+
|
|
struct bch_sb_field_downgrade_entry *dst;
|
|
unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;
|
|
|
|
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
|
|
index b86ec013d7d7..4036a20c6adc 100644
|
|
--- a/fs/bcachefs/sb-errors_format.h
|
|
+++ b/fs/bcachefs/sb-errors_format.h
|
|
@@ -5,8 +5,7 @@
|
|
enum bch_fsck_flags {
|
|
FSCK_CAN_FIX = 1 << 0,
|
|
FSCK_CAN_IGNORE = 1 << 1,
|
|
- FSCK_NO_RATELIMIT = 1 << 2,
|
|
- FSCK_AUTOFIX = 1 << 3,
|
|
+ FSCK_AUTOFIX = 1 << 2,
|
|
};
|
|
|
|
#define BCH_SB_ERRS() \
|
|
@@ -47,7 +46,7 @@ enum bch_fsck_flags {
|
|
x(btree_node_unsupported_version, 34, 0) \
|
|
x(btree_node_bset_older_than_sb_min, 35, 0) \
|
|
x(btree_node_bset_newer_than_sb, 36, 0) \
|
|
- x(btree_node_data_missing, 37, 0) \
|
|
+ x(btree_node_data_missing, 37, FSCK_AUTOFIX) \
|
|
x(btree_node_bset_after_end, 38, 0) \
|
|
x(btree_node_replicas_sectors_written_mismatch, 39, 0) \
|
|
x(btree_node_replicas_data_mismatch, 40, 0) \
|
|
@@ -179,6 +178,7 @@ enum bch_fsck_flags {
|
|
x(ptr_crc_redundant, 160, 0) \
|
|
x(ptr_crc_nonce_mismatch, 162, 0) \
|
|
x(ptr_stripe_redundant, 163, 0) \
|
|
+ x(extent_flags_not_at_start, 306, 0) \
|
|
x(reservation_key_nr_replicas_invalid, 164, 0) \
|
|
x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
|
|
x(reflink_v_pos_bad, 292, 0) \
|
|
@@ -205,10 +205,11 @@ enum bch_fsck_flags {
|
|
x(snapshot_bad_depth, 184, 0) \
|
|
x(snapshot_bad_skiplist, 185, 0) \
|
|
x(subvol_pos_bad, 186, 0) \
|
|
- x(subvol_not_master_and_not_snapshot, 187, 0) \
|
|
+ x(subvol_not_master_and_not_snapshot, 187, FSCK_AUTOFIX) \
|
|
x(subvol_to_missing_root, 188, 0) \
|
|
- x(subvol_root_wrong_bi_subvol, 189, 0) \
|
|
+ x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \
|
|
x(bkey_in_missing_snapshot, 190, 0) \
|
|
+ x(bkey_in_deleted_snapshot, 315, 0) \
|
|
x(inode_pos_inode_nonzero, 191, 0) \
|
|
x(inode_pos_blockdev_range, 192, 0) \
|
|
x(inode_alloc_cursor_inode_bad, 301, 0) \
|
|
@@ -216,6 +217,7 @@ enum bch_fsck_flags {
|
|
x(inode_str_hash_invalid, 194, 0) \
|
|
x(inode_v3_fields_start_bad, 195, 0) \
|
|
x(inode_snapshot_mismatch, 196, 0) \
|
|
+ x(snapshot_key_missing_inode_snapshot, 314, 0) \
|
|
x(inode_unlinked_but_clean, 197, 0) \
|
|
x(inode_unlinked_but_nlink_nonzero, 198, 0) \
|
|
x(inode_unlinked_and_not_open, 281, 0) \
|
|
@@ -236,6 +238,11 @@ enum bch_fsck_flags {
|
|
x(inode_has_child_snapshots_wrong, 287, 0) \
|
|
x(inode_unreachable, 210, FSCK_AUTOFIX) \
|
|
x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \
|
|
+ x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \
|
|
+ x(inode_has_case_insensitive_not_set, 316, FSCK_AUTOFIX) \
|
|
+ x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \
|
|
+ x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \
|
|
+ x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \
|
|
x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
|
|
x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
|
|
x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
|
|
@@ -259,6 +266,7 @@ enum bch_fsck_flags {
|
|
x(dirent_to_overwritten_inode, 302, 0) \
|
|
x(dirent_to_missing_subvol, 230, 0) \
|
|
x(dirent_to_itself, 231, 0) \
|
|
+ x(dirent_casefold_mismatch, 318, FSCK_AUTOFIX) \
|
|
x(quota_type_invalid, 232, 0) \
|
|
x(xattr_val_size_too_small, 233, 0) \
|
|
x(xattr_val_size_too_big, 234, 0) \
|
|
@@ -290,14 +298,15 @@ enum bch_fsck_flags {
|
|
x(btree_node_bkey_bad_u64s, 260, 0) \
|
|
x(btree_node_topology_empty_interior_node, 261, 0) \
|
|
x(btree_ptr_v2_min_key_bad, 262, 0) \
|
|
- x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
|
|
- x(snapshot_node_missing, 264, 0) \
|
|
+ x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \
|
|
+ x(snapshot_node_missing, 264, FSCK_AUTOFIX) \
|
|
x(dup_backpointer_to_bad_csum_extent, 265, 0) \
|
|
x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \
|
|
x(sb_clean_entry_overrun, 267, 0) \
|
|
x(btree_ptr_v2_written_0, 268, 0) \
|
|
x(subvol_snapshot_bad, 269, 0) \
|
|
x(subvol_inode_bad, 270, 0) \
|
|
+ x(subvol_missing, 308, FSCK_AUTOFIX) \
|
|
x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
|
|
x(accounting_mismatch, 272, FSCK_AUTOFIX) \
|
|
x(accounting_replicas_not_marked, 273, 0) \
|
|
@@ -310,11 +319,16 @@ enum bch_fsck_flags {
|
|
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
|
|
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
|
|
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
|
|
+ x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \
|
|
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
|
|
x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
|
|
x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
|
|
x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
|
|
- x(MAX, 304, 0)
|
|
+ x(dirent_cf_name_too_big, 304, 0) \
|
|
+ x(dirent_stray_data_after_cf_name, 305, 0) \
|
|
+ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
|
|
+ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
|
|
+ x(MAX, 319, 0)
|
|
|
|
enum bch_sb_error_id {
|
|
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
|
|
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
|
|
index 40325239c3b0..3b28871d23ed 100644
|
|
--- a/fs/bcachefs/sb-errors_types.h
|
|
+++ b/fs/bcachefs/sb-errors_types.h
|
|
@@ -2,7 +2,7 @@
|
|
#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
|
|
#define _BCACHEFS_SB_ERRORS_TYPES_H
|
|
|
|
-#include "darray.h"
|
|
+#include <linux/darray_types.h>
|
|
|
|
struct bch_sb_error_entry_cpu {
|
|
u64 id:16,
|
|
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
|
|
index 116131f95815..3398906660a5 100644
|
|
--- a/fs/bcachefs/sb-members.c
|
|
+++ b/fs/bcachefs/sb-members.c
|
|
@@ -5,19 +5,41 @@
|
|
#include "disk_groups.h"
|
|
#include "error.h"
|
|
#include "opts.h"
|
|
+#include "recovery_passes.h"
|
|
#include "replicas.h"
|
|
#include "sb-members.h"
|
|
#include "super-io.h"
|
|
|
|
-void bch2_dev_missing(struct bch_fs *c, unsigned dev)
|
|
+int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev)
|
|
+{
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+
|
|
+ prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev);
|
|
+ bch2_bkey_val_to_text(&buf, c, k);
|
|
+
|
|
+ bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf);
|
|
+
|
|
+ int ret = bch2_run_explicit_recovery_pass(c, &buf,
|
|
+ BCH_RECOVERY_PASS_check_allocations, 0);
|
|
+
|
|
+ if (print)
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev)
|
|
{
|
|
if (dev != BCH_SB_MEMBER_INVALID)
|
|
bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
|
|
}
|
|
|
|
-void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket)
|
|
+void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket)
|
|
{
|
|
- bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset);
|
|
+ bch2_fs_inconsistent(ca->fs,
|
|
+ "pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)",
|
|
+ bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets);
|
|
}
|
|
|
|
#define x(t, n, ...) [n] = #t,
|
|
@@ -117,6 +139,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
|
|
struct bch_sb_field_members_v1 *mi1;
|
|
struct bch_sb_field_members_v2 *mi2;
|
|
|
|
+ if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) {
|
|
+ bch2_sb_field_resize(disk_sb, members_v1, 0);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
mi1 = bch2_sb_field_resize(disk_sb, members_v1,
|
|
DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
|
|
disk_sb->sb->nr_devices, sizeof(u64)));
|
|
@@ -168,6 +195,12 @@ static int validate_member(struct printbuf *err,
|
|
return -BCH_ERR_invalid_sb_members;
|
|
}
|
|
|
|
+ if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) &&
|
|
+ sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) {
|
|
+ prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i);
|
|
+ return -BCH_ERR_invalid_sb_members;
|
|
+ }
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
@@ -189,17 +222,11 @@ static void member_to_text(struct printbuf *out,
|
|
printbuf_indent_add(out, 2);
|
|
|
|
prt_printf(out, "Label:\t");
|
|
- if (BCH_MEMBER_GROUP(&m)) {
|
|
- unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
|
|
-
|
|
- if (idx < disk_groups_nr(gi))
|
|
- prt_printf(out, "%s (%u)",
|
|
- gi->entries[idx].label, idx);
|
|
- else
|
|
- prt_printf(out, "(bad disk labels section)");
|
|
- } else {
|
|
+ if (BCH_MEMBER_GROUP(&m))
|
|
+ bch2_disk_path_to_text_sb(out, sb,
|
|
+ BCH_MEMBER_GROUP(&m) - 1);
|
|
+ else
|
|
prt_printf(out, "(none)");
|
|
- }
|
|
prt_newline(out);
|
|
|
|
prt_printf(out, "UUID:\t");
|
|
@@ -266,6 +293,7 @@ static void member_to_text(struct printbuf *out,
|
|
|
|
prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
|
|
prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
|
|
+ prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m));
|
|
|
|
printbuf_indent_sub(out, 2);
|
|
}
|
|
@@ -491,6 +519,7 @@ int bch2_sb_member_alloc(struct bch_fs *c)
|
|
unsigned u64s;
|
|
int best = -1;
|
|
u64 best_last_mount = 0;
|
|
+ unsigned nr_deleted = 0;
|
|
|
|
if (dev_idx < BCH_SB_MEMBERS_MAX)
|
|
goto have_slot;
|
|
@@ -501,7 +530,10 @@ int bch2_sb_member_alloc(struct bch_fs *c)
|
|
continue;
|
|
|
|
struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
|
|
- if (bch2_member_alive(&m))
|
|
+
|
|
+ nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID);
|
|
+
|
|
+ if (!bch2_is_zero(&m.uuid, sizeof(m.uuid)))
|
|
continue;
|
|
|
|
u64 last_mount = le64_to_cpu(m.last_mount);
|
|
@@ -515,6 +547,10 @@ int bch2_sb_member_alloc(struct bch_fs *c)
|
|
goto have_slot;
|
|
}
|
|
|
|
+ if (nr_deleted)
|
|
+ bch_err(c, "unable to allocate new member, but have %u deleted: run fsck",
|
|
+ nr_deleted);
|
|
+
|
|
return -BCH_ERR_ENOSPC_sb_members;
|
|
have_slot:
|
|
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
|
|
@@ -530,3 +566,22 @@ int bch2_sb_member_alloc(struct bch_fs *c)
|
|
c->disk_sb.sb->nr_devices = nr_devices;
|
|
return dev_idx;
|
|
}
|
|
+
|
|
+void bch2_sb_members_clean_deleted(struct bch_fs *c)
|
|
+{
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ bool write_sb = false;
|
|
+
|
|
+ for (unsigned i = 0; i < c->sb.nr_devices; i++) {
|
|
+ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i);
|
|
+
|
|
+ if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) {
|
|
+ memset(&m->uuid, 0, sizeof(m->uuid));
|
|
+ write_sb = true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (write_sb)
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+}
|
|
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
|
|
index 762083b564ee..09b751a75020 100644
|
|
--- a/fs/bcachefs/sb-members.h
|
|
+++ b/fs/bcachefs/sb-members.h
|
|
@@ -2,8 +2,10 @@
|
|
#ifndef _BCACHEFS_SB_MEMBERS_H
|
|
#define _BCACHEFS_SB_MEMBERS_H
|
|
|
|
-#include "darray.h"
|
|
#include "bkey_types.h"
|
|
+#include "enumerated_ref.h"
|
|
+
|
|
+#include <linux/darray.h>
|
|
|
|
extern char * const bch2_member_error_strs[];
|
|
|
|
@@ -20,10 +22,22 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
|
|
|
|
static inline bool bch2_dev_is_online(struct bch_dev *ca)
|
|
{
|
|
- return !percpu_ref_is_zero(&ca->io_ref);
|
|
+ return !enumerated_ref_is_zero(&ca->io_ref[READ]);
|
|
+}
|
|
+
|
|
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
|
|
+
|
|
+static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
|
|
+{
|
|
+ rcu_read_lock();
|
|
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
|
|
+ bool ret = ca && bch2_dev_is_online(ca);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return ret;
|
|
}
|
|
|
|
-static inline bool bch2_dev_is_readable(struct bch_dev *ca)
|
|
+static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
|
|
{
|
|
return bch2_dev_is_online(ca) &&
|
|
ca->mi.state != BCH_MEMBER_STATE_failed;
|
|
@@ -92,6 +106,12 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *
|
|
for (struct bch_dev *_ca = NULL; \
|
|
(_ca = __bch2_next_dev((_c), _ca, (_mask)));)
|
|
|
|
+#define for_each_online_member_rcu(_c, _ca) \
|
|
+ for_each_member_device_rcu(_c, _ca, &(_c)->online_devs)
|
|
+
|
|
+#define for_each_rw_member_rcu(_c, _ca) \
|
|
+ for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free])
|
|
+
|
|
static inline void bch2_dev_get(struct bch_dev *ca)
|
|
{
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
@@ -144,33 +164,34 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev
|
|
|
|
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
|
|
struct bch_dev *ca,
|
|
- unsigned state_mask)
|
|
+ unsigned state_mask,
|
|
+ int rw, unsigned ref_idx)
|
|
{
|
|
rcu_read_lock();
|
|
if (ca)
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[rw], ref_idx);
|
|
|
|
while ((ca = __bch2_next_dev(c, ca, NULL)) &&
|
|
(!((1 << ca->mi.state) & state_mask) ||
|
|
- !percpu_ref_tryget(&ca->io_ref)))
|
|
+ !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)))
|
|
;
|
|
rcu_read_unlock();
|
|
|
|
return ca;
|
|
}
|
|
|
|
-#define __for_each_online_member(_c, _ca, state_mask) \
|
|
+#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \
|
|
for (struct bch_dev *_ca = NULL; \
|
|
- (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));)
|
|
+ (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));)
|
|
|
|
-#define for_each_online_member(c, ca) \
|
|
- __for_each_online_member(c, ca, ~0)
|
|
+#define for_each_online_member(c, ca, ref_idx) \
|
|
+ __for_each_online_member(c, ca, ~0, READ, ref_idx)
|
|
|
|
-#define for_each_rw_member(c, ca) \
|
|
- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw))
|
|
+#define for_each_rw_member(c, ca, ref_idx) \
|
|
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx)
|
|
|
|
-#define for_each_readable_member(c, ca) \
|
|
- __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
|
|
+#define for_each_readable_member(c, ca, ref_idx) \
|
|
+ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx)
|
|
|
|
static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
|
|
{
|
|
@@ -205,13 +226,15 @@ static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned de
|
|
: NULL;
|
|
}
|
|
|
|
-void bch2_dev_missing(struct bch_fs *, unsigned);
|
|
+int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned);
|
|
+
|
|
+void bch2_dev_missing_atomic(struct bch_fs *, unsigned);
|
|
|
|
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
|
|
{
|
|
struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
|
|
if (unlikely(!ca))
|
|
- bch2_dev_missing(c, dev);
|
|
+ bch2_dev_missing_atomic(c, dev);
|
|
return ca;
|
|
}
|
|
|
|
@@ -229,27 +252,30 @@ static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
|
|
{
|
|
struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
|
|
if (unlikely(!ca))
|
|
- bch2_dev_missing(c, dev);
|
|
+ bch2_dev_missing_atomic(c, dev);
|
|
return ca;
|
|
}
|
|
|
|
static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)
|
|
{
|
|
struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode);
|
|
- if (ca && !bucket_valid(ca, bucket.offset)) {
|
|
+ if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
|
|
bch2_dev_put(ca);
|
|
ca = NULL;
|
|
}
|
|
return ca;
|
|
}
|
|
|
|
-void bch2_dev_bucket_missing(struct bch_fs *, struct bpos);
|
|
+void bch2_dev_bucket_missing(struct bch_dev *, u64);
|
|
|
|
static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)
|
|
{
|
|
- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket);
|
|
- if (!ca)
|
|
- bch2_dev_bucket_missing(c, bucket);
|
|
+ struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
|
|
+ if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
|
|
+ bch2_dev_bucket_missing(ca, bucket.offset);
|
|
+ bch2_dev_put(ca);
|
|
+ ca = NULL;
|
|
+ }
|
|
return ca;
|
|
}
|
|
|
|
@@ -269,11 +295,14 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev
|
|
return bch2_dev_tryget(c, dev_idx);
|
|
}
|
|
|
|
-static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
|
|
+static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
|
|
+ int rw, unsigned ref_idx)
|
|
{
|
|
+ might_sleep();
|
|
+
|
|
rcu_read_lock();
|
|
struct bch_dev *ca = bch2_dev_rcu(c, dev);
|
|
- if (ca && !percpu_ref_tryget(&ca->io_ref))
|
|
+ if (ca && !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))
|
|
ca = NULL;
|
|
rcu_read_unlock();
|
|
|
|
@@ -283,27 +312,17 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
|
|
return ca;
|
|
|
|
if (ca)
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[rw], ref_idx);
|
|
return NULL;
|
|
}
|
|
|
|
-/* XXX kill, move to struct bch_fs */
|
|
-static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
|
|
-{
|
|
- struct bch_devs_mask devs;
|
|
-
|
|
- memset(&devs, 0, sizeof(devs));
|
|
- for_each_online_member(c, ca)
|
|
- __set_bit(ca->dev_idx, devs.d);
|
|
- return devs;
|
|
-}
|
|
-
|
|
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
|
|
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
|
|
|
|
static inline bool bch2_member_alive(struct bch_member *m)
|
|
{
|
|
- return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
|
|
+ return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) &&
|
|
+ !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID);
|
|
}
|
|
|
|
static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
|
|
@@ -333,6 +352,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
|
|
? BCH_MEMBER_DURABILITY(mi) - 1
|
|
: 1,
|
|
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
|
|
+ .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi),
|
|
.valid = bch2_member_alive(mi),
|
|
.btree_bitmap_shift = mi->btree_bitmap_shift,
|
|
.btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
|
|
@@ -363,5 +383,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
|
|
void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
|
|
|
|
int bch2_sb_member_alloc(struct bch_fs *);
|
|
+void bch2_sb_members_clean_deleted(struct bch_fs *);
|
|
|
|
#endif /* _BCACHEFS_SB_MEMBERS_H */
|
|
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
|
|
index 2adf1221a440..fb72ad730518 100644
|
|
--- a/fs/bcachefs/sb-members_format.h
|
|
+++ b/fs/bcachefs/sb-members_format.h
|
|
@@ -13,6 +13,10 @@
|
|
*/
|
|
#define BCH_SB_MEMBER_INVALID 255
|
|
|
|
+#define BCH_SB_MEMBER_DELETED_UUID \
|
|
+ UUID_INIT(0xffffffff, 0xffff, 0xffff, \
|
|
+ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
|
|
+
|
|
#define BCH_MIN_NR_NBUCKETS (1 << 6)
|
|
|
|
#define BCH_IOPS_MEASUREMENTS() \
|
|
@@ -79,6 +83,7 @@ struct bch_member {
|
|
|
|
#define BCH_MEMBER_V1_BYTES 56
|
|
|
|
+LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16)
|
|
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
|
|
/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
|
|
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
|
|
@@ -87,6 +92,8 @@ LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
|
|
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
|
|
LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
|
|
struct bch_member, flags, 30, 31)
|
|
+LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT,
|
|
+ struct bch_member, flags, 31, 32)
|
|
|
|
#if 0
|
|
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
|
|
diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h
|
|
index c0eda888fe39..d6443e186872 100644
|
|
--- a/fs/bcachefs/sb-members_types.h
|
|
+++ b/fs/bcachefs/sb-members_types.h
|
|
@@ -13,6 +13,7 @@ struct bch_member_cpu {
|
|
u8 data_allowed;
|
|
u8 durability;
|
|
u8 freespace_initialized;
|
|
+ u8 resize_on_mount;
|
|
u8 valid;
|
|
u8 btree_bitmap_shift;
|
|
u64 btree_allocated_bitmap;
|
|
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
|
|
index c54091a28909..00d62d1190ef 100644
|
|
--- a/fs/bcachefs/snapshot.c
|
|
+++ b/fs/bcachefs/snapshot.c
|
|
@@ -1,11 +1,13 @@
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include "bcachefs.h"
|
|
+#include "bbpos.h"
|
|
#include "bkey_buf.h"
|
|
#include "btree_cache.h"
|
|
#include "btree_key_cache.h"
|
|
#include "btree_update.h"
|
|
#include "buckets.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "errcode.h"
|
|
#include "error.h"
|
|
#include "fs.h"
|
|
@@ -141,13 +143,14 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
|
|
rcu_read_lock();
|
|
struct snapshot_table *t = rcu_dereference(c->snapshots);
|
|
|
|
- if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
|
|
+ if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
|
|
ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
|
|
goto out;
|
|
}
|
|
|
|
- while (id && id < ancestor - IS_ANCESTOR_BITMAP)
|
|
- id = get_ancestor_below(t, id, ancestor);
|
|
+ if (likely(ancestor >= IS_ANCESTOR_BITMAP))
|
|
+ while (id && id < ancestor - IS_ANCESTOR_BITMAP)
|
|
+ id = get_ancestor_below(t, id, ancestor);
|
|
|
|
ret = id && id < ancestor
|
|
? test_ancestor_bitmap(t, id, ancestor)
|
|
@@ -208,9 +211,14 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
|
|
{
|
|
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
|
|
|
|
- prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
|
|
- BCH_SNAPSHOT_SUBVOL(s.v),
|
|
- BCH_SNAPSHOT_DELETED(s.v),
|
|
+ if (BCH_SNAPSHOT_SUBVOL(s.v))
|
|
+ prt_str(out, "subvol ");
|
|
+ if (BCH_SNAPSHOT_WILL_DELETE(s.v))
|
|
+ prt_str(out, "will_delete ");
|
|
+ if (BCH_SNAPSHOT_DELETED(s.v))
|
|
+ prt_str(out, "deleted ");
|
|
+
|
|
+ prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u",
|
|
le32_to_cpu(s.v->parent),
|
|
le32_to_cpu(s.v->children[0]),
|
|
le32_to_cpu(s.v->children[1]),
|
|
@@ -280,6 +288,16 @@ int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
return ret;
|
|
}
|
|
|
|
+static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
|
|
+{
|
|
+ mutex_lock(&c->snapshot_table_lock);
|
|
+ int ret = snapshot_t_mut(c, id)
|
|
+ ? 0
|
|
+ : -BCH_ERR_ENOMEM_mark_snapshot;
|
|
+ mutex_unlock(&c->snapshot_table_lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static int __bch2_mark_snapshot(struct btree_trans *trans,
|
|
enum btree_id btree, unsigned level,
|
|
struct bkey_s_c old, struct bkey_s_c new,
|
|
@@ -301,7 +319,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
|
|
if (new.k->type == KEY_TYPE_snapshot) {
|
|
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
|
|
|
|
- t->live = true;
|
|
+ t->state = !BCH_SNAPSHOT_DELETED(s.v)
|
|
+ ? SNAPSHOT_ID_live
|
|
+ : SNAPSHOT_ID_deleted;
|
|
t->parent = le32_to_cpu(s.v->parent);
|
|
t->children[0] = le32_to_cpu(s.v->children[0]);
|
|
t->children[1] = le32_to_cpu(s.v->children[1]);
|
|
@@ -326,9 +346,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
|
|
parent - id - 1 < IS_ANCESTOR_BITMAP)
|
|
__set_bit(parent - id - 1, t->is_ancestor);
|
|
|
|
- if (BCH_SNAPSHOT_DELETED(s.v)) {
|
|
+ if (BCH_SNAPSHOT_WILL_DELETE(s.v)) {
|
|
set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
|
|
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
|
|
+ if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots)
|
|
bch2_delete_dead_snapshots_async(c);
|
|
}
|
|
} else {
|
|
@@ -389,22 +409,31 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
|
|
return 0;
|
|
}
|
|
|
|
-static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
|
|
+u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
|
|
+ snapshot_id_list *skip)
|
|
{
|
|
- u32 id = snapshot_root;
|
|
- u32 subvol = 0, s;
|
|
-
|
|
+ u32 id, subvol = 0, s;
|
|
+retry:
|
|
+ id = snapshot_root;
|
|
rcu_read_lock();
|
|
- while (id) {
|
|
- s = snapshot_t(c, id)->subvol;
|
|
-
|
|
- if (s && (!subvol || s < subvol))
|
|
- subvol = s;
|
|
+ while (id && bch2_snapshot_exists(c, id)) {
|
|
+ if (!(skip && snapshot_list_has_id(skip, id))) {
|
|
+ s = snapshot_t(c, id)->subvol;
|
|
|
|
+ if (s && (!subvol || s < subvol))
|
|
+ subvol = s;
|
|
+ }
|
|
id = bch2_snapshot_tree_next(c, id);
|
|
+ if (id == snapshot_root)
|
|
+ break;
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
+ if (!subvol && skip) {
|
|
+ skip = NULL;
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
return subvol;
|
|
}
|
|
|
|
@@ -436,7 +465,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
|
|
if (!ret && !found) {
|
|
struct bkey_i_subvolume *u;
|
|
|
|
- *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
|
|
+ *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL);
|
|
|
|
u = bch2_bkey_get_mut_typed(trans, &iter,
|
|
BTREE_ID_subvolumes, POS(0, *subvol_id),
|
|
@@ -484,7 +513,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
|
|
root_id != bch2_snapshot_root(c, root_id) ||
|
|
st.k->p.offset != le32_to_cpu(s.tree),
|
|
trans, snapshot_tree_to_missing_snapshot,
|
|
- "snapshot tree points to missing/incorrect snapshot:\n %s",
|
|
+ "snapshot tree points to missing/incorrect snapshot:\n%s",
|
|
(bch2_bkey_val_to_text(&buf, c, st.s_c),
|
|
prt_newline(&buf),
|
|
ret
|
|
@@ -504,19 +533,19 @@ static int check_snapshot_tree(struct btree_trans *trans,
|
|
|
|
if (fsck_err_on(ret,
|
|
trans, snapshot_tree_to_missing_subvol,
|
|
- "snapshot tree points to missing subvolume:\n %s",
|
|
+ "snapshot tree points to missing subvolume:\n%s",
|
|
(printbuf_reset(&buf),
|
|
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
|
|
fsck_err_on(!bch2_snapshot_is_ancestor(c,
|
|
le32_to_cpu(subvol.snapshot),
|
|
root_id),
|
|
trans, snapshot_tree_to_wrong_subvol,
|
|
- "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
|
|
+ "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s",
|
|
(printbuf_reset(&buf),
|
|
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
|
|
fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
|
|
trans, snapshot_tree_to_snapshot_subvol,
|
|
- "snapshot tree points to snapshot subvolume:\n %s",
|
|
+ "snapshot tree points to snapshot subvolume:\n%s",
|
|
(printbuf_reset(&buf),
|
|
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
|
|
struct bkey_i_snapshot_tree *u;
|
|
@@ -653,7 +682,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans,
|
|
u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
|
|
ret = PTR_ERR_OR_ZERO(u) ?:
|
|
bch2_snapshot_tree_create(trans, root_id,
|
|
- bch2_snapshot_tree_oldest_subvol(c, root_id),
|
|
+ bch2_snapshot_oldest_subvol(c, root_id, NULL),
|
|
&tree_id);
|
|
if (ret)
|
|
goto err;
|
|
@@ -698,6 +727,9 @@ static int check_snapshot(struct btree_trans *trans,
|
|
memset(&s, 0, sizeof(s));
|
|
memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
|
|
|
|
+ if (BCH_SNAPSHOT_DELETED(&s))
|
|
+ return 0;
|
|
+
|
|
id = le32_to_cpu(s.parent);
|
|
if (id) {
|
|
ret = bch2_snapshot_lookup(trans, id, &v);
|
|
@@ -735,7 +767,7 @@ static int check_snapshot(struct btree_trans *trans,
|
|
}
|
|
|
|
bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
|
|
- !BCH_SNAPSHOT_DELETED(&s);
|
|
+ !BCH_SNAPSHOT_WILL_DELETE(&s);
|
|
|
|
if (should_have_subvol) {
|
|
id = le32_to_cpu(s.subvol);
|
|
@@ -755,7 +787,7 @@ static int check_snapshot(struct btree_trans *trans,
|
|
} else {
|
|
if (fsck_err_on(s.subvol,
|
|
trans, snapshot_should_not_have_subvol,
|
|
- "snapshot should not point to subvol:\n %s",
|
|
+ "snapshot should not point to subvol:\n%s",
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
|
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
|
|
ret = PTR_ERR_OR_ZERO(u);
|
|
@@ -773,7 +805,7 @@ static int check_snapshot(struct btree_trans *trans,
|
|
|
|
if (fsck_err_on(!ret,
|
|
trans, snapshot_to_bad_snapshot_tree,
|
|
- "snapshot points to missing/incorrect tree:\n %s",
|
|
+ "snapshot points to missing/incorrect tree:\n%s",
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
|
ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
|
|
if (ret)
|
|
@@ -785,7 +817,7 @@ static int check_snapshot(struct btree_trans *trans,
|
|
|
|
if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
|
|
trans, snapshot_bad_depth,
|
|
- "snapshot with incorrect depth field, should be %u:\n %s",
|
|
+ "snapshot with incorrect depth field, should be %u:\n%s",
|
|
real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
|
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
|
|
ret = PTR_ERR_OR_ZERO(u);
|
|
@@ -802,7 +834,7 @@ static int check_snapshot(struct btree_trans *trans,
|
|
|
|
if (fsck_err_on(!ret,
|
|
trans, snapshot_bad_skiplist,
|
|
- "snapshot with bad skiplist field:\n %s",
|
|
+ "snapshot with bad skiplist field:\n%s",
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
|
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
|
|
ret = PTR_ERR_OR_ZERO(u);
|
|
@@ -842,9 +874,6 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
|
|
- if (bch2_snapshot_exists(c, id))
|
|
- return 0;
|
|
-
|
|
/* Do we need to reconstruct the snapshot_tree entry as well? */
|
|
struct btree_iter iter;
|
|
struct bkey_s_c k;
|
|
@@ -889,9 +918,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
|
|
}
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
- return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
|
|
- bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
|
|
- bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
|
|
+ return bch2_snapshot_table_make_room(c, id) ?:
|
|
+ bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0);
|
|
}
|
|
|
|
/* Figure out which snapshot nodes belong in the same tree: */
|
|
@@ -989,7 +1017,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
|
|
snapshot_id_list_to_text(&buf, t);
|
|
|
|
darray_for_each(*t, id) {
|
|
- if (fsck_err_on(!bch2_snapshot_exists(c, *id),
|
|
+ if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty,
|
|
trans, snapshot_node_missing,
|
|
"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
|
|
if (t->nr > 1) {
|
|
@@ -1014,22 +1042,38 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
|
|
return ret;
|
|
}
|
|
|
|
-int bch2_check_key_has_snapshot(struct btree_trans *trans,
|
|
- struct btree_iter *iter,
|
|
- struct bkey_s_c k)
|
|
+int __bch2_check_key_has_snapshot(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_s_c k)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct printbuf buf = PRINTBUF;
|
|
int ret = 0;
|
|
+ enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot);
|
|
+
|
|
+ /* Snapshot was definitively deleted, this error is marked autofix */
|
|
+ if (fsck_err_on(state == SNAPSHOT_ID_deleted,
|
|
+ trans, bkey_in_deleted_snapshot,
|
|
+ "key in deleted snapshot %s, delete?",
|
|
+ (bch2_btree_id_to_text(&buf, iter->btree_id),
|
|
+ prt_char(&buf, ' '),
|
|
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
|
+ ret = bch2_btree_delete_at(trans, iter,
|
|
+ BTREE_UPDATE_internal_snapshot_node) ?: 1;
|
|
|
|
- if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
|
|
+ /*
|
|
+ * Snapshot missing: we should have caught this with btree_lost_data and
|
|
+ * kicked off reconstruct_snapshots, so if we end up here we have no
|
|
+ * idea what happened:
|
|
+ */
|
|
+ if (fsck_err_on(state == SNAPSHOT_ID_empty,
|
|
trans, bkey_in_missing_snapshot,
|
|
"key in missing snapshot %s, delete?",
|
|
(bch2_btree_id_to_text(&buf, iter->btree_id),
|
|
prt_char(&buf, ' '),
|
|
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
|
ret = bch2_btree_delete_at(trans, iter,
|
|
- BTREE_UPDATE_internal_snapshot_node) ?: 1;
|
|
+ BTREE_UPDATE_internal_snapshot_node) ?: 1;
|
|
fsck_err:
|
|
printbuf_exit(&buf);
|
|
return ret;
|
|
@@ -1053,10 +1097,10 @@ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
|
|
}
|
|
|
|
/* already deleted? */
|
|
- if (BCH_SNAPSHOT_DELETED(&s->v))
|
|
+ if (BCH_SNAPSHOT_WILL_DELETE(&s->v))
|
|
goto err;
|
|
|
|
- SET_BCH_SNAPSHOT_DELETED(&s->v, true);
|
|
+ SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true);
|
|
SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
|
|
s->v.subvol = 0;
|
|
err:
|
|
@@ -1073,27 +1117,28 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
|
|
static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
|
|
- struct btree_iter c_iter = (struct btree_iter) { NULL };
|
|
- struct btree_iter tree_iter = (struct btree_iter) { NULL };
|
|
- struct bkey_s_c_snapshot s;
|
|
+ struct btree_iter iter, p_iter = {};
|
|
+ struct btree_iter c_iter = {};
|
|
+ struct btree_iter tree_iter = {};
|
|
u32 parent_id, child_id;
|
|
unsigned i;
|
|
int ret = 0;
|
|
|
|
- s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
|
|
- BTREE_ITER_intent, snapshot);
|
|
- ret = bkey_err(s);
|
|
+ struct bkey_i_snapshot *s =
|
|
+ bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
|
|
+ BTREE_ITER_intent, snapshot);
|
|
+ ret = PTR_ERR_OR_ZERO(s);
|
|
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
|
|
"missing snapshot %u", id);
|
|
|
|
if (ret)
|
|
goto err;
|
|
|
|
- BUG_ON(s.v->children[1]);
|
|
+ BUG_ON(BCH_SNAPSHOT_DELETED(&s->v));
|
|
+ BUG_ON(s->v.children[1]);
|
|
|
|
- parent_id = le32_to_cpu(s.v->parent);
|
|
- child_id = le32_to_cpu(s.v->children[0]);
|
|
+ parent_id = le32_to_cpu(s->v.parent);
|
|
+ child_id = le32_to_cpu(s->v.children[0]);
|
|
|
|
if (parent_id) {
|
|
struct bkey_i_snapshot *parent;
|
|
@@ -1151,24 +1196,38 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
|
|
*/
|
|
struct bkey_i_snapshot_tree *s_t;
|
|
|
|
- BUG_ON(s.v->children[1]);
|
|
+ BUG_ON(s->v.children[1]);
|
|
|
|
s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
|
|
- BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
|
|
+ BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)),
|
|
0, snapshot_tree);
|
|
ret = PTR_ERR_OR_ZERO(s_t);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (s.v->children[0]) {
|
|
- s_t->v.root_snapshot = s.v->children[0];
|
|
+ if (s->v.children[0]) {
|
|
+ s_t->v.root_snapshot = s->v.children[0];
|
|
} else {
|
|
s_t->k.type = KEY_TYPE_deleted;
|
|
set_bkey_val_u64s(&s_t->k, 0);
|
|
}
|
|
}
|
|
|
|
- ret = bch2_btree_delete_at(trans, &iter, 0);
|
|
+ if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) {
|
|
+ SET_BCH_SNAPSHOT_DELETED(&s->v, true);
|
|
+ s->v.parent = 0;
|
|
+ s->v.children[0] = 0;
|
|
+ s->v.children[1] = 0;
|
|
+ s->v.subvol = 0;
|
|
+ s->v.tree = 0;
|
|
+ s->v.depth = 0;
|
|
+ s->v.skip[0] = 0;
|
|
+ s->v.skip[1] = 0;
|
|
+ s->v.skip[2] = 0;
|
|
+ } else {
|
|
+ s->k.type = KEY_TYPE_deleted;
|
|
+ set_bkey_val_u64s(&s->k, 0);
|
|
+ }
|
|
err:
|
|
bch2_trans_iter_exit(trans, &tree_iter);
|
|
bch2_trans_iter_exit(trans, &p_iter);
|
|
@@ -1192,13 +1251,13 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
|
|
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
|
|
POS_MIN, BTREE_ITER_intent);
|
|
- k = bch2_btree_iter_peek(&iter);
|
|
+ k = bch2_btree_iter_peek(trans, &iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
|
|
for (i = 0; i < nr_snapids; i++) {
|
|
- k = bch2_btree_iter_prev_slot(&iter);
|
|
+ k = bch2_btree_iter_prev_slot(trans, &iter);
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1338,12 +1397,6 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
|
|
* that key to snapshot leaf nodes, where we can mutate it
|
|
*/
|
|
|
|
-struct snapshot_interior_delete {
|
|
- u32 id;
|
|
- u32 live_child;
|
|
-};
|
|
-typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
|
|
-
|
|
static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
|
|
{
|
|
darray_for_each(*l, i)
|
|
@@ -1377,28 +1430,34 @@ static unsigned __live_child(struct snapshot_table *t, u32 id,
|
|
return 0;
|
|
}
|
|
|
|
-static unsigned live_child(struct bch_fs *c, u32 id,
|
|
- snapshot_id_list *delete_leaves,
|
|
- interior_delete_list *delete_interior)
|
|
+static unsigned live_child(struct bch_fs *c, u32 id)
|
|
{
|
|
+ struct snapshot_delete *d = &c->snapshot_delete;
|
|
+
|
|
rcu_read_lock();
|
|
u32 ret = __live_child(rcu_dereference(c->snapshots), id,
|
|
- delete_leaves, delete_interior);
|
|
+ &d->delete_leaves, &d->delete_interior);
|
|
rcu_read_unlock();
|
|
return ret;
|
|
}
|
|
|
|
+static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
|
|
+{
|
|
+ return snapshot_list_has_id(&d->delete_leaves, id) ||
|
|
+ interior_delete_has_id(&d->delete_interior, id) != 0;
|
|
+}
|
|
+
|
|
static int delete_dead_snapshots_process_key(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
- struct bkey_s_c k,
|
|
- snapshot_id_list *delete_leaves,
|
|
- interior_delete_list *delete_interior)
|
|
+ struct bkey_s_c k)
|
|
{
|
|
- if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
|
|
+ struct snapshot_delete *d = &trans->c->snapshot_delete;
|
|
+
|
|
+ if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot))
|
|
return bch2_btree_delete_at(trans, iter,
|
|
BTREE_UPDATE_internal_snapshot_node);
|
|
|
|
- u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
|
|
+ u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot);
|
|
if (live_child) {
|
|
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
|
|
int ret = PTR_ERR_OR_ZERO(new);
|
|
@@ -1429,49 +1488,208 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
+static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct snapshot_delete *d = &c->snapshot_delete;
|
|
+
|
|
+ u64 inum = iter->btree_id != BTREE_ID_inodes
|
|
+ ? iter->pos.inode
|
|
+ : iter->pos.offset;
|
|
+
|
|
+ if (*prev_inum == inum)
|
|
+ return false;
|
|
+
|
|
+ *prev_inum = inum;
|
|
+
|
|
+ bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
|
|
+ bch2_snapshot_tree(c, iter->pos.snapshot));
|
|
+ if (unlikely(ret)) {
|
|
+ struct bpos pos = iter->pos;
|
|
+ pos.snapshot = 0;
|
|
+ if (iter->btree_id != BTREE_ID_inodes)
|
|
+ pos.offset = U64_MAX;
|
|
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos));
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct snapshot_delete *d = &c->snapshot_delete;
|
|
+
|
|
+ for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
|
|
+ struct disk_reservation res = { 0 };
|
|
+ u64 prev_inum = 0;
|
|
+
|
|
+ d->pos.pos = POS_MIN;
|
|
+
|
|
+ if (!btree_type_has_snapshots(d->pos.btree))
|
|
+ continue;
|
|
+
|
|
+ int ret = for_each_btree_key_commit(trans, iter,
|
|
+ d->pos.btree, POS_MIN,
|
|
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
|
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
+ d->pos.pos = iter.pos;
|
|
+
|
|
+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
|
|
+ continue;
|
|
+
|
|
+ delete_dead_snapshots_process_key(trans, &iter, k);
|
|
+ }));
|
|
+
|
|
+ bch2_disk_reservation_put(c, &res);
|
|
+
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
|
|
+ struct bpos start, struct bpos end)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct snapshot_delete *d = &c->snapshot_delete;
|
|
+ struct disk_reservation res = { 0 };
|
|
+
|
|
+ d->pos.btree = btree;
|
|
+ d->pos.pos = POS_MIN;
|
|
+
|
|
+ int ret = for_each_btree_key_max_commit(trans, iter,
|
|
+ btree, start, end,
|
|
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
|
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
+ d->pos.pos = iter.pos;
|
|
+ delete_dead_snapshots_process_key(trans, &iter, k);
|
|
+ }));
|
|
+
|
|
+ bch2_disk_reservation_put(c, &res);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct snapshot_delete *d = &c->snapshot_delete;
|
|
+ struct disk_reservation res = { 0 };
|
|
+ u64 prev_inum = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ struct btree_iter iter;
|
|
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
|
|
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
|
|
+
|
|
+ while (1) {
|
|
+ struct bkey_s_c k;
|
|
+ ret = lockrestart_do(trans,
|
|
+ bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ if (!k.k)
|
|
+ break;
|
|
+
|
|
+ d->pos.btree = iter.btree_id;
|
|
+ d->pos.pos = iter.pos;
|
|
+
|
|
+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
|
|
+ continue;
|
|
+
|
|
+ if (snapshot_id_dying(d, k.k->p.snapshot)) {
|
|
+ struct bpos start = POS(k.k->p.offset, 0);
|
|
+ struct bpos end = POS(k.k->p.offset, U64_MAX);
|
|
+
|
|
+ ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
|
|
+ delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
|
|
+ delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
|
|
+ } else {
|
|
+ bch2_btree_iter_advance(trans, &iter);
|
|
+ }
|
|
+ }
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ prev_inum = 0;
|
|
+ ret = for_each_btree_key_commit(trans, iter,
|
|
+ BTREE_ID_inodes, POS_MIN,
|
|
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
|
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
+ d->pos.btree = iter.btree_id;
|
|
+ d->pos.pos = iter.pos;
|
|
+
|
|
+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
|
|
+ continue;
|
|
+
|
|
+ delete_dead_snapshots_process_key(trans, &iter, k);
|
|
+ }));
|
|
+err:
|
|
+ bch2_disk_reservation_put(c, &res);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
/*
|
|
* For a given snapshot, if it doesn't have a subvolume that points to it, and
|
|
* it doesn't have child snapshot nodes - it's now redundant and we can mark it
|
|
* as deleted.
|
|
*/
|
|
-static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
|
|
- snapshot_id_list *delete_leaves,
|
|
- interior_delete_list *delete_interior)
|
|
+static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k)
|
|
{
|
|
if (k.k->type != KEY_TYPE_snapshot)
|
|
return 0;
|
|
|
|
struct bch_fs *c = trans->c;
|
|
+ struct snapshot_delete *d = &c->snapshot_delete;
|
|
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
|
|
unsigned live_children = 0;
|
|
+ int ret = 0;
|
|
|
|
if (BCH_SNAPSHOT_SUBVOL(s.v))
|
|
return 0;
|
|
|
|
+ if (BCH_SNAPSHOT_DELETED(s.v))
|
|
+ return 0;
|
|
+
|
|
+ mutex_lock(&d->progress_lock);
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
u32 child = le32_to_cpu(s.v->children[i]);
|
|
|
|
live_children += child &&
|
|
- !snapshot_list_has_id(delete_leaves, child);
|
|
+ !snapshot_list_has_id(&d->delete_leaves, child);
|
|
}
|
|
|
|
+ u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
|
|
+
|
|
if (live_children == 0) {
|
|
- return snapshot_list_add(c, delete_leaves, s.k->p.offset);
|
|
+ ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
|
|
+ snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
|
|
} else if (live_children == 1) {
|
|
- struct snapshot_interior_delete d = {
|
|
+ struct snapshot_interior_delete n = {
|
|
.id = s.k->p.offset,
|
|
- .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior),
|
|
+ .live_child = live_child(c, s.k->p.offset),
|
|
};
|
|
|
|
- if (!d.live_child) {
|
|
- bch_err(c, "error finding live child of snapshot %u", d.id);
|
|
- return -EINVAL;
|
|
+ if (!n.live_child) {
|
|
+ bch_err(c, "error finding live child of snapshot %u", n.id);
|
|
+ ret = -EINVAL;
|
|
+ } else {
|
|
+ ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
|
|
+ darray_push(&d->delete_interior, n);
|
|
}
|
|
-
|
|
- return darray_push(delete_interior, d);
|
|
- } else {
|
|
- return 0;
|
|
}
|
|
+ mutex_unlock(&d->progress_lock);
|
|
+
|
|
+ return ret;
|
|
}
|
|
|
|
static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
|
|
@@ -1500,6 +1718,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
|
|
struct bkey_i_snapshot *s;
|
|
int ret;
|
|
|
|
+ if (!bch2_snapshot_exists(c, k.k->p.offset))
|
|
+ return 0;
|
|
+
|
|
if (k.k->type != KEY_TYPE_snapshot)
|
|
return 0;
|
|
|
|
@@ -1547,39 +1768,56 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
|
|
return bch2_trans_update(trans, iter, &s->k_i, 0);
|
|
}
|
|
|
|
-int bch2_delete_dead_snapshots(struct bch_fs *c)
|
|
+static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d)
|
|
{
|
|
- if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
|
|
+ prt_printf(out, "deleting from trees");
|
|
+ darray_for_each(d->deleting_from_trees, i)
|
|
+ prt_printf(out, " %u", *i);
|
|
+
|
|
+ prt_printf(out, "deleting leaves");
|
|
+ darray_for_each(d->delete_leaves, i)
|
|
+ prt_printf(out, " %u", *i);
|
|
+ prt_newline(out);
|
|
+
|
|
+ prt_printf(out, "interior");
|
|
+ darray_for_each(d->delete_interior, i)
|
|
+ prt_printf(out, " %u->%u", i->id, i->live_child);
|
|
+ prt_newline(out);
|
|
+}
|
|
+
|
|
+int __bch2_delete_dead_snapshots(struct bch_fs *c)
|
|
+{
|
|
+ struct snapshot_delete *d = &c->snapshot_delete;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!mutex_trylock(&d->lock))
|
|
return 0;
|
|
|
|
+ if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
|
|
+ goto out_unlock;
|
|
+
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
- snapshot_id_list delete_leaves = {};
|
|
- interior_delete_list delete_interior = {};
|
|
- int ret = 0;
|
|
|
|
/*
|
|
* For every snapshot node: If we have no live children and it's not
|
|
* pointed to by a subvolume, delete it:
|
|
*/
|
|
+ d->running = true;
|
|
+ d->pos = BBPOS_MIN;
|
|
+
|
|
ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
|
|
- check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
|
|
+ check_should_delete_snapshot(trans, k));
|
|
if (!bch2_err_matches(ret, EROFS))
|
|
bch_err_msg(c, ret, "walking snapshots");
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (!delete_leaves.nr && !delete_interior.nr)
|
|
+ if (!d->delete_leaves.nr && !d->delete_interior.nr)
|
|
goto err;
|
|
|
|
{
|
|
struct printbuf buf = PRINTBUF;
|
|
- prt_printf(&buf, "deleting leaves");
|
|
- darray_for_each(delete_leaves, i)
|
|
- prt_printf(&buf, " %u", *i);
|
|
-
|
|
- prt_printf(&buf, " interior");
|
|
- darray_for_each(delete_interior, i)
|
|
- prt_printf(&buf, " %u->%u", i->id, i->live_child);
|
|
+ bch2_snapshot_delete_nodes_to_text(&buf, d);
|
|
|
|
ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
|
|
printbuf_exit(&buf);
|
|
@@ -1587,29 +1825,15 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
|
|
goto err;
|
|
}
|
|
|
|
- for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
|
|
- struct disk_reservation res = { 0 };
|
|
-
|
|
- if (!btree_type_has_snapshots(btree))
|
|
- continue;
|
|
-
|
|
- ret = for_each_btree_key_commit(trans, iter,
|
|
- btree, POS_MIN,
|
|
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
|
- &res, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
- delete_dead_snapshots_process_key(trans, &iter, k,
|
|
- &delete_leaves,
|
|
- &delete_interior));
|
|
-
|
|
- bch2_disk_reservation_put(c, &res);
|
|
-
|
|
- if (!bch2_err_matches(ret, EROFS))
|
|
- bch_err_msg(c, ret, "deleting keys from dying snapshots");
|
|
- if (ret)
|
|
- goto err;
|
|
- }
|
|
+ ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
|
|
+ ? delete_dead_snapshot_keys_v2(trans)
|
|
+ : delete_dead_snapshot_keys_v1(trans);
|
|
+ if (!bch2_err_matches(ret, EROFS))
|
|
+ bch_err_msg(c, ret, "deleting keys from dying snapshots");
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
- darray_for_each(delete_leaves, i) {
|
|
+ darray_for_each(d->delete_leaves, i) {
|
|
ret = commit_do(trans, NULL, NULL, 0,
|
|
bch2_snapshot_node_delete(trans, *i));
|
|
if (!bch2_err_matches(ret, EROFS))
|
|
@@ -1626,11 +1850,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
|
|
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
|
|
BTREE_ITER_intent, k,
|
|
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
|
|
+ bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior));
|
|
if (ret)
|
|
goto err;
|
|
|
|
- darray_for_each(delete_interior, i) {
|
|
+ darray_for_each(d->delete_interior, i) {
|
|
ret = commit_do(trans, NULL, NULL, 0,
|
|
bch2_snapshot_node_delete(trans, i->id));
|
|
if (!bch2_err_matches(ret, EROFS))
|
|
@@ -1639,33 +1863,66 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
|
|
goto err;
|
|
}
|
|
err:
|
|
- darray_exit(&delete_interior);
|
|
- darray_exit(&delete_leaves);
|
|
+ mutex_lock(&d->progress_lock);
|
|
+ darray_exit(&d->deleting_from_trees);
|
|
+ darray_exit(&d->delete_interior);
|
|
+ darray_exit(&d->delete_leaves);
|
|
+ d->running = false;
|
|
+ mutex_unlock(&d->progress_lock);
|
|
bch2_trans_put(trans);
|
|
+out_unlock:
|
|
+ mutex_unlock(&d->lock);
|
|
if (!bch2_err_matches(ret, EROFS))
|
|
bch_err_fn(c, ret);
|
|
return ret;
|
|
}
|
|
|
|
+int bch2_delete_dead_snapshots(struct bch_fs *c)
|
|
+{
|
|
+ if (!c->opts.auto_snapshot_deletion)
|
|
+ return 0;
|
|
+
|
|
+ return __bch2_delete_dead_snapshots(c);
|
|
+}
|
|
+
|
|
void bch2_delete_dead_snapshots_work(struct work_struct *work)
|
|
{
|
|
- struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
|
|
+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work);
|
|
|
|
set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
|
|
|
|
bch2_delete_dead_snapshots(c);
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
|
|
}
|
|
|
|
void bch2_delete_dead_snapshots_async(struct bch_fs *c)
|
|
{
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
|
|
+ if (!c->opts.auto_snapshot_deletion)
|
|
+ return;
|
|
+
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots))
|
|
return;
|
|
|
|
BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
|
|
|
|
- if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
|
|
+ if (!queue_work(c->write_ref_wq, &c->snapshot_delete.work))
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
|
|
+}
|
|
+
|
|
+void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ struct snapshot_delete *d = &c->snapshot_delete;
|
|
+
|
|
+ if (!d->running) {
|
|
+ prt_str(out, "(not running)");
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&d->progress_lock);
|
|
+ bch2_snapshot_delete_nodes_to_text(out, d);
|
|
+
|
|
+ bch2_bbpos_to_text(out, d->pos);
|
|
+ mutex_unlock(&d->progress_lock);
|
|
}
|
|
|
|
int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
|
|
@@ -1706,7 +1963,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct
|
|
return 0;
|
|
|
|
struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
|
|
- if (BCH_SNAPSHOT_DELETED(snap.v) ||
|
|
+ if (BCH_SNAPSHOT_WILL_DELETE(snap.v) ||
|
|
interior_snapshot_needs_delete(snap))
|
|
set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
|
|
|
|
@@ -1735,10 +1992,6 @@ int bch2_snapshots_read(struct bch_fs *c)
|
|
BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
|
|
test_bit(BCH_FS_may_go_rw, &c->flags));
|
|
|
|
- if (bch2_err_matches(ret, EIO) ||
|
|
- (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
|
|
- ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
|
|
-
|
|
return ret;
|
|
}
|
|
|
|
@@ -1746,3 +1999,11 @@ void bch2_fs_snapshots_exit(struct bch_fs *c)
|
|
{
|
|
kvfree(rcu_dereference_protected(c->snapshots, true));
|
|
}
|
|
+
|
|
+void bch2_fs_snapshots_init_early(struct bch_fs *c)
|
|
+{
|
|
+ INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work);
|
|
+ mutex_init(&c->snapshot_delete.lock);
|
|
+ mutex_init(&c->snapshot_delete.progress_lock);
|
|
+ mutex_init(&c->snapshots_unlinked_lock);
|
|
+}
|
|
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
|
|
index 00373cf32e7b..382a171f5413 100644
|
|
--- a/fs/bcachefs/snapshot.h
|
|
+++ b/fs/bcachefs/snapshot.h
|
|
@@ -105,6 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
|
|
return id;
|
|
}
|
|
|
|
+u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *);
|
|
u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
|
|
|
|
static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
|
|
@@ -119,21 +120,26 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
|
|
return id;
|
|
}
|
|
|
|
-static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id)
|
|
+static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id)
|
|
{
|
|
const struct snapshot_t *s = snapshot_t(c, id);
|
|
- return s ? s->live : 0;
|
|
+ return s ? s->state : SNAPSHOT_ID_empty;
|
|
}
|
|
|
|
-static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
|
|
+static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id)
|
|
{
|
|
rcu_read_lock();
|
|
- bool ret = __bch2_snapshot_exists(c, id);
|
|
+ enum snapshot_id_state ret = __bch2_snapshot_id_state(c, id);
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
}
|
|
|
|
+static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
|
|
+{
|
|
+ return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live;
|
|
+}
|
|
+
|
|
static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
|
|
{
|
|
rcu_read_lock();
|
|
@@ -240,10 +246,19 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
|
|
int bch2_check_snapshot_trees(struct bch_fs *);
|
|
int bch2_check_snapshots(struct bch_fs *);
|
|
int bch2_reconstruct_snapshots(struct bch_fs *);
|
|
-int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
|
|
+
|
|
+int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
|
|
+
|
|
+static inline int bch2_check_key_has_snapshot(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot))
|
|
+ ? 0
|
|
+ : __bch2_check_key_has_snapshot(trans, iter, k);
|
|
+}
|
|
|
|
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
|
|
-void bch2_delete_dead_snapshots_work(struct work_struct *);
|
|
|
|
int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
|
|
|
|
@@ -258,7 +273,14 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
|
|
return __bch2_key_has_snapshot_overwrites(trans, id, pos);
|
|
}
|
|
|
|
+int __bch2_delete_dead_snapshots(struct bch_fs *);
|
|
+int bch2_delete_dead_snapshots(struct bch_fs *);
|
|
+void bch2_delete_dead_snapshots_work(struct work_struct *);
|
|
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
|
|
+void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *);
|
|
+
|
|
int bch2_snapshots_read(struct bch_fs *);
|
|
void bch2_fs_snapshots_exit(struct bch_fs *);
|
|
+void bch2_fs_snapshots_init_early(struct bch_fs *);
|
|
|
|
#endif /* _BCACHEFS_SNAPSHOT_H */
|
|
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
|
|
index aabcd3a74cd9..9bccae1f3590 100644
|
|
--- a/fs/bcachefs/snapshot_format.h
|
|
+++ b/fs/bcachefs/snapshot_format.h
|
|
@@ -15,10 +15,10 @@ struct bch_snapshot {
|
|
bch_le128 btime;
|
|
};
|
|
|
|
-LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
|
|
-
|
|
+LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1)
|
|
/* True if a subvolume points to this snapshot node: */
|
|
LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
|
|
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3)
|
|
|
|
/*
|
|
* Snapshot trees:
|
|
diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h
|
|
new file mode 100644
|
|
index 000000000000..31f96d1cf5f4
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/snapshot_types.h
|
|
@@ -0,0 +1,58 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_SNAPSHOT_TYPES_H
|
|
+#define _BCACHEFS_SNAPSHOT_TYPES_H
|
|
+
|
|
+#include "bbpos_types.h"
|
|
+#include "subvolume_types.h"
|
|
+
|
|
+#include <linux/darray_types.h>
|
|
+
|
|
+typedef DARRAY(u32) snapshot_id_list;
|
|
+
|
|
+#define IS_ANCESTOR_BITMAP 128
|
|
+
|
|
+struct snapshot_t {
|
|
+ enum snapshot_id_state {
|
|
+ SNAPSHOT_ID_empty,
|
|
+ SNAPSHOT_ID_live,
|
|
+ SNAPSHOT_ID_deleted,
|
|
+ } state;
|
|
+ u32 parent;
|
|
+ u32 skip[3];
|
|
+ u32 depth;
|
|
+ u32 children[2];
|
|
+ u32 subvol; /* Nonzero only if a subvolume points to this node: */
|
|
+ u32 tree;
|
|
+ unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
|
|
+};
|
|
+
|
|
+struct snapshot_table {
|
|
+ struct rcu_head rcu;
|
|
+ size_t nr;
|
|
+#ifndef RUST_BINDGEN
|
|
+ DECLARE_FLEX_ARRAY(struct snapshot_t, s);
|
|
+#else
|
|
+ struct snapshot_t s[0];
|
|
+#endif
|
|
+};
|
|
+
|
|
+struct snapshot_interior_delete {
|
|
+ u32 id;
|
|
+ u32 live_child;
|
|
+};
|
|
+typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
|
|
+
|
|
+struct snapshot_delete {
|
|
+ struct mutex lock;
|
|
+ struct work_struct work;
|
|
+
|
|
+ struct mutex progress_lock;
|
|
+ snapshot_id_list deleting_from_trees;
|
|
+ snapshot_id_list delete_leaves;
|
|
+ interior_delete_list delete_interior;
|
|
+
|
|
+ bool running;
|
|
+ struct bbpos pos;
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */
|
|
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
|
|
index d78451c2a0c6..0cbf5508a32c 100644
|
|
--- a/fs/bcachefs/str_hash.c
|
|
+++ b/fs/bcachefs/str_hash.c
|
|
@@ -50,7 +50,7 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
|
|
for (unsigned i = 0; i < 1000; i++) {
|
|
unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
|
|
old_name.len, old_name.name, i);
|
|
- unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
|
|
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0);
|
|
|
|
if (u64s > U8_MAX)
|
|
return -EINVAL;
|
|
@@ -101,17 +101,25 @@ static noinline int hash_pick_winner(struct btree_trans *trans,
|
|
}
|
|
}
|
|
|
|
-static int repair_inode_hash_info(struct btree_trans *trans,
|
|
- struct bch_inode_unpacked *snapshot_root)
|
|
+/*
|
|
+ * str_hash lookups across snapshots break in wild ways if hash_info in
|
|
+ * different snapshot versions doesn't match - so if we find one mismatch, check
|
|
+ * them all
|
|
+ */
|
|
+int bch2_repair_inode_hash_info(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *snapshot_root)
|
|
{
|
|
+ struct bch_fs *c = trans->c;
|
|
struct btree_iter iter;
|
|
struct bkey_s_c k;
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bool need_commit = false;
|
|
int ret = 0;
|
|
|
|
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
|
|
- SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1),
|
|
- BTREE_ITER_all_snapshots, k, ret) {
|
|
- if (k.k->p.offset != snapshot_root->bi_inum)
|
|
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes,
|
|
+ POS(0, snapshot_root->bi_inum),
|
|
+ BTREE_ITER_all_snapshots, k, ret) {
|
|
+ if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot)))
|
|
break;
|
|
if (!bkey_is_inode(k.k))
|
|
continue;
|
|
@@ -121,19 +129,72 @@ static int repair_inode_hash_info(struct btree_trans *trans,
|
|
if (ret)
|
|
break;
|
|
|
|
- if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed ||
|
|
- INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root),
|
|
- trans, inode_snapshot_mismatch,
|
|
- "inode hash info in different snapshots don't match")) {
|
|
+ if (inode.bi_hash_seed == snapshot_root->bi_hash_seed &&
|
|
+ INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) {
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root);
|
|
+ struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
|
|
+
|
|
+ BUG_ON(hash1.type != hash2.type ||
|
|
+ memcmp(&hash1.siphash_key,
|
|
+ &hash2.siphash_key,
|
|
+ sizeof(hash1.siphash_key)));
|
|
+#endif
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ printbuf_reset(&buf);
|
|
+ prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n",
|
|
+ snapshot_root->bi_inum,
|
|
+ inode.bi_snapshot,
|
|
+ snapshot_root->bi_snapshot);
|
|
+
|
|
+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode));
|
|
+ prt_printf(&buf, " %llx\n", inode.bi_hash_seed);
|
|
+
|
|
+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
|
|
+ prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed);
|
|
+
|
|
+ if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) {
|
|
inode.bi_hash_seed = snapshot_root->bi_hash_seed;
|
|
SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root));
|
|
- ret = __bch2_fsck_write_inode(trans, &inode) ?:
|
|
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
|
|
- -BCH_ERR_transaction_restart_nested;
|
|
- break;
|
|
+
|
|
+ ret = __bch2_fsck_write_inode(trans, &inode);
|
|
+ if (ret)
|
|
+ break;
|
|
+ need_commit = true;
|
|
}
|
|
}
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (!need_commit) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+
|
|
+ prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n",
|
|
+ snapshot_root->bi_inum);
|
|
+
|
|
+ prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot);
|
|
+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
|
|
+ prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed);
|
|
+#if 0
|
|
+ prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot);
|
|
+ bch2_prt_str_hash_type(&buf, hash_info->type);
|
|
+ prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1);
|
|
+#endif
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ ret = -BCH_ERR_fsck_repair_unimplemented;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
|
|
+ -BCH_ERR_transaction_restart_nested;
|
|
+err:
|
|
fsck_err:
|
|
+ printbuf_exit(&buf);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
@@ -145,46 +206,18 @@ static int repair_inode_hash_info(struct btree_trans *trans,
|
|
static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
|
|
struct bch_hash_info *hash_info)
|
|
{
|
|
- struct bch_fs *c = trans->c;
|
|
- struct btree_iter iter;
|
|
- struct bkey_s_c k;
|
|
- int ret = 0;
|
|
-
|
|
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX),
|
|
- BTREE_ITER_all_snapshots, k, ret) {
|
|
- if (k.k->p.offset != inum)
|
|
- break;
|
|
- if (bkey_is_inode(k.k))
|
|
- goto found;
|
|
- }
|
|
- bch_err(c, "%s(): inum %llu not found", __func__, inum);
|
|
- ret = -BCH_ERR_fsck_repair_unimplemented;
|
|
- goto err;
|
|
-found:;
|
|
- struct bch_inode_unpacked inode;
|
|
- ret = bch2_inode_unpack(k, &inode);
|
|
+ struct bch_inode_unpacked snapshot_root;
|
|
+ int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root);
|
|
if (ret)
|
|
- goto err;
|
|
+ return ret;
|
|
+
|
|
+ struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root);
|
|
+ if (hash_info->type != hash_root.type ||
|
|
+ memcmp(&hash_info->siphash_key,
|
|
+ &hash_root.siphash_key,
|
|
+ sizeof(hash_root.siphash_key)))
|
|
+ ret = bch2_repair_inode_hash_info(trans, &snapshot_root);
|
|
|
|
- struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
|
|
- if (hash_info->type != hash2.type ||
|
|
- memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) {
|
|
- ret = repair_inode_hash_info(trans, &inode);
|
|
- if (!ret) {
|
|
- bch_err(c, "inode hash info mismatch with root, but mismatch not found\n"
|
|
- "%u %llx %llx\n"
|
|
- "%u %llx %llx",
|
|
- hash_info->type,
|
|
- hash_info->siphash_key.k0,
|
|
- hash_info->siphash_key.k1,
|
|
- hash2.type,
|
|
- hash2.siphash_key.k0,
|
|
- hash2.siphash_key.k1);
|
|
- ret = -BCH_ERR_fsck_repair_unimplemented;
|
|
- }
|
|
- }
|
|
-err:
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
|
|
@@ -195,7 +228,7 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
|
|
struct btree_iter *k_iter, struct bkey_s_c hash_k)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter iter = { NULL };
|
|
+ struct btree_iter iter = {};
|
|
struct printbuf buf = PRINTBUF;
|
|
struct bkey_s_c k;
|
|
int ret = 0;
|
|
@@ -232,7 +265,7 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
|
|
goto out;
|
|
|
|
if (fsck_err(trans, hash_table_key_wrong_offset,
|
|
- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s",
|
|
+ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
|
|
bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
|
|
(printbuf_reset(&buf),
|
|
bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
|
|
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
|
|
index 55a4ac7bf220..6762b3627e1b 100644
|
|
--- a/fs/bcachefs/str_hash.h
|
|
+++ b/fs/bcachefs/str_hash.h
|
|
@@ -12,7 +12,6 @@
|
|
#include "super.h"
|
|
|
|
#include <linux/crc32c.h>
|
|
-#include <crypto/hash.h>
|
|
#include <crypto/sha2.h>
|
|
|
|
static inline enum bch_str_hash_type
|
|
@@ -33,7 +32,9 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
|
|
}
|
|
|
|
struct bch_hash_info {
|
|
+ u32 inum_snapshot;
|
|
u8 type;
|
|
+ struct unicode_map *cf_encoding;
|
|
/*
|
|
* For crc32 or crc64 string hashes the first key value of
|
|
* the siphash_key (k0) is used as the key.
|
|
@@ -44,20 +45,20 @@ struct bch_hash_info {
|
|
static inline struct bch_hash_info
|
|
bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
|
|
{
|
|
- /* XXX ick */
|
|
struct bch_hash_info info = {
|
|
- .type = INODE_STR_HASH(bi),
|
|
- .siphash_key = { .k0 = bi->bi_hash_seed }
|
|
+ .inum_snapshot = bi->bi_snapshot,
|
|
+ .type = INODE_STR_HASH(bi),
|
|
+#ifdef CONFIG_UNICODE
|
|
+ .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
|
|
+#endif
|
|
+ .siphash_key = { .k0 = bi->bi_hash_seed }
|
|
};
|
|
|
|
if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
|
|
- SHASH_DESC_ON_STACK(desc, c->sha256);
|
|
u8 digest[SHA256_DIGEST_SIZE];
|
|
|
|
- desc->tfm = c->sha256;
|
|
-
|
|
- crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
|
|
- sizeof(bi->bi_hash_seed), digest);
|
|
+ sha256((const u8 *)&bi->bi_hash_seed,
|
|
+ sizeof(bi->bi_hash_seed), digest);
|
|
memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
|
|
}
|
|
|
|
@@ -231,11 +232,11 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
|
|
struct bkey_s_c k;
|
|
int ret;
|
|
|
|
- bch2_trans_copy_iter(&iter, start);
|
|
+ bch2_trans_copy_iter(trans, &iter, start);
|
|
|
|
- bch2_btree_iter_advance(&iter);
|
|
+ bch2_btree_iter_advance(trans, &iter);
|
|
|
|
- for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) {
|
|
+ for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) {
|
|
if (k.k->type != desc.key_type &&
|
|
k.k->type != KEY_TYPE_hash_whiteout)
|
|
break;
|
|
@@ -280,7 +281,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
|
|
}
|
|
|
|
if (!slot.path && !(flags & STR_HASH_must_replace))
|
|
- bch2_trans_copy_iter(&slot, iter);
|
|
+ bch2_trans_copy_iter(trans, &slot, iter);
|
|
|
|
if (k.k->type != KEY_TYPE_hash_whiteout)
|
|
goto not_found;
|
|
@@ -393,6 +394,8 @@ int bch2_hash_delete(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
+int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *);
|
|
+
|
|
struct snapshots_seen;
|
|
int __bch2_str_hash_check_key(struct btree_trans *,
|
|
struct snapshots_seen *,
|
|
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
|
|
index b7b96283c316..35c9f86a73c1 100644
|
|
--- a/fs/bcachefs/subvolume.c
|
|
+++ b/fs/bcachefs/subvolume.c
|
|
@@ -3,9 +3,11 @@
|
|
#include "bcachefs.h"
|
|
#include "btree_key_cache.h"
|
|
#include "btree_update.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "errcode.h"
|
|
#include "error.h"
|
|
#include "fs.h"
|
|
+#include "recovery_passes.h"
|
|
#include "snapshot.h"
|
|
#include "subvolume.h"
|
|
|
|
@@ -13,6 +15,22 @@
|
|
|
|
static int bch2_subvolume_delete(struct btree_trans *, u32);
|
|
|
|
+static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid)
|
|
+{
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ bch2_log_msg_start(c, &buf);
|
|
+
|
|
+ prt_printf(&buf, "missing subvolume %u", subvolid);
|
|
+ bool print = bch2_count_fsck_err(c, subvol_missing, &buf);
|
|
+
|
|
+ int ret = bch2_run_explicit_recovery_pass(c, &buf,
|
|
+ BCH_RECOVERY_PASS_check_inodes, 0);
|
|
+ if (print)
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static struct bpos subvolume_children_pos(struct bkey_s_c k)
|
|
{
|
|
if (k.k->type != KEY_TYPE_subvolume)
|
|
@@ -44,8 +62,8 @@ static int check_subvol(struct btree_trans *trans,
|
|
ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
|
|
|
|
if (bch2_err_matches(ret, ENOENT))
|
|
- bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
|
|
- k.k->p.offset, snapid);
|
|
+ return bch2_run_print_explicit_recovery_pass(c,
|
|
+ BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -275,7 +293,7 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
|
|
struct btree_iter iter;
|
|
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
|
|
- struct bkey_s_c k = bch2_btree_iter_peek(&iter);
|
|
+ struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
return bkey_err(k) ?: k.k && k.k->p.inode == subvol
|
|
@@ -291,9 +309,8 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
|
|
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
|
|
BTREE_ITER_cached|
|
|
BTREE_ITER_with_updates, subvolume, s);
|
|
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
|
|
- inconsistent_if_not_found,
|
|
- trans->c, "missing subvolume %u", subvol);
|
|
+ if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found)
|
|
+ ret = bch2_subvolume_missing(trans->c, subvol) ?: ret;
|
|
return ret;
|
|
}
|
|
|
|
@@ -343,8 +360,8 @@ int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
|
|
subvolume);
|
|
ret = bkey_err(subvol);
|
|
|
|
- bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c,
|
|
- "missing subvolume %u", subvolid);
|
|
+ if (bch2_err_matches(ret, ENOENT))
|
|
+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
|
|
|
|
if (likely(!ret))
|
|
*snapid = le32_to_cpu(subvol.v->snapshot);
|
|
@@ -417,8 +434,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
|
|
BTREE_ITER_cached|BTREE_ITER_intent,
|
|
subvolume);
|
|
int ret = bkey_err(subvol);
|
|
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
|
|
- "missing subvolume %u", subvolid);
|
|
+ if (bch2_err_matches(ret, ENOENT))
|
|
+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
|
|
if (ret)
|
|
goto err;
|
|
|
|
@@ -478,13 +495,11 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
|
|
{
|
|
struct bch_fs *c = container_of(work, struct bch_fs,
|
|
snapshot_wait_for_pagecache_and_delete_work);
|
|
- snapshot_id_list s;
|
|
- u32 *id;
|
|
int ret = 0;
|
|
|
|
while (!ret) {
|
|
mutex_lock(&c->snapshots_unlinked_lock);
|
|
- s = c->snapshots_unlinked;
|
|
+ snapshot_id_list s = c->snapshots_unlinked;
|
|
darray_init(&c->snapshots_unlinked);
|
|
mutex_unlock(&c->snapshots_unlinked_lock);
|
|
|
|
@@ -493,7 +508,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
|
|
|
|
bch2_evict_subvolume_inodes(c, &s);
|
|
|
|
- for (id = s.data; id < s.data + s.nr; id++) {
|
|
+ darray_for_each(s, id) {
|
|
ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
|
|
bch_err_msg(c, ret, "deleting subvolume %u", *id);
|
|
if (ret)
|
|
@@ -503,7 +518,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
|
|
darray_exit(&s);
|
|
}
|
|
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
|
|
}
|
|
|
|
struct subvolume_unlink_hook {
|
|
@@ -526,11 +541,11 @@ static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans
|
|
if (ret)
|
|
return ret;
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache))
|
|
return -EROFS;
|
|
|
|
if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
|
|
return 0;
|
|
}
|
|
|
|
@@ -554,13 +569,13 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
|
|
BTREE_ID_subvolumes, POS(0, subvolid),
|
|
BTREE_ITER_cached, subvolume);
|
|
ret = PTR_ERR_OR_ZERO(n);
|
|
- if (unlikely(ret)) {
|
|
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
|
|
- "missing subvolume %u", subvolid);
|
|
+ if (bch2_err_matches(ret, ENOENT))
|
|
+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
|
|
+ if (unlikely(ret))
|
|
return ret;
|
|
- }
|
|
|
|
SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
|
|
+ n->v.fs_path_parent = 0;
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
@@ -573,7 +588,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
|
|
bool ro)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
|
|
+ struct btree_iter dst_iter, src_iter = {};
|
|
struct bkey_i_subvolume *new_subvol = NULL;
|
|
struct bkey_i_subvolume *src_subvol = NULL;
|
|
u32 parent = 0, new_nodes[2], snapshot_subvols[2];
|
|
@@ -596,11 +611,10 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
|
|
BTREE_ID_subvolumes, POS(0, src_subvolid),
|
|
BTREE_ITER_cached, subvolume);
|
|
ret = PTR_ERR_OR_ZERO(src_subvol);
|
|
- if (unlikely(ret)) {
|
|
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
|
|
- "subvolume %u not found", src_subvolid);
|
|
+ if (bch2_err_matches(ret, ENOENT))
|
|
+ ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret;
|
|
+ if (unlikely(ret))
|
|
goto err;
|
|
- }
|
|
|
|
parent = le32_to_cpu(src_subvol->v.snapshot);
|
|
}
|
|
@@ -714,11 +728,8 @@ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
|
|
return ret;
|
|
}
|
|
|
|
-int bch2_fs_subvolumes_init(struct bch_fs *c)
|
|
+void bch2_fs_subvolumes_init_early(struct bch_fs *c)
|
|
{
|
|
- INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
|
|
INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
|
|
bch2_subvolume_wait_for_pagecache_and_delete);
|
|
- mutex_init(&c->snapshots_unlinked_lock);
|
|
- return 0;
|
|
}
|
|
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
|
|
index 910f6196700e..771ade03a348 100644
|
|
--- a/fs/bcachefs/subvolume.h
|
|
+++ b/fs/bcachefs/subvolume.h
|
|
@@ -2,7 +2,6 @@
|
|
#ifndef _BCACHEFS_SUBVOLUME_H
|
|
#define _BCACHEFS_SUBVOLUME_H
|
|
|
|
-#include "darray.h"
|
|
#include "subvolume_types.h"
|
|
|
|
int bch2_check_subvols(struct bch_fs *);
|
|
@@ -33,16 +32,16 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
|
|
int bch2_subvol_is_ro(struct bch_fs *, u32);
|
|
|
|
static inline struct bkey_s_c
|
|
-bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end,
|
|
- u32 subvolid, unsigned flags)
|
|
+bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bpos end, u32 subvolid, unsigned flags)
|
|
{
|
|
u32 snapshot;
|
|
- int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
|
|
+ int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot);
|
|
if (ret)
|
|
return bkey_s_c_err(ret);
|
|
|
|
- bch2_btree_iter_set_snapshot(iter, snapshot);
|
|
- return bch2_btree_iter_peek_max_type(iter, end, flags);
|
|
+ bch2_btree_iter_set_snapshot(trans, iter, snapshot);
|
|
+ return bch2_btree_iter_peek_max_type(trans, iter, end, flags);
|
|
}
|
|
|
|
#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
|
|
@@ -53,14 +52,14 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos
|
|
\
|
|
do { \
|
|
_ret3 = lockrestart_do(_trans, ({ \
|
|
- (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \
|
|
+ (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\
|
|
_end, _subvolid, (_flags)); \
|
|
if (!(_k).k) \
|
|
break; \
|
|
\
|
|
bkey_err(_k) ?: (_do); \
|
|
})); \
|
|
- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
|
|
+ } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \
|
|
\
|
|
bch2_trans_iter_exit((_trans), &(_iter)); \
|
|
_ret3; \
|
|
@@ -77,15 +76,12 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos
|
|
_end, _subvolid, _flags, _k, _do); \
|
|
})
|
|
|
|
-int bch2_delete_dead_snapshots(struct bch_fs *);
|
|
-void bch2_delete_dead_snapshots_async(struct bch_fs *);
|
|
-
|
|
int bch2_subvolume_unlink(struct btree_trans *, u32);
|
|
int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
|
|
|
|
int bch2_initialize_subvolumes(struct bch_fs *);
|
|
int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
|
|
|
|
-int bch2_fs_subvolumes_init(struct bch_fs *);
|
|
+void bch2_fs_subvolumes_init_early(struct bch_fs *);
|
|
|
|
#endif /* _BCACHEFS_SUBVOLUME_H */
|
|
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
|
|
index 1549d6daf7af..9d634b906dcd 100644
|
|
--- a/fs/bcachefs/subvolume_types.h
|
|
+++ b/fs/bcachefs/subvolume_types.h
|
|
@@ -2,33 +2,6 @@
|
|
#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
|
|
#define _BCACHEFS_SUBVOLUME_TYPES_H
|
|
|
|
-#include "darray.h"
|
|
-
|
|
-typedef DARRAY(u32) snapshot_id_list;
|
|
-
|
|
-#define IS_ANCESTOR_BITMAP 128
|
|
-
|
|
-struct snapshot_t {
|
|
- bool live;
|
|
- u32 parent;
|
|
- u32 skip[3];
|
|
- u32 depth;
|
|
- u32 children[2];
|
|
- u32 subvol; /* Nonzero only if a subvolume points to this node: */
|
|
- u32 tree;
|
|
- unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
|
|
-};
|
|
-
|
|
-struct snapshot_table {
|
|
- struct rcu_head rcu;
|
|
- size_t nr;
|
|
-#ifndef RUST_BINDGEN
|
|
- DECLARE_FLEX_ARRAY(struct snapshot_t, s);
|
|
-#else
|
|
- struct snapshot_t s[0];
|
|
-#endif
|
|
-};
|
|
-
|
|
typedef struct {
|
|
/* we can't have padding in this struct: */
|
|
u64 subvol;
|
|
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
|
|
index a81a7b6c0989..6687b9235d3c 100644
|
|
--- a/fs/bcachefs/super-io.c
|
|
+++ b/fs/bcachefs/super-io.c
|
|
@@ -25,9 +25,6 @@
|
|
#include <linux/sort.h>
|
|
#include <linux/string_choices.h>
|
|
|
|
-static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
|
|
-};
|
|
-
|
|
struct bch2_metadata_version {
|
|
u16 version;
|
|
const char *name;
|
|
@@ -69,19 +66,38 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta
|
|
return v;
|
|
}
|
|
|
|
-bool bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
|
|
+int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
|
|
{
|
|
- bool ret = (c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
|
|
- version <= c->sb.version_incompat_allowed;
|
|
+ int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
|
|
+ version <= c->sb.version_incompat_allowed)
|
|
+ ? 0
|
|
+ : -BCH_ERR_may_not_use_incompat_feature;
|
|
|
|
- if (ret) {
|
|
- mutex_lock(&c->sb_lock);
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ if (!ret) {
|
|
SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
|
|
max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
|
|
bch2_write_super(c);
|
|
- mutex_unlock(&c->sb_lock);
|
|
+ } else {
|
|
+ darray_for_each(c->incompat_versions_requested, i)
|
|
+ if (version == *i)
|
|
+ goto out;
|
|
+
|
|
+ darray_push(&c->incompat_versions_requested, version);
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ prt_str(&buf, "requested incompat feature ");
|
|
+ bch2_version_to_text(&buf, version);
|
|
+ prt_str(&buf, " currently not enabled, allowed up to ");
|
|
+ bch2_version_to_text(&buf, version);
|
|
+ prt_printf(&buf, "\n set version_upgrade=incompat to enable");
|
|
+
|
|
+ bch_notice(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
}
|
|
|
|
+out:
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
return ret;
|
|
}
|
|
|
|
@@ -245,11 +261,11 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
|
|
|
|
/* XXX: we're not checking that offline device have enough space */
|
|
|
|
- for_each_online_member(c, ca) {
|
|
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) {
|
|
struct bch_sb_handle *dev_sb = &ca->disk_sb;
|
|
|
|
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize);
|
|
return NULL;
|
|
}
|
|
}
|
|
@@ -366,39 +382,40 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
|
|
return 0;
|
|
}
|
|
|
|
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
|
- enum bch_validate_flags flags, struct printbuf *out)
|
|
+int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
|
|
+ enum bch_validate_flags flags, struct printbuf *out)
|
|
{
|
|
- struct bch_sb *sb = disk_sb->sb;
|
|
- struct bch_sb_field_members_v1 *mi;
|
|
enum bch_opt_id opt_id;
|
|
- u16 block_size;
|
|
int ret;
|
|
|
|
ret = bch2_sb_compatible(sb, out);
|
|
if (ret)
|
|
return ret;
|
|
|
|
- if (sb->features[1] ||
|
|
- (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
|
|
- prt_printf(out, "Filesystem has incompatible features");
|
|
+ u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
|
|
+ unsigned incompat_bit = 0;
|
|
+ if (incompat)
|
|
+ incompat_bit = __ffs64(incompat);
|
|
+ else if (sb->features[1])
|
|
+ incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
|
|
+
|
|
+ if (incompat_bit) {
|
|
+ prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
|
|
+ incompat_bit,
|
|
+ bch2_sb_features[BCH_FEATURE_NR - 1],
|
|
+ BCH_FEATURE_NR - 1);
|
|
return -BCH_ERR_invalid_sb_features;
|
|
}
|
|
|
|
if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
|
|
BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
|
|
- prt_printf(out, "Filesystem has incompatible version");
|
|
+ prt_str(out, "Filesystem has incompatible version ");
|
|
+ bch2_version_to_text(out, le16_to_cpu(sb->version));
|
|
+ prt_str(out, ", current version ");
|
|
+ bch2_version_to_text(out, bcachefs_metadata_version_current);
|
|
return -BCH_ERR_invalid_sb_features;
|
|
}
|
|
|
|
- block_size = le16_to_cpu(sb->block_size);
|
|
-
|
|
- if (block_size > PAGE_SECTORS) {
|
|
- prt_printf(out, "Block size too big (got %u, max %u)",
|
|
- block_size, PAGE_SECTORS);
|
|
- return -BCH_ERR_invalid_sb_block_size;
|
|
- }
|
|
-
|
|
if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
|
|
prt_printf(out, "Bad user UUID (got zeroes)");
|
|
return -BCH_ERR_invalid_sb_uuid;
|
|
@@ -409,6 +426,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
|
return -BCH_ERR_invalid_sb_uuid;
|
|
}
|
|
|
|
+ if (!(flags & BCH_VALIDATE_write) &&
|
|
+ le64_to_cpu(sb->offset) != read_offset) {
|
|
+ prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
|
|
+ le64_to_cpu(sb->offset), read_offset);
|
|
+ return -BCH_ERR_invalid_sb_offset;
|
|
+ }
|
|
+
|
|
if (!sb->nr_devices ||
|
|
sb->nr_devices > BCH_SB_MEMBERS_MAX) {
|
|
prt_printf(out, "Bad number of member devices %u (max %u)",
|
|
@@ -444,6 +468,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
|
SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb));
|
|
}
|
|
|
|
+ if (sb->nr_devices > 1)
|
|
+ SET_BCH_SB_MULTI_DEVICE(sb, true);
|
|
+
|
|
if (!flags) {
|
|
/*
|
|
* Been seeing a bug where these are getting inexplicably
|
|
@@ -464,6 +491,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
|
|
|
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
|
|
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
|
|
+
|
|
+ if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
|
|
+ SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
|
|
+
|
|
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
|
|
+ !BCH_SB_CSUM_ERR_RETRY_NR(sb))
|
|
+ SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
|
|
}
|
|
|
|
#ifdef __KERNEL__
|
|
@@ -474,8 +508,8 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
|
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
|
|
const struct bch_option *opt = bch2_opt_table + opt_id;
|
|
|
|
- if (opt->get_sb != BCH2_NO_SB_OPT) {
|
|
- u64 v = bch2_opt_from_sb(sb, opt_id);
|
|
+ if (opt->get_sb) {
|
|
+ u64 v = bch2_opt_from_sb(sb, opt_id, -1);
|
|
|
|
prt_printf(out, "Invalid option ");
|
|
ret = bch2_opt_validate(opt, v, out);
|
|
@@ -505,14 +539,17 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
|
}
|
|
}
|
|
|
|
+ struct bch_sb_field *mi =
|
|
+ bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?:
|
|
+ bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1);
|
|
+
|
|
/* members must be validated first: */
|
|
- mi = bch2_sb_field_get(sb, members_v1);
|
|
if (!mi) {
|
|
prt_printf(out, "Invalid superblock: member info area missing");
|
|
return -BCH_ERR_invalid_sb_members_missing;
|
|
}
|
|
|
|
- ret = bch2_sb_field_validate(sb, &mi->field, flags, out);
|
|
+ ret = bch2_sb_field_validate(sb, mi, flags, out);
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -581,11 +618,15 @@ static void bch2_sb_update(struct bch_fs *c)
|
|
|
|
c->sb.features = le64_to_cpu(src->features[0]);
|
|
c->sb.compat = le64_to_cpu(src->compat[0]);
|
|
+ c->sb.multi_device = BCH_SB_MULTI_DEVICE(src);
|
|
|
|
memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
|
|
|
|
struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
|
|
if (ext) {
|
|
+ c->sb.recovery_passes_required =
|
|
+ bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
|
|
+
|
|
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
|
|
sizeof(c->sb.errors_silent) * 8);
|
|
c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
|
|
@@ -755,7 +796,7 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts,
|
|
memset(sb, 0, sizeof(*sb));
|
|
sb->mode = BLK_OPEN_READ;
|
|
sb->have_bio = true;
|
|
- sb->holder = kmalloc(1, GFP_KERNEL);
|
|
+ sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL);
|
|
if (!sb->holder)
|
|
return -ENOMEM;
|
|
|
|
@@ -881,7 +922,7 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts,
|
|
|
|
sb->have_layout = true;
|
|
|
|
- ret = bch2_sb_validate(sb, 0, &err);
|
|
+ ret = bch2_sb_validate(sb->sb, offset, 0, &err);
|
|
if (ret) {
|
|
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
|
|
path, err.buf);
|
|
@@ -918,19 +959,19 @@ static void write_super_endio(struct bio *bio)
|
|
{
|
|
struct bch_dev *ca = bio->bi_private;
|
|
|
|
+ bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
|
|
+
|
|
/* XXX: return errors directly */
|
|
|
|
- if (bch2_dev_io_err_on(bio->bi_status, ca,
|
|
- bio_data_dir(bio)
|
|
- ? BCH_MEMBER_ERROR_write
|
|
- : BCH_MEMBER_ERROR_read,
|
|
- "superblock %s error: %s",
|
|
+ if (bio->bi_status) {
|
|
+ bch_err_dev_ratelimited(ca, "superblock %s error: %s",
|
|
str_write_read(bio_data_dir(bio)),
|
|
- bch2_blk_status_to_str(bio->bi_status)))
|
|
+ bch2_blk_status_to_str(bio->bi_status));
|
|
ca->sb_write_error = 1;
|
|
+ }
|
|
|
|
closure_put(&ca->fs->sb_write);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
|
|
}
|
|
|
|
static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
|
|
@@ -948,7 +989,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
|
|
|
|
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
|
|
|
|
- percpu_ref_get(&ca->io_ref);
|
|
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
|
|
closure_bio_submit(bio, &c->sb_write);
|
|
}
|
|
|
|
@@ -974,7 +1015,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
|
|
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
|
|
bio_sectors(bio));
|
|
|
|
- percpu_ref_get(&ca->io_ref);
|
|
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
|
|
closure_bio_submit(bio, &c->sb_write);
|
|
}
|
|
|
|
@@ -991,7 +1032,7 @@ int bch2_write_super(struct bch_fs *c)
|
|
|
|
trace_and_count(c, write_super, c, _RET_IP_);
|
|
|
|
- if (c->opts.very_degraded)
|
|
+ if (c->opts.degraded == BCH_DEGRADED_very)
|
|
degraded_flags |= BCH_FORCE_IF_LOST;
|
|
|
|
lockdep_assert_held(&c->sb_lock);
|
|
@@ -999,13 +1040,20 @@ int bch2_write_super(struct bch_fs *c)
|
|
closure_init_stack(cl);
|
|
memset(&sb_written, 0, sizeof(sb_written));
|
|
|
|
- for_each_online_member(c, ca) {
|
|
+ /*
|
|
+ * Note: we do writes to RO devices here, and we might want to change
|
|
+ * that in the future.
|
|
+ *
|
|
+ * For now, we expect to be able to call write_super() when we're not
|
|
+ * yet RW:
|
|
+ */
|
|
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) {
|
|
ret = darray_push(&online_devices, ca);
|
|
if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
|
|
- percpu_ref_put(&ca->io_ref);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
|
|
goto out;
|
|
}
|
|
- percpu_ref_get(&ca->io_ref);
|
|
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
|
|
}
|
|
|
|
/* Make sure we're using the new magic numbers: */
|
|
@@ -1038,7 +1086,7 @@ int bch2_write_super(struct bch_fs *c)
|
|
darray_for_each(online_devices, ca) {
|
|
printbuf_reset(&err);
|
|
|
|
- ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
|
|
+ ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
|
|
if (ret) {
|
|
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
|
|
goto out;
|
|
@@ -1064,7 +1112,8 @@ int bch2_write_super(struct bch_fs *c)
|
|
prt_str(&buf, ")");
|
|
bch2_fs_fatal_error(c, ": %s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
- return -BCH_ERR_sb_not_downgraded;
|
|
+ ret = -BCH_ERR_sb_not_downgraded;
|
|
+ goto out;
|
|
}
|
|
|
|
darray_for_each(online_devices, ca) {
|
|
@@ -1166,12 +1215,12 @@ int bch2_write_super(struct bch_fs *c)
|
|
!can_mount_with_written), c,
|
|
": Unable to write superblock to sufficient devices (from %ps)",
|
|
(void *) _RET_IP_))
|
|
- ret = -1;
|
|
+ ret = -BCH_ERR_erofs_sb_err;
|
|
out:
|
|
/* Make new options visible after they're persistent: */
|
|
bch2_sb_update(c);
|
|
darray_for_each(online_devices, ca)
|
|
- percpu_ref_put(&(*ca)->io_ref);
|
|
+ enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super);
|
|
darray_exit(&online_devices);
|
|
printbuf_exit(&err);
|
|
return ret;
|
|
@@ -1223,15 +1272,39 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
|
|
bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
|
|
|
|
c->disk_sb.sb->version = cpu_to_le16(new_version);
|
|
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
|
|
|
|
if (incompat) {
|
|
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
|
|
SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
|
|
max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
|
|
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field);
|
|
}
|
|
}
|
|
|
|
+void bch2_sb_upgrade_incompat(struct bch_fs *c)
|
|
+{
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ if (c->sb.version == c->sb.version_incompat_allowed)
|
|
+ goto unlock;
|
|
+
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ prt_str(&buf, "Now allowing incompatible features up to ");
|
|
+ bch2_version_to_text(&buf, c->sb.version);
|
|
+ prt_str(&buf, ", previously allowed up to ");
|
|
+ bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
|
|
+ prt_newline(&buf);
|
|
+
|
|
+ bch_notice(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+
|
|
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
|
|
+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
|
|
+ max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version));
|
|
+ bch2_write_super(c);
|
|
+unlock:
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+}
|
|
+
|
|
static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
|
|
enum bch_validate_flags flags, struct printbuf *err)
|
|
{
|
|
@@ -1459,8 +1532,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
|
|
for (id = 0; id < bch2_opts_nr; id++) {
|
|
const struct bch_option *opt = bch2_opt_table + id;
|
|
|
|
- if (opt->get_sb != BCH2_NO_SB_OPT) {
|
|
- u64 v = bch2_opt_from_sb(sb, id);
|
|
+ if (opt->get_sb) {
|
|
+ u64 v = bch2_opt_from_sb(sb, id, -1);
|
|
|
|
prt_printf(out, "%s:\t", opt->attr.name);
|
|
bch2_opt_to_text(out, NULL, sb, opt, v,
|
|
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
|
|
index b4cff9ebdebb..a3b7a90f2533 100644
|
|
--- a/fs/bcachefs/super-io.h
|
|
+++ b/fs/bcachefs/super-io.h
|
|
@@ -21,13 +21,13 @@ static inline bool bch2_version_compatible(u16 version)
|
|
void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
|
|
enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
|
|
|
|
-bool bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
|
|
+int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
|
|
|
|
-static inline bool bch2_request_incompat_feature(struct bch_fs *c,
|
|
- enum bcachefs_metadata_version version)
|
|
+static inline int bch2_request_incompat_feature(struct bch_fs *c,
|
|
+ enum bcachefs_metadata_version version)
|
|
{
|
|
return likely(version <= c->sb.version_incompat)
|
|
- ? true
|
|
+ ? 0
|
|
: bch2_set_version_incompat(c, version);
|
|
}
|
|
|
|
@@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
|
|
void bch2_free_super(struct bch_sb_handle *);
|
|
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
|
|
|
|
+int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
|
|
+
|
|
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
|
|
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
|
|
int bch2_write_super(struct bch_fs *);
|
|
@@ -105,6 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
|
|
|
|
bool bch2_check_version_downgrade(struct bch_fs *);
|
|
void bch2_sb_upgrade(struct bch_fs *, unsigned, bool);
|
|
+void bch2_sb_upgrade_incompat(struct bch_fs *);
|
|
|
|
void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
|
|
struct bch_sb_field *);
|
|
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
|
|
index 0459c875e189..24658bf450ab 100644
|
|
--- a/fs/bcachefs/super.c
|
|
+++ b/fs/bcachefs/super.c
|
|
@@ -10,6 +10,8 @@
|
|
#include "bcachefs.h"
|
|
#include "alloc_background.h"
|
|
#include "alloc_foreground.h"
|
|
+#include "async_objs.h"
|
|
+#include "backpointers.h"
|
|
#include "bkey_sort.h"
|
|
#include "btree_cache.h"
|
|
#include "btree_gc.h"
|
|
@@ -28,6 +30,7 @@
|
|
#include "disk_accounting.h"
|
|
#include "disk_groups.h"
|
|
#include "ec.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "errcode.h"
|
|
#include "error.h"
|
|
#include "fs.h"
|
|
@@ -48,6 +51,7 @@
|
|
#include "quota.h"
|
|
#include "rebalance.h"
|
|
#include "recovery.h"
|
|
+#include "recovery_passes.h"
|
|
#include "replicas.h"
|
|
#include "sb-clean.h"
|
|
#include "sb-counters.h"
|
|
@@ -70,26 +74,37 @@
|
|
#include <linux/percpu.h>
|
|
#include <linux/random.h>
|
|
#include <linux/sysfs.h>
|
|
-#include <crypto/hash.h>
|
|
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
|
|
MODULE_DESCRIPTION("bcachefs filesystem");
|
|
-MODULE_SOFTDEP("pre: crc32c");
|
|
-MODULE_SOFTDEP("pre: crc64");
|
|
-MODULE_SOFTDEP("pre: sha256");
|
|
-MODULE_SOFTDEP("pre: chacha20");
|
|
-MODULE_SOFTDEP("pre: poly1305");
|
|
-MODULE_SOFTDEP("pre: xxhash");
|
|
|
|
-const char * const bch2_fs_flag_strs[] = {
|
|
+typedef DARRAY(struct bch_sb_handle) bch_sb_handles;
|
|
+
|
|
#define x(n) #n,
|
|
+const char * const bch2_fs_flag_strs[] = {
|
|
BCH_FS_FLAGS()
|
|
-#undef x
|
|
NULL
|
|
};
|
|
|
|
-void bch2_print_str(struct bch_fs *c, const char *str)
|
|
+const char * const bch2_write_refs[] = {
|
|
+ BCH_WRITE_REFS()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+const char * const bch2_dev_read_refs[] = {
|
|
+ BCH_DEV_READ_REFS()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+const char * const bch2_dev_write_refs[] = {
|
|
+ BCH_DEV_WRITE_REFS()
|
|
+ NULL
|
|
+};
|
|
+#undef x
|
|
+
|
|
+static void __bch2_print_str(struct bch_fs *c, const char *prefix,
|
|
+ const char *str, bool nonblocking)
|
|
{
|
|
#ifdef __KERNEL__
|
|
struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
|
|
@@ -99,7 +114,17 @@ void bch2_print_str(struct bch_fs *c, const char *str)
|
|
return;
|
|
}
|
|
#endif
|
|
- bch2_print_string_as_lines(KERN_ERR, str);
|
|
+ bch2_print_string_as_lines(KERN_ERR, str, nonblocking);
|
|
+}
|
|
+
|
|
+void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str)
|
|
+{
|
|
+ __bch2_print_str(c, prefix, str, false);
|
|
+}
|
|
+
|
|
+void bch2_print_str_nonblocking(struct bch_fs *c, const char *prefix, const char *str)
|
|
+{
|
|
+ __bch2_print_str(c, prefix, str, true);
|
|
}
|
|
|
|
__printf(2, 0)
|
|
@@ -188,7 +213,9 @@ static void bch2_dev_unlink(struct bch_dev *);
|
|
static void bch2_dev_free(struct bch_dev *);
|
|
static int bch2_dev_alloc(struct bch_fs *, unsigned);
|
|
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
|
|
+static void bch2_dev_io_ref_stop(struct bch_dev *, int);
|
|
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
|
|
+static int bch2_fs_init_rw(struct bch_fs *);
|
|
|
|
struct bch_fs *bch2_dev_to_fs(dev_t dev)
|
|
{
|
|
@@ -297,19 +324,19 @@ static void __bch2_fs_read_only(struct bch_fs *c)
|
|
/*
|
|
* After stopping journal:
|
|
*/
|
|
- for_each_member_device(c, ca)
|
|
+ for_each_member_device(c, ca) {
|
|
+ bch2_dev_io_ref_stop(ca, WRITE);
|
|
bch2_dev_allocator_remove(c, ca);
|
|
+ }
|
|
}
|
|
|
|
-#ifndef BCH_WRITE_REF_DEBUG
|
|
-static void bch2_writes_disabled(struct percpu_ref *writes)
|
|
+static void bch2_writes_disabled(struct enumerated_ref *writes)
|
|
{
|
|
struct bch_fs *c = container_of(writes, struct bch_fs, writes);
|
|
|
|
set_bit(BCH_FS_write_disable_complete, &c->flags);
|
|
wake_up(&bch2_read_only_wait);
|
|
}
|
|
-#endif
|
|
|
|
void bch2_fs_read_only(struct bch_fs *c)
|
|
{
|
|
@@ -327,12 +354,7 @@ void bch2_fs_read_only(struct bch_fs *c)
|
|
* writes will return -EROFS:
|
|
*/
|
|
set_bit(BCH_FS_going_ro, &c->flags);
|
|
-#ifndef BCH_WRITE_REF_DEBUG
|
|
- percpu_ref_kill(&c->writes);
|
|
-#else
|
|
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
|
|
- bch2_write_ref_put(c, i);
|
|
-#endif
|
|
+ enumerated_ref_stop_async(&c->writes);
|
|
|
|
/*
|
|
* If we're not doing an emergency shutdown, we want to wait on
|
|
@@ -370,7 +392,7 @@ void bch2_fs_read_only(struct bch_fs *c)
|
|
!test_bit(BCH_FS_emergency_ro, &c->flags) &&
|
|
test_bit(BCH_FS_started, &c->flags) &&
|
|
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
|
|
- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
|
|
+ c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) {
|
|
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
|
|
BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
|
|
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
|
|
@@ -381,6 +403,11 @@ void bch2_fs_read_only(struct bch_fs *c)
|
|
bch_verbose(c, "marking filesystem clean");
|
|
bch2_fs_mark_clean(c);
|
|
} else {
|
|
+ /* Make sure error counts/counters are persisted */
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
bch_verbose(c, "done going read-only, filesystem not clean");
|
|
}
|
|
}
|
|
@@ -411,41 +438,39 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
|
|
return ret;
|
|
}
|
|
|
|
-bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
|
|
+static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out,
|
|
+ bool locked)
|
|
{
|
|
bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
|
|
|
|
- bch2_journal_halt_locked(&c->journal);
|
|
+ if (!locked)
|
|
+ bch2_journal_halt(&c->journal);
|
|
+ else
|
|
+ bch2_journal_halt_locked(&c->journal);
|
|
bch2_fs_read_only_async(c);
|
|
-
|
|
wake_up(&bch2_read_only_wait);
|
|
+
|
|
+ if (ret)
|
|
+ prt_printf(out, "emergency read only at seq %llu\n",
|
|
+ journal_cur_seq(&c->journal));
|
|
+
|
|
return ret;
|
|
}
|
|
|
|
-static int bch2_fs_read_write_late(struct bch_fs *c)
|
|
+bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out)
|
|
{
|
|
- int ret;
|
|
+ return __bch2_fs_emergency_read_only2(c, out, false);
|
|
+}
|
|
|
|
- /*
|
|
- * Data move operations can't run until after check_snapshots has
|
|
- * completed, and bch2_snapshot_is_ancestor() is available.
|
|
- *
|
|
- * Ideally we'd start copygc/rebalance earlier instead of waiting for
|
|
- * all of recovery/fsck to complete:
|
|
- */
|
|
- ret = bch2_copygc_start(c);
|
|
- if (ret) {
|
|
- bch_err(c, "error starting copygc thread");
|
|
- return ret;
|
|
- }
|
|
+bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
|
|
+{
|
|
+ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
|
|
|
|
- ret = bch2_rebalance_start(c);
|
|
- if (ret) {
|
|
- bch_err(c, "error starting rebalance thread");
|
|
- return ret;
|
|
- }
|
|
+ bch2_journal_halt_locked(&c->journal);
|
|
+ bch2_fs_read_only_async(c);
|
|
|
|
- return 0;
|
|
+ wake_up(&bch2_read_only_wait);
|
|
+ return ret;
|
|
}
|
|
|
|
static int __bch2_fs_read_write(struct bch_fs *c, bool early)
|
|
@@ -454,59 +479,79 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
|
|
|
|
BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
|
|
|
|
+ if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))
|
|
+ return -BCH_ERR_erofs_no_alloc_info;
|
|
+
|
|
if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
|
|
bch_err(c, "cannot go rw, unfixed btree errors");
|
|
return -BCH_ERR_erofs_unfixed_errors;
|
|
}
|
|
|
|
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
|
|
+ bch_err(c, "cannot go rw, filesystem is an unresized image file");
|
|
+ return -BCH_ERR_erofs_filesystem_full;
|
|
+ }
|
|
+
|
|
if (test_bit(BCH_FS_rw, &c->flags))
|
|
return 0;
|
|
|
|
bch_info(c, "going read-write");
|
|
|
|
- ret = bch2_sb_members_v2_init(c);
|
|
+ ret = bch2_fs_init_rw(c);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- ret = bch2_fs_mark_dirty(c);
|
|
+ ret = bch2_sb_members_v2_init(c);
|
|
if (ret)
|
|
goto err;
|
|
|
|
clear_bit(BCH_FS_clean_shutdown, &c->flags);
|
|
|
|
+ rcu_read_lock();
|
|
+ for_each_online_member_rcu(c, ca)
|
|
+ if (ca->mi.state == BCH_MEMBER_STATE_rw) {
|
|
+ bch2_dev_allocator_add(c, ca);
|
|
+ enumerated_ref_start(&ca->io_ref[WRITE]);
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ bch2_recalc_capacity(c);
|
|
+
|
|
/*
|
|
* First journal write must be a flush write: after a clean shutdown we
|
|
* don't read the journal, so the first journal write may end up
|
|
* overwriting whatever was there previously, and there must always be
|
|
* at least one non-flush write in the journal or recovery will fail:
|
|
*/
|
|
+ spin_lock(&c->journal.lock);
|
|
set_bit(JOURNAL_need_flush_write, &c->journal.flags);
|
|
set_bit(JOURNAL_running, &c->journal.flags);
|
|
+ bch2_journal_space_available(&c->journal);
|
|
+ spin_unlock(&c->journal.lock);
|
|
|
|
- for_each_rw_member(c, ca)
|
|
- bch2_dev_allocator_add(c, ca);
|
|
- bch2_recalc_capacity(c);
|
|
+ ret = bch2_fs_mark_dirty(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_journal_reclaim_start(&c->journal);
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
set_bit(BCH_FS_rw, &c->flags);
|
|
set_bit(BCH_FS_was_rw, &c->flags);
|
|
|
|
-#ifndef BCH_WRITE_REF_DEBUG
|
|
- percpu_ref_reinit(&c->writes);
|
|
-#else
|
|
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
|
|
- BUG_ON(atomic_long_read(&c->writes[i]));
|
|
- atomic_long_inc(&c->writes[i]);
|
|
- }
|
|
-#endif
|
|
+ enumerated_ref_start(&c->writes);
|
|
|
|
- ret = bch2_journal_reclaim_start(&c->journal);
|
|
- if (ret)
|
|
+ ret = bch2_copygc_start(c);
|
|
+ if (ret) {
|
|
+ bch_err_msg(c, ret, "error starting copygc thread");
|
|
goto err;
|
|
+ }
|
|
|
|
- if (!early) {
|
|
- ret = bch2_fs_read_write_late(c);
|
|
- if (ret)
|
|
- goto err;
|
|
+ ret = bch2_rebalance_start(c);
|
|
+ if (ret) {
|
|
+ bch_err_msg(c, ret, "error starting rebalance thread");
|
|
+ goto err;
|
|
}
|
|
|
|
bch2_do_discards(c);
|
|
@@ -531,14 +576,19 @@ int bch2_fs_read_write(struct bch_fs *c)
|
|
if (c->opts.nochanges)
|
|
return -BCH_ERR_erofs_nochanges;
|
|
|
|
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
|
|
+ return -BCH_ERR_erofs_no_alloc_info;
|
|
+
|
|
return __bch2_fs_read_write(c, false);
|
|
}
|
|
|
|
int bch2_fs_read_write_early(struct bch_fs *c)
|
|
{
|
|
- lockdep_assert_held(&c->state_lock);
|
|
+ down_write(&c->state_lock);
|
|
+ int ret = __bch2_fs_read_write(c, true);
|
|
+ up_write(&c->state_lock);
|
|
|
|
- return __bch2_fs_read_write(c, true);
|
|
+ return ret;
|
|
}
|
|
|
|
/* Filesystem startup/shutdown: */
|
|
@@ -548,37 +598,44 @@ static void __bch2_fs_free(struct bch_fs *c)
|
|
for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
|
|
bch2_time_stats_exit(&c->times[i]);
|
|
|
|
+#ifdef CONFIG_UNICODE
|
|
+ utf8_unload(c->cf_encoding);
|
|
+#endif
|
|
+
|
|
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
|
|
bch2_free_pending_node_rewrites(c);
|
|
- bch2_fs_accounting_exit(c);
|
|
- bch2_fs_sb_errors_exit(c);
|
|
- bch2_fs_counters_exit(c);
|
|
+ bch2_free_fsck_errs(c);
|
|
+ bch2_fs_vfs_exit(c);
|
|
bch2_fs_snapshots_exit(c);
|
|
+ bch2_fs_sb_errors_exit(c);
|
|
+ bch2_fs_replicas_exit(c);
|
|
+ bch2_fs_rebalance_exit(c);
|
|
bch2_fs_quota_exit(c);
|
|
+ bch2_fs_nocow_locking_exit(c);
|
|
+ bch2_fs_journal_exit(&c->journal);
|
|
bch2_fs_fs_io_direct_exit(c);
|
|
bch2_fs_fs_io_buffered_exit(c);
|
|
bch2_fs_fsio_exit(c);
|
|
- bch2_fs_vfs_exit(c);
|
|
- bch2_fs_ec_exit(c);
|
|
- bch2_fs_encryption_exit(c);
|
|
- bch2_fs_nocow_locking_exit(c);
|
|
bch2_fs_io_write_exit(c);
|
|
bch2_fs_io_read_exit(c);
|
|
+ bch2_fs_encryption_exit(c);
|
|
+ bch2_fs_ec_exit(c);
|
|
+ bch2_fs_counters_exit(c);
|
|
+ bch2_fs_compress_exit(c);
|
|
+ bch2_io_clock_exit(&c->io_clock[WRITE]);
|
|
+ bch2_io_clock_exit(&c->io_clock[READ]);
|
|
bch2_fs_buckets_waiting_for_journal_exit(c);
|
|
- bch2_fs_btree_interior_update_exit(c);
|
|
+ bch2_fs_btree_write_buffer_exit(c);
|
|
bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
|
|
- bch2_fs_btree_cache_exit(c);
|
|
bch2_fs_btree_iter_exit(c);
|
|
- bch2_fs_replicas_exit(c);
|
|
- bch2_fs_journal_exit(&c->journal);
|
|
- bch2_io_clock_exit(&c->io_clock[WRITE]);
|
|
- bch2_io_clock_exit(&c->io_clock[READ]);
|
|
- bch2_fs_compress_exit(c);
|
|
- bch2_fs_btree_gc_exit(c);
|
|
+ bch2_fs_btree_interior_update_exit(c);
|
|
+ bch2_fs_btree_cache_exit(c);
|
|
+ bch2_fs_accounting_exit(c);
|
|
+ bch2_fs_async_obj_exit(c);
|
|
bch2_journal_keys_put_initial(c);
|
|
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
|
|
+
|
|
BUG_ON(atomic_read(&c->journal_keys.ref));
|
|
- bch2_fs_btree_write_buffer_exit(c);
|
|
percpu_free_rwsem(&c->mark_lock);
|
|
if (c->online_reserved) {
|
|
u64 v = percpu_u64_get(c->online_reserved);
|
|
@@ -586,6 +643,7 @@ static void __bch2_fs_free(struct bch_fs *c)
|
|
free_percpu(c->online_reserved);
|
|
}
|
|
|
|
+ darray_exit(&c->incompat_versions_requested);
|
|
darray_exit(&c->btree_roots_extra);
|
|
free_percpu(c->pcpu);
|
|
free_percpu(c->usage);
|
|
@@ -593,9 +651,7 @@ static void __bch2_fs_free(struct bch_fs *c)
|
|
mempool_exit(&c->btree_bounce_pool);
|
|
bioset_exit(&c->btree_bio);
|
|
mempool_exit(&c->fill_iter);
|
|
-#ifndef BCH_WRITE_REF_DEBUG
|
|
- percpu_ref_exit(&c->writes);
|
|
-#endif
|
|
+ enumerated_ref_exit(&c->writes);
|
|
kfree(rcu_dereference_protected(c->disk_groups, 1));
|
|
kfree(c->journal_seq_blacklist_table);
|
|
|
|
@@ -607,8 +663,8 @@ static void __bch2_fs_free(struct bch_fs *c)
|
|
destroy_workqueue(c->btree_read_complete_wq);
|
|
if (c->copygc_wq)
|
|
destroy_workqueue(c->copygc_wq);
|
|
- if (c->btree_io_complete_wq)
|
|
- destroy_workqueue(c->btree_io_complete_wq);
|
|
+ if (c->btree_write_complete_wq)
|
|
+ destroy_workqueue(c->btree_write_complete_wq);
|
|
if (c->btree_update_wq)
|
|
destroy_workqueue(c->btree_update_wq);
|
|
|
|
@@ -634,6 +690,12 @@ void __bch2_fs_stop(struct bch_fs *c)
|
|
bch2_fs_read_only(c);
|
|
up_write(&c->state_lock);
|
|
|
|
+ for (unsigned i = 0; i < c->sb.nr_devices; i++) {
|
|
+ struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
|
|
+ if (ca)
|
|
+ bch2_dev_io_ref_stop(ca, READ);
|
|
+ }
|
|
+
|
|
for_each_member_device(c, ca)
|
|
bch2_dev_unlink(ca);
|
|
|
|
@@ -662,8 +724,6 @@ void __bch2_fs_stop(struct bch_fs *c)
|
|
|
|
void bch2_fs_free(struct bch_fs *c)
|
|
{
|
|
- unsigned i;
|
|
-
|
|
mutex_lock(&bch_fs_list_lock);
|
|
list_del(&c->list);
|
|
mutex_unlock(&bch_fs_list_lock);
|
|
@@ -671,11 +731,12 @@ void bch2_fs_free(struct bch_fs *c)
|
|
closure_sync(&c->cl);
|
|
closure_debug_destroy(&c->cl);
|
|
|
|
- for (i = 0; i < c->sb.nr_devices; i++) {
|
|
+ for (unsigned i = 0; i < c->sb.nr_devices; i++) {
|
|
struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
|
|
|
|
if (ca) {
|
|
EBUG_ON(atomic_long_read(&ca->ref) != 1);
|
|
+ bch2_dev_io_ref_stop(ca, READ);
|
|
bch2_free_super(&ca->disk_sb);
|
|
bch2_dev_free(ca);
|
|
}
|
|
@@ -698,9 +759,10 @@ static int bch2_fs_online(struct bch_fs *c)
|
|
|
|
lockdep_assert_held(&bch_fs_list_lock);
|
|
|
|
- if (__bch2_uuid_to_fs(c->sb.uuid)) {
|
|
+ if (c->sb.multi_device &&
|
|
+ __bch2_uuid_to_fs(c->sb.uuid)) {
|
|
bch_err(c, "filesystem UUID already open");
|
|
- return -EINVAL;
|
|
+ return -BCH_ERR_filesystem_uuid_already_open;
|
|
}
|
|
|
|
ret = bch2_fs_chardev_init(c);
|
|
@@ -711,14 +773,16 @@ static int bch2_fs_online(struct bch_fs *c)
|
|
|
|
bch2_fs_debug_init(c);
|
|
|
|
- ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
|
|
+ ret = (c->sb.multi_device
|
|
+ ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b)
|
|
+ : kobject_add(&c->kobj, NULL, "%s", c->name)) ?:
|
|
kobject_add(&c->internal, &c->kobj, "internal") ?:
|
|
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
|
|
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
|
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
|
|
#endif
|
|
kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
|
|
- bch2_opts_create_sysfs_files(&c->opts_dir);
|
|
+ bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS);
|
|
if (ret) {
|
|
bch_err(c, "error creating sysfs objects");
|
|
return ret;
|
|
@@ -742,7 +806,37 @@ static int bch2_fs_online(struct bch_fs *c)
|
|
return ret;
|
|
}
|
|
|
|
-static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|
+static int bch2_fs_init_rw(struct bch_fs *c)
|
|
+{
|
|
+ if (test_bit(BCH_FS_rw_init_done, &c->flags))
|
|
+ return 0;
|
|
+
|
|
+ if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
|
|
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
|
|
+ !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete",
|
|
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
|
|
+ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
|
|
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
|
|
+ !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
|
|
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
|
|
+ !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
|
|
+ WQ_FREEZABLE, 0)))
|
|
+ return -BCH_ERR_ENOMEM_fs_other_alloc;
|
|
+
|
|
+ int ret = bch2_fs_btree_interior_update_init(c) ?:
|
|
+ bch2_fs_btree_write_buffer_init(c) ?:
|
|
+ bch2_fs_fs_io_buffered_init(c) ?:
|
|
+ bch2_fs_io_write_init(c) ?:
|
|
+ bch2_fs_journal_init(&c->journal);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ set_bit(BCH_FS_rw_init_done, &c->flags);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
|
|
+ bch_sb_handles *sbs)
|
|
{
|
|
struct bch_fs *c;
|
|
struct printbuf name = PRINTBUF;
|
|
@@ -755,7 +849,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|
goto out;
|
|
}
|
|
|
|
- c->stdio = (void *)(unsigned long) opts.stdio;
|
|
+ c->stdio = (void *)(unsigned long) opts->stdio;
|
|
|
|
__module_get(THIS_MODULE);
|
|
|
|
@@ -779,24 +873,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|
|
|
refcount_set(&c->ro_ref, 1);
|
|
init_waitqueue_head(&c->ro_ref_wait);
|
|
- spin_lock_init(&c->recovery_pass_lock);
|
|
- sema_init(&c->online_fsck_mutex, 1);
|
|
|
|
for (i = 0; i < BCH_TIME_STAT_NR; i++)
|
|
bch2_time_stats_init(&c->times[i]);
|
|
|
|
- bch2_fs_copygc_init(c);
|
|
- bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
|
|
- bch2_fs_btree_iter_init_early(c);
|
|
- bch2_fs_btree_interior_update_init_early(c);
|
|
- bch2_fs_journal_keys_init(c);
|
|
bch2_fs_allocator_background_init(c);
|
|
bch2_fs_allocator_foreground_init(c);
|
|
- bch2_fs_rebalance_init(c);
|
|
- bch2_fs_quota_init(c);
|
|
+ bch2_fs_btree_cache_init_early(&c->btree_cache);
|
|
+ bch2_fs_btree_gc_init_early(c);
|
|
+ bch2_fs_btree_interior_update_init_early(c);
|
|
+ bch2_fs_btree_iter_init_early(c);
|
|
+ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
|
|
+ bch2_fs_btree_write_buffer_init_early(c);
|
|
+ bch2_fs_copygc_init(c);
|
|
bch2_fs_ec_init_early(c);
|
|
+ bch2_fs_journal_init_early(&c->journal);
|
|
+ bch2_fs_journal_keys_init(c);
|
|
bch2_fs_move_init(c);
|
|
+ bch2_fs_nocow_locking_init_early(c);
|
|
+ bch2_fs_quota_init(c);
|
|
+ bch2_fs_recovery_passes_init(c);
|
|
bch2_fs_sb_errors_init_early(c);
|
|
+ bch2_fs_snapshots_init_early(c);
|
|
+ bch2_fs_subvolumes_init_early(c);
|
|
|
|
INIT_LIST_HEAD(&c->list);
|
|
|
|
@@ -822,8 +921,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
|
|
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
|
|
|
|
- bch2_fs_btree_cache_init_early(&c->btree_cache);
|
|
-
|
|
mutex_init(&c->sectors_available_lock);
|
|
|
|
ret = percpu_init_rwsem(&c->mark_lock);
|
|
@@ -837,14 +934,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|
if (ret)
|
|
goto err;
|
|
|
|
- pr_uuid(&name, c->sb.user_uuid.b);
|
|
- ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- strscpy(c->name, name.buf, sizeof(c->name));
|
|
- printbuf_exit(&name);
|
|
-
|
|
/* Compat: */
|
|
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
|
|
!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
|
|
@@ -859,7 +948,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|
if (ret)
|
|
goto err;
|
|
|
|
- bch2_opts_apply(&c->opts, opts);
|
|
+ bch2_opts_apply(&c->opts, *opts);
|
|
|
|
c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
|
|
if (c->opts.inodes_use_key_cache)
|
|
@@ -875,26 +964,26 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|
goto err;
|
|
}
|
|
|
|
+ if (c->sb.multi_device)
|
|
+ pr_uuid(&name, c->sb.user_uuid.b);
|
|
+ else
|
|
+ prt_bdevname(&name, sbs->data[0].bdev);
|
|
+
|
|
+ ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ strscpy(c->name, name.buf, sizeof(c->name));
|
|
+ printbuf_exit(&name);
|
|
+
|
|
iter_size = sizeof(struct sort_iter) +
|
|
(btree_blocks(c) + 1) * 2 *
|
|
sizeof(struct sort_iter_set);
|
|
|
|
- if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
|
|
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
|
|
- !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
|
|
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
|
|
- !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
|
|
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
|
|
- !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
|
|
+ if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
|
|
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
|
|
- !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
|
|
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
|
|
- !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
|
|
- WQ_FREEZABLE, 0)) ||
|
|
-#ifndef BCH_WRITE_REF_DEBUG
|
|
- percpu_ref_init(&c->writes, bch2_writes_disabled,
|
|
- PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
|
|
-#endif
|
|
+ enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR,
|
|
+ bch2_writes_disabled) ||
|
|
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
|
|
bioset_init(&c->btree_bio, 1,
|
|
max(offsetof(struct btree_read_bio, bio),
|
|
@@ -910,32 +999,50 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|
goto err;
|
|
}
|
|
|
|
- ret = bch2_fs_counters_init(c) ?:
|
|
- bch2_fs_sb_errors_init(c) ?:
|
|
- bch2_io_clock_init(&c->io_clock[READ]) ?:
|
|
- bch2_io_clock_init(&c->io_clock[WRITE]) ?:
|
|
- bch2_fs_journal_init(&c->journal) ?:
|
|
- bch2_fs_btree_iter_init(c) ?:
|
|
+ ret =
|
|
+ bch2_fs_async_obj_init(c) ?:
|
|
bch2_fs_btree_cache_init(c) ?:
|
|
+ bch2_fs_btree_iter_init(c) ?:
|
|
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
|
|
- bch2_fs_btree_interior_update_init(c) ?:
|
|
- bch2_fs_btree_gc_init(c) ?:
|
|
bch2_fs_buckets_waiting_for_journal_init(c) ?:
|
|
- bch2_fs_btree_write_buffer_init(c) ?:
|
|
- bch2_fs_subvolumes_init(c) ?:
|
|
- bch2_fs_io_read_init(c) ?:
|
|
- bch2_fs_io_write_init(c) ?:
|
|
- bch2_fs_nocow_locking_init(c) ?:
|
|
- bch2_fs_encryption_init(c) ?:
|
|
+ bch2_io_clock_init(&c->io_clock[READ]) ?:
|
|
+ bch2_io_clock_init(&c->io_clock[WRITE]) ?:
|
|
bch2_fs_compress_init(c) ?:
|
|
+ bch2_fs_counters_init(c) ?:
|
|
bch2_fs_ec_init(c) ?:
|
|
- bch2_fs_vfs_init(c) ?:
|
|
+ bch2_fs_encryption_init(c) ?:
|
|
bch2_fs_fsio_init(c) ?:
|
|
- bch2_fs_fs_io_buffered_init(c) ?:
|
|
- bch2_fs_fs_io_direct_init(c);
|
|
+ bch2_fs_fs_io_direct_init(c) ?:
|
|
+ bch2_fs_io_read_init(c) ?:
|
|
+ bch2_fs_rebalance_init(c) ?:
|
|
+ bch2_fs_sb_errors_init(c) ?:
|
|
+ bch2_fs_vfs_init(c);
|
|
if (ret)
|
|
goto err;
|
|
|
|
+#ifdef CONFIG_UNICODE
|
|
+ /* Default encoding until we can potentially have more as an option. */
|
|
+ c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
|
|
+ if (IS_ERR(c->cf_encoding)) {
|
|
+ printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
|
|
+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
|
|
+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
|
|
+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+ bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
|
|
+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
|
|
+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
|
|
+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
|
|
+#else
|
|
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
|
|
+ printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+#endif
|
|
+
|
|
for (i = 0; i < c->sb.nr_devices; i++) {
|
|
if (!bch2_member_exists(c->disk_sb.sb, i))
|
|
continue;
|
|
@@ -975,12 +1082,6 @@ static void print_mount_opts(struct bch_fs *c)
|
|
prt_str(&p, "starting version ");
|
|
bch2_version_to_text(&p, c->sb.version);
|
|
|
|
- if (c->opts.read_only) {
|
|
- prt_str(&p, " opts=");
|
|
- first = false;
|
|
- prt_printf(&p, "ro");
|
|
- }
|
|
-
|
|
for (i = 0; i < bch2_opts_nr; i++) {
|
|
const struct bch_option *opt = &bch2_opt_table[i];
|
|
u64 v = bch2_opt_get_by_id(&c->opts, i);
|
|
@@ -996,45 +1097,102 @@ static void print_mount_opts(struct bch_fs *c)
|
|
bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
|
|
}
|
|
|
|
+ if (c->sb.version_incompat_allowed != c->sb.version) {
|
|
+ prt_printf(&p, "\n allowing incompatible features above ");
|
|
+ bch2_version_to_text(&p, c->sb.version_incompat_allowed);
|
|
+ }
|
|
+
|
|
+ if (c->opts.verbose) {
|
|
+ prt_printf(&p, "\n features: ");
|
|
+ prt_bitflags(&p, bch2_sb_features, c->sb.features);
|
|
+ }
|
|
+
|
|
bch_info(c, "%s", p.buf);
|
|
printbuf_exit(&p);
|
|
}
|
|
|
|
+static bool bch2_fs_may_start(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned flags = 0;
|
|
+
|
|
+ switch (c->opts.degraded) {
|
|
+ case BCH_DEGRADED_very:
|
|
+ flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
|
|
+ break;
|
|
+ case BCH_DEGRADED_yes:
|
|
+ flags |= BCH_FORCE_IF_DEGRADED;
|
|
+ break;
|
|
+ default:
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) {
|
|
+ if (!bch2_member_exists(c->disk_sb.sb, i))
|
|
+ continue;
|
|
+
|
|
+ ca = bch2_dev_locked(c, i);
|
|
+
|
|
+ if (!bch2_dev_is_online(ca) &&
|
|
+ (ca->mi.state == BCH_MEMBER_STATE_rw ||
|
|
+ ca->mi.state == BCH_MEMBER_STATE_ro)) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return bch2_have_enough_devs(c, c->online_devs, flags, true);
|
|
+}
|
|
+
|
|
int bch2_fs_start(struct bch_fs *c)
|
|
{
|
|
time64_t now = ktime_get_real_seconds();
|
|
- int ret;
|
|
+ int ret = 0;
|
|
|
|
print_mount_opts(c);
|
|
|
|
+ if (!bch2_fs_may_start(c))
|
|
+ return -BCH_ERR_insufficient_devices_to_start;
|
|
+
|
|
down_write(&c->state_lock);
|
|
+ mutex_lock(&c->sb_lock);
|
|
|
|
BUG_ON(test_bit(BCH_FS_started, &c->flags));
|
|
|
|
- mutex_lock(&c->sb_lock);
|
|
+ if (!bch2_sb_field_get_minsize(&c->disk_sb, ext,
|
|
+ sizeof(struct bch_sb_field_ext) / sizeof(u64))) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ up_write(&c->state_lock);
|
|
+ ret = -BCH_ERR_ENOSPC_sb;
|
|
+ goto err;
|
|
+ }
|
|
|
|
ret = bch2_sb_members_v2_init(c);
|
|
if (ret) {
|
|
mutex_unlock(&c->sb_lock);
|
|
+ up_write(&c->state_lock);
|
|
goto err;
|
|
}
|
|
|
|
- for_each_online_member(c, ca)
|
|
- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
|
|
+ rcu_read_lock();
|
|
+ for_each_online_member_rcu(c, ca)
|
|
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
|
|
+ cpu_to_le64(now);
|
|
+ rcu_read_unlock();
|
|
|
|
- struct bch_sb_field_ext *ext =
|
|
- bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
|
|
+ /*
|
|
+ * Dno't write superblock yet: recovery might have to downgrade
|
|
+ */
|
|
mutex_unlock(&c->sb_lock);
|
|
|
|
- if (!ext) {
|
|
- bch_err(c, "insufficient space in superblock for sb_field_ext");
|
|
- ret = -BCH_ERR_ENOSPC_sb;
|
|
- goto err;
|
|
- }
|
|
-
|
|
- for_each_rw_member(c, ca)
|
|
- bch2_dev_allocator_add(c, ca);
|
|
+ rcu_read_lock();
|
|
+ for_each_online_member_rcu(c, ca)
|
|
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
|
|
+ bch2_dev_allocator_add(c, ca);
|
|
+ rcu_read_unlock();
|
|
bch2_recalc_capacity(c);
|
|
+ up_write(&c->state_lock);
|
|
|
|
c->recovery_task = current;
|
|
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
|
|
@@ -1045,35 +1203,30 @@ int bch2_fs_start(struct bch_fs *c)
|
|
if (ret)
|
|
goto err;
|
|
|
|
- ret = bch2_opts_check_may_set(c);
|
|
+ ret = bch2_opts_hooks_pre_set(c);
|
|
if (ret)
|
|
goto err;
|
|
|
|
if (bch2_fs_init_fault("fs_start")) {
|
|
- bch_err(c, "fs_start fault injected");
|
|
- ret = -EINVAL;
|
|
+ ret = -BCH_ERR_injected_fs_start;
|
|
goto err;
|
|
}
|
|
|
|
set_bit(BCH_FS_started, &c->flags);
|
|
+ wake_up(&c->ro_ref_wait);
|
|
|
|
- if (c->opts.read_only) {
|
|
+ down_write(&c->state_lock);
|
|
+ if (c->opts.read_only)
|
|
bch2_fs_read_only(c);
|
|
- } else {
|
|
- ret = !test_bit(BCH_FS_rw, &c->flags)
|
|
- ? bch2_fs_read_write(c)
|
|
- : bch2_fs_read_write_late(c);
|
|
- if (ret)
|
|
- goto err;
|
|
- }
|
|
+ else if (!test_bit(BCH_FS_rw, &c->flags))
|
|
+ ret = bch2_fs_read_write(c);
|
|
+ up_write(&c->state_lock);
|
|
|
|
- ret = 0;
|
|
err:
|
|
if (ret)
|
|
bch_err_msg(c, ret, "starting filesystem");
|
|
else
|
|
bch_verbose(c, "done starting filesystem");
|
|
- up_write(&c->state_lock);
|
|
return ret;
|
|
}
|
|
|
|
@@ -1182,6 +1335,18 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
|
|
|
|
/* Device startup/shutdown: */
|
|
|
|
+static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw)
|
|
+{
|
|
+ if (rw == READ)
|
|
+ clear_bit(ca->dev_idx, ca->fs->online_devs.d);
|
|
+
|
|
+ if (!enumerated_ref_is_zero(&ca->io_ref[rw]))
|
|
+ enumerated_ref_stop(&ca->io_ref[rw],
|
|
+ rw == READ
|
|
+ ? bch2_dev_read_refs
|
|
+ : bch2_dev_write_refs);
|
|
+}
|
|
+
|
|
static void bch2_dev_release(struct kobject *kobj)
|
|
{
|
|
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
|
|
@@ -1191,6 +1356,9 @@ static void bch2_dev_release(struct kobject *kobj)
|
|
|
|
static void bch2_dev_free(struct bch_dev *ca)
|
|
{
|
|
+ WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
|
|
+ WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
|
|
+
|
|
cancel_work_sync(&ca->io_error_work);
|
|
|
|
bch2_dev_unlink(ca);
|
|
@@ -1198,6 +1366,9 @@ static void bch2_dev_free(struct bch_dev *ca)
|
|
if (ca->kobj.state_in_sysfs)
|
|
kobject_del(&ca->kobj);
|
|
|
|
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
|
|
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
|
|
+
|
|
bch2_free_super(&ca->disk_sb);
|
|
bch2_dev_allocator_background_exit(ca);
|
|
bch2_dev_journal_exit(ca);
|
|
@@ -1209,7 +1380,8 @@ static void bch2_dev_free(struct bch_dev *ca)
|
|
bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
|
|
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
|
|
|
|
- percpu_ref_exit(&ca->io_ref);
|
|
+ enumerated_ref_exit(&ca->io_ref[WRITE]);
|
|
+ enumerated_ref_exit(&ca->io_ref[READ]);
|
|
#ifndef CONFIG_BCACHEFS_DEBUG
|
|
percpu_ref_exit(&ca->ref);
|
|
#endif
|
|
@@ -1221,14 +1393,12 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
|
|
|
|
lockdep_assert_held(&c->state_lock);
|
|
|
|
- if (percpu_ref_is_zero(&ca->io_ref))
|
|
+ if (enumerated_ref_is_zero(&ca->io_ref[READ]))
|
|
return;
|
|
|
|
__bch2_dev_read_only(c, ca);
|
|
|
|
- reinit_completion(&ca->io_ref_completion);
|
|
- percpu_ref_kill(&ca->io_ref);
|
|
- wait_for_completion(&ca->io_ref_completion);
|
|
+ bch2_dev_io_ref_stop(ca, READ);
|
|
|
|
bch2_dev_unlink(ca);
|
|
|
|
@@ -1245,13 +1415,6 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref)
|
|
}
|
|
#endif
|
|
|
|
-static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
|
|
-{
|
|
- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
|
|
-
|
|
- complete(&ca->io_ref_completion);
|
|
-}
|
|
-
|
|
static void bch2_dev_unlink(struct bch_dev *ca)
|
|
{
|
|
struct kobject *b;
|
|
@@ -1280,8 +1443,8 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
|
|
return 0;
|
|
|
|
if (!ca->kobj.state_in_sysfs) {
|
|
- ret = kobject_add(&ca->kobj, &c->kobj,
|
|
- "dev-%u", ca->dev_idx);
|
|
+ ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?:
|
|
+ bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -1313,7 +1476,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
|
|
|
|
kobject_init(&ca->kobj, &bch2_dev_ktype);
|
|
init_completion(&ca->ref_completion);
|
|
- init_completion(&ca->io_ref_completion);
|
|
|
|
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
|
|
|
|
@@ -1337,10 +1499,13 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
|
|
atomic_long_set(&ca->ref, 1);
|
|
#endif
|
|
|
|
+ mutex_init(&ca->bucket_backpointer_mismatch.lock);
|
|
+ mutex_init(&ca->bucket_backpointer_empty.lock);
|
|
+
|
|
bch2_dev_allocator_background_init(ca);
|
|
|
|
- if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
|
|
- PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
|
|
+ if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) ||
|
|
+ enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) ||
|
|
!(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
|
|
bch2_dev_buckets_alloc(c, ca) ||
|
|
!(ca->io_done = alloc_percpu(*ca->io_done)))
|
|
@@ -1357,7 +1522,9 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
|
|
{
|
|
ca->dev_idx = dev_idx;
|
|
__set_bit(ca->dev_idx, ca->self.d);
|
|
- scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
|
|
+
|
|
+ if (!ca->name[0])
|
|
+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
|
|
|
|
ca->fs = c;
|
|
rcu_assign_pointer(c->devs[ca->dev_idx], ca);
|
|
@@ -1402,19 +1569,32 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
|
|
return -BCH_ERR_device_size_too_small;
|
|
}
|
|
|
|
- BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
|
|
+ BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
|
|
+ BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
|
|
|
|
ret = bch2_dev_journal_init(ca, sb->sb);
|
|
if (ret)
|
|
return ret;
|
|
|
|
+ struct printbuf name = PRINTBUF;
|
|
+ prt_bdevname(&name, sb->bdev);
|
|
+ strscpy(ca->name, name.buf, sizeof(ca->name));
|
|
+ printbuf_exit(&name);
|
|
+
|
|
/* Commit: */
|
|
ca->disk_sb = *sb;
|
|
memset(sb, 0, sizeof(*sb));
|
|
|
|
+ /*
|
|
+ * Stash pointer to the filesystem for blk_holder_ops - note that once
|
|
+ * attached to a filesystem, we will always close the block device
|
|
+ * before tearing down the filesystem object.
|
|
+ */
|
|
+ ca->disk_sb.holder->c = ca->fs;
|
|
+
|
|
ca->dev = ca->disk_sb.bdev->bd_dev;
|
|
|
|
- percpu_ref_reinit(&ca->io_ref);
|
|
+ enumerated_ref_start(&ca->io_ref[READ]);
|
|
|
|
return 0;
|
|
}
|
|
@@ -1438,18 +1618,11 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
|
|
if (ret)
|
|
return ret;
|
|
|
|
- bch2_dev_sysfs_online(c, ca);
|
|
-
|
|
- struct printbuf name = PRINTBUF;
|
|
- prt_bdevname(&name, ca->disk_sb.bdev);
|
|
-
|
|
- if (c->sb.nr_devices == 1)
|
|
- strscpy(c->name, name.buf, sizeof(c->name));
|
|
- strscpy(ca->name, name.buf, sizeof(ca->name));
|
|
+ set_bit(ca->dev_idx, c->online_devs.d);
|
|
|
|
- printbuf_exit(&name);
|
|
+ bch2_dev_sysfs_online(c, ca);
|
|
|
|
- rebalance_wakeup(c);
|
|
+ bch2_rebalance_wakeup(c);
|
|
return 0;
|
|
}
|
|
|
|
@@ -1499,7 +1672,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
|
|
return true;
|
|
|
|
/* do we have enough devices to read from? */
|
|
- new_online_devs = bch2_online_devs(c);
|
|
+ new_online_devs = c->online_devs;
|
|
__clear_bit(ca->dev_idx, new_online_devs.d);
|
|
|
|
return bch2_have_enough_devs(c, new_online_devs, flags, false);
|
|
@@ -1508,42 +1681,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
|
|
}
|
|
}
|
|
|
|
-static bool bch2_fs_may_start(struct bch_fs *c)
|
|
-{
|
|
- struct bch_dev *ca;
|
|
- unsigned i, flags = 0;
|
|
-
|
|
- if (c->opts.very_degraded)
|
|
- flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
|
|
-
|
|
- if (c->opts.degraded)
|
|
- flags |= BCH_FORCE_IF_DEGRADED;
|
|
-
|
|
- if (!c->opts.degraded &&
|
|
- !c->opts.very_degraded) {
|
|
- mutex_lock(&c->sb_lock);
|
|
-
|
|
- for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
|
|
- if (!bch2_member_exists(c->disk_sb.sb, i))
|
|
- continue;
|
|
-
|
|
- ca = bch2_dev_locked(c, i);
|
|
-
|
|
- if (!bch2_dev_is_online(ca) &&
|
|
- (ca->mi.state == BCH_MEMBER_STATE_rw ||
|
|
- ca->mi.state == BCH_MEMBER_STATE_ro)) {
|
|
- mutex_unlock(&c->sb_lock);
|
|
- return false;
|
|
- }
|
|
- }
|
|
- mutex_unlock(&c->sb_lock);
|
|
- }
|
|
-
|
|
- return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
|
|
-}
|
|
-
|
|
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
|
|
{
|
|
+ bch2_dev_io_ref_stop(ca, WRITE);
|
|
+
|
|
/*
|
|
* The allocator thread itself allocates btree nodes, so stop it first:
|
|
*/
|
|
@@ -1560,6 +1701,10 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
|
|
|
|
bch2_dev_allocator_add(c, ca);
|
|
bch2_recalc_capacity(c);
|
|
+
|
|
+ if (enumerated_ref_is_zero(&ca->io_ref[WRITE]))
|
|
+ enumerated_ref_start(&ca->io_ref[WRITE]);
|
|
+
|
|
bch2_dev_do_discards(ca);
|
|
}
|
|
|
|
@@ -1589,7 +1734,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
|
|
if (new_state == BCH_MEMBER_STATE_rw)
|
|
__bch2_dev_read_write(c, ca);
|
|
|
|
- rebalance_wakeup(c);
|
|
+ bch2_rebalance_wakeup(c);
|
|
|
|
return ret;
|
|
}
|
|
@@ -1612,6 +1757,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
|
{
|
|
struct bch_member *m;
|
|
unsigned dev_idx = ca->dev_idx, data;
|
|
+ bool fast_device_removal = !bch2_request_incompat_feature(c,
|
|
+ bcachefs_metadata_version_fast_device_removal);
|
|
int ret;
|
|
|
|
down_write(&c->state_lock);
|
|
@@ -1630,11 +1777,25 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
|
|
|
__bch2_dev_read_only(c, ca);
|
|
|
|
- ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
|
|
- bch_err_msg(ca, ret, "bch2_dev_data_drop()");
|
|
+ ret = fast_device_removal
|
|
+ ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags)
|
|
+ : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?:
|
|
+ bch2_dev_remove_stripes(c, ca->dev_idx, flags));
|
|
if (ret)
|
|
goto err;
|
|
|
|
+ /* Check if device still has data before blowing away alloc info */
|
|
+ struct bch_dev_usage usage = bch2_dev_usage_read(ca);
|
|
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
|
|
+ if (!data_type_is_empty(i) &&
|
|
+ !data_type_is_hidden(i) &&
|
|
+ usage.buckets[i]) {
|
|
+ bch_err(ca, "Remove failed: still has data (%s, %llu buckets)",
|
|
+ __bch2_data_types[i], usage.buckets[i]);
|
|
+ ret = -EBUSY;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
ret = bch2_dev_remove_alloc(c, ca);
|
|
bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
|
|
if (ret)
|
|
@@ -1698,7 +1859,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
|
*/
|
|
mutex_lock(&c->sb_lock);
|
|
m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
|
|
- memset(&m->uuid, 0, sizeof(m->uuid));
|
|
+
|
|
+ if (fast_device_removal)
|
|
+ m->uuid = BCH_SB_MEMBER_DELETED_UUID;
|
|
+ else
|
|
+ memset(&m->uuid, 0, sizeof(m->uuid));
|
|
|
|
bch2_write_super(c);
|
|
|
|
@@ -1706,8 +1871,9 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
|
up_write(&c->state_lock);
|
|
return 0;
|
|
err:
|
|
- if (ca->mi.state == BCH_MEMBER_STATE_rw &&
|
|
- !percpu_ref_is_zero(&ca->io_ref))
|
|
+ if (test_bit(BCH_FS_rw, &c->flags) &&
|
|
+ ca->mi.state == BCH_MEMBER_STATE_rw &&
|
|
+ !enumerated_ref_is_zero(&ca->io_ref[READ]))
|
|
__bch2_dev_read_write(c, ca);
|
|
up_write(&c->state_lock);
|
|
return ret;
|
|
@@ -1717,11 +1883,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
|
int bch2_dev_add(struct bch_fs *c, const char *path)
|
|
{
|
|
struct bch_opts opts = bch2_opts_empty();
|
|
- struct bch_sb_handle sb;
|
|
+ struct bch_sb_handle sb = {};
|
|
struct bch_dev *ca = NULL;
|
|
struct printbuf errbuf = PRINTBUF;
|
|
struct printbuf label = PRINTBUF;
|
|
- int ret;
|
|
+ int ret = 0;
|
|
|
|
ret = bch2_read_super(path, &opts, &sb);
|
|
bch_err_msg(c, ret, "reading super");
|
|
@@ -1738,6 +1904,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
|
|
}
|
|
}
|
|
|
|
+ if (list_empty(&c->list)) {
|
|
+ mutex_lock(&bch_fs_list_lock);
|
|
+ if (__bch2_uuid_to_fs(c->sb.uuid))
|
|
+ ret = -BCH_ERR_filesystem_uuid_already_open;
|
|
+ else
|
|
+ list_add(&c->list, &bch_fs_list);
|
|
+ mutex_unlock(&bch_fs_list_lock);
|
|
+
|
|
+ if (ret) {
|
|
+ bch_err(c, "filesystem UUID already open");
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
ret = bch2_dev_may_add(sb.sb, c);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1754,6 +1934,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
|
|
|
|
down_write(&c->state_lock);
|
|
mutex_lock(&c->sb_lock);
|
|
+ SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true);
|
|
|
|
ret = bch2_sb_from_fs(c, ca);
|
|
bch_err_msg(c, ret, "setting up new superblock");
|
|
@@ -1769,6 +1950,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
|
|
goto err_unlock;
|
|
}
|
|
unsigned dev_idx = ret;
|
|
+ ret = 0;
|
|
|
|
/* success: */
|
|
|
|
@@ -1788,27 +1970,29 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
|
|
bch2_write_super(c);
|
|
mutex_unlock(&c->sb_lock);
|
|
|
|
- ret = bch2_dev_usage_init(ca, false);
|
|
- if (ret)
|
|
- goto err_late;
|
|
+ if (test_bit(BCH_FS_started, &c->flags)) {
|
|
+ ret = bch2_dev_usage_init(ca, false);
|
|
+ if (ret)
|
|
+ goto err_late;
|
|
|
|
- ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
|
|
- bch_err_msg(ca, ret, "marking new superblock");
|
|
- if (ret)
|
|
- goto err_late;
|
|
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
|
|
+ bch_err_msg(ca, ret, "marking new superblock");
|
|
+ if (ret)
|
|
+ goto err_late;
|
|
|
|
- ret = bch2_fs_freespace_init(c);
|
|
- bch_err_msg(ca, ret, "initializing free space");
|
|
- if (ret)
|
|
- goto err_late;
|
|
+ ret = bch2_fs_freespace_init(c);
|
|
+ bch_err_msg(ca, ret, "initializing free space");
|
|
+ if (ret)
|
|
+ goto err_late;
|
|
|
|
- if (ca->mi.state == BCH_MEMBER_STATE_rw)
|
|
- __bch2_dev_read_write(c, ca);
|
|
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
|
|
+ __bch2_dev_read_write(c, ca);
|
|
|
|
- ret = bch2_dev_journal_alloc(ca, false);
|
|
- bch_err_msg(c, ret, "allocating journal");
|
|
- if (ret)
|
|
- goto err_late;
|
|
+ ret = bch2_dev_journal_alloc(ca, false);
|
|
+ bch_err_msg(c, ret, "allocating journal");
|
|
+ if (ret)
|
|
+ goto err_late;
|
|
+ }
|
|
|
|
up_write(&c->state_lock);
|
|
out:
|
|
@@ -1919,6 +2103,18 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
|
|
return 0;
|
|
}
|
|
|
|
+static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets)
|
|
+{
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 };
|
|
+
|
|
+ return bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
|
|
+ bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
|
|
+ .dev = ca->dev_idx,
|
|
+ .data_type = BCH_DATA_free)) ?:
|
|
+ bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets);
|
|
+}
|
|
+
|
|
int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
|
{
|
|
struct bch_member *m;
|
|
@@ -1966,16 +2162,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
|
mutex_unlock(&c->sb_lock);
|
|
|
|
if (ca->mi.freespace_initialized) {
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
|
|
- .dev_data_type.dev = ca->dev_idx,
|
|
- .dev_data_type.data_type = BCH_DATA_free,
|
|
- };
|
|
- u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
|
|
-
|
|
- ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
|
|
- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
|
|
- bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
|
|
+ ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets);
|
|
if (ret)
|
|
goto err;
|
|
}
|
|
@@ -1986,6 +2173,49 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
|
return ret;
|
|
}
|
|
|
|
+int bch2_fs_resize_on_mount(struct bch_fs *c)
|
|
+{
|
|
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) {
|
|
+ u64 old_nbuckets = ca->mi.nbuckets;
|
|
+ u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk),
|
|
+ ca->mi.bucket_size);
|
|
+
|
|
+ if (ca->mi.resize_on_mount &&
|
|
+ new_nbuckets > ca->mi.nbuckets) {
|
|
+ bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size);
|
|
+ int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets);
|
|
+ bch_err_fn(ca, ret);
|
|
+ if (ret) {
|
|
+ enumerated_ref_put(&ca->io_ref[READ],
|
|
+ BCH_DEV_READ_REF_fs_resize_on_mount);
|
|
+ up_write(&c->state_lock);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ struct bch_member *m =
|
|
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
|
|
+ m->nbuckets = cpu_to_le64(new_nbuckets);
|
|
+ SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false);
|
|
+
|
|
+ c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image));
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ if (ca->mi.freespace_initialized) {
|
|
+ ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets);
|
|
+ if (ret) {
|
|
+ enumerated_ref_put(&ca->io_ref[READ],
|
|
+ BCH_DEV_READ_REF_fs_resize_on_mount);
|
|
+ up_write(&c->state_lock);
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
/* return with ref on ca->ref: */
|
|
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
|
|
{
|
|
@@ -1998,6 +2228,114 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
|
|
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
|
|
}
|
|
|
|
+/* blk_holder_ops: */
|
|
+
|
|
+static struct bch_fs *bdev_get_fs(struct block_device *bdev)
|
|
+ __releases(&bdev->bd_holder_lock)
|
|
+{
|
|
+ struct bch_sb_handle_holder *holder = bdev->bd_holder;
|
|
+ struct bch_fs *c = holder->c;
|
|
+
|
|
+ if (c && !bch2_ro_ref_tryget(c))
|
|
+ c = NULL;
|
|
+
|
|
+ mutex_unlock(&bdev->bd_holder_lock);
|
|
+
|
|
+ if (c)
|
|
+ wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
|
|
+ return c;
|
|
+}
|
|
+
|
|
+/* returns with ref on ca->ref */
|
|
+static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
|
|
+{
|
|
+ for_each_member_device(c, ca)
|
|
+ if (ca->disk_sb.bdev == bdev)
|
|
+ return ca;
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
|
|
+{
|
|
+ struct bch_fs *c = bdev_get_fs(bdev);
|
|
+ if (!c)
|
|
+ return;
|
|
+
|
|
+ struct super_block *sb = c->vfs_sb;
|
|
+ if (sb) {
|
|
+ /*
|
|
+ * Not necessary, c->ro_ref guards against the filesystem being
|
|
+ * unmounted - we only take this to avoid a warning in
|
|
+ * sync_filesystem:
|
|
+ */
|
|
+ down_read(&sb->s_umount);
|
|
+ }
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+ struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
|
|
+ if (!ca)
|
|
+ goto unlock;
|
|
+
|
|
+ bool dev = bch2_dev_state_allowed(c, ca,
|
|
+ BCH_MEMBER_STATE_failed,
|
|
+ BCH_FORCE_IF_DEGRADED);
|
|
+
|
|
+ if (!dev && sb) {
|
|
+ if (!surprise)
|
|
+ sync_filesystem(sb);
|
|
+ shrink_dcache_sb(sb);
|
|
+ evict_inodes(sb);
|
|
+ }
|
|
+
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ __bch2_log_msg_start(ca->name, &buf);
|
|
+
|
|
+ prt_printf(&buf, "offline from block layer");
|
|
+
|
|
+ if (dev) {
|
|
+ __bch2_dev_offline(c, ca);
|
|
+ } else {
|
|
+ bch2_journal_flush(&c->journal);
|
|
+ bch2_fs_emergency_read_only2(c, &buf);
|
|
+ }
|
|
+
|
|
+ bch2_print_str(c, KERN_ERR, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+
|
|
+ bch2_dev_put(ca);
|
|
+unlock:
|
|
+ if (sb)
|
|
+ up_read(&sb->s_umount);
|
|
+ up_write(&c->state_lock);
|
|
+ bch2_ro_ref_put(c);
|
|
+}
|
|
+
|
|
+static void bch2_fs_bdev_sync(struct block_device *bdev)
|
|
+{
|
|
+ struct bch_fs *c = bdev_get_fs(bdev);
|
|
+ if (!c)
|
|
+ return;
|
|
+
|
|
+ struct super_block *sb = c->vfs_sb;
|
|
+ if (sb) {
|
|
+ /*
|
|
+ * Not necessary, c->ro_ref guards against the filesystem being
|
|
+ * unmounted - we only take this to avoid a warning in
|
|
+ * sync_filesystem:
|
|
+ */
|
|
+ down_read(&sb->s_umount);
|
|
+ sync_filesystem(sb);
|
|
+ up_read(&sb->s_umount);
|
|
+ }
|
|
+
|
|
+ bch2_ro_ref_put(c);
|
|
+}
|
|
+
|
|
+const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
|
|
+ .mark_dead = bch2_fs_bdev_mark_dead,
|
|
+ .sync = bch2_fs_bdev_sync,
|
|
+};
|
|
+
|
|
/* Filesystem open: */
|
|
|
|
static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
|
|
@@ -2006,10 +2344,10 @@ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
|
|
cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
|
|
}
|
|
|
|
-struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
|
|
- struct bch_opts opts)
|
|
+struct bch_fs *bch2_fs_open(darray_const_str *devices,
|
|
+ struct bch_opts *opts)
|
|
{
|
|
- DARRAY(struct bch_sb_handle) sbs = { 0 };
|
|
+ bch_sb_handles sbs = {};
|
|
struct bch_fs *c = NULL;
|
|
struct bch_sb_handle *best = NULL;
|
|
struct printbuf errbuf = PRINTBUF;
|
|
@@ -2018,26 +2356,26 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
|
|
if (!try_module_get(THIS_MODULE))
|
|
return ERR_PTR(-ENODEV);
|
|
|
|
- if (!nr_devices) {
|
|
+ if (!devices->nr) {
|
|
ret = -EINVAL;
|
|
goto err;
|
|
}
|
|
|
|
- ret = darray_make_room(&sbs, nr_devices);
|
|
+ ret = darray_make_room(&sbs, devices->nr);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- for (unsigned i = 0; i < nr_devices; i++) {
|
|
+ darray_for_each(*devices, i) {
|
|
struct bch_sb_handle sb = { NULL };
|
|
|
|
- ret = bch2_read_super(devices[i], &opts, &sb);
|
|
+ ret = bch2_read_super(*i, opts, &sb);
|
|
if (ret)
|
|
goto err;
|
|
|
|
BUG_ON(darray_push(&sbs, sb));
|
|
}
|
|
|
|
- if (opts.nochanges && !opts.read_only) {
|
|
+ if (opts->nochanges && !opts->read_only) {
|
|
ret = -BCH_ERR_erofs_nochanges;
|
|
goto err_print;
|
|
}
|
|
@@ -2047,7 +2385,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
|
|
best = sb;
|
|
|
|
darray_for_each_reverse(sbs, sb) {
|
|
- ret = bch2_dev_in_fs(best, sb, &opts);
|
|
+ ret = bch2_dev_in_fs(best, sb, opts);
|
|
|
|
if (ret == -BCH_ERR_device_has_been_removed ||
|
|
ret == -BCH_ERR_device_splitbrain) {
|
|
@@ -2062,7 +2400,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
|
|
goto err_print;
|
|
}
|
|
|
|
- c = bch2_fs_alloc(best->sb, opts);
|
|
+ c = bch2_fs_alloc(best->sb, opts, &sbs);
|
|
ret = PTR_ERR_OR_ZERO(c);
|
|
if (ret)
|
|
goto err;
|
|
@@ -2077,11 +2415,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
|
|
}
|
|
up_write(&c->state_lock);
|
|
|
|
- if (!bch2_fs_may_start(c)) {
|
|
- ret = -BCH_ERR_insufficient_devices_to_start;
|
|
- goto err_print;
|
|
- }
|
|
-
|
|
if (!c->opts.nostart) {
|
|
ret = bch2_fs_start(c);
|
|
if (ret)
|
|
@@ -2096,7 +2429,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
|
|
return c;
|
|
err_print:
|
|
pr_err("bch_fs_open err opening %s: %s",
|
|
- devices[0], bch2_err_str(ret));
|
|
+ devices->data[0], bch2_err_str(ret));
|
|
err:
|
|
if (!IS_ERR_OR_NULL(c))
|
|
bch2_fs_stop(c);
|
|
@@ -2133,16 +2466,52 @@ static int __init bcachefs_init(void)
|
|
return -ENOMEM;
|
|
}
|
|
|
|
-#define BCH_DEBUG_PARAM(name, description) \
|
|
- bool bch2_##name; \
|
|
- module_param_named(name, bch2_##name, bool, 0644); \
|
|
+#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name);
|
|
+BCH_DEBUG_PARAMS_ALL()
|
|
+#undef BCH_DEBUG_PARAM
|
|
+
|
|
+static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp)
|
|
+{
|
|
+ /* Match bool exactly, by re-using it. */
|
|
+ struct static_key *key = kp->arg;
|
|
+ struct kernel_param boolkp = *kp;
|
|
+ bool v;
|
|
+ int ret;
|
|
+
|
|
+ boolkp.arg = &v;
|
|
+
|
|
+ ret = param_set_bool(val, &boolkp);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ if (v)
|
|
+ static_key_enable(key);
|
|
+ else
|
|
+ static_key_disable(key);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp)
|
|
+{
|
|
+ struct static_key *key = kp->arg;
|
|
+ return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y');
|
|
+}
|
|
+
|
|
+static const struct kernel_param_ops bch2_param_ops_static_key_t = {
|
|
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
|
|
+ .set = bch2_param_set_static_key_t,
|
|
+ .get = bch2_param_get_static_key_t,
|
|
+};
|
|
+
|
|
+#define BCH_DEBUG_PARAM(name, description) \
|
|
+ module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\
|
|
+ __MODULE_PARM_TYPE(name, "static_key_t"); \
|
|
MODULE_PARM_DESC(name, description);
|
|
BCH_DEBUG_PARAMS()
|
|
#undef BCH_DEBUG_PARAM
|
|
|
|
__maybe_unused
|
|
static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
|
|
-module_param_named(version, bch2_metadata_version, uint, 0400);
|
|
+module_param_named(version, bch2_metadata_version, uint, 0444);
|
|
|
|
module_exit(bcachefs_exit);
|
|
module_init(bcachefs_init);
|
|
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
|
|
index 04f8287eff5c..dc52f06cb2b9 100644
|
|
--- a/fs/bcachefs/super.h
|
|
+++ b/fs/bcachefs/super.h
|
|
@@ -9,6 +9,9 @@
|
|
#include <linux/math64.h>
|
|
|
|
extern const char * const bch2_fs_flag_strs[];
|
|
+extern const char * const bch2_write_refs[];
|
|
+extern const char * const bch2_dev_read_refs[];
|
|
+extern const char * const bch2_dev_write_refs[];
|
|
|
|
struct bch_fs *bch2_dev_to_fs(dev_t);
|
|
struct bch_fs *bch2_uuid_to_fs(__uuid_t);
|
|
@@ -29,17 +32,23 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
|
|
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
|
|
|
|
bool bch2_fs_emergency_read_only(struct bch_fs *);
|
|
+bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *);
|
|
+
|
|
bool bch2_fs_emergency_read_only_locked(struct bch_fs *);
|
|
void bch2_fs_read_only(struct bch_fs *);
|
|
|
|
int bch2_fs_read_write(struct bch_fs *);
|
|
int bch2_fs_read_write_early(struct bch_fs *);
|
|
|
|
+int bch2_fs_resize_on_mount(struct bch_fs *);
|
|
+
|
|
void __bch2_fs_stop(struct bch_fs *);
|
|
void bch2_fs_free(struct bch_fs *);
|
|
void bch2_fs_stop(struct bch_fs *);
|
|
|
|
int bch2_fs_start(struct bch_fs *);
|
|
-struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
|
|
+struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *);
|
|
+
|
|
+extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
|
|
|
|
#endif /* _BCACHEFS_SUPER_H */
|
|
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
|
|
index 368a63d938cf..3a899f799d1d 100644
|
|
--- a/fs/bcachefs/super_types.h
|
|
+++ b/fs/bcachefs/super_types.h
|
|
@@ -2,13 +2,19 @@
|
|
#ifndef _BCACHEFS_SUPER_TYPES_H
|
|
#define _BCACHEFS_SUPER_TYPES_H
|
|
|
|
+struct bch_fs;
|
|
+
|
|
+struct bch_sb_handle_holder {
|
|
+ struct bch_fs *c;
|
|
+};
|
|
+
|
|
struct bch_sb_handle {
|
|
struct bch_sb *sb;
|
|
struct file *s_bdev_file;
|
|
struct block_device *bdev;
|
|
char *sb_name;
|
|
struct bio *bio;
|
|
- void *holder;
|
|
+ struct bch_sb_handle_holder *holder;
|
|
size_t buffer_size;
|
|
blk_mode_t mode;
|
|
unsigned have_layout:1;
|
|
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
|
|
index a7eb1f511484..0101eb025117 100644
|
|
--- a/fs/bcachefs/sysfs.c
|
|
+++ b/fs/bcachefs/sysfs.c
|
|
@@ -25,6 +25,7 @@
|
|
#include "disk_accounting.h"
|
|
#include "disk_groups.h"
|
|
#include "ec.h"
|
|
+#include "enumerated_ref.h"
|
|
#include "inode.h"
|
|
#include "journal.h"
|
|
#include "journal_reclaim.h"
|
|
@@ -34,12 +35,14 @@
|
|
#include "nocow_locking.h"
|
|
#include "opts.h"
|
|
#include "rebalance.h"
|
|
+#include "recovery_passes.h"
|
|
#include "replicas.h"
|
|
#include "super-io.h"
|
|
#include "tests.h"
|
|
|
|
#include <linux/blkdev.h>
|
|
#include <linux/sort.h>
|
|
+#include <linux/string_choices.h>
|
|
#include <linux/sched/clock.h>
|
|
|
|
#include "util.h"
|
|
@@ -145,16 +148,18 @@ write_attribute(trigger_journal_flush);
|
|
write_attribute(trigger_journal_writes);
|
|
write_attribute(trigger_btree_cache_shrink);
|
|
write_attribute(trigger_btree_key_cache_shrink);
|
|
+write_attribute(trigger_btree_updates);
|
|
write_attribute(trigger_freelist_wakeup);
|
|
+write_attribute(trigger_recalc_capacity);
|
|
+write_attribute(trigger_delete_dead_snapshots);
|
|
read_attribute(gc_gens_pos);
|
|
+__sysfs_attribute(read_fua_test, 0400);
|
|
|
|
read_attribute(uuid);
|
|
read_attribute(minor);
|
|
read_attribute(flags);
|
|
-read_attribute(bucket_size);
|
|
read_attribute(first_bucket);
|
|
read_attribute(nbuckets);
|
|
-rw_attribute(durability);
|
|
read_attribute(io_done);
|
|
read_attribute(io_errors);
|
|
write_attribute(io_errors_reset);
|
|
@@ -173,31 +178,13 @@ read_attribute(journal_debug);
|
|
read_attribute(btree_cache);
|
|
read_attribute(btree_key_cache);
|
|
read_attribute(btree_reserve_cache);
|
|
-read_attribute(stripes_heap);
|
|
read_attribute(open_buckets);
|
|
read_attribute(open_buckets_partial);
|
|
-read_attribute(write_points);
|
|
read_attribute(nocow_lock_table);
|
|
|
|
-#ifdef BCH_WRITE_REF_DEBUG
|
|
+read_attribute(read_refs);
|
|
read_attribute(write_refs);
|
|
|
|
-static const char * const bch2_write_refs[] = {
|
|
-#define x(n) #n,
|
|
- BCH_WRITE_REFS()
|
|
-#undef x
|
|
- NULL
|
|
-};
|
|
-
|
|
-static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
|
|
-{
|
|
- bch2_printbuf_tabstop_push(out, 24);
|
|
-
|
|
- for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++)
|
|
- prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i]));
|
|
-}
|
|
-#endif
|
|
-
|
|
read_attribute(internal_uuid);
|
|
read_attribute(disk_groups);
|
|
|
|
@@ -209,14 +196,14 @@ read_attribute(usage_base);
|
|
BCH_PERSISTENT_COUNTERS()
|
|
#undef x
|
|
|
|
-rw_attribute(discard);
|
|
-read_attribute(state);
|
|
rw_attribute(label);
|
|
|
|
read_attribute(copy_gc_wait);
|
|
|
|
sysfs_pd_controller_attribute(rebalance);
|
|
read_attribute(rebalance_status);
|
|
+read_attribute(snapshot_delete_status);
|
|
+read_attribute(recovery_status);
|
|
|
|
read_attribute(new_stripes);
|
|
|
|
@@ -262,10 +249,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
|
|
prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
|
|
|
|
for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) {
|
|
- struct disk_accounting_pos a = {
|
|
- .type = BCH_DISK_ACCOUNTING_compression,
|
|
- .compression.type = i,
|
|
- };
|
|
+ struct disk_accounting_pos a;
|
|
+ disk_accounting_key_init(a, compression, .type = i);
|
|
struct bpos p = disk_accounting_pos_to_bpos(&a);
|
|
u64 v[3];
|
|
bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v));
|
|
@@ -315,6 +300,116 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
|
|
prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes);
|
|
}
|
|
|
|
+static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca)
|
|
+{
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ struct bio *bio = NULL;
|
|
+ void *buf = NULL;
|
|
+ unsigned bs = c->opts.block_size, iters;
|
|
+ u64 end, test_duration = NSEC_PER_SEC * 2;
|
|
+ struct bch2_time_stats stats_nofua, stats_fua, stats_random;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_time_stats_init_no_pcpu(&stats_nofua);
|
|
+ bch2_time_stats_init_no_pcpu(&stats_fua);
|
|
+ bch2_time_stats_init_no_pcpu(&stats_random);
|
|
+
|
|
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, READ, BCH_DEV_READ_REF_read_fua_test)) {
|
|
+ prt_str(out, "offline\n");
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ struct block_device *bdev = ca->disk_sb.bdev;
|
|
+
|
|
+ bio = bio_kmalloc(1, GFP_KERNEL);
|
|
+ if (!bio) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ buf = kmalloc(bs, GFP_KERNEL);
|
|
+ if (!buf)
|
|
+ goto err;
|
|
+
|
|
+ end = ktime_get_ns() + test_duration;
|
|
+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
|
|
+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
|
|
+ bch2_bio_map(bio, buf, bs);
|
|
+
|
|
+ u64 submit_time = ktime_get_ns();
|
|
+ ret = submit_bio_wait(bio);
|
|
+ bch2_time_stats_update(&stats_nofua, submit_time);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ end = ktime_get_ns() + test_duration;
|
|
+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
|
|
+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ);
|
|
+ bch2_bio_map(bio, buf, bs);
|
|
+
|
|
+ u64 submit_time = ktime_get_ns();
|
|
+ ret = submit_bio_wait(bio);
|
|
+ bch2_time_stats_update(&stats_fua, submit_time);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca);
|
|
+
|
|
+ end = ktime_get_ns() + test_duration;
|
|
+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
|
|
+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
|
|
+ bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9;
|
|
+ bch2_bio_map(bio, buf, bs);
|
|
+
|
|
+ u64 submit_time = ktime_get_ns();
|
|
+ ret = submit_bio_wait(bio);
|
|
+ bch2_time_stats_update(&stats_random, submit_time);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ u64 ns_nofua = mean_and_variance_get_mean(stats_nofua.duration_stats);
|
|
+ u64 ns_fua = mean_and_variance_get_mean(stats_fua.duration_stats);
|
|
+ u64 ns_rand = mean_and_variance_get_mean(stats_random.duration_stats);
|
|
+
|
|
+ u64 stddev_nofua = mean_and_variance_get_stddev(stats_nofua.duration_stats);
|
|
+ u64 stddev_fua = mean_and_variance_get_stddev(stats_fua.duration_stats);
|
|
+ u64 stddev_rand = mean_and_variance_get_stddev(stats_random.duration_stats);
|
|
+
|
|
+ printbuf_tabstop_push(out, 8);
|
|
+ printbuf_tabstop_push(out, 12);
|
|
+ printbuf_tabstop_push(out, 12);
|
|
+ prt_printf(out, "This test must be run on an idle drive for accurate results\n");
|
|
+ prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device));
|
|
+ prt_printf(out, "fua support advertized: %s\n", str_yes_no(bdev_fua(bdev)));
|
|
+ prt_newline(out);
|
|
+ prt_printf(out, "ns:\tlatency\rstddev\r\n");
|
|
+ prt_printf(out, "nofua\t%llu\r%llu\r\n", ns_nofua, stddev_nofua);
|
|
+ prt_printf(out, "fua\t%llu\r%llu\r\n", ns_fua, stddev_fua);
|
|
+ prt_printf(out, "random\t%llu\r%llu\r\n", ns_rand, stddev_rand);
|
|
+
|
|
+ bool read_cache = ns_nofua * 2 < ns_rand;
|
|
+ bool fua_cached = read_cache && ns_fua < (ns_nofua + ns_rand) / 2;
|
|
+
|
|
+ if (!read_cache)
|
|
+ prt_str(out, "reads don't appear to be cached - safe\n");
|
|
+ else if (!fua_cached)
|
|
+ prt_str(out, "fua reads don't appear to be cached - safe\n");
|
|
+ else
|
|
+ prt_str(out, "fua reads appear to be cached - unsafe\n");
|
|
+err:
|
|
+ kfree(buf);
|
|
+ kfree(bio);
|
|
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_read_fua_test);
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
SHOW(bch2_fs)
|
|
{
|
|
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
|
|
@@ -341,6 +436,12 @@ SHOW(bch2_fs)
|
|
if (attr == &sysfs_rebalance_status)
|
|
bch2_rebalance_status_to_text(out, c);
|
|
|
|
+ if (attr == &sysfs_snapshot_delete_status)
|
|
+ bch2_snapshot_delete_status_to_text(out, c);
|
|
+
|
|
+ if (attr == &sysfs_recovery_status)
|
|
+ bch2_recovery_pass_status_to_text(out, c);
|
|
+
|
|
/* Debugging: */
|
|
|
|
if (attr == &sysfs_journal_debug)
|
|
@@ -355,18 +456,12 @@ SHOW(bch2_fs)
|
|
if (attr == &sysfs_btree_reserve_cache)
|
|
bch2_btree_reserve_cache_to_text(out, c);
|
|
|
|
- if (attr == &sysfs_stripes_heap)
|
|
- bch2_stripes_heap_to_text(out, c);
|
|
-
|
|
if (attr == &sysfs_open_buckets)
|
|
bch2_open_buckets_to_text(out, c, NULL);
|
|
|
|
if (attr == &sysfs_open_buckets_partial)
|
|
bch2_open_buckets_partial_to_text(out, c);
|
|
|
|
- if (attr == &sysfs_write_points)
|
|
- bch2_write_points_to_text(out, c);
|
|
-
|
|
if (attr == &sysfs_compression_stats)
|
|
bch2_compression_stats_to_text(out, c);
|
|
|
|
@@ -382,10 +477,8 @@ SHOW(bch2_fs)
|
|
if (attr == &sysfs_moving_ctxts)
|
|
bch2_fs_moving_ctxts_to_text(out, c);
|
|
|
|
-#ifdef BCH_WRITE_REF_DEBUG
|
|
if (attr == &sysfs_write_refs)
|
|
- bch2_write_refs_to_text(out, c);
|
|
-#endif
|
|
+ enumerated_ref_to_text(out, &c->writes, bch2_write_refs);
|
|
|
|
if (attr == &sysfs_nocow_lock_table)
|
|
bch2_nocow_locks_to_text(out, &c->nocow_locks);
|
|
@@ -415,7 +508,10 @@ STORE(bch2_fs)
|
|
|
|
/* Debugging: */
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
|
|
+ if (attr == &sysfs_trigger_btree_updates)
|
|
+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
|
|
+
|
|
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))
|
|
return -EROFS;
|
|
|
|
if (attr == &sysfs_trigger_btree_cache_shrink) {
|
|
@@ -455,6 +551,15 @@ STORE(bch2_fs)
|
|
if (attr == &sysfs_trigger_freelist_wakeup)
|
|
closure_wake_up(&c->freelist_wait);
|
|
|
|
+ if (attr == &sysfs_trigger_recalc_capacity) {
|
|
+ down_read(&c->state_lock);
|
|
+ bch2_recalc_capacity(c);
|
|
+ up_read(&c->state_lock);
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_trigger_delete_dead_snapshots)
|
|
+ __bch2_delete_dead_snapshots(c);
|
|
+
|
|
#ifdef CONFIG_BCACHEFS_TESTS
|
|
if (attr == &sysfs_perf_test) {
|
|
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
|
|
@@ -475,7 +580,7 @@ STORE(bch2_fs)
|
|
size = ret;
|
|
}
|
|
#endif
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
|
|
return size;
|
|
}
|
|
SYSFS_OPS(bch2_fs);
|
|
@@ -486,6 +591,8 @@ struct attribute *bch2_fs_files[] = {
|
|
&sysfs_btree_write_stats,
|
|
|
|
&sysfs_rebalance_status,
|
|
+ &sysfs_snapshot_delete_status,
|
|
+ &sysfs_recovery_status,
|
|
|
|
&sysfs_compression_stats,
|
|
|
|
@@ -566,13 +673,9 @@ struct attribute *bch2_fs_internal_files[] = {
|
|
&sysfs_btree_key_cache,
|
|
&sysfs_btree_reserve_cache,
|
|
&sysfs_new_stripes,
|
|
- &sysfs_stripes_heap,
|
|
&sysfs_open_buckets,
|
|
&sysfs_open_buckets_partial,
|
|
- &sysfs_write_points,
|
|
-#ifdef BCH_WRITE_REF_DEBUG
|
|
&sysfs_write_refs,
|
|
-#endif
|
|
&sysfs_nocow_lock_table,
|
|
&sysfs_io_timers_read,
|
|
&sysfs_io_timers_write,
|
|
@@ -584,7 +687,10 @@ struct attribute *bch2_fs_internal_files[] = {
|
|
&sysfs_trigger_journal_writes,
|
|
&sysfs_trigger_btree_cache_shrink,
|
|
&sysfs_trigger_btree_key_cache_shrink,
|
|
+ &sysfs_trigger_btree_updates,
|
|
&sysfs_trigger_freelist_wakeup,
|
|
+ &sysfs_trigger_recalc_capacity,
|
|
+ &sysfs_trigger_delete_dead_snapshots,
|
|
|
|
&sysfs_gc_gens_pos,
|
|
|
|
@@ -604,87 +710,115 @@ struct attribute *bch2_fs_internal_files[] = {
|
|
|
|
/* options */
|
|
|
|
-SHOW(bch2_fs_opts_dir)
|
|
+static ssize_t sysfs_opt_show(struct bch_fs *c,
|
|
+ struct bch_dev *ca,
|
|
+ enum bch_opt_id id,
|
|
+ struct printbuf *out)
|
|
{
|
|
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
|
|
- const struct bch_option *opt = container_of(attr, struct bch_option, attr);
|
|
- int id = opt - bch2_opt_table;
|
|
- u64 v = bch2_opt_get_by_id(&c->opts, id);
|
|
+ const struct bch_option *opt = bch2_opt_table + id;
|
|
+ u64 v;
|
|
+
|
|
+ if (opt->flags & OPT_FS) {
|
|
+ v = bch2_opt_get_by_id(&c->opts, id);
|
|
+ } else if ((opt->flags & OPT_DEVICE) && opt->get_member) {
|
|
+ v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx);
|
|
+ } else {
|
|
+ return -EINVAL;
|
|
+ }
|
|
|
|
bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
|
|
prt_char(out, '\n');
|
|
-
|
|
return 0;
|
|
}
|
|
|
|
-STORE(bch2_fs_opts_dir)
|
|
+static ssize_t sysfs_opt_store(struct bch_fs *c,
|
|
+ struct bch_dev *ca,
|
|
+ enum bch_opt_id id,
|
|
+ const char *buf, size_t size)
|
|
{
|
|
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
|
|
- const struct bch_option *opt = container_of(attr, struct bch_option, attr);
|
|
- int ret, id = opt - bch2_opt_table;
|
|
- char *tmp;
|
|
- u64 v;
|
|
+ const struct bch_option *opt = bch2_opt_table + id;
|
|
+ int ret = 0;
|
|
|
|
/*
|
|
* We don't need to take c->writes for correctness, but it eliminates an
|
|
* unsightly error message in the dmesg log when we're RO:
|
|
*/
|
|
- if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
|
|
+ if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)))
|
|
return -EROFS;
|
|
|
|
- tmp = kstrdup(buf, GFP_KERNEL);
|
|
+ char *tmp = kstrdup(buf, GFP_KERNEL);
|
|
if (!tmp) {
|
|
ret = -ENOMEM;
|
|
goto err;
|
|
}
|
|
|
|
- ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
|
|
+ u64 v;
|
|
+ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?:
|
|
+ bch2_opt_hook_pre_set(c, ca, id, v);
|
|
kfree(tmp);
|
|
|
|
if (ret < 0)
|
|
goto err;
|
|
|
|
- ret = bch2_opt_check_may_set(c, id, v);
|
|
- if (ret < 0)
|
|
- goto err;
|
|
-
|
|
- bch2_opt_set_sb(c, NULL, opt, v);
|
|
- bch2_opt_set_by_id(&c->opts, id, v);
|
|
-
|
|
- if (v &&
|
|
- (id == Opt_background_target ||
|
|
- id == Opt_background_compression ||
|
|
- (id == Opt_compression && !c->opts.background_compression)))
|
|
- bch2_set_rebalance_needs_scan(c, 0);
|
|
+ bool is_sb = opt->get_sb || opt->get_member;
|
|
+ bool changed = false;
|
|
+
|
|
+ if (is_sb) {
|
|
+ changed = bch2_opt_set_sb(c, ca, opt, v);
|
|
+ } else if (!ca) {
|
|
+ changed = bch2_opt_get_by_id(&c->opts, id) != v;
|
|
+ } else {
|
|
+ /* device options that aren't superblock options aren't
|
|
+ * supported */
|
|
+ BUG();
|
|
+ }
|
|
|
|
- if (v && id == Opt_rebalance_enabled)
|
|
- rebalance_wakeup(c);
|
|
+ if (!ca)
|
|
+ bch2_opt_set_by_id(&c->opts, id, v);
|
|
|
|
- if (v && id == Opt_copygc_enabled &&
|
|
- c->copygc_thread)
|
|
- wake_up_process(c->copygc_thread);
|
|
+ if (changed)
|
|
+ bch2_opt_hook_post_set(c, ca, 0, &c->opts, id);
|
|
|
|
ret = size;
|
|
err:
|
|
- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
|
|
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
|
|
return ret;
|
|
}
|
|
+
|
|
+SHOW(bch2_fs_opts_dir)
|
|
+{
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
|
|
+ int id = bch2_opt_lookup(attr->name);
|
|
+ if (id < 0)
|
|
+ return 0;
|
|
+
|
|
+ return sysfs_opt_show(c, NULL, id, out);
|
|
+}
|
|
+
|
|
+STORE(bch2_fs_opts_dir)
|
|
+{
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
|
|
+ int id = bch2_opt_lookup(attr->name);
|
|
+ if (id < 0)
|
|
+ return 0;
|
|
+
|
|
+ return sysfs_opt_store(c, NULL, id, buf, size);
|
|
+}
|
|
SYSFS_OPS(bch2_fs_opts_dir);
|
|
|
|
struct attribute *bch2_fs_opts_dir_files[] = { NULL };
|
|
|
|
-int bch2_opts_create_sysfs_files(struct kobject *kobj)
|
|
+int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
|
|
{
|
|
- const struct bch_option *i;
|
|
- int ret;
|
|
-
|
|
- for (i = bch2_opt_table;
|
|
+ for (const struct bch_option *i = bch2_opt_table;
|
|
i < bch2_opt_table + bch2_opts_nr;
|
|
i++) {
|
|
- if (!(i->flags & OPT_FS))
|
|
+ if (i->flags & OPT_HIDDEN)
|
|
+ continue;
|
|
+ if (!(i->flags & type))
|
|
continue;
|
|
|
|
- ret = sysfs_create_file(kobj, &i->attr);
|
|
+ int ret = sysfs_create_file(kobj, &i->attr);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -755,11 +889,8 @@ SHOW(bch2_dev)
|
|
|
|
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
|
|
|
|
- sysfs_print(bucket_size, bucket_bytes(ca));
|
|
sysfs_print(first_bucket, ca->mi.first_bucket);
|
|
sysfs_print(nbuckets, ca->mi.nbuckets);
|
|
- sysfs_print(durability, ca->mi.durability);
|
|
- sysfs_print(discard, ca->mi.discard);
|
|
|
|
if (attr == &sysfs_label) {
|
|
if (ca->mi.group)
|
|
@@ -772,11 +903,6 @@ SHOW(bch2_dev)
|
|
prt_char(out, '\n');
|
|
}
|
|
|
|
- if (attr == &sysfs_state) {
|
|
- prt_string_option(out, bch2_member_states, ca->mi.state);
|
|
- prt_char(out, '\n');
|
|
- }
|
|
-
|
|
if (attr == &sysfs_io_done)
|
|
dev_io_done_to_text(out, ca);
|
|
|
|
@@ -802,6 +928,19 @@ SHOW(bch2_dev)
|
|
if (attr == &sysfs_open_buckets)
|
|
bch2_open_buckets_to_text(out, c, ca);
|
|
|
|
+ if (attr == &sysfs_read_fua_test)
|
|
+ return bch2_read_fua_test(out, ca);
|
|
+
|
|
+ int opt_id = bch2_opt_lookup(attr->name);
|
|
+ if (opt_id >= 0)
|
|
+ return sysfs_opt_show(c, ca, opt_id, out);
|
|
+
|
|
+ if (attr == &sysfs_read_refs)
|
|
+ enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs);
|
|
+
|
|
+ if (attr == &sysfs_write_refs)
|
|
+ enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs);
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
@@ -810,18 +949,6 @@ STORE(bch2_dev)
|
|
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
|
|
struct bch_fs *c = ca->fs;
|
|
|
|
- if (attr == &sysfs_discard) {
|
|
- bool v = strtoul_or_return(buf);
|
|
-
|
|
- bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v);
|
|
- }
|
|
-
|
|
- if (attr == &sysfs_durability) {
|
|
- u64 v = strtoul_or_return(buf);
|
|
-
|
|
- bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v);
|
|
- }
|
|
-
|
|
if (attr == &sysfs_label) {
|
|
char *tmp;
|
|
int ret;
|
|
@@ -839,20 +966,20 @@ STORE(bch2_dev)
|
|
if (attr == &sysfs_io_errors_reset)
|
|
bch2_dev_errors_reset(ca);
|
|
|
|
+ int opt_id = bch2_opt_lookup(attr->name);
|
|
+ if (opt_id >= 0)
|
|
+ return sysfs_opt_store(c, ca, opt_id, buf, size);
|
|
+
|
|
return size;
|
|
}
|
|
SYSFS_OPS(bch2_dev);
|
|
|
|
struct attribute *bch2_dev_files[] = {
|
|
&sysfs_uuid,
|
|
- &sysfs_bucket_size,
|
|
&sysfs_first_bucket,
|
|
&sysfs_nbuckets,
|
|
- &sysfs_durability,
|
|
|
|
/* settings: */
|
|
- &sysfs_discard,
|
|
- &sysfs_state,
|
|
&sysfs_label,
|
|
|
|
&sysfs_has_data,
|
|
@@ -866,9 +993,14 @@ struct attribute *bch2_dev_files[] = {
|
|
&sysfs_io_latency_stats_write,
|
|
&sysfs_congested,
|
|
|
|
+ &sysfs_read_fua_test,
|
|
+
|
|
/* debug: */
|
|
&sysfs_alloc_debug,
|
|
&sysfs_open_buckets,
|
|
+
|
|
+ &sysfs_read_refs,
|
|
+ &sysfs_write_refs,
|
|
NULL
|
|
};
|
|
|
|
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
|
|
index 222cd5062702..303e0433c702 100644
|
|
--- a/fs/bcachefs/sysfs.h
|
|
+++ b/fs/bcachefs/sysfs.h
|
|
@@ -23,7 +23,7 @@ extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
|
|
extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
|
|
extern const struct sysfs_ops bch2_dev_sysfs_ops;
|
|
|
|
-int bch2_opts_create_sysfs_files(struct kobject *);
|
|
+int bch2_opts_create_sysfs_files(struct kobject *, unsigned);
|
|
|
|
#else
|
|
|
|
@@ -41,7 +41,8 @@ static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
|
|
static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
|
|
static const struct sysfs_ops bch2_dev_sysfs_ops;
|
|
|
|
-static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
|
|
+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
|
|
+{ return 0; }
|
|
|
|
#endif /* NO_BCACHEFS_SYSFS */
|
|
|
|
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
|
|
index 6c6469814637..782a05fe7656 100644
|
|
--- a/fs/bcachefs/tests.c
|
|
+++ b/fs/bcachefs/tests.c
|
|
@@ -43,7 +43,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
|
|
BTREE_ITER_intent);
|
|
|
|
ret = commit_do(trans, NULL, NULL, 0,
|
|
- bch2_btree_iter_traverse(&iter) ?:
|
|
+ bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_trans_update(trans, &iter, &k.k_i, 0));
|
|
bch_err_msg(c, ret, "update error");
|
|
if (ret)
|
|
@@ -51,7 +51,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
|
|
|
|
pr_info("deleting once");
|
|
ret = commit_do(trans, NULL, NULL, 0,
|
|
- bch2_btree_iter_traverse(&iter) ?:
|
|
+ bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_btree_delete_at(trans, &iter, 0));
|
|
bch_err_msg(c, ret, "delete error (first)");
|
|
if (ret)
|
|
@@ -59,7 +59,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
|
|
|
|
pr_info("deleting twice");
|
|
ret = commit_do(trans, NULL, NULL, 0,
|
|
- bch2_btree_iter_traverse(&iter) ?:
|
|
+ bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_btree_delete_at(trans, &iter, 0));
|
|
bch_err_msg(c, ret, "delete error (second)");
|
|
if (ret)
|
|
@@ -84,7 +84,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
|
|
BTREE_ITER_intent);
|
|
|
|
ret = commit_do(trans, NULL, NULL, 0,
|
|
- bch2_btree_iter_traverse(&iter) ?:
|
|
+ bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_trans_update(trans, &iter, &k.k_i, 0));
|
|
bch_err_msg(c, ret, "update error");
|
|
if (ret)
|
|
@@ -94,7 +94,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
|
|
bch2_journal_flush_all_pins(&c->journal);
|
|
|
|
ret = commit_do(trans, NULL, NULL, 0,
|
|
- bch2_btree_iter_traverse(&iter) ?:
|
|
+ bch2_btree_iter_traverse(trans, &iter) ?:
|
|
bch2_btree_delete_at(trans, &iter, 0));
|
|
bch_err_msg(c, ret, "delete error");
|
|
if (ret)
|
|
@@ -342,6 +342,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
|
|
*/
|
|
static int test_peek_end(struct bch_fs *c, u64 nr)
|
|
{
|
|
+ delete_test_keys(c);
|
|
+
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
struct btree_iter iter;
|
|
struct bkey_s_c k;
|
|
@@ -349,10 +351,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
|
|
SPOS(0, 0, U32_MAX), 0);
|
|
|
|
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
|
|
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
|
|
BUG_ON(k.k);
|
|
|
|
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
|
|
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
|
|
BUG_ON(k.k);
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
@@ -362,6 +364,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
|
|
|
|
static int test_peek_end_extents(struct bch_fs *c, u64 nr)
|
|
{
|
|
+ delete_test_keys(c);
|
|
+
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
struct btree_iter iter;
|
|
struct bkey_s_c k;
|
|
@@ -369,10 +373,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
|
|
SPOS(0, 0, U32_MAX), 0);
|
|
|
|
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
|
|
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
|
|
BUG_ON(k.k);
|
|
|
|
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
|
|
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
|
|
BUG_ON(k.k);
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
@@ -488,7 +492,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
|
|
trans = bch2_trans_get(c);
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
|
|
SPOS(0, 0, snapid_lo), 0);
|
|
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
|
|
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
|
|
|
|
BUG_ON(k.k->p.snapshot != U32_MAX);
|
|
|
|
@@ -602,9 +606,9 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
|
|
SPOS(0, 0, U32_MAX), 0);
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
- bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
|
|
+ bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX));
|
|
|
|
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
|
|
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
break;
|
|
@@ -623,9 +627,9 @@ static int rand_mixed_trans(struct btree_trans *trans,
|
|
struct bkey_s_c k;
|
|
int ret;
|
|
|
|
- bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
|
|
+ bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX));
|
|
|
|
- k = bch2_btree_iter_peek(iter);
|
|
+ k = bch2_btree_iter_peek(trans, iter);
|
|
ret = bkey_err(k);
|
|
bch_err_msg(trans->c, ret, "lookup error");
|
|
if (ret)
|
|
@@ -672,7 +676,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
|
|
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
|
|
BTREE_ITER_intent);
|
|
- k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX));
|
|
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX));
|
|
ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
|
|
index dea73bc1cb51..314a24d15d4e 100644
|
|
--- a/fs/bcachefs/thread_with_file.c
|
|
+++ b/fs/bcachefs/thread_with_file.c
|
|
@@ -455,8 +455,10 @@ ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocki
|
|
struct stdio_buf *buf = &stdio->output;
|
|
unsigned long flags;
|
|
ssize_t ret;
|
|
-
|
|
again:
|
|
+ if (stdio->done)
|
|
+ return -EPIPE;
|
|
+
|
|
spin_lock_irqsave(&buf->lock, flags);
|
|
ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
|
|
spin_unlock_irqrestore(&buf->lock, flags);
|
|
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
|
|
index f4d484d44f63..254b8493ec4b 100644
|
|
--- a/fs/bcachefs/thread_with_file_types.h
|
|
+++ b/fs/bcachefs/thread_with_file_types.h
|
|
@@ -2,7 +2,7 @@
|
|
#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
|
|
#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
|
|
|
|
-#include "darray.h"
|
|
+#include <linux/darray_types.h>
|
|
|
|
struct stdio_buf {
|
|
spinlock_t lock;
|
|
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
|
|
index 3fe82757f93a..2c34fe4be912 100644
|
|
--- a/fs/bcachefs/time_stats.c
|
|
+++ b/fs/bcachefs/time_stats.c
|
|
@@ -10,6 +10,9 @@
|
|
#include "eytzinger.h"
|
|
#include "time_stats.h"
|
|
|
|
+/* disable automatic switching to percpu mode */
|
|
+#define TIME_STATS_NONPCPU ((unsigned long) 1)
|
|
+
|
|
static const struct time_unit time_units[] = {
|
|
{ "ns", 1 },
|
|
{ "us", NSEC_PER_USEC },
|
|
@@ -123,11 +126,12 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
|
|
{
|
|
unsigned long flags;
|
|
|
|
- if (!stats->buffer) {
|
|
+ if ((unsigned long) stats->buffer <= TIME_STATS_NONPCPU) {
|
|
spin_lock_irqsave(&stats->lock, flags);
|
|
time_stats_update_one(stats, start, end);
|
|
|
|
- if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
|
|
+ if (!stats->buffer &&
|
|
+ mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
|
|
stats->duration_stats.n > 1024)
|
|
stats->buffer =
|
|
alloc_percpu_gfp(struct time_stat_buffer,
|
|
@@ -157,7 +161,7 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats)
|
|
unsigned offset = offsetof(struct bch2_time_stats, min_duration);
|
|
memset((void *) stats + offset, 0, sizeof(*stats) - offset);
|
|
|
|
- if (stats->buffer) {
|
|
+ if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) {
|
|
int cpu;
|
|
for_each_possible_cpu(cpu)
|
|
per_cpu_ptr(stats->buffer, cpu)->nr = 0;
|
|
@@ -167,7 +171,9 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats)
|
|
|
|
void bch2_time_stats_exit(struct bch2_time_stats *stats)
|
|
{
|
|
- free_percpu(stats->buffer);
|
|
+ if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU)
|
|
+ free_percpu(stats->buffer);
|
|
+ stats->buffer = NULL;
|
|
}
|
|
|
|
void bch2_time_stats_init(struct bch2_time_stats *stats)
|
|
@@ -177,3 +183,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats)
|
|
stats->min_freq = U64_MAX;
|
|
spin_lock_init(&stats->lock);
|
|
}
|
|
+
|
|
+void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats)
|
|
+{
|
|
+ bch2_time_stats_init(stats);
|
|
+ stats->buffer = (struct time_stat_buffer __percpu *) TIME_STATS_NONPCPU;
|
|
+}
|
|
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
|
|
index dc6493f7bbab..eddb0985bab4 100644
|
|
--- a/fs/bcachefs/time_stats.h
|
|
+++ b/fs/bcachefs/time_stats.h
|
|
@@ -145,6 +145,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
|
|
void bch2_time_stats_reset(struct bch2_time_stats *);
|
|
void bch2_time_stats_exit(struct bch2_time_stats *);
|
|
void bch2_time_stats_init(struct bch2_time_stats *);
|
|
+void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *);
|
|
|
|
static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
|
|
{
|
|
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
|
|
index c1b51009edf6..8cb5b40704fd 100644
|
|
--- a/fs/bcachefs/trace.h
|
|
+++ b/fs/bcachefs/trace.h
|
|
@@ -295,12 +295,12 @@ TRACE_EVENT(write_super,
|
|
|
|
/* io.c: */
|
|
|
|
-DEFINE_EVENT(bio, read_promote,
|
|
+DEFINE_EVENT(bio, io_read_promote,
|
|
TP_PROTO(struct bio *bio),
|
|
TP_ARGS(bio)
|
|
);
|
|
|
|
-TRACE_EVENT(read_nopromote,
|
|
+TRACE_EVENT(io_read_nopromote,
|
|
TP_PROTO(struct bch_fs *c, int ret),
|
|
TP_ARGS(c, ret),
|
|
|
|
@@ -319,26 +319,55 @@ TRACE_EVENT(read_nopromote,
|
|
__entry->ret)
|
|
);
|
|
|
|
-DEFINE_EVENT(bio, read_bounce,
|
|
+DEFINE_EVENT(bio, io_read_bounce,
|
|
TP_PROTO(struct bio *bio),
|
|
TP_ARGS(bio)
|
|
);
|
|
|
|
-DEFINE_EVENT(bio, read_split,
|
|
+DEFINE_EVENT(bio, io_read_split,
|
|
TP_PROTO(struct bio *bio),
|
|
TP_ARGS(bio)
|
|
);
|
|
|
|
-DEFINE_EVENT(bio, read_retry,
|
|
+DEFINE_EVENT(bio, io_read_retry,
|
|
TP_PROTO(struct bio *bio),
|
|
TP_ARGS(bio)
|
|
);
|
|
|
|
-DEFINE_EVENT(bio, read_reuse_race,
|
|
+DEFINE_EVENT(bio, io_read_reuse_race,
|
|
TP_PROTO(struct bio *bio),
|
|
TP_ARGS(bio)
|
|
);
|
|
|
|
+DEFINE_EVENT(bio, io_read_fail_and_poison,
|
|
+ TP_PROTO(struct bio *bio),
|
|
+ TP_ARGS(bio)
|
|
+);
|
|
+
|
|
+/* ec.c */
|
|
+
|
|
+TRACE_EVENT(stripe_create,
|
|
+ TP_PROTO(struct bch_fs *c, u64 idx, int ret),
|
|
+ TP_ARGS(c, idx, ret),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(dev_t, dev )
|
|
+ __field(u64, idx )
|
|
+ __field(int, ret )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->dev = c->dev;
|
|
+ __entry->idx = idx;
|
|
+ __entry->ret = ret;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%d,%d idx %llu ret %i",
|
|
+ MAJOR(__entry->dev), MINOR(__entry->dev),
|
|
+ __entry->idx,
|
|
+ __entry->ret)
|
|
+);
|
|
+
|
|
/* Journal */
|
|
|
|
DEFINE_EVENT(bch_fs, journal_full,
|
|
@@ -797,53 +826,37 @@ TRACE_EVENT(bucket_invalidate,
|
|
|
|
/* Moving IO */
|
|
|
|
-TRACE_EVENT(bucket_evacuate,
|
|
- TP_PROTO(struct bch_fs *c, struct bpos *bucket),
|
|
- TP_ARGS(c, bucket),
|
|
-
|
|
- TP_STRUCT__entry(
|
|
- __field(dev_t, dev )
|
|
- __field(u32, dev_idx )
|
|
- __field(u64, bucket )
|
|
- ),
|
|
-
|
|
- TP_fast_assign(
|
|
- __entry->dev = c->dev;
|
|
- __entry->dev_idx = bucket->inode;
|
|
- __entry->bucket = bucket->offset;
|
|
- ),
|
|
-
|
|
- TP_printk("%d:%d %u:%llu",
|
|
- MAJOR(__entry->dev), MINOR(__entry->dev),
|
|
- __entry->dev_idx, __entry->bucket)
|
|
+DEFINE_EVENT(fs_str, io_move,
|
|
+ TP_PROTO(struct bch_fs *c, const char *str),
|
|
+ TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent,
|
|
+DEFINE_EVENT(fs_str, io_move_read,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent_read,
|
|
+DEFINE_EVENT(fs_str, io_move_write,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent_write,
|
|
+DEFINE_EVENT(fs_str, io_move_finish,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent_finish,
|
|
+DEFINE_EVENT(fs_str, io_move_fail,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent_fail,
|
|
+DEFINE_EVENT(fs_str, io_move_write_fail,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent_start_fail,
|
|
+DEFINE_EVENT(fs_str, io_move_start_fail,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
@@ -881,37 +894,6 @@ TRACE_EVENT(move_data,
|
|
__entry->sectors_raced)
|
|
);
|
|
|
|
-TRACE_EVENT(evacuate_bucket,
|
|
- TP_PROTO(struct bch_fs *c, struct bpos *bucket,
|
|
- unsigned sectors, unsigned bucket_size,
|
|
- int ret),
|
|
- TP_ARGS(c, bucket, sectors, bucket_size, ret),
|
|
-
|
|
- TP_STRUCT__entry(
|
|
- __field(dev_t, dev )
|
|
- __field(u64, member )
|
|
- __field(u64, bucket )
|
|
- __field(u32, sectors )
|
|
- __field(u32, bucket_size )
|
|
- __field(int, ret )
|
|
- ),
|
|
-
|
|
- TP_fast_assign(
|
|
- __entry->dev = c->dev;
|
|
- __entry->member = bucket->inode;
|
|
- __entry->bucket = bucket->offset;
|
|
- __entry->sectors = sectors;
|
|
- __entry->bucket_size = bucket_size;
|
|
- __entry->ret = ret;
|
|
- ),
|
|
-
|
|
- TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
|
|
- MAJOR(__entry->dev), MINOR(__entry->dev),
|
|
- __entry->member, __entry->bucket,
|
|
- __entry->sectors, __entry->bucket_size,
|
|
- __entry->ret)
|
|
-);
|
|
-
|
|
TRACE_EVENT(copygc,
|
|
TP_PROTO(struct bch_fs *c,
|
|
u64 buckets,
|
|
@@ -1145,51 +1127,9 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
|
|
TP_ARGS(trans, caller_ip, path)
|
|
);
|
|
|
|
-TRACE_EVENT(trans_restart_upgrade,
|
|
- TP_PROTO(struct btree_trans *trans,
|
|
- unsigned long caller_ip,
|
|
- struct btree_path *path,
|
|
- unsigned old_locks_want,
|
|
- unsigned new_locks_want,
|
|
- struct get_locks_fail *f),
|
|
- TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
|
|
-
|
|
- TP_STRUCT__entry(
|
|
- __array(char, trans_fn, 32 )
|
|
- __field(unsigned long, caller_ip )
|
|
- __field(u8, btree_id )
|
|
- __field(u8, old_locks_want )
|
|
- __field(u8, new_locks_want )
|
|
- __field(u8, level )
|
|
- __field(u32, path_seq )
|
|
- __field(u32, node_seq )
|
|
- TRACE_BPOS_entries(pos)
|
|
- ),
|
|
-
|
|
- TP_fast_assign(
|
|
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
|
|
- __entry->caller_ip = caller_ip;
|
|
- __entry->btree_id = path->btree_id;
|
|
- __entry->old_locks_want = old_locks_want;
|
|
- __entry->new_locks_want = new_locks_want;
|
|
- __entry->level = f->l;
|
|
- __entry->path_seq = path->l[f->l].lock_seq;
|
|
- __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
|
|
- TRACE_BPOS_assign(pos, path->pos)
|
|
- ),
|
|
-
|
|
- TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u",
|
|
- __entry->trans_fn,
|
|
- (void *) __entry->caller_ip,
|
|
- bch2_btree_id_str(__entry->btree_id),
|
|
- __entry->pos_inode,
|
|
- __entry->pos_offset,
|
|
- __entry->pos_snapshot,
|
|
- __entry->old_locks_want,
|
|
- __entry->new_locks_want,
|
|
- __entry->level,
|
|
- __entry->path_seq,
|
|
- __entry->node_seq)
|
|
+DEFINE_EVENT(fs_str, trans_restart_upgrade,
|
|
+ TP_PROTO(struct bch_fs *c, const char *str),
|
|
+ TP_ARGS(c, str)
|
|
);
|
|
|
|
DEFINE_EVENT(trans_str, trans_restart_relock,
|
|
@@ -1491,6 +1431,11 @@ DEFINE_EVENT(fs_str, data_update,
|
|
TP_ARGS(c, str)
|
|
);
|
|
|
|
+DEFINE_EVENT(fs_str, io_move_created_rebalance,
|
|
+ TP_PROTO(struct bch_fs *c, const char *str),
|
|
+ TP_ARGS(c, str)
|
|
+);
|
|
+
|
|
TRACE_EVENT(error_downcast,
|
|
TP_PROTO(int bch_err, int std_err, unsigned long ip),
|
|
TP_ARGS(bch_err, std_err, ip),
|
|
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
|
|
index da2cd11b3025..dc3817f545fa 100644
|
|
--- a/fs/bcachefs/util.c
|
|
+++ b/fs/bcachefs/util.c
|
|
@@ -252,8 +252,18 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v)
|
|
bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
|
|
}
|
|
|
|
-static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
|
|
- bool nonblocking)
|
|
+static bool string_is_spaces(const char *str)
|
|
+{
|
|
+ while (*str) {
|
|
+ if (*str != ' ')
|
|
+ return false;
|
|
+ str++;
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+void bch2_print_string_as_lines(const char *prefix, const char *lines,
|
|
+ bool nonblocking)
|
|
{
|
|
bool locked = false;
|
|
const char *p;
|
|
@@ -270,8 +280,11 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
|
|
locked = console_trylock();
|
|
}
|
|
|
|
- while (1) {
|
|
+ while (*lines) {
|
|
p = strchrnul(lines, '\n');
|
|
+ if (!*p && string_is_spaces(lines))
|
|
+ break;
|
|
+
|
|
printk("%s%.*s\n", prefix, (int) (p - lines), lines);
|
|
if (!*p)
|
|
break;
|
|
@@ -281,16 +294,6 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
|
|
console_unlock();
|
|
}
|
|
|
|
-void bch2_print_string_as_lines(const char *prefix, const char *lines)
|
|
-{
|
|
- return __bch2_print_string_as_lines(prefix, lines, false);
|
|
-}
|
|
-
|
|
-void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines)
|
|
-{
|
|
- return __bch2_print_string_as_lines(prefix, lines, true);
|
|
-}
|
|
-
|
|
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
|
|
gfp_t gfp)
|
|
{
|
|
@@ -473,10 +476,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
|
|
u64 last_q = 0;
|
|
|
|
prt_printf(out, "quantiles (%s):\t", u->name);
|
|
- eytzinger0_for_each(i, NR_QUANTILES) {
|
|
- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
|
|
+ eytzinger0_for_each(j, NR_QUANTILES) {
|
|
+ bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1;
|
|
|
|
- u64 q = max(quantiles->entries[i].m, last_q);
|
|
+ u64 q = max(quantiles->entries[j].m, last_q);
|
|
prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
|
|
if (is_last)
|
|
prt_newline(out);
|
|
@@ -704,12 +707,43 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
|
|
}
|
|
}
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+void bch2_corrupt_bio(struct bio *bio)
|
|
+{
|
|
+ struct bvec_iter iter;
|
|
+ struct bio_vec bv;
|
|
+ unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
|
|
+
|
|
+ bio_for_each_segment(bv, bio, iter) {
|
|
+ unsigned u64s = bv.bv_len / sizeof(u64);
|
|
+
|
|
+ if (offset < u64s) {
|
|
+ u64 *segment = bvec_kmap_local(&bv);
|
|
+ segment[offset] = get_random_u64();
|
|
+ kunmap_local(segment);
|
|
+ return;
|
|
+ }
|
|
+ offset -= u64s;
|
|
+ }
|
|
+}
|
|
+#endif
|
|
+
|
|
+void bch2_bio_to_text(struct printbuf *out, struct bio *bio)
|
|
+{
|
|
+ prt_printf(out, "bi_remaining:\t%u\n",
|
|
+ atomic_read(&bio->__bi_remaining));
|
|
+ prt_printf(out, "bi_end_io:\t%ps\n",
|
|
+ bio->bi_end_io);
|
|
+ prt_printf(out, "bi_status:\t%u\n",
|
|
+ bio->bi_status);
|
|
+}
|
|
+
|
|
#if 0
|
|
void eytzinger1_test(void)
|
|
{
|
|
- unsigned inorder, eytz, size;
|
|
+ unsigned inorder, size;
|
|
|
|
- pr_info("1 based eytzinger test:");
|
|
+ pr_info("1 based eytzinger test:\n");
|
|
|
|
for (size = 2;
|
|
size < 65536;
|
|
@@ -717,13 +751,7 @@ void eytzinger1_test(void)
|
|
unsigned extra = eytzinger1_extra(size);
|
|
|
|
if (!(size % 4096))
|
|
- pr_info("tree size %u", size);
|
|
-
|
|
- BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
|
|
- BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
|
|
-
|
|
- BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0);
|
|
- BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0);
|
|
+ pr_info("tree size %u\n", size);
|
|
|
|
inorder = 1;
|
|
eytzinger1_for_each(eytz, size) {
|
|
@@ -734,15 +762,16 @@ void eytzinger1_test(void)
|
|
|
|
inorder++;
|
|
}
|
|
+ BUG_ON(inorder - 1 != size);
|
|
}
|
|
}
|
|
|
|
void eytzinger0_test(void)
|
|
{
|
|
|
|
- unsigned inorder, eytz, size;
|
|
+ unsigned inorder, size;
|
|
|
|
- pr_info("0 based eytzinger test:");
|
|
+ pr_info("0 based eytzinger test:\n");
|
|
|
|
for (size = 1;
|
|
size < 65536;
|
|
@@ -750,13 +779,7 @@ void eytzinger0_test(void)
|
|
unsigned extra = eytzinger0_extra(size);
|
|
|
|
if (!(size % 4096))
|
|
- pr_info("tree size %u", size);
|
|
-
|
|
- BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
|
|
- BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
|
|
-
|
|
- BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1);
|
|
- BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1);
|
|
+ pr_info("tree size %u\n", size);
|
|
|
|
inorder = 0;
|
|
eytzinger0_for_each(eytz, size) {
|
|
@@ -767,54 +790,191 @@ void eytzinger0_test(void)
|
|
|
|
inorder++;
|
|
}
|
|
+ BUG_ON(inorder != size);
|
|
+
|
|
+ inorder = size - 1;
|
|
+ eytzinger0_for_each_prev(eytz, size) {
|
|
+ BUG_ON(eytz != eytzinger0_first(size) &&
|
|
+ eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz);
|
|
+
|
|
+ inorder--;
|
|
+ }
|
|
+ BUG_ON(inorder != -1);
|
|
}
|
|
}
|
|
|
|
-static inline int cmp_u16(const void *_l, const void *_r, size_t size)
|
|
+static inline int cmp_u16(const void *_l, const void *_r)
|
|
{
|
|
const u16 *l = _l, *r = _r;
|
|
|
|
- return (*l > *r) - (*r - *l);
|
|
+ return (*l > *r) - (*r > *l);
|
|
}
|
|
|
|
-static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
|
|
+static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search)
|
|
{
|
|
- int i, c1 = -1, c2 = -1;
|
|
- ssize_t r;
|
|
+ int r, s;
|
|
+ bool bad;
|
|
|
|
r = eytzinger0_find_le(test_array, nr,
|
|
sizeof(test_array[0]),
|
|
cmp_u16, &search);
|
|
- if (r >= 0)
|
|
- c1 = test_array[r];
|
|
-
|
|
- for (i = 0; i < nr; i++)
|
|
- if (test_array[i] <= search && test_array[i] > c2)
|
|
- c2 = test_array[i];
|
|
-
|
|
- if (c1 != c2) {
|
|
- eytzinger0_for_each(i, nr)
|
|
- pr_info("[%3u] = %12u", i, test_array[i]);
|
|
- pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
|
|
- i, r, c1, c2);
|
|
+ if (r >= 0) {
|
|
+ if (test_array[r] > search) {
|
|
+ bad = true;
|
|
+ } else {
|
|
+ s = eytzinger0_next(r, nr);
|
|
+ bad = s >= 0 && test_array[s] <= search;
|
|
+ }
|
|
+ } else {
|
|
+ s = eytzinger0_last(nr);
|
|
+ bad = s >= 0 && test_array[s] <= search;
|
|
+ }
|
|
+
|
|
+ if (bad) {
|
|
+ s = -1;
|
|
+ eytzinger0_for_each_prev(j, nr) {
|
|
+ if (test_array[j] <= search) {
|
|
+ s = j;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ eytzinger0_for_each(j, nr)
|
|
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
|
|
+ pr_info("find_le(%12u) = %3i should be %3i\n",
|
|
+ search, r, s);
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search)
|
|
+{
|
|
+ int r, s;
|
|
+ bool bad;
|
|
+
|
|
+ r = eytzinger0_find_gt(test_array, nr,
|
|
+ sizeof(test_array[0]),
|
|
+ cmp_u16, &search);
|
|
+ if (r >= 0) {
|
|
+ if (test_array[r] <= search) {
|
|
+ bad = true;
|
|
+ } else {
|
|
+ s = eytzinger0_prev(r, nr);
|
|
+ bad = s >= 0 && test_array[s] > search;
|
|
+ }
|
|
+ } else {
|
|
+ s = eytzinger0_first(nr);
|
|
+ bad = s >= 0 && test_array[s] > search;
|
|
+ }
|
|
+
|
|
+ if (bad) {
|
|
+ s = -1;
|
|
+ eytzinger0_for_each(j, nr) {
|
|
+ if (test_array[j] > search) {
|
|
+ s = j;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ eytzinger0_for_each(j, nr)
|
|
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
|
|
+ pr_info("find_gt(%12u) = %3i should be %3i\n",
|
|
+ search, r, s);
|
|
+ BUG();
|
|
}
|
|
}
|
|
|
|
+static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search)
|
|
+{
|
|
+ int r, s;
|
|
+ bool bad;
|
|
+
|
|
+ r = eytzinger0_find_ge(test_array, nr,
|
|
+ sizeof(test_array[0]),
|
|
+ cmp_u16, &search);
|
|
+ if (r >= 0) {
|
|
+ if (test_array[r] < search) {
|
|
+ bad = true;
|
|
+ } else {
|
|
+ s = eytzinger0_prev(r, nr);
|
|
+ bad = s >= 0 && test_array[s] >= search;
|
|
+ }
|
|
+ } else {
|
|
+ s = eytzinger0_first(nr);
|
|
+ bad = s >= 0 && test_array[s] >= search;
|
|
+ }
|
|
+
|
|
+ if (bad) {
|
|
+ s = -1;
|
|
+ eytzinger0_for_each(j, nr) {
|
|
+ if (test_array[j] >= search) {
|
|
+ s = j;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ eytzinger0_for_each(j, nr)
|
|
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
|
|
+ pr_info("find_ge(%12u) = %3i should be %3i\n",
|
|
+ search, r, s);
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search)
|
|
+{
|
|
+ unsigned r;
|
|
+ int s;
|
|
+ bool bad;
|
|
+
|
|
+ r = eytzinger0_find(test_array, nr,
|
|
+ sizeof(test_array[0]),
|
|
+ cmp_u16, &search);
|
|
+
|
|
+ if (r < nr) {
|
|
+ bad = test_array[r] != search;
|
|
+ } else {
|
|
+ s = eytzinger0_find_le(test_array, nr,
|
|
+ sizeof(test_array[0]),
|
|
+ cmp_u16, &search);
|
|
+ bad = s >= 0 && test_array[s] == search;
|
|
+ }
|
|
+
|
|
+ if (bad) {
|
|
+ eytzinger0_for_each(j, nr)
|
|
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
|
|
+ pr_info("find(%12u) = %3i is incorrect\n",
|
|
+ search, r);
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
|
|
+{
|
|
+ eytzinger0_find_test_le(test_array, nr, search);
|
|
+ eytzinger0_find_test_gt(test_array, nr, search);
|
|
+ eytzinger0_find_test_ge(test_array, nr, search);
|
|
+ eytzinger0_find_test_eq(test_array, nr, search);
|
|
+}
|
|
+
|
|
void eytzinger0_find_test(void)
|
|
{
|
|
unsigned i, nr, allocated = 1 << 12;
|
|
u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
|
|
|
|
for (nr = 1; nr < allocated; nr++) {
|
|
- pr_info("testing %u elems", nr);
|
|
+ u16 prev = 0;
|
|
+
|
|
+ pr_info("testing %u elems\n", nr);
|
|
|
|
get_random_bytes(test_array, nr * sizeof(test_array[0]));
|
|
eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
|
|
|
|
/* verify array is sorted correctly: */
|
|
- eytzinger0_for_each(i, nr)
|
|
- BUG_ON(i != eytzinger0_last(nr) &&
|
|
- test_array[i] > test_array[eytzinger0_next(i, nr)]);
|
|
+ eytzinger0_for_each(j, nr) {
|
|
+ BUG_ON(test_array[j] < prev);
|
|
+ prev = test_array[j];
|
|
+ }
|
|
|
|
for (i = 0; i < U16_MAX; i += 1 << 12)
|
|
eytzinger0_find_test_val(test_array, nr, i);
|
|
@@ -856,14 +1016,14 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
|
|
return ret;
|
|
}
|
|
|
|
-void bch2_darray_str_exit(darray_str *d)
|
|
+void bch2_darray_str_exit(darray_const_str *d)
|
|
{
|
|
darray_for_each(*d, i)
|
|
kfree(*i);
|
|
darray_exit(d);
|
|
}
|
|
|
|
-int bch2_split_devs(const char *_dev_name, darray_str *ret)
|
|
+int bch2_split_devs(const char *_dev_name, darray_const_str *ret)
|
|
{
|
|
darray_init(ret);
|
|
|
|
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
|
|
index f4a4783219d9..ed5bee5e63de 100644
|
|
--- a/fs/bcachefs/util.h
|
|
+++ b/fs/bcachefs/util.h
|
|
@@ -5,23 +5,24 @@
|
|
#include <linux/bio.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/closure.h>
|
|
+#include <linux/darray.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/freezer.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/min_heap.h>
|
|
-#include <linux/sched/clock.h>
|
|
#include <linux/llist.h>
|
|
#include <linux/log2.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/preempt.h>
|
|
+#include <linux/random.h>
|
|
#include <linux/ratelimit.h>
|
|
+#include <linux/sched/clock.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/workqueue.h>
|
|
|
|
#include "mean_and_variance.h"
|
|
|
|
-#include "darray.h"
|
|
#include "time_stats.h"
|
|
|
|
struct closure;
|
|
@@ -55,15 +56,16 @@ static inline size_t buf_pages(void *p, size_t len)
|
|
PAGE_SIZE);
|
|
}
|
|
|
|
-static inline void *bch2_kvmalloc(size_t n, gfp_t flags)
|
|
+static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags)
|
|
{
|
|
void *p = unlikely(n >= INT_MAX)
|
|
- ? vmalloc(n)
|
|
- : kvmalloc(n, flags & ~__GFP_ZERO);
|
|
+ ? vmalloc_noprof(n)
|
|
+ : kvmalloc_noprof(n, flags & ~__GFP_ZERO);
|
|
if (p && (flags & __GFP_ZERO))
|
|
memset(p, 0, n);
|
|
return p;
|
|
}
|
|
+#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__))
|
|
|
|
#define init_heap(heap, _size, gfp) \
|
|
({ \
|
|
@@ -94,6 +96,7 @@ do { \
|
|
#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n)
|
|
|
|
#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n)
|
|
+#define printbuf_indent_add_nextline(_out, _n) bch2_printbuf_indent_add_nextline(_out, _n)
|
|
#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n)
|
|
|
|
#define prt_newline(_out) bch2_prt_newline(_out)
|
|
@@ -210,8 +213,7 @@ u64 bch2_read_flag_list(const char *, const char * const[]);
|
|
void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
|
|
void bch2_prt_u64_base2(struct printbuf *, u64);
|
|
|
|
-void bch2_print_string_as_lines(const char *prefix, const char *lines);
|
|
-void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines);
|
|
+void bch2_print_string_as_lines(const char *, const char *, bool);
|
|
|
|
typedef DARRAY(unsigned long) bch_stacktrace;
|
|
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
|
|
@@ -406,6 +408,20 @@ u64 bch2_get_random_u64_below(u64);
|
|
void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
|
|
void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+void bch2_corrupt_bio(struct bio *);
|
|
+
|
|
+static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
|
|
+{
|
|
+ if (ratio && !get_random_u32_below(ratio))
|
|
+ bch2_corrupt_bio(bio);
|
|
+}
|
|
+#else
|
|
+#define bch2_maybe_corrupt_bio(...) do {} while (0)
|
|
+#endif
|
|
+
|
|
+void bch2_bio_to_text(struct printbuf *, struct bio *);
|
|
+
|
|
static inline void memcpy_u64s_small(void *dst, const void *src,
|
|
unsigned u64s)
|
|
{
|
|
@@ -419,7 +435,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src,
|
|
static inline void __memcpy_u64s(void *dst, const void *src,
|
|
unsigned u64s)
|
|
{
|
|
-#ifdef CONFIG_X86_64
|
|
+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
|
|
long d0, d1, d2;
|
|
|
|
asm volatile("rep ; movsq"
|
|
@@ -496,7 +512,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src,
|
|
u64 *dst = (u64 *) _dst + u64s - 1;
|
|
u64 *src = (u64 *) _src + u64s - 1;
|
|
|
|
-#ifdef CONFIG_X86_64
|
|
+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
|
|
long d0, d1, d2;
|
|
|
|
asm volatile("std ;\n"
|
|
@@ -536,30 +552,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
|
|
memset(s + bytes, c, rem);
|
|
}
|
|
|
|
-/* just the memmove, doesn't update @_nr */
|
|
-#define __array_insert_item(_array, _nr, _pos) \
|
|
- memmove(&(_array)[(_pos) + 1], \
|
|
- &(_array)[(_pos)], \
|
|
- sizeof((_array)[0]) * ((_nr) - (_pos)))
|
|
-
|
|
-#define array_insert_item(_array, _nr, _pos, _new_item) \
|
|
-do { \
|
|
- __array_insert_item(_array, _nr, _pos); \
|
|
- (_nr)++; \
|
|
- (_array)[(_pos)] = (_new_item); \
|
|
-} while (0)
|
|
-
|
|
-#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
|
|
-do { \
|
|
- (_nr) -= (_nr_to_remove); \
|
|
- memmove(&(_array)[(_pos)], \
|
|
- &(_array)[(_pos) + (_nr_to_remove)], \
|
|
- sizeof((_array)[0]) * ((_nr) - (_pos))); \
|
|
-} while (0)
|
|
-
|
|
-#define array_remove_item(_array, _nr, _pos) \
|
|
- array_remove_items(_array, _nr, _pos, 1)
|
|
-
|
|
static inline void __move_gap(void *array, size_t element_size,
|
|
size_t nr, size_t size,
|
|
size_t old_gap, size_t new_gap)
|
|
@@ -675,8 +667,8 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r)
|
|
return l.len == r.len && !memcmp(l.name, r.name, l.len);
|
|
}
|
|
|
|
-void bch2_darray_str_exit(darray_str *);
|
|
-int bch2_split_devs(const char *, darray_str *);
|
|
+void bch2_darray_str_exit(darray_const_str *);
|
|
+int bch2_split_devs(const char *, darray_const_str *);
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
@@ -726,4 +718,42 @@ static inline void memcpy_swab(void *_dst, void *_src, size_t len)
|
|
*--dst = *src++;
|
|
}
|
|
|
|
+#define set_flags(_map, _in, _out) \
|
|
+do { \
|
|
+ unsigned _i; \
|
|
+ \
|
|
+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
|
|
+ if ((_in) & (1 << _i)) \
|
|
+ (_out) |= _map[_i]; \
|
|
+ else \
|
|
+ (_out) &= ~_map[_i]; \
|
|
+} while (0)
|
|
+
|
|
+#define map_flags(_map, _in) \
|
|
+({ \
|
|
+ unsigned _out = 0; \
|
|
+ \
|
|
+ set_flags(_map, _in, _out); \
|
|
+ _out; \
|
|
+})
|
|
+
|
|
+#define map_flags_rev(_map, _in) \
|
|
+({ \
|
|
+ unsigned _i, _out = 0; \
|
|
+ \
|
|
+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
|
|
+ if ((_in) & _map[_i]) { \
|
|
+ (_out) |= 1 << _i; \
|
|
+ (_in) &= ~_map[_i]; \
|
|
+ } \
|
|
+ (_out); \
|
|
+})
|
|
+
|
|
+#define map_defined(_map) \
|
|
+({ \
|
|
+ unsigned _in = ~0; \
|
|
+ \
|
|
+ map_flags_rev(_map, _in); \
|
|
+})
|
|
+
|
|
#endif /* _BCACHEFS_UTIL_H */
|
|
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
|
|
index aed7c6984173..627f153798c6 100644
|
|
--- a/fs/bcachefs/xattr.c
|
|
+++ b/fs/bcachefs/xattr.c
|
|
@@ -38,7 +38,7 @@ static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
|
|
struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
|
|
|
|
return bch2_xattr_hash(info,
|
|
- &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
|
|
+ &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len));
|
|
}
|
|
|
|
static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
|
|
@@ -48,7 +48,7 @@ static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
|
|
|
|
return l.v->x_type != r->type ||
|
|
l.v->x_name_len != r->name.len ||
|
|
- memcmp(l.v->x_name, r->name.name, r->name.len);
|
|
+ memcmp(l.v->x_name_and_value, r->name.name, r->name.len);
|
|
}
|
|
|
|
static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
|
|
@@ -58,7 +58,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
|
|
|
|
return l.v->x_type != r.v->x_type ||
|
|
l.v->x_name_len != r.v->x_name_len ||
|
|
- memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
|
|
+ memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len);
|
|
}
|
|
|
|
const struct bch_hash_desc bch2_xattr_hash_desc = {
|
|
@@ -96,7 +96,7 @@ int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
c, xattr_invalid_type,
|
|
"invalid type (%u)", xattr.v->x_type);
|
|
|
|
- bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len),
|
|
+ bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len),
|
|
c, xattr_name_invalid_chars,
|
|
"xattr name has invalid characters");
|
|
fsck_err:
|
|
@@ -120,13 +120,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
|
|
unsigned name_len = xattr.v->x_name_len;
|
|
unsigned val_len = le16_to_cpu(xattr.v->x_val_len);
|
|
unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
|
|
- offsetof(struct bch_xattr, x_name);
|
|
+ offsetof(struct bch_xattr, x_name_and_value);
|
|
|
|
val_len = min_t(int, val_len, max_name_val_bytes - name_len);
|
|
name_len = min(name_len, max_name_val_bytes);
|
|
|
|
prt_printf(out, "%.*s:%.*s",
|
|
- name_len, xattr.v->x_name,
|
|
+ name_len, xattr.v->x_name_and_value,
|
|
val_len, (char *) xattr_val(xattr.v));
|
|
|
|
if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
|
|
@@ -168,7 +168,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
|
|
int type, int flags)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter inode_iter = { NULL };
|
|
+ struct btree_iter inode_iter = {};
|
|
int ret;
|
|
|
|
ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
|
|
@@ -176,6 +176,11 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
|
|
if (ret)
|
|
return ret;
|
|
|
|
+ /*
|
|
+ * Besides the ctime update, extents, dirents and xattrs updates require
|
|
+ * that an inode update also happens - to ensure that if a key exists in
|
|
+ * one of those btrees with a given snapshot ID an inode is also present
|
|
+ */
|
|
inode_u->bi_ctime = bch2_current_time(c);
|
|
|
|
ret = bch2_inode_write(trans, &inode_iter, inode_u);
|
|
@@ -202,7 +207,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
|
|
xattr->v.x_type = type;
|
|
xattr->v.x_name_len = namelen;
|
|
xattr->v.x_val_len = cpu_to_le16(size);
|
|
- memcpy(xattr->v.x_name, name, namelen);
|
|
+ memcpy(xattr->v.x_name_and_value, name, namelen);
|
|
memcpy(xattr_val(&xattr->v), value, size);
|
|
|
|
ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
|
|
@@ -270,7 +275,7 @@ static int bch2_xattr_emit(struct dentry *dentry,
|
|
if (!prefix)
|
|
return 0;
|
|
|
|
- return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
|
|
+ return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf);
|
|
}
|
|
|
|
static int bch2_xattr_list_bcachefs(struct bch_fs *c,
|
|
@@ -473,6 +478,12 @@ static int inode_opt_set_fn(struct btree_trans *trans,
|
|
{
|
|
struct inode_opt_set *s = p;
|
|
|
|
+ if (s->id == Inode_opt_casefold) {
|
|
+ int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
if (s->defined)
|
|
bi->bi_fields_set |= 1U << s->id;
|
|
else
|
|
@@ -523,7 +534,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
|
|
if (ret < 0)
|
|
goto err_class_exit;
|
|
|
|
- ret = bch2_opt_check_may_set(c, opt_id, v);
|
|
+ ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v);
|
|
if (ret < 0)
|
|
goto err_class_exit;
|
|
|
|
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
|
|
index 132fbbd15a66..1139bf345f70 100644
|
|
--- a/fs/bcachefs/xattr.h
|
|
+++ b/fs/bcachefs/xattr.h
|
|
@@ -18,12 +18,12 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
|
|
|
static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
|
|
{
|
|
- return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
|
|
+ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) +
|
|
name_len + val_len, sizeof(u64));
|
|
}
|
|
|
|
#define xattr_val(_xattr) \
|
|
- ((void *) (_xattr)->x_name + (_xattr)->x_name_len)
|
|
+ ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len)
|
|
|
|
struct xattr_search_key {
|
|
u8 type;
|
|
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
|
|
index c7916011ef34..4121b78d9a92 100644
|
|
--- a/fs/bcachefs/xattr_format.h
|
|
+++ b/fs/bcachefs/xattr_format.h
|
|
@@ -13,7 +13,13 @@ struct bch_xattr {
|
|
__u8 x_type;
|
|
__u8 x_name_len;
|
|
__le16 x_val_len;
|
|
- __u8 x_name[] __counted_by(x_name_len);
|
|
+ /*
|
|
+ * x_name contains the name and value counted by
|
|
+ * x_name_len + x_val_len. The introduction of
|
|
+ * __counted_by(x_name_len) previously caused a false positive
|
|
+ * detection of an out of bounds write.
|
|
+ */
|
|
+ __u8 x_name_and_value[];
|
|
} __packed __aligned(8);
|
|
|
|
#endif /* _BCACHEFS_XATTR_FORMAT_H */
|
|
diff --git a/fs/dcache.c b/fs/dcache.c
|
|
index e3634916ffb9..db7029e2673f 100644
|
|
--- a/fs/dcache.c
|
|
+++ b/fs/dcache.c
|
|
@@ -32,6 +32,9 @@
|
|
#include <linux/bit_spinlock.h>
|
|
#include <linux/rculist_bl.h>
|
|
#include <linux/list_lru.h>
|
|
+#include <linux/rhashtable.h>
|
|
+#include <linux/darray.h>
|
|
+#include <linux/errname.h>
|
|
#include "internal.h"
|
|
#include "mount.h"
|
|
|
|
@@ -3169,6 +3172,266 @@ ino_t d_parent_ino(struct dentry *dentry)
|
|
}
|
|
EXPORT_SYMBOL(d_parent_ino);
|
|
|
|
+static struct rhashtable no_casefold_dentries;
|
|
+
|
|
+enum no_casefold_dentry_ref {
|
|
+ ref_casefold_disable,
|
|
+ ref_casefold_enable,
|
|
+};
|
|
+
|
|
+struct no_casefold_dentry {
|
|
+ struct rhash_head hash;
|
|
+ struct dentry *dentry;
|
|
+ unsigned long ref[2];
|
|
+};
|
|
+
|
|
+static const struct rhashtable_params no_casefold_dentries_params = {
|
|
+ .head_offset = offsetof(struct no_casefold_dentry, hash),
|
|
+ .key_offset = offsetof(struct no_casefold_dentry, dentry),
|
|
+ .key_len = sizeof(struct dentry *),
|
|
+ .automatic_shrinking = true,
|
|
+};
|
|
+
|
|
+static int no_casefold_dentry_get(struct dentry *dentry,
|
|
+ enum no_casefold_dentry_ref ref)
|
|
+{
|
|
+ struct no_casefold_dentry *n =
|
|
+ rhashtable_lookup_fast(&no_casefold_dentries,
|
|
+ &dentry,
|
|
+ no_casefold_dentries_params);
|
|
+ if (n) {
|
|
+ if (n->ref[!ref])
|
|
+ return -EINVAL;
|
|
+
|
|
+ n->ref[ref]++;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ n = kzalloc(sizeof(*n), GFP_KERNEL);
|
|
+ if (!n)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ n->dentry = dget(dentry);
|
|
+ n->ref[ref]++;
|
|
+
|
|
+ int ret = rhashtable_lookup_insert_fast(&no_casefold_dentries,
|
|
+ &n->hash, no_casefold_dentries_params);
|
|
+ if (WARN_ON(ret)) {
|
|
+ kfree(n);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void no_casefold_dentry_put(struct dentry *dentry,
|
|
+ enum no_casefold_dentry_ref ref)
|
|
+{
|
|
+ struct no_casefold_dentry *n =
|
|
+ rhashtable_lookup_fast(&no_casefold_dentries,
|
|
+ &dentry,
|
|
+ no_casefold_dentries_params);
|
|
+ if (WARN_ON(!n))
|
|
+ return;
|
|
+
|
|
+ if (--n->ref[ref])
|
|
+ return;
|
|
+
|
|
+ dput(n->dentry);
|
|
+ int ret = rhashtable_remove_fast(&no_casefold_dentries,
|
|
+ &n->hash, no_casefold_dentries_params);
|
|
+ WARN_ON(ret);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * d_casefold_disabled_put - drop a "casefold disabled" ref
|
|
+ *
|
|
+ * Only for overlayfs.
|
|
+ */
|
|
+void d_casefold_disabled_put(struct dentry *dentry)
|
|
+{
|
|
+ struct super_block *sb = dentry->d_sb;
|
|
+
|
|
+ if (!(sb->s_flags & SB_CASEFOLD))
|
|
+ return;
|
|
+
|
|
+ guard(mutex)(&sb->s_casefold_enable_lock);
|
|
+ no_casefold_dentry_put(dentry, ref_casefold_disable);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(d_casefold_disabled_put);
|
|
+
|
|
+/**
|
|
+ * d_casefold_disabled_get - attempt to disable casefold on a tree
|
|
+ *
|
|
+ * Only for overlayfs.
|
|
+ *
|
|
+ * Returns -EINVAL if casefolding is in use on any subdirectory; this must be
|
|
+ * tracked by the filesystem.
|
|
+ *
|
|
+ * On success, returns with a reference held that must be released by
|
|
+ * d_casefold_disabled_put(); this ref blocks casefold from being enabled
|
|
+ * by d_casefold_enable().
|
|
+ */
|
|
+int d_casefold_disabled_get(struct dentry *dentry)
|
|
+{
|
|
+ struct super_block *sb = dentry->d_sb;
|
|
+
|
|
+ if (!(sb->s_flags & SB_CASEFOLD))
|
|
+ return 0;
|
|
+
|
|
+ guard(mutex)(&sb->s_casefold_enable_lock);
|
|
+
|
|
+ if (!(dentry->d_inode->i_flags & S_NO_CASEFOLD))
|
|
+ return -EINVAL;
|
|
+
|
|
+ return no_casefold_dentry_get(dentry, ref_casefold_disable);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(d_casefold_disabled_get);
|
|
+
|
|
+/* Crabwalk: releases @dentry after getting ref on parent */
|
|
+static struct dentry *dget_parent_this_sb(struct dentry *dentry)
|
|
+{
|
|
+ struct dentry *parent = dentry != dentry->d_sb->s_root
|
|
+ ? dget_parent(dentry)
|
|
+ : NULL;
|
|
+ dput(dentry);
|
|
+ return parent;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * d_casefold_enable - check if casefolding may be enabled on a dentry
|
|
+ *
|
|
+ * @dentry: dentry to enable casefolding on
|
|
+ * @e: state object, released by d_casefold_enable_commit()
|
|
+ * @rename: Are we in the rename path?
|
|
+ * If so, we expect s_vfs_rename_mutex to be held, if not (called
|
|
+ * from setflags), we aquire it if necessary, and release in
|
|
+ * commit.
|
|
+ *
|
|
+ * The rename mutex is required because we're operating on a whole path,
|
|
+ * potentially up to the filesystem root, and we need it to be stable until
|
|
+ * commit (i.e. we don't want to be renamed into a tree overlayfs is exporting
|
|
+ * after we've returned success).
|
|
+ *
|
|
+ * For rename, this should only be called for cross-directory rename.
|
|
+ * S_NO_CASEFOLD doesn't need to change on rename within a directory, and
|
|
+ * s_vfs_rename_mutex won't be held on non cross-directory rename.
|
|
+ *
|
|
+ * Returns -EINVAL if casefolding has been disabled on any parent directory (by
|
|
+ * overlayfs).
|
|
+ *
|
|
+ * On success, the d_casefold_enable object must be committed with
|
|
+ * d_casefold_enable_commit(), after the filesystem has updated its internal
|
|
+ * state.
|
|
+ *
|
|
+ * Commit will clear S_NO_CASEFOLD on all inodes up to the filesystem root,
|
|
+ * informing overlayfs that this tree has casefolding enabled somewhere in it.
|
|
+ */
|
|
+int d_casefold_enable(struct dentry *dentry, struct d_casefold_enable *e,
|
|
+ bool rename)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ memset(e, sizeof(*e), 0);
|
|
+ e->sb = dentry->d_sb;
|
|
+
|
|
+ if (!(e->sb->s_flags & SB_CASEFOLD))
|
|
+ return 0;
|
|
+
|
|
+ if (!S_ISDIR(dentry->d_inode->i_mode))
|
|
+ return 0;
|
|
+
|
|
+ /*
|
|
+ * On rename, we're passed the dentry being renamed (the filesystem is
|
|
+ * not passed the dentry of the directory we're renaming to), but it's
|
|
+ * the parent that may need to have S_NO_CASEFOLD cleared:
|
|
+ */
|
|
+ dentry = rename
|
|
+ ? dget_parent(dentry)
|
|
+ : dget(dentry);
|
|
+
|
|
+ if (!(dentry->d_inode->i_flags & S_NO_CASEFOLD)) {
|
|
+ dput(dentry);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (rename) {
|
|
+ lockdep_assert_held(&e->sb->s_vfs_rename_mutex);
|
|
+ } else {
|
|
+ mutex_lock(&e->sb->s_vfs_rename_mutex);
|
|
+ e->rename_mutex_held = true;
|
|
+ }
|
|
+
|
|
+ guard(mutex)(&e->sb->s_casefold_enable_lock);
|
|
+
|
|
+ for (struct dentry *i = dentry; i; i = dget_parent_this_sb(i)) {
|
|
+ if (!(i->d_inode->i_flags & S_NO_CASEFOLD)) {
|
|
+ dput(i);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = darray_push(&e->refs, i);
|
|
+ if (ret) {
|
|
+ dput(i);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = no_casefold_dentry_get(i, ref_casefold_enable);
|
|
+ if (ret) {
|
|
+ dput(i);
|
|
+ --e->refs.nr;
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+err:
|
|
+ darray_for_each(e->refs, i)
|
|
+ no_casefold_dentry_put(*i, ref_casefold_enable);
|
|
+ darray_exit(&e->refs);
|
|
+
|
|
+ if (e->rename_mutex_held)
|
|
+ mutex_unlock(&e->sb->s_vfs_rename_mutex);
|
|
+ e->rename_mutex_held = false;
|
|
+ return ret;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(d_casefold_enable);
|
|
+
|
|
+/**
|
|
+ * d_casefold_enable_commit - finish operation started by d_casefold_enable()
|
|
+ *
|
|
+ * @e: state object, started by d_casefold_enable_commit()
|
|
+ * @ret: Success or failure of the operation, from the filesystem
|
|
+ *
|
|
+ * On success (@ret == 0), clear S_NO_CASEFOLD on all inodes up to the
|
|
+ * filesystem root that have it set, which d_casefold_enable() previously took
|
|
+ * references to.
|
|
+ */
|
|
+void d_casefold_enable_commit(struct d_casefold_enable *e, int ret)
|
|
+{
|
|
+ if (e->refs.nr) {
|
|
+ guard(mutex)(&e->sb->s_casefold_enable_lock);
|
|
+
|
|
+ darray_for_each(e->refs, i) {
|
|
+ if (!ret) {
|
|
+ struct inode *inode = (*i)->d_inode;
|
|
+
|
|
+ spin_lock(&inode->i_lock);
|
|
+ inode->i_flags &= ~S_NO_CASEFOLD;
|
|
+ spin_unlock(&inode->i_lock);
|
|
+ }
|
|
+
|
|
+ no_casefold_dentry_put(*i, ref_casefold_enable);
|
|
+ }
|
|
+ darray_exit(&e->refs);
|
|
+ }
|
|
+
|
|
+ if (e->rename_mutex_held)
|
|
+ mutex_unlock(&e->sb->s_vfs_rename_mutex);
|
|
+ e->rename_mutex_held = false;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(d_casefold_enable_commit);
|
|
+
|
|
static __initdata unsigned long dhash_entries;
|
|
static int __init set_dhash_entries(char *str)
|
|
{
|
|
@@ -3214,6 +3477,10 @@ static void __init dcache_init(void)
|
|
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
|
|
d_shortname.string);
|
|
|
|
+ int ret = rhashtable_init(&no_casefold_dentries, &no_casefold_dentries_params);
|
|
+ if (ret)
|
|
+ panic("error initializing no_casefold_dentries: %s\n", errname(ret));
|
|
+
|
|
/* Hash may have been set up in dcache_init_early */
|
|
if (!hashdist)
|
|
return;
|
|
diff --git a/fs/libfs.c b/fs/libfs.c
|
|
index dc042a975a56..fa73064c311c 100644
|
|
--- a/fs/libfs.c
|
|
+++ b/fs/libfs.c
|
|
@@ -1952,6 +1952,7 @@ void generic_set_sb_d_ops(struct super_block *sb)
|
|
{
|
|
#if IS_ENABLED(CONFIG_UNICODE)
|
|
if (sb->s_encoding) {
|
|
+ sb->s_flags |= SB_CASEFOLD;
|
|
sb->s_d_op = &generic_ci_dentry_ops;
|
|
return;
|
|
}
|
|
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
|
|
index 1115c22deca0..acebffc61c44 100644
|
|
--- a/fs/overlayfs/params.c
|
|
+++ b/fs/overlayfs/params.c
|
|
@@ -285,7 +285,8 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
|
|
* with overlayfs. Check explicitly to prevent post-mount
|
|
* failures.
|
|
*/
|
|
- if (sb_has_encoding(path->mnt->mnt_sb))
|
|
+ if ((path->mnt->mnt_sb->s_flags & SB_CASEFOLD) &&
|
|
+ !(path->dentry->d_inode->i_flags & S_NO_CASEFOLD))
|
|
return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name);
|
|
|
|
if (ovl_dentry_weird(path->dentry))
|
|
@@ -409,20 +410,32 @@ static int ovl_do_parse_layer(struct fs_context *fc, const char *layer_name,
|
|
if (!name)
|
|
return -ENOMEM;
|
|
|
|
+ if (layer != Opt_workdir &&
|
|
+ layer != Opt_upperdir) {
|
|
+ err = d_casefold_disabled_get(layer_path->dentry);
|
|
+ if (err)
|
|
+ return err;
|
|
+ }
|
|
+
|
|
upper = is_upper_layer(layer);
|
|
err = ovl_mount_dir_check(fc, layer_path, layer, name, upper);
|
|
if (err)
|
|
- return err;
|
|
+ goto err_put;
|
|
|
|
if (!upper) {
|
|
err = ovl_ctx_realloc_lower(fc);
|
|
if (err)
|
|
- return err;
|
|
+ goto err_put;
|
|
}
|
|
|
|
/* Store the user provided path string in ctx to show in mountinfo */
|
|
ovl_add_layer(fc, layer, layer_path, &name);
|
|
return err;
|
|
+err_put:
|
|
+ if (layer != Opt_workdir &&
|
|
+ layer != Opt_upperdir)
|
|
+ d_casefold_disabled_put(layer_path->dentry);
|
|
+ return err;
|
|
}
|
|
|
|
static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param,
|
|
@@ -473,6 +486,7 @@ static void ovl_reset_lowerdirs(struct ovl_fs_context *ctx)
|
|
ctx->lowerdir_all = NULL;
|
|
|
|
for (size_t nr = 0; nr < ctx->nr; nr++, l++) {
|
|
+ d_casefold_disabled_put(l->path.dentry);
|
|
path_put(&l->path);
|
|
kfree(l->name);
|
|
l->name = NULL;
|
|
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
|
|
index 0819c739cc2f..c515f260032c 100644
|
|
--- a/fs/overlayfs/util.c
|
|
+++ b/fs/overlayfs/util.c
|
|
@@ -205,10 +205,21 @@ bool ovl_dentry_weird(struct dentry *dentry)
|
|
if (!d_can_lookup(dentry) && !d_is_file(dentry) && !d_is_symlink(dentry))
|
|
return true;
|
|
|
|
- return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
|
|
- DCACHE_MANAGE_TRANSIT |
|
|
- DCACHE_OP_HASH |
|
|
- DCACHE_OP_COMPARE);
|
|
+ if (dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
|
|
+ DCACHE_MANAGE_TRANSIT))
|
|
+ return true;
|
|
+
|
|
+ /*
|
|
+ * The filesystem might support casefolding, but we've already checked
|
|
+ * that casefolding isn't present on this tree: we only need to check
|
|
+ * for non-casefolding hash/compare ops
|
|
+ */
|
|
+ if (!(dentry->d_sb->s_flags & SB_CASEFOLD) &&
|
|
+ (dentry->d_flags & (DCACHE_OP_HASH |
|
|
+ DCACHE_OP_COMPARE)))
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
}
|
|
|
|
enum ovl_path_type ovl_path_type(struct dentry *dentry)
|
|
diff --git a/fs/super.c b/fs/super.c
|
|
index 5a7db4a556e3..d84c50c400ec 100644
|
|
--- a/fs/super.c
|
|
+++ b/fs/super.c
|
|
@@ -368,6 +368,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
|
|
atomic_set(&s->s_active, 1);
|
|
mutex_init(&s->s_vfs_rename_mutex);
|
|
lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
|
|
+ mutex_init(&s->s_casefold_enable_lock);
|
|
init_rwsem(&s->s_dquot.dqio_sem);
|
|
s->s_maxbytes = MAX_NON_LFS;
|
|
s->s_op = &default_op;
|
|
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
|
|
index 0055066fb1d9..62d04f4843cf 100644
|
|
--- a/fs/xfs/xfs_super.c
|
|
+++ b/fs/xfs/xfs_super.c
|
|
@@ -2122,7 +2122,8 @@ static struct file_system_type xfs_fs_type = {
|
|
.init_fs_context = xfs_init_fs_context,
|
|
.parameters = xfs_fs_parameters,
|
|
.kill_sb = xfs_kill_sb,
|
|
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
|
|
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME |
|
|
+ FS_LBS,
|
|
};
|
|
MODULE_ALIAS_FS("xfs");
|
|
|
|
diff --git a/fs/bcachefs/darray.h b/include/linux/darray.h
|
|
similarity index 64%
|
|
rename from fs/bcachefs/darray.h
|
|
rename to include/linux/darray.h
|
|
index c6151495985f..7a0c0159b319 100644
|
|
--- a/fs/bcachefs/darray.h
|
|
+++ b/include/linux/darray.h
|
|
@@ -1,34 +1,26 @@
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
-#ifndef _BCACHEFS_DARRAY_H
|
|
-#define _BCACHEFS_DARRAY_H
|
|
+/*
|
|
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
|
|
+ */
|
|
+#ifndef _LINUX_DARRAY_H
|
|
+#define _LINUX_DARRAY_H
|
|
|
|
/*
|
|
- * Dynamic arrays:
|
|
+ * Dynamic arrays
|
|
*
|
|
* Inspired by CCAN's darray
|
|
*/
|
|
|
|
+#include <linux/darray_types.h>
|
|
#include <linux/slab.h>
|
|
|
|
-#define DARRAY_PREALLOCATED(_type, _nr) \
|
|
-struct { \
|
|
- size_t nr, size; \
|
|
- _type *data; \
|
|
- _type preallocated[_nr]; \
|
|
-}
|
|
-
|
|
-#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
|
|
-
|
|
-typedef DARRAY(char) darray_char;
|
|
-typedef DARRAY(char *) darray_str;
|
|
-
|
|
-int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
|
|
+int __darray_resize_slowpath(darray_char *, size_t, size_t, gfp_t);
|
|
|
|
#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
|
|
|
|
#define __darray_resize(_d, _element_size, _new_size, _gfp) \
|
|
(unlikely((_new_size) > (_d)->size) \
|
|
- ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
|
|
+ ? __darray_resize_slowpath((_d), (_element_size), (_new_size), (_gfp))\
|
|
: 0)
|
|
|
|
#define darray_resize_gfp(_d, _new_size, _gfp) \
|
|
@@ -63,6 +55,28 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
|
|
#define darray_first(_d) ((_d).data[0])
|
|
#define darray_last(_d) ((_d).data[(_d).nr - 1])
|
|
|
|
+/* Insert/remove items into the middle of a darray: */
|
|
+
|
|
+#define array_insert_item(_array, _nr, _pos, _new_item) \
|
|
+do { \
|
|
+ memmove(&(_array)[(_pos) + 1], \
|
|
+ &(_array)[(_pos)], \
|
|
+ sizeof((_array)[0]) * ((_nr) - (_pos))); \
|
|
+ (_nr)++; \
|
|
+ (_array)[(_pos)] = (_new_item); \
|
|
+} while (0)
|
|
+
|
|
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
|
|
+do { \
|
|
+ (_nr) -= (_nr_to_remove); \
|
|
+ memmove(&(_array)[(_pos)], \
|
|
+ &(_array)[(_pos) + (_nr_to_remove)], \
|
|
+ sizeof((_array)[0]) * ((_nr) - (_pos))); \
|
|
+} while (0)
|
|
+
|
|
+#define array_remove_item(_array, _nr, _pos) \
|
|
+ array_remove_items(_array, _nr, _pos, 1)
|
|
+
|
|
#define darray_insert_item(_d, pos, _item) \
|
|
({ \
|
|
size_t _pos = (pos); \
|
|
@@ -73,10 +87,15 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
|
|
_ret; \
|
|
})
|
|
|
|
+#define darray_remove_items(_d, _pos, _nr_to_remove) \
|
|
+ array_remove_items((_d)->data, (_d)->nr, (_pos) - (_d)->data, _nr_to_remove)
|
|
+
|
|
#define darray_remove_item(_d, _pos) \
|
|
- array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
|
|
+ darray_remove_items(_d, _pos, 1)
|
|
+
|
|
+/* Iteration: */
|
|
|
|
-#define __darray_for_each(_d, _i) \
|
|
+#define __darray_for_each(_d, _i) \
|
|
for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
|
|
|
|
#define darray_for_each(_d, _i) \
|
|
@@ -100,4 +119,4 @@ do { \
|
|
darray_init(_d); \
|
|
} while (0)
|
|
|
|
-#endif /* _BCACHEFS_DARRAY_H */
|
|
+#endif /* _LINUX_DARRAY_H */
|
|
diff --git a/include/linux/darray_types.h b/include/linux/darray_types.h
|
|
new file mode 100644
|
|
index 000000000000..c55484487905
|
|
--- /dev/null
|
|
+++ b/include/linux/darray_types.h
|
|
@@ -0,0 +1,33 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+/*
|
|
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
|
|
+ */
|
|
+#ifndef _LINUX_DARRAY_TYPES_H
|
|
+#define _LINUX_DARRAY_TYPES_H
|
|
+
|
|
+#include <linux/types.h>
|
|
+
|
|
+#define DARRAY_PREALLOCATED(_type, _nr) \
|
|
+struct { \
|
|
+ size_t nr, size; \
|
|
+ _type *data; \
|
|
+ _type preallocated[_nr]; \
|
|
+}
|
|
+
|
|
+#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
|
|
+
|
|
+typedef DARRAY(char) darray_char;
|
|
+typedef DARRAY(char *) darray_str;
|
|
+typedef DARRAY(const char *) darray_const_str;
|
|
+
|
|
+typedef DARRAY(u8) darray_u8;
|
|
+typedef DARRAY(u16) darray_u16;
|
|
+typedef DARRAY(u32) darray_u32;
|
|
+typedef DARRAY(u64) darray_u64;
|
|
+
|
|
+typedef DARRAY(s8) darray_s8;
|
|
+typedef DARRAY(s16) darray_s16;
|
|
+typedef DARRAY(s32) darray_s32;
|
|
+typedef DARRAY(s64) darray_s64;
|
|
+
|
|
+#endif /* _LINUX_DARRAY_TYPES_H */
|
|
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
|
|
index 4afb60365675..be0f42911a02 100644
|
|
--- a/include/linux/dcache.h
|
|
+++ b/include/linux/dcache.h
|
|
@@ -3,6 +3,7 @@
|
|
#define __LINUX_DCACHE_H
|
|
|
|
#include <linux/atomic.h>
|
|
+#include <linux/darray_types.h>
|
|
#include <linux/list.h>
|
|
#include <linux/math.h>
|
|
#include <linux/rculist.h>
|
|
@@ -616,4 +617,15 @@ static inline struct dentry *d_next_sibling(const struct dentry *dentry)
|
|
return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib);
|
|
}
|
|
|
|
+void d_casefold_disabled_put(struct dentry *dentry);
|
|
+int d_casefold_disabled_get(struct dentry *dentry);
|
|
+
|
|
+struct d_casefold_enable {
|
|
+ DARRAY(struct dentry *) refs;
|
|
+ struct super_block *sb;
|
|
+ bool rename_mutex_held;
|
|
+};
|
|
+int d_casefold_enable(struct dentry *dentry, struct d_casefold_enable *e, bool);
|
|
+void d_casefold_enable_commit(struct d_casefold_enable *e, int ret);
|
|
+
|
|
#endif /* __LINUX_DCACHE_H */
|
|
diff --git a/include/linux/fs.h b/include/linux/fs.h
|
|
index 2788df98080f..63c9a1a8a24b 100644
|
|
--- a/include/linux/fs.h
|
|
+++ b/include/linux/fs.h
|
|
@@ -47,6 +47,7 @@
|
|
#include <linux/rw_hint.h>
|
|
#include <linux/file_ref.h>
|
|
#include <linux/unicode.h>
|
|
+#include <linux/rhashtable-types.h>
|
|
|
|
#include <asm/byteorder.h>
|
|
#include <uapi/linux/fs.h>
|
|
@@ -1238,6 +1239,7 @@ extern int send_sigurg(struct file *file);
|
|
#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */
|
|
#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */
|
|
#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */
|
|
+#define SB_CASEFOLD BIT(8) /* Superblock supports casefolding */
|
|
#define SB_NOATIME BIT(10) /* Do not update access times. */
|
|
#define SB_NODIRATIME BIT(11) /* Do not update directory access times */
|
|
#define SB_SILENT BIT(15)
|
|
@@ -1398,6 +1400,7 @@ struct super_block {
|
|
* even looking at it. You had been warned.
|
|
*/
|
|
struct mutex s_vfs_rename_mutex; /* Kludge */
|
|
+ struct mutex s_casefold_enable_lock;
|
|
|
|
/*
|
|
* Filesystem subtype. If non-empty the filesystem type field
|
|
@@ -1441,6 +1444,7 @@ struct super_block {
|
|
|
|
struct mutex s_sync_lock; /* sync serialisation lock */
|
|
|
|
+
|
|
/*
|
|
* Indicates how deep in a filesystem stack this SB is
|
|
*/
|
|
@@ -2346,6 +2350,7 @@ struct super_operations {
|
|
#define S_CASEFOLD (1 << 15) /* Casefolded file */
|
|
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
|
|
#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
|
|
+#define S_NO_CASEFOLD (1 << 18) /* Directory, and all descendents, are not casefolded */
|
|
|
|
/*
|
|
* Note that nosuid etc flags are inode-specific: setting some file-system
|
|
@@ -2616,6 +2621,7 @@ struct file_system_type {
|
|
#define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
|
|
#define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */
|
|
#define FS_MGTIME 64 /* FS uses multigrain timestamps */
|
|
+#define FS_LBS 128 /* FS supports LBS */
|
|
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
|
|
int (*init_fs_context)(struct fs_context *);
|
|
const struct fs_parameter_spec *parameters;
|
|
diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
|
|
index fe41da005970..1cba369e1821 100644
|
|
--- a/include/linux/seq_buf.h
|
|
+++ b/include/linux/seq_buf.h
|
|
@@ -173,4 +173,8 @@ seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary);
|
|
|
|
void seq_buf_do_printk(struct seq_buf *s, const char *lvl);
|
|
|
|
+enum string_size_units;
|
|
+void seq_buf_human_readable_u64(struct seq_buf *s, u64 v,
|
|
+ const enum string_size_units units);
|
|
+
|
|
#endif /* _LINUX_SEQ_BUF_H */
|
|
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
|
|
index 1a00be90d93a..106622ddac77 100644
|
|
--- a/include/linux/shrinker.h
|
|
+++ b/include/linux/shrinker.h
|
|
@@ -24,6 +24,8 @@ struct shrinker_info {
|
|
struct shrinker_info_unit *unit[];
|
|
};
|
|
|
|
+struct seq_buf;
|
|
+
|
|
/*
|
|
* This struct is used to pass information from page reclaim to the shrinkers.
|
|
* We consolidate the values for easier extension later.
|
|
@@ -80,10 +82,12 @@ struct shrink_control {
|
|
* @flags determine the shrinker abilities, like numa awareness
|
|
*/
|
|
struct shrinker {
|
|
+ const char *name;
|
|
unsigned long (*count_objects)(struct shrinker *,
|
|
struct shrink_control *sc);
|
|
unsigned long (*scan_objects)(struct shrinker *,
|
|
struct shrink_control *sc);
|
|
+ void (*to_text)(struct seq_buf *, struct shrinker *);
|
|
|
|
long batch; /* reclaim batch size, 0 = default */
|
|
int seeks; /* seeks to recreate an obj */
|
|
@@ -110,11 +114,16 @@ struct shrinker {
|
|
#endif
|
|
#ifdef CONFIG_SHRINKER_DEBUG
|
|
int debugfs_id;
|
|
- const char *name;
|
|
struct dentry *debugfs_entry;
|
|
#endif
|
|
/* objs pending delete, per node */
|
|
atomic_long_t *nr_deferred;
|
|
+
|
|
+ atomic_long_t objects_requested_to_free;
|
|
+ atomic_long_t objects_freed;
|
|
+ unsigned long last_freed; /* timestamp, in jiffies */
|
|
+ unsigned long last_scanned; /* timestamp, in jiffies */
|
|
+ atomic64_t ns_run;
|
|
};
|
|
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
|
|
|
|
@@ -135,6 +144,8 @@ __printf(2, 3)
|
|
struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
|
|
void shrinker_register(struct shrinker *shrinker);
|
|
void shrinker_free(struct shrinker *shrinker);
|
|
+void shrinker_to_text(struct seq_buf *, struct shrinker *);
|
|
+void shrinkers_to_text(struct seq_buf *);
|
|
|
|
static inline bool shrinker_try_get(struct shrinker *shrinker)
|
|
{
|
|
diff --git a/include/linux/sort.h b/include/linux/sort.h
|
|
index e163287ac6c1..8e5603b10941 100644
|
|
--- a/include/linux/sort.h
|
|
+++ b/include/linux/sort.h
|
|
@@ -13,4 +13,15 @@ void sort(void *base, size_t num, size_t size,
|
|
cmp_func_t cmp_func,
|
|
swap_func_t swap_func);
|
|
|
|
+/* Versions that periodically call cond_resched(): */
|
|
+
|
|
+void sort_r_nonatomic(void *base, size_t num, size_t size,
|
|
+ cmp_r_func_t cmp_func,
|
|
+ swap_r_func_t swap_func,
|
|
+ const void *priv);
|
|
+
|
|
+void sort_nonatomic(void *base, size_t num, size_t size,
|
|
+ cmp_func_t cmp_func,
|
|
+ swap_func_t swap_func);
|
|
+
|
|
#endif
|
|
diff --git a/lib/Makefile b/lib/Makefile
|
|
index 4f3d00a2fd65..ccc50cdc4926 100644
|
|
--- a/lib/Makefile
|
|
+++ b/lib/Makefile
|
|
@@ -56,7 +56,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
|
|
bsearch.o find_bit.o llist.o lwq.o memweight.o kfifo.o \
|
|
percpu-refcount.o rhashtable.o base64.o \
|
|
once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
|
|
- generic-radix-tree.o bitmap-str.o
|
|
+ generic-radix-tree.o bitmap-str.o darray.o
|
|
obj-$(CONFIG_STRING_KUNIT_TEST) += string_kunit.o
|
|
obj-y += string_helpers.o
|
|
obj-$(CONFIG_STRING_HELPERS_KUNIT_TEST) += string_helpers_kunit.o
|
|
diff --git a/fs/bcachefs/darray.c b/lib/darray.c
|
|
similarity index 75%
|
|
rename from fs/bcachefs/darray.c
|
|
rename to lib/darray.c
|
|
index e86d36d23e9e..1d3820a43e14 100644
|
|
--- a/fs/bcachefs/darray.c
|
|
+++ b/lib/darray.c
|
|
@@ -1,11 +1,15 @@
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
|
|
+ */
|
|
|
|
+#include <linux/darray.h>
|
|
+#include <linux/export.h>
|
|
#include <linux/log2.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/vmalloc.h>
|
|
-#include "darray.h"
|
|
|
|
-int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
|
|
+int __darray_resize_slowpath(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
|
|
{
|
|
if (new_size > d->size) {
|
|
new_size = roundup_pow_of_two(new_size);
|
|
@@ -36,3 +40,4 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_
|
|
|
|
return 0;
|
|
}
|
|
+EXPORT_SYMBOL_GPL(__darray_resize_slowpath);
|
|
diff --git a/lib/seq_buf.c b/lib/seq_buf.c
|
|
index f3f3436d60a9..3c41ca83a0c3 100644
|
|
--- a/lib/seq_buf.c
|
|
+++ b/lib/seq_buf.c
|
|
@@ -436,3 +436,13 @@ int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type,
|
|
}
|
|
return 0;
|
|
}
|
|
+
|
|
+void seq_buf_human_readable_u64(struct seq_buf *s, u64 v, const enum string_size_units units)
|
|
+{
|
|
+ char *buf;
|
|
+ size_t size = seq_buf_get_buf(s, &buf);
|
|
+ int wrote = string_get_size(v, 1, units, buf, size);
|
|
+
|
|
+ seq_buf_commit(s, wrote);
|
|
+}
|
|
+EXPORT_SYMBOL(seq_buf_human_readable_u64);
|
|
diff --git a/lib/sort.c b/lib/sort.c
|
|
index 8e73dc55476b..52363995ccc5 100644
|
|
--- a/lib/sort.c
|
|
+++ b/lib/sort.c
|
|
@@ -186,36 +186,13 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size)
|
|
return i / 2;
|
|
}
|
|
|
|
-/**
|
|
- * sort_r - sort an array of elements
|
|
- * @base: pointer to data to sort
|
|
- * @num: number of elements
|
|
- * @size: size of each element
|
|
- * @cmp_func: pointer to comparison function
|
|
- * @swap_func: pointer to swap function or NULL
|
|
- * @priv: third argument passed to comparison function
|
|
- *
|
|
- * This function does a heapsort on the given array. You may provide
|
|
- * a swap_func function if you need to do something more than a memory
|
|
- * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
|
|
- * avoids a slow retpoline and so is significantly faster.
|
|
- *
|
|
- * The comparison function must adhere to specific mathematical
|
|
- * properties to ensure correct and stable sorting:
|
|
- * - Antisymmetry: cmp_func(a, b) must return the opposite sign of
|
|
- * cmp_func(b, a).
|
|
- * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then
|
|
- * cmp_func(a, c) <= 0.
|
|
- *
|
|
- * Sorting time is O(n log n) both on average and worst-case. While
|
|
- * quicksort is slightly faster on average, it suffers from exploitable
|
|
- * O(n*n) worst-case behavior and extra memory requirements that make
|
|
- * it less suitable for kernel use.
|
|
- */
|
|
-void sort_r(void *base, size_t num, size_t size,
|
|
- cmp_r_func_t cmp_func,
|
|
- swap_r_func_t swap_func,
|
|
- const void *priv)
|
|
+#include <linux/sched.h>
|
|
+
|
|
+static void __sort_r(void *base, size_t num, size_t size,
|
|
+ cmp_r_func_t cmp_func,
|
|
+ swap_r_func_t swap_func,
|
|
+ const void *priv,
|
|
+ bool may_schedule)
|
|
{
|
|
/* pre-scale counters for performance */
|
|
size_t n = num * size, a = (num/2) * size;
|
|
@@ -286,6 +263,9 @@ void sort_r(void *base, size_t num, size_t size,
|
|
b = parent(b, lsbit, size);
|
|
do_swap(base + b, base + c, size, swap_func, priv);
|
|
}
|
|
+
|
|
+ if (may_schedule)
|
|
+ cond_resched();
|
|
}
|
|
|
|
n -= size;
|
|
@@ -293,8 +273,63 @@ void sort_r(void *base, size_t num, size_t size,
|
|
if (n == size * 2 && do_cmp(base, base + size, cmp_func, priv) > 0)
|
|
do_swap(base, base + size, size, swap_func, priv);
|
|
}
|
|
+
|
|
+/**
|
|
+ * sort_r - sort an array of elements
|
|
+ * @base: pointer to data to sort
|
|
+ * @num: number of elements
|
|
+ * @size: size of each element
|
|
+ * @cmp_func: pointer to comparison function
|
|
+ * @swap_func: pointer to swap function or NULL
|
|
+ * @priv: third argument passed to comparison function
|
|
+ *
|
|
+ * This function does a heapsort on the given array. You may provide
|
|
+ * a swap_func function if you need to do something more than a memory
|
|
+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
|
|
+ * avoids a slow retpoline and so is significantly faster.
|
|
+ *
|
|
+ * The comparison function must adhere to specific mathematical
|
|
+ * properties to ensure correct and stable sorting:
|
|
+ * - Antisymmetry: cmp_func(a, b) must return the opposite sign of
|
|
+ * cmp_func(b, a).
|
|
+ * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then
|
|
+ * cmp_func(a, c) <= 0.
|
|
+ *
|
|
+ * Sorting time is O(n log n) both on average and worst-case. While
|
|
+ * quicksort is slightly faster on average, it suffers from exploitable
|
|
+ * O(n*n) worst-case behavior and extra memory requirements that make
|
|
+ * it less suitable for kernel use.
|
|
+ */
|
|
+void sort_r(void *base, size_t num, size_t size,
|
|
+ cmp_r_func_t cmp_func,
|
|
+ swap_r_func_t swap_func,
|
|
+ const void *priv)
|
|
+{
|
|
+ __sort_r(base, num, size, cmp_func, swap_func, priv, false);
|
|
+}
|
|
EXPORT_SYMBOL(sort_r);
|
|
|
|
+/**
|
|
+ * sort_r_nonatomic - sort an array of elements, with cond_resched
|
|
+ * @base: pointer to data to sort
|
|
+ * @num: number of elements
|
|
+ * @size: size of each element
|
|
+ * @cmp_func: pointer to comparison function
|
|
+ * @swap_func: pointer to swap function or NULL
|
|
+ * @priv: third argument passed to comparison function
|
|
+ *
|
|
+ * Same as sort_r, but preferred for larger arrays as it does a periodic
|
|
+ * cond_resched().
|
|
+ */
|
|
+void sort_r_nonatomic(void *base, size_t num, size_t size,
|
|
+ cmp_r_func_t cmp_func,
|
|
+ swap_r_func_t swap_func,
|
|
+ const void *priv)
|
|
+{
|
|
+ __sort_r(base, num, size, cmp_func, swap_func, priv, true);
|
|
+}
|
|
+EXPORT_SYMBOL(sort_r_nonatomic);
|
|
+
|
|
void sort(void *base, size_t num, size_t size,
|
|
cmp_func_t cmp_func,
|
|
swap_func_t swap_func)
|
|
@@ -304,6 +339,19 @@ void sort(void *base, size_t num, size_t size,
|
|
.swap = swap_func,
|
|
};
|
|
|
|
- return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
|
|
+ return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, false);
|
|
}
|
|
EXPORT_SYMBOL(sort);
|
|
+
|
|
+void sort_nonatomic(void *base, size_t num, size_t size,
|
|
+ cmp_func_t cmp_func,
|
|
+ swap_func_t swap_func)
|
|
+{
|
|
+ struct wrapper w = {
|
|
+ .cmp = cmp_func,
|
|
+ .swap = swap_func,
|
|
+ };
|
|
+
|
|
+ return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, true);
|
|
+}
|
|
+EXPORT_SYMBOL(sort_nonatomic);
|
|
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
|
|
index 1cf121ad7085..43914d472059 100644
|
|
--- a/mm/oom_kill.c
|
|
+++ b/mm/oom_kill.c
|
|
@@ -169,27 +169,6 @@ static bool oom_unkillable_task(struct task_struct *p)
|
|
return false;
|
|
}
|
|
|
|
-/*
|
|
- * Check whether unreclaimable slab amount is greater than
|
|
- * all user memory(LRU pages).
|
|
- * dump_unreclaimable_slab() could help in the case that
|
|
- * oom due to too much unreclaimable slab used by kernel.
|
|
-*/
|
|
-static bool should_dump_unreclaim_slab(void)
|
|
-{
|
|
- unsigned long nr_lru;
|
|
-
|
|
- nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
|
|
- global_node_page_state(NR_INACTIVE_ANON) +
|
|
- global_node_page_state(NR_ACTIVE_FILE) +
|
|
- global_node_page_state(NR_INACTIVE_FILE) +
|
|
- global_node_page_state(NR_ISOLATED_ANON) +
|
|
- global_node_page_state(NR_ISOLATED_FILE) +
|
|
- global_node_page_state(NR_UNEVICTABLE);
|
|
-
|
|
- return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
|
|
-}
|
|
-
|
|
/**
|
|
* oom_badness - heuristic function to determine which candidate task to kill
|
|
* @p: task struct of which task we should calculate
|
|
@@ -469,8 +448,6 @@ static void dump_header(struct oom_control *oc)
|
|
mem_cgroup_print_oom_meminfo(oc->memcg);
|
|
else {
|
|
__show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask));
|
|
- if (should_dump_unreclaim_slab())
|
|
- dump_unreclaimable_slab();
|
|
}
|
|
if (sysctl_oom_dump_tasks)
|
|
dump_tasks(oc);
|
|
diff --git a/mm/show_mem.c b/mm/show_mem.c
|
|
index 43afb56abbd3..982a64a86880 100644
|
|
--- a/mm/show_mem.c
|
|
+++ b/mm/show_mem.c
|
|
@@ -7,15 +7,18 @@
|
|
|
|
#include <linux/blkdev.h>
|
|
#include <linux/cma.h>
|
|
+#include <linux/console.h>
|
|
#include <linux/cpuset.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/mmzone.h>
|
|
+#include <linux/seq_buf.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/vmstat.h>
|
|
|
|
#include "internal.h"
|
|
+#include "slab.h"
|
|
#include "swap.h"
|
|
|
|
atomic_long_t _totalram_pages __read_mostly;
|
|
@@ -396,10 +399,31 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
|
|
show_swap_cache_info();
|
|
}
|
|
|
|
+static void print_string_as_lines(const char *prefix, const char *lines)
|
|
+{
|
|
+ if (!lines) {
|
|
+ printk("%s (null)\n", prefix);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ bool locked = console_trylock();
|
|
+
|
|
+ while (1) {
|
|
+ const char *p = strchrnul(lines, '\n');
|
|
+ printk("%s%.*s\n", prefix, (int) (p - lines), lines);
|
|
+ if (!*p)
|
|
+ break;
|
|
+ lines = p + 1;
|
|
+ }
|
|
+ if (locked)
|
|
+ console_unlock();
|
|
+}
|
|
+
|
|
void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
|
|
{
|
|
unsigned long total = 0, reserved = 0, highmem = 0;
|
|
struct zone *zone;
|
|
+ char *buf;
|
|
|
|
printk("Mem-Info:\n");
|
|
show_free_areas(filter, nodemask, max_zone_idx);
|
|
@@ -451,4 +475,30 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
|
|
}
|
|
}
|
|
#endif
|
|
+
|
|
+ const unsigned buf_size = 8192;
|
|
+ buf = kmalloc(buf_size, GFP_ATOMIC);
|
|
+ if (buf) {
|
|
+ struct seq_buf s;
|
|
+
|
|
+ printk("Unreclaimable slab info:\n");
|
|
+ seq_buf_init(&s, buf, buf_size);
|
|
+ dump_unreclaimable_slab(&s);
|
|
+ print_string_as_lines(KERN_NOTICE, seq_buf_str(&s));
|
|
+
|
|
+ static unsigned long shrinkers_last_print;
|
|
+
|
|
+ /* Ratelimit to at most once every 30 seconds */
|
|
+ if (!shrinkers_last_print ||
|
|
+ time_after(jiffies, shrinkers_last_print + HZ * 30)) {
|
|
+ shrinkers_last_print = jiffies;
|
|
+
|
|
+ printk("Shrinkers:\n");
|
|
+ seq_buf_init(&s, buf, buf_size);
|
|
+ shrinkers_to_text(&s);
|
|
+ print_string_as_lines(KERN_NOTICE, seq_buf_str(&s));
|
|
+ }
|
|
+
|
|
+ kfree(buf);
|
|
+ }
|
|
}
|
|
diff --git a/mm/shrinker.c b/mm/shrinker.c
|
|
index 4a93fd433689..c56c1f824f79 100644
|
|
--- a/mm/shrinker.c
|
|
+++ b/mm/shrinker.c
|
|
@@ -1,8 +1,9 @@
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/memcontrol.h>
|
|
+#include <linux/rculist.h>
|
|
#include <linux/rwsem.h>
|
|
+#include <linux/seq_buf.h>
|
|
#include <linux/shrinker.h>
|
|
-#include <linux/rculist.h>
|
|
#include <trace/events/vmscan.h>
|
|
|
|
#include "internal.h"
|
|
@@ -411,6 +412,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
|
|
|
|
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
|
|
freeable, delta, total_scan, priority);
|
|
+ u64 start_time = ktime_get_ns();
|
|
|
|
/*
|
|
* Normally, we should not scan less than batch_size objects in one
|
|
@@ -461,6 +463,17 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
|
|
*/
|
|
new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
|
|
|
|
+ unsigned long now = jiffies;
|
|
+ if (freed) {
|
|
+ atomic_long_add(freed, &shrinker->objects_freed);
|
|
+ shrinker->last_freed = now;
|
|
+ }
|
|
+ shrinker->last_scanned = now;
|
|
+ atomic_long_add(scanned, &shrinker->objects_requested_to_free);
|
|
+
|
|
+ atomic64_add(ktime_get_ns() - start_time, &shrinker->ns_run);
|
|
+
|
|
+
|
|
trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
|
|
return freed;
|
|
}
|
|
@@ -809,3 +822,83 @@ void shrinker_free(struct shrinker *shrinker)
|
|
call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(shrinker_free);
|
|
+
|
|
+void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker)
|
|
+{
|
|
+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
|
|
+ unsigned long nr_freed = atomic_long_read(&shrinker->objects_freed);
|
|
+
|
|
+ seq_buf_puts(out, shrinker->name);
|
|
+ seq_buf_putc(out, '\n');
|
|
+
|
|
+ seq_buf_printf(out, "objects: %lu\n", shrinker->count_objects(shrinker, &sc));
|
|
+ seq_buf_printf(out, "requested to free: %lu\n", atomic_long_read(&shrinker->objects_requested_to_free));
|
|
+ seq_buf_printf(out, "objects freed: %lu\n", nr_freed);
|
|
+ seq_buf_printf(out, "last scanned: %li sec ago\n", (jiffies - shrinker->last_scanned) / HZ);
|
|
+ seq_buf_printf(out, "last freed: %li sec ago\n", (jiffies - shrinker->last_freed) / HZ);
|
|
+ seq_buf_printf(out, "ns per object freed: %llu\n", nr_freed
|
|
+ ? div64_ul(atomic64_read(&shrinker->ns_run), nr_freed)
|
|
+ : 0);
|
|
+
|
|
+ if (shrinker->to_text) {
|
|
+ shrinker->to_text(out, shrinker);
|
|
+ seq_buf_puts(out, "\n");
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * shrinkers_to_text - Report on shrinkers with highest usage
|
|
+ *
|
|
+ * This reports on the top 10 shrinkers, by object counts, in sorted order:
|
|
+ * intended to be used for OOM reporting.
|
|
+ */
|
|
+void shrinkers_to_text(struct seq_buf *out)
|
|
+{
|
|
+ struct shrinker *shrinker;
|
|
+ struct shrinker_by_mem {
|
|
+ struct shrinker *shrinker;
|
|
+ unsigned long mem;
|
|
+ } shrinkers_by_mem[4];
|
|
+ int i, nr = 0;
|
|
+
|
|
+ if (!mutex_trylock(&shrinker_mutex)) {
|
|
+ seq_buf_puts(out, "(couldn't take shrinker lock)");
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ list_for_each_entry(shrinker, &shrinker_list, list) {
|
|
+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
|
|
+ unsigned long mem = shrinker->count_objects(shrinker, &sc);
|
|
+
|
|
+ if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY)
|
|
+ continue;
|
|
+
|
|
+ for (i = 0; i < nr; i++)
|
|
+ if (mem < shrinkers_by_mem[i].mem)
|
|
+ break;
|
|
+
|
|
+ if (nr < ARRAY_SIZE(shrinkers_by_mem)) {
|
|
+ memmove(&shrinkers_by_mem[i + 1],
|
|
+ &shrinkers_by_mem[i],
|
|
+ sizeof(shrinkers_by_mem[0]) * (nr - i));
|
|
+ nr++;
|
|
+ } else if (i) {
|
|
+ i--;
|
|
+ memmove(&shrinkers_by_mem[0],
|
|
+ &shrinkers_by_mem[1],
|
|
+ sizeof(shrinkers_by_mem[0]) * i);
|
|
+ } else {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ shrinkers_by_mem[i] = (struct shrinker_by_mem) {
|
|
+ .shrinker = shrinker,
|
|
+ .mem = mem,
|
|
+ };
|
|
+ }
|
|
+
|
|
+ for (i = nr - 1; i >= 0; --i)
|
|
+ shrinker_to_text(out, shrinkers_by_mem[i].shrinker);
|
|
+
|
|
+ mutex_unlock(&shrinker_mutex);
|
|
+}
|
|
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
|
|
index 794bd433cce0..c3f2d22ae0d9 100644
|
|
--- a/mm/shrinker_debug.c
|
|
+++ b/mm/shrinker_debug.c
|
|
@@ -2,6 +2,7 @@
|
|
#include <linux/idr.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/debugfs.h>
|
|
+#include <linux/seq_buf.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/shrinker.h>
|
|
#include <linux/memcontrol.h>
|
|
@@ -159,6 +160,21 @@ static const struct file_operations shrinker_debugfs_scan_fops = {
|
|
.write = shrinker_debugfs_scan_write,
|
|
};
|
|
|
|
+static int shrinker_debugfs_report_show(struct seq_file *m, void *v)
|
|
+{
|
|
+ struct shrinker *shrinker = m->private;
|
|
+ char *bufp;
|
|
+ size_t buflen = seq_get_buf(m, &bufp);
|
|
+ struct seq_buf out;
|
|
+
|
|
+ seq_buf_init(&out, bufp, buflen);
|
|
+ shrinker_to_text(&out, shrinker);
|
|
+ seq_commit(m, seq_buf_used(&out));
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_report);
|
|
+
|
|
int shrinker_debugfs_add(struct shrinker *shrinker)
|
|
{
|
|
struct dentry *entry;
|
|
@@ -190,6 +206,8 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
|
|
&shrinker_debugfs_count_fops);
|
|
debugfs_create_file("scan", 0220, entry, shrinker,
|
|
&shrinker_debugfs_scan_fops);
|
|
+ debugfs_create_file("report", 0440, entry, shrinker,
|
|
+ &shrinker_debugfs_report_fops);
|
|
return 0;
|
|
}
|
|
|
|
diff --git a/mm/slab.h b/mm/slab.h
|
|
index e9fd9bf0bfa6..1baf8771089b 100644
|
|
--- a/mm/slab.h
|
|
+++ b/mm/slab.h
|
|
@@ -631,10 +631,12 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
|
|
return s->size;
|
|
}
|
|
|
|
+struct seq_buf;
|
|
+
|
|
#ifdef CONFIG_SLUB_DEBUG
|
|
-void dump_unreclaimable_slab(void);
|
|
+void dump_unreclaimable_slab(struct seq_buf *);
|
|
#else
|
|
-static inline void dump_unreclaimable_slab(void)
|
|
+static inline void dump_unreclaimable_slab(struct seq_buf *out)
|
|
{
|
|
}
|
|
#endif
|
|
diff --git a/mm/slab_common.c b/mm/slab_common.c
|
|
index 4c9f0a87f733..1f24fcc2bc7f 100644
|
|
--- a/mm/slab_common.c
|
|
+++ b/mm/slab_common.c
|
|
@@ -27,6 +27,7 @@
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/page.h>
|
|
#include <linux/memcontrol.h>
|
|
+#include <linux/seq_buf.h>
|
|
#include <linux/stackdepot.h>
|
|
#include <trace/events/rcu.h>
|
|
|
|
@@ -1134,10 +1135,15 @@ static int slab_show(struct seq_file *m, void *p)
|
|
return 0;
|
|
}
|
|
|
|
-void dump_unreclaimable_slab(void)
|
|
+void dump_unreclaimable_slab(struct seq_buf *out)
|
|
{
|
|
struct kmem_cache *s;
|
|
struct slabinfo sinfo;
|
|
+ struct slab_by_mem {
|
|
+ struct kmem_cache *s;
|
|
+ size_t total, active;
|
|
+ } slabs_by_mem[10], n;
|
|
+ int i, nr = 0;
|
|
|
|
/*
|
|
* Here acquiring slab_mutex is risky since we don't prefer to get
|
|
@@ -1147,24 +1153,52 @@ void dump_unreclaimable_slab(void)
|
|
* without acquiring the mutex.
|
|
*/
|
|
if (!mutex_trylock(&slab_mutex)) {
|
|
- pr_warn("excessive unreclaimable slab but cannot dump stats\n");
|
|
+ seq_buf_puts(out, "excessive unreclaimable slab but cannot dump stats\n");
|
|
return;
|
|
}
|
|
|
|
- pr_info("Unreclaimable slab info:\n");
|
|
- pr_info("Name Used Total\n");
|
|
-
|
|
list_for_each_entry(s, &slab_caches, list) {
|
|
if (s->flags & SLAB_RECLAIM_ACCOUNT)
|
|
continue;
|
|
|
|
get_slabinfo(s, &sinfo);
|
|
|
|
- if (sinfo.num_objs > 0)
|
|
- pr_info("%-17s %10luKB %10luKB\n", s->name,
|
|
- (sinfo.active_objs * s->size) / 1024,
|
|
- (sinfo.num_objs * s->size) / 1024);
|
|
+ if (!sinfo.num_objs)
|
|
+ continue;
|
|
+
|
|
+ n.s = s;
|
|
+ n.total = sinfo.num_objs * s->size;
|
|
+ n.active = sinfo.active_objs * s->size;
|
|
+
|
|
+ for (i = 0; i < nr; i++)
|
|
+ if (n.total < slabs_by_mem[i].total)
|
|
+ break;
|
|
+
|
|
+ if (nr < ARRAY_SIZE(slabs_by_mem)) {
|
|
+ memmove(&slabs_by_mem[i + 1],
|
|
+ &slabs_by_mem[i],
|
|
+ sizeof(slabs_by_mem[0]) * (nr - i));
|
|
+ nr++;
|
|
+ } else if (i) {
|
|
+ i--;
|
|
+ memmove(&slabs_by_mem[0],
|
|
+ &slabs_by_mem[1],
|
|
+ sizeof(slabs_by_mem[0]) * i);
|
|
+ } else {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ slabs_by_mem[i] = n;
|
|
}
|
|
+
|
|
+ for (i = nr - 1; i >= 0; --i) {
|
|
+ seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name);
|
|
+ seq_buf_human_readable_u64(out, slabs_by_mem[i].total, STRING_UNITS_2);
|
|
+ seq_buf_printf(out, " active: ");
|
|
+ seq_buf_human_readable_u64(out, slabs_by_mem[i].active, STRING_UNITS_2);
|
|
+ seq_buf_putc(out, '\n');
|
|
+ }
|
|
+
|
|
mutex_unlock(&slab_mutex);
|
|
}
|
|
|
|
--
|
|
2.49.0
|
|
|