15092 lines
468 KiB
Diff
15092 lines
468 KiB
Diff
From b78adf1d9502a4e206b13f7348370eb483a5edcc Mon Sep 17 00:00:00 2001
|
|
From: Alexander Miroshnichenko <alex@millerson.name>
|
|
Date: Thu, 27 Mar 2025 14:52:55 +0300
|
|
Subject: [PATCH] bcachefs: cherry-pick updates from master 4594600
|
|
Content-Type: text/plain; charset="utf-8"
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Signed-off-by: Alexander Miroshnichenko <alex@millerson.name>
|
|
---
|
|
.../bcachefs/SubmittingPatches.rst | 43 +-
|
|
.../filesystems/bcachefs/casefolding.rst | 90 ++
|
|
Documentation/filesystems/bcachefs/index.rst | 20 +-
|
|
block/blk-core.c | 19 +-
|
|
fs/bcachefs/Kconfig | 2 +-
|
|
fs/bcachefs/Makefile | 3 +-
|
|
fs/bcachefs/alloc_background.c | 190 ++--
|
|
fs/bcachefs/alloc_background.h | 2 +-
|
|
fs/bcachefs/alloc_foreground.c | 31 +-
|
|
fs/bcachefs/alloc_foreground.h | 19 +-
|
|
fs/bcachefs/alloc_types.h | 2 +
|
|
fs/bcachefs/backpointers.c | 151 ++-
|
|
fs/bcachefs/backpointers.h | 26 +-
|
|
fs/bcachefs/bcachefs.h | 20 +-
|
|
fs/bcachefs/bcachefs_format.h | 16 +-
|
|
fs/bcachefs/bcachefs_ioctl.h | 29 +-
|
|
fs/bcachefs/bkey.h | 1 +
|
|
fs/bcachefs/btree_cache.c | 1 +
|
|
fs/bcachefs/btree_gc.c | 18 +-
|
|
fs/bcachefs/btree_io.c | 259 +++++-
|
|
fs/bcachefs/btree_io.h | 4 +
|
|
fs/bcachefs/btree_iter.c | 14 -
|
|
fs/bcachefs/btree_iter.h | 9 +-
|
|
fs/bcachefs/btree_locking.c | 8 +-
|
|
fs/bcachefs/btree_node_scan.c | 29 +-
|
|
fs/bcachefs/btree_trans_commit.c | 120 +--
|
|
fs/bcachefs/btree_types.h | 13 +
|
|
fs/bcachefs/btree_update.c | 5 +-
|
|
fs/bcachefs/btree_update.h | 2 +
|
|
fs/bcachefs/btree_update_interior.c | 150 +--
|
|
fs/bcachefs/btree_update_interior.h | 7 +
|
|
fs/bcachefs/buckets.c | 80 +-
|
|
fs/bcachefs/buckets.h | 31 +-
|
|
fs/bcachefs/buckets_types.h | 27 +
|
|
fs/bcachefs/chardev.c | 38 +-
|
|
fs/bcachefs/checksum.c | 25 +-
|
|
fs/bcachefs/checksum.h | 2 +
|
|
fs/bcachefs/compress.c | 65 +-
|
|
fs/bcachefs/data_update.c | 237 +++--
|
|
fs/bcachefs/data_update.h | 17 +-
|
|
fs/bcachefs/debug.c | 34 +-
|
|
fs/bcachefs/dirent.c | 274 +++++-
|
|
fs/bcachefs/dirent.h | 17 +-
|
|
fs/bcachefs/dirent_format.h | 20 +-
|
|
fs/bcachefs/disk_accounting.c | 11 +
|
|
fs/bcachefs/disk_accounting.h | 26 +-
|
|
fs/bcachefs/disk_accounting_format.h | 90 +-
|
|
fs/bcachefs/ec.c | 482 ++++------
|
|
fs/bcachefs/ec.h | 46 +-
|
|
fs/bcachefs/ec_types.h | 12 +-
|
|
fs/bcachefs/errcode.h | 65 +-
|
|
fs/bcachefs/error.c | 88 +-
|
|
fs/bcachefs/error.h | 57 +-
|
|
fs/bcachefs/extents.c | 245 +++--
|
|
fs/bcachefs/extents.h | 24 +-
|
|
fs/bcachefs/extents_format.h | 24 +-
|
|
fs/bcachefs/extents_types.h | 11 +-
|
|
fs/bcachefs/eytzinger.c | 76 +-
|
|
fs/bcachefs/eytzinger.h | 95 +-
|
|
fs/bcachefs/fs-io-buffered.c | 40 +-
|
|
fs/bcachefs/fs-io-direct.c | 20 +-
|
|
fs/bcachefs/fs-ioctl.c | 30 +-
|
|
fs/bcachefs/fs-ioctl.h | 20 +-
|
|
fs/bcachefs/fs.c | 139 +--
|
|
fs/bcachefs/fsck.c | 231 +----
|
|
fs/bcachefs/inode.c | 24 +-
|
|
fs/bcachefs/inode.h | 1 +
|
|
fs/bcachefs/inode_format.h | 3 +-
|
|
fs/bcachefs/io_misc.c | 3 +-
|
|
fs/bcachefs/io_read.c | 856 +++++++++++-------
|
|
fs/bcachefs/io_read.h | 97 +-
|
|
fs/bcachefs/io_write.c | 438 ++++-----
|
|
fs/bcachefs/io_write.h | 38 +-
|
|
fs/bcachefs/io_write_types.h | 2 +-
|
|
fs/bcachefs/journal.c | 202 +++--
|
|
fs/bcachefs/journal.h | 42 +-
|
|
fs/bcachefs/journal_io.c | 99 +-
|
|
fs/bcachefs/journal_reclaim.c | 10 +-
|
|
fs/bcachefs/journal_seq_blacklist.c | 7 +-
|
|
fs/bcachefs/journal_types.h | 37 +-
|
|
fs/bcachefs/lru.c | 100 +-
|
|
fs/bcachefs/lru.h | 22 +-
|
|
fs/bcachefs/lru_format.h | 6 +-
|
|
fs/bcachefs/migrate.c | 26 +-
|
|
fs/bcachefs/move.c | 505 +++++++----
|
|
fs/bcachefs/move_types.h | 20 +-
|
|
fs/bcachefs/movinggc.c | 15 +-
|
|
fs/bcachefs/{fs-common.c => namei.c} | 210 ++++-
|
|
fs/bcachefs/{fs-common.h => namei.h} | 31 +-
|
|
fs/bcachefs/opts.c | 117 ++-
|
|
fs/bcachefs/opts.h | 70 +-
|
|
fs/bcachefs/progress.c | 63 ++
|
|
fs/bcachefs/progress.h | 29 +
|
|
fs/bcachefs/rebalance.c | 52 +-
|
|
fs/bcachefs/recovery.c | 4 +-
|
|
fs/bcachefs/recovery_passes_types.h | 2 +-
|
|
fs/bcachefs/reflink.c | 23 +-
|
|
fs/bcachefs/sb-counters.c | 90 +-
|
|
fs/bcachefs/sb-counters.h | 4 +
|
|
fs/bcachefs/sb-counters_format.h | 32 +-
|
|
fs/bcachefs/sb-downgrade.c | 8 +-
|
|
fs/bcachefs/sb-errors_format.h | 6 +-
|
|
fs/bcachefs/sb-members.h | 16 +-
|
|
fs/bcachefs/sb-members_format.h | 1 +
|
|
fs/bcachefs/snapshot.c | 7 +-
|
|
fs/bcachefs/snapshot.h | 1 +
|
|
fs/bcachefs/str_hash.c | 2 +-
|
|
fs/bcachefs/str_hash.h | 12 +-
|
|
fs/bcachefs/super-io.c | 92 +-
|
|
fs/bcachefs/super-io.h | 10 +-
|
|
fs/bcachefs/super.c | 141 ++-
|
|
fs/bcachefs/super.h | 2 +
|
|
fs/bcachefs/super_types.h | 8 +-
|
|
fs/bcachefs/sysfs.c | 257 ++++--
|
|
fs/bcachefs/sysfs.h | 5 +-
|
|
fs/bcachefs/time_stats.c | 22 +-
|
|
fs/bcachefs/time_stats.h | 1 +
|
|
fs/bcachefs/trace.h | 106 +--
|
|
fs/bcachefs/util.c | 231 ++++-
|
|
fs/bcachefs/util.h | 16 +-
|
|
fs/bcachefs/xattr.c | 2 +-
|
|
121 files changed, 5238 insertions(+), 2990 deletions(-)
|
|
create mode 100644 Documentation/filesystems/bcachefs/casefolding.rst
|
|
rename fs/bcachefs/{fs-common.c => namei.c} (73%)
|
|
rename fs/bcachefs/{fs-common.h => namei.h} (61%)
|
|
create mode 100644 fs/bcachefs/progress.c
|
|
create mode 100644 fs/bcachefs/progress.h
|
|
|
|
diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst
|
|
index 026b12ae0d6a..a455f9cfd15c 100644
|
|
--- a/Documentation/filesystems/bcachefs/SubmittingPatches.rst
|
|
+++ b/Documentation/filesystems/bcachefs/SubmittingPatches.rst
|
|
@@ -1,8 +1,13 @@
|
|
-Submitting patches to bcachefs:
|
|
-===============================
|
|
+Submitting patches to bcachefs
|
|
+==============================
|
|
+
|
|
+Here are suggestions for submitting patches to bcachefs subsystem.
|
|
+
|
|
+Submission checklist
|
|
+--------------------
|
|
|
|
Patches must be tested before being submitted, either with the xfstests suite
|
|
-[0], or the full bcachefs test suite in ktest [1], depending on what's being
|
|
+[0]_, or the full bcachefs test suite in ktest [1]_, depending on what's being
|
|
touched. Note that ktest wraps xfstests and will be an easier method to running
|
|
it for most users; it includes single-command wrappers for all the mainstream
|
|
in-kernel local filesystems.
|
|
@@ -26,21 +31,21 @@ considered out of date), but try not to deviate too much without reason.
|
|
Focus on writing code that reads well and is organized well; code should be
|
|
aesthetically pleasing.
|
|
|
|
-CI:
|
|
-===
|
|
+CI
|
|
+--
|
|
|
|
Instead of running your tests locally, when running the full test suite it's
|
|
prefereable to let a server farm do it in parallel, and then have the results
|
|
in a nice test dashboard (which can tell you which failures are new, and
|
|
presents results in a git log view, avoiding the need for most bisecting).
|
|
|
|
-That exists [2], and community members may request an account. If you work for
|
|
+That exists [2]_, and community members may request an account. If you work for
|
|
a big tech company, you'll need to help out with server costs to get access -
|
|
but the CI is not restricted to running bcachefs tests: it runs any ktest test
|
|
(which generally makes it easy to wrap other tests that can run in qemu).
|
|
|
|
-Other things to think about:
|
|
-============================
|
|
+Other things to think about
|
|
+---------------------------
|
|
|
|
- How will we debug this code? Is there sufficient introspection to diagnose
|
|
when something starts acting wonky on a user machine?
|
|
@@ -79,20 +84,22 @@ Other things to think about:
|
|
tested? (Automated tests exists but aren't in the CI, due to the hassle of
|
|
disk image management; coordinate to have them run.)
|
|
|
|
-Mailing list, IRC:
|
|
-==================
|
|
+Mailing list, IRC
|
|
+-----------------
|
|
|
|
-Patches should hit the list [3], but much discussion and code review happens on
|
|
-IRC as well [4]; many people appreciate the more conversational approach and
|
|
-quicker feedback.
|
|
+Patches should hit the list [3]_, but much discussion and code review happens
|
|
+on IRC as well [4]_; many people appreciate the more conversational approach
|
|
+and quicker feedback.
|
|
|
|
Additionally, we have a lively user community doing excellent QA work, which
|
|
exists primarily on IRC. Please make use of that resource; user feedback is
|
|
important for any nontrivial feature, and documenting it in commit messages
|
|
would be a good idea.
|
|
|
|
-[0]: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git
|
|
-[1]: https://evilpiepirate.org/git/ktest.git/
|
|
-[2]: https://evilpiepirate.org/~testdashboard/ci/
|
|
-[3]: linux-bcachefs@vger.kernel.org
|
|
-[4]: irc.oftc.net#bcache, #bcachefs-dev
|
|
+.. rubric:: References
|
|
+
|
|
+.. [0] git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git
|
|
+.. [1] https://evilpiepirate.org/git/ktest.git/
|
|
+.. [2] https://evilpiepirate.org/~testdashboard/ci/
|
|
+.. [3] linux-bcachefs@vger.kernel.org
|
|
+.. [4] irc.oftc.net#bcache, #bcachefs-dev
|
|
diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst
|
|
new file mode 100644
|
|
index 000000000000..ba5de97d155f
|
|
--- /dev/null
|
|
+++ b/Documentation/filesystems/bcachefs/casefolding.rst
|
|
@@ -0,0 +1,90 @@
|
|
+.. SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+Casefolding
|
|
+===========
|
|
+
|
|
+bcachefs has support for case-insensitive file and directory
|
|
+lookups using the regular `chattr +F` (`S_CASEFOLD`, `FS_CASEFOLD_FL`)
|
|
+casefolding attributes.
|
|
+
|
|
+The main usecase for casefolding is compatibility with software written
|
|
+against other filesystems that rely on casefolded lookups
|
|
+(eg. NTFS and Wine/Proton).
|
|
+Taking advantage of file-system level casefolding can lead to great
|
|
+loading time gains in many applications and games.
|
|
+
|
|
+Casefolding support requires a kernel with the `CONFIG_UNICODE` enabled.
|
|
+Once a directory has been flagged for casefolding, a feature bit
|
|
+is enabled on the superblock which marks the filesystem as using
|
|
+casefolding.
|
|
+When the feature bit for casefolding is enabled, it is no longer possible
|
|
+to mount that filesystem on kernels without `CONFIG_UNICODE` enabled.
|
|
+
|
|
+On the lookup/query side: casefolding is implemented by allocating a new
|
|
+string of `BCH_NAME_MAX` length using the `utf8_casefold` function to
|
|
+casefold the query string.
|
|
+
|
|
+On the dirent side: casefolding is implemented by ensuring the `bkey`'s
|
|
+hash is made from the casefolded string and storing the cached casefolded
|
|
+name with the regular name in the dirent.
|
|
+
|
|
+The structure looks like this:
|
|
+
|
|
+* Regular: [dirent data][regular name][nul][nul]...
|
|
+* Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]...
|
|
+
|
|
+(Do note, the number of NULs here is merely for illustration; their count can
|
|
+vary per-key, and they may not even be present if the key is aligned to
|
|
+`sizeof(u64)`.)
|
|
+
|
|
+This is efficient as it means that for all file lookups that require casefolding,
|
|
+it has identical performance to a regular lookup:
|
|
+a hash comparison and a `memcmp` of the name.
|
|
+
|
|
+Rationale
|
|
+---------
|
|
+
|
|
+Several designs were considered for this system:
|
|
+One was to introduce a dirent_v2, however that would be painful especially as
|
|
+the hash system only has support for a single key type. This would also need
|
|
+`BCH_NAME_MAX` to change between versions, and a new feature bit.
|
|
+
|
|
+Another option was to store without the two lengths, and just take the length of
|
|
+the regular name and casefolded name contiguously / 2 as the length. This would
|
|
+assume that the regular length == casefolded length, but that could potentially
|
|
+not be true, if the uppercase unicode glyph had a different UTF-8 encoding than
|
|
+the lowercase unicode glyph.
|
|
+It would be possible to disregard the casefold cache for those cases, but it was
|
|
+decided to simply encode the two string lengths in the key to avoid random
|
|
+performance issues if this edgecase was ever hit.
|
|
+
|
|
+The option settled on was to use a free-bit in d_type to mark a dirent as having
|
|
+a casefold cache, and then treat the first 4 bytes the name block as lengths.
|
|
+You can see this in the `d_cf_name_block` member of union in `bch_dirent`.
|
|
+
|
|
+The feature bit was used to allow casefolding support to be enabled for the majority
|
|
+of users, but some allow users who have no need for the feature to still use bcachefs as
|
|
+`CONFIG_UNICODE` can increase the kernel side a significant amount due to the tables used,
|
|
+which may be decider between using bcachefs for eg. embedded platforms.
|
|
+
|
|
+Other filesystems like ext4 and f2fs have a super-block level option for casefolding
|
|
+encoding, but bcachefs currently does not provide this. ext4 and f2fs do not expose
|
|
+any encodings than a single UTF-8 version. When future encodings are desirable,
|
|
+they will be added trivially using the opts mechanism.
|
|
+
|
|
+dentry/dcache considerations
|
|
+----------------------------
|
|
+
|
|
+Currently, in casefolded directories, bcachefs (like other filesystems) will not cache
|
|
+negative dentry's.
|
|
+
|
|
+This is because currently doing so presents a problem in the following scenario:
|
|
+
|
|
+ - Lookup file "blAH" in a casefolded directory
|
|
+ - Creation of file "BLAH" in a casefolded directory
|
|
+ - Lookup file "blAH" in a casefolded directory
|
|
+
|
|
+This would fail if negative dentry's were cached.
|
|
+
|
|
+This is slightly suboptimal, but could be fixed in future with some vfs work.
|
|
+
|
|
diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst
|
|
index 7db4d7ceab58..3864d0ae89c1 100644
|
|
--- a/Documentation/filesystems/bcachefs/index.rst
|
|
+++ b/Documentation/filesystems/bcachefs/index.rst
|
|
@@ -4,10 +4,28 @@
|
|
bcachefs Documentation
|
|
======================
|
|
|
|
+Subsystem-specific development process notes
|
|
+--------------------------------------------
|
|
+
|
|
+Development notes specific to bcachefs. These are intended to supplement
|
|
+:doc:`general kernel development handbook </process/index>`.
|
|
+
|
|
.. toctree::
|
|
- :maxdepth: 2
|
|
+ :maxdepth: 1
|
|
:numbered:
|
|
|
|
CodingStyle
|
|
SubmittingPatches
|
|
+
|
|
+Filesystem implementation
|
|
+-------------------------
|
|
+
|
|
+Documentation for filesystem features and their implementation details.
|
|
+At this moment, only a few of these are described here.
|
|
+
|
|
+.. toctree::
|
|
+ :maxdepth: 1
|
|
+ :numbered:
|
|
+
|
|
+ casefolding
|
|
errorcodes
|
|
diff --git a/block/blk-core.c b/block/blk-core.c
|
|
index d6c4fa3943b5..7b1103eb877d 100644
|
|
--- a/block/blk-core.c
|
|
+++ b/block/blk-core.c
|
|
@@ -793,20 +793,21 @@ void submit_bio_noacct(struct bio *bio)
|
|
goto end_io;
|
|
}
|
|
|
|
+ if (WARN_ON_ONCE((bio->bi_opf & REQ_PREFLUSH) &&
|
|
+ bio_op(bio) != REQ_OP_WRITE &&
|
|
+ bio_op(bio) != REQ_OP_ZONE_APPEND))
|
|
+ goto end_io;
|
|
+
|
|
/*
|
|
* Filter flush bio's early so that bio based drivers without flush
|
|
* support don't have to worry about them.
|
|
*/
|
|
- if (op_is_flush(bio->bi_opf)) {
|
|
- if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
|
|
- bio_op(bio) != REQ_OP_ZONE_APPEND))
|
|
+ if (op_is_flush(bio->bi_opf) &&
|
|
+ !bdev_write_cache(bdev)) {
|
|
+ bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
|
|
+ if (!bio_sectors(bio)) {
|
|
+ status = BLK_STS_OK;
|
|
goto end_io;
|
|
- if (!bdev_write_cache(bdev)) {
|
|
- bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
|
|
- if (!bio_sectors(bio)) {
|
|
- status = BLK_STS_OK;
|
|
- goto end_io;
|
|
- }
|
|
}
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
|
|
index fc7efd0a7525..c9798750202d 100644
|
|
--- a/fs/bcachefs/Kconfig
|
|
+++ b/fs/bcachefs/Kconfig
|
|
@@ -16,7 +16,7 @@ config BCACHEFS_FS
|
|
select ZSTD_COMPRESS
|
|
select ZSTD_DECOMPRESS
|
|
select CRYPTO
|
|
- select CRYPTO_SHA256
|
|
+ select CRYPTO_LIB_SHA256
|
|
select CRYPTO_CHACHA20
|
|
select CRYPTO_POLY1305
|
|
select KEYS
|
|
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
|
|
index d2689388d5e8..9af65079374f 100644
|
|
--- a/fs/bcachefs/Makefile
|
|
+++ b/fs/bcachefs/Makefile
|
|
@@ -41,7 +41,6 @@ bcachefs-y := \
|
|
extent_update.o \
|
|
eytzinger.o \
|
|
fs.o \
|
|
- fs-common.o \
|
|
fs-ioctl.o \
|
|
fs-io.o \
|
|
fs-io-buffered.o \
|
|
@@ -64,9 +63,11 @@ bcachefs-y := \
|
|
migrate.o \
|
|
move.o \
|
|
movinggc.o \
|
|
+ namei.o \
|
|
nocow_locking.o \
|
|
opts.o \
|
|
printbuf.o \
|
|
+ progress.o \
|
|
quota.o \
|
|
rebalance.o \
|
|
rcu_pending.o \
|
|
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
|
|
index 3ea809990ef1..5fb396be9127 100644
|
|
--- a/fs/bcachefs/alloc_background.c
|
|
+++ b/fs/bcachefs/alloc_background.c
|
|
@@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
int ret = 0;
|
|
|
|
bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
|
|
- c, alloc_v2_unpack_error,
|
|
+ c, alloc_v3_unpack_error,
|
|
"unpack error");
|
|
fsck_err:
|
|
return ret;
|
|
@@ -777,14 +777,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s
|
|
s64 delta_sectors,
|
|
s64 delta_fragmented, unsigned flags)
|
|
{
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
|
|
- .dev_data_type.dev = ca->dev_idx,
|
|
- .dev_data_type.data_type = data_type,
|
|
- };
|
|
s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
|
|
|
|
- return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
|
|
+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
|
|
+ d, dev_data_type,
|
|
+ .dev = ca->dev_idx,
|
|
+ .data_type = data_type);
|
|
}
|
|
|
|
int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
|
|
@@ -837,7 +835,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
|
|
|
struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
|
|
if (!ca)
|
|
- return -EIO;
|
|
+ return -BCH_ERR_trigger_alloc;
|
|
|
|
struct bch_alloc_v4 old_a_convert;
|
|
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
|
|
@@ -871,6 +869,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
|
if (data_type_is_empty(new_a->data_type) &&
|
|
BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
|
|
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
|
|
+ if (new_a->oldest_gen == new_a->gen &&
|
|
+ !bch2_bucket_sectors_total(*new_a))
|
|
+ new_a->oldest_gen++;
|
|
new_a->gen++;
|
|
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
|
|
alloc_data_type_set(new_a, new_a->data_type);
|
|
@@ -889,26 +890,20 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
|
!new_a->io_time[READ])
|
|
new_a->io_time[READ] = bch2_current_io_time(c, READ);
|
|
|
|
- u64 old_lru = alloc_lru_idx_read(*old_a);
|
|
- u64 new_lru = alloc_lru_idx_read(*new_a);
|
|
- if (old_lru != new_lru) {
|
|
- ret = bch2_lru_change(trans, new.k->p.inode,
|
|
- bucket_to_u64(new.k->p),
|
|
- old_lru, new_lru);
|
|
- if (ret)
|
|
- goto err;
|
|
- }
|
|
+ ret = bch2_lru_change(trans, new.k->p.inode,
|
|
+ bucket_to_u64(new.k->p),
|
|
+ alloc_lru_idx_read(*old_a),
|
|
+ alloc_lru_idx_read(*new_a));
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
- old_lru = alloc_lru_idx_fragmentation(*old_a, ca);
|
|
- new_lru = alloc_lru_idx_fragmentation(*new_a, ca);
|
|
- if (old_lru != new_lru) {
|
|
- ret = bch2_lru_change(trans,
|
|
- BCH_LRU_FRAGMENTATION_START,
|
|
- bucket_to_u64(new.k->p),
|
|
- old_lru, new_lru);
|
|
- if (ret)
|
|
- goto err;
|
|
- }
|
|
+ ret = bch2_lru_change(trans,
|
|
+ BCH_LRU_BUCKET_FRAGMENTATION,
|
|
+ bucket_to_u64(new.k->p),
|
|
+ alloc_lru_idx_fragmentation(*old_a, ca),
|
|
+ alloc_lru_idx_fragmentation(*new_a, ca));
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
if (old_a->gen != new_a->gen) {
|
|
ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
|
|
@@ -1034,7 +1029,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
|
invalid_bucket:
|
|
bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
|
|
(bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
|
|
- ret = -EIO;
|
|
+ ret = -BCH_ERR_trigger_alloc;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1705,7 +1700,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
|
|
|
|
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
|
|
if (lru_idx) {
|
|
- ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
|
|
+ ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION,
|
|
+ bucket_to_u64(alloc_k.k->p),
|
|
lru_idx, alloc_k, last_flushed);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1735,7 +1731,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
|
|
a = &a_mut->v;
|
|
}
|
|
|
|
- ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
|
|
+ ret = bch2_lru_check_set(trans, alloc_k.k->p.inode,
|
|
+ bucket_to_u64(alloc_k.k->p),
|
|
+ a->io_time[READ],
|
|
alloc_k, last_flushed);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1757,7 +1755,8 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
|
|
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
|
|
POS_MIN, BTREE_ITER_prefetch, k,
|
|
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
- bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
|
|
+ bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?:
|
|
+ bch2_check_stripe_to_lru_refs(c);
|
|
|
|
bch2_bkey_buf_exit(&last_flushed, c);
|
|
bch_err_fn(c, ret);
|
|
@@ -1805,6 +1804,19 @@ struct discard_buckets_state {
|
|
u64 discarded;
|
|
};
|
|
|
|
+/*
|
|
+ * This is needed because discard is both a filesystem option and a device
|
|
+ * option, and mount options are supposed to apply to that mount and not be
|
|
+ * persisted, i.e. if it's set as a mount option we can't propagate it to the
|
|
+ * device.
|
|
+ */
|
|
+static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ return test_bit(BCH_FS_discard_mount_opt_set, &c->flags)
|
|
+ ? c->opts.discard
|
|
+ : ca->mi.discard;
|
|
+}
|
|
+
|
|
static int bch2_discard_one_bucket(struct btree_trans *trans,
|
|
struct bch_dev *ca,
|
|
struct btree_iter *need_discard_iter,
|
|
@@ -1868,7 +1880,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
|
|
s->discarded++;
|
|
*discard_pos_done = iter.pos;
|
|
|
|
- if (ca->mi.discard && !c->opts.nochanges) {
|
|
+ if (discard_opt_enabled(c, ca) && !c->opts.nochanges) {
|
|
/*
|
|
* This works without any other locks because this is the only
|
|
* thread that removes items from the need_discard tree
|
|
@@ -1897,7 +1909,10 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
|
|
if (ret)
|
|
goto out;
|
|
|
|
- count_event(c, bucket_discard);
|
|
+ if (!fastpath)
|
|
+ count_event(c, bucket_discard);
|
|
+ else
|
|
+ count_event(c, bucket_discard_fast);
|
|
out:
|
|
fsck_err:
|
|
if (discard_locked)
|
|
@@ -2055,16 +2070,71 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
|
|
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
|
|
}
|
|
|
|
+static int invalidate_one_bp(struct btree_trans *trans,
|
|
+ struct bch_dev *ca,
|
|
+ struct bkey_s_c_backpointer bp,
|
|
+ struct bkey_buf *last_flushed)
|
|
+{
|
|
+ struct btree_iter extent_iter;
|
|
+ struct bkey_s_c extent_k =
|
|
+ bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed);
|
|
+ int ret = bkey_err(extent_k);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ struct bkey_i *n =
|
|
+ bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
|
|
+ BTREE_UPDATE_internal_snapshot_node);
|
|
+ ret = PTR_ERR_OR_ZERO(n);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx);
|
|
+err:
|
|
+ bch2_trans_iter_exit(trans, &extent_iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int invalidate_one_bucket_by_bps(struct btree_trans *trans,
|
|
+ struct bch_dev *ca,
|
|
+ struct bpos bucket,
|
|
+ u8 gen,
|
|
+ struct bkey_buf *last_flushed)
|
|
+{
|
|
+ struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket);
|
|
+ struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket);
|
|
+
|
|
+ return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
|
|
+ bp_start, bp_end, 0, k,
|
|
+ NULL, NULL,
|
|
+ BCH_WATERMARK_btree|
|
|
+ BCH_TRANS_COMMIT_no_enospc, ({
|
|
+ if (k.k->type != KEY_TYPE_backpointer)
|
|
+ continue;
|
|
+
|
|
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
|
|
+
|
|
+ if (bp.v->bucket_gen != gen)
|
|
+ continue;
|
|
+
|
|
+ /* filter out bps with gens that don't match */
|
|
+
|
|
+ invalidate_one_bp(trans, ca, bp, last_flushed);
|
|
+ }));
|
|
+}
|
|
+
|
|
+noinline_for_stack
|
|
static int invalidate_one_bucket(struct btree_trans *trans,
|
|
+ struct bch_dev *ca,
|
|
struct btree_iter *lru_iter,
|
|
struct bkey_s_c lru_k,
|
|
+ struct bkey_buf *last_flushed,
|
|
s64 *nr_to_invalidate)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct bkey_i_alloc_v4 *a = NULL;
|
|
struct printbuf buf = PRINTBUF;
|
|
struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
|
|
- unsigned cached_sectors;
|
|
+ struct btree_iter alloc_iter = {};
|
|
int ret = 0;
|
|
|
|
if (*nr_to_invalidate <= 0)
|
|
@@ -2081,35 +2151,37 @@ static int invalidate_one_bucket(struct btree_trans *trans,
|
|
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
|
|
return 0;
|
|
|
|
- a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
|
|
- ret = PTR_ERR_OR_ZERO(a);
|
|
+ struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
|
|
+ BTREE_ID_alloc, bucket,
|
|
+ BTREE_ITER_cached);
|
|
+ ret = bkey_err(alloc_k);
|
|
if (ret)
|
|
- goto out;
|
|
+ return ret;
|
|
+
|
|
+ struct bch_alloc_v4 a_convert;
|
|
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
|
|
|
|
/* We expect harmless races here due to the btree write buffer: */
|
|
- if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
|
|
+ if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a))
|
|
goto out;
|
|
|
|
- BUG_ON(a->v.data_type != BCH_DATA_cached);
|
|
- BUG_ON(a->v.dirty_sectors);
|
|
+ /*
|
|
+ * Impossible since alloc_lru_idx_read() only returns nonzero if the
|
|
+ * bucket is supposed to be on the cached bucket LRU (i.e.
|
|
+ * BCH_DATA_cached)
|
|
+ *
|
|
+ * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0
|
|
+ */
|
|
+ BUG_ON(a->data_type != BCH_DATA_cached);
|
|
+ BUG_ON(a->dirty_sectors);
|
|
|
|
- if (!a->v.cached_sectors)
|
|
+ if (!a->cached_sectors)
|
|
bch_err(c, "invalidating empty bucket, confused");
|
|
|
|
- cached_sectors = a->v.cached_sectors;
|
|
+ unsigned cached_sectors = a->cached_sectors;
|
|
+ u8 gen = a->gen;
|
|
|
|
- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
|
|
- a->v.gen++;
|
|
- a->v.data_type = 0;
|
|
- a->v.dirty_sectors = 0;
|
|
- a->v.stripe_sectors = 0;
|
|
- a->v.cached_sectors = 0;
|
|
- a->v.io_time[READ] = bch2_current_io_time(c, READ);
|
|
- a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
|
|
-
|
|
- ret = bch2_trans_commit(trans, NULL, NULL,
|
|
- BCH_WATERMARK_btree|
|
|
- BCH_TRANS_COMMIT_no_enospc);
|
|
+ ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed);
|
|
if (ret)
|
|
goto out;
|
|
|
|
@@ -2117,6 +2189,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
|
|
--*nr_to_invalidate;
|
|
out:
|
|
fsck_err:
|
|
+ bch2_trans_iter_exit(trans, &alloc_iter);
|
|
printbuf_exit(&buf);
|
|
return ret;
|
|
}
|
|
@@ -2143,6 +2216,10 @@ static void bch2_do_invalidates_work(struct work_struct *work)
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
int ret = 0;
|
|
|
|
+ struct bkey_buf last_flushed;
|
|
+ bch2_bkey_buf_init(&last_flushed);
|
|
+ bkey_init(&last_flushed.k->k);
|
|
+
|
|
ret = bch2_btree_write_buffer_tryflush(trans);
|
|
if (ret)
|
|
goto err;
|
|
@@ -2167,7 +2244,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
|
|
if (!k.k)
|
|
break;
|
|
|
|
- ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
|
|
+ ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate);
|
|
restart_err:
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
continue;
|
|
@@ -2180,6 +2257,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
|
|
err:
|
|
bch2_trans_put(trans);
|
|
percpu_ref_put(&ca->io_ref);
|
|
+ bch2_bkey_buf_exit(&last_flushed, c);
|
|
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
|
|
index de25ba4ee94b..c556ccaffe89 100644
|
|
--- a/fs/bcachefs/alloc_background.h
|
|
+++ b/fs/bcachefs/alloc_background.h
|
|
@@ -131,7 +131,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
|
|
if (a.stripe)
|
|
return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
|
|
if (bch2_bucket_sectors_dirty(a))
|
|
- return data_type;
|
|
+ return bucket_data_type(data_type);
|
|
if (a.cached_sectors)
|
|
return BCH_DATA_cached;
|
|
if (BCH_ALLOC_V4_NEED_DISCARD(&a))
|
|
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
|
|
index 5a781fb4c794..0cac65347a5d 100644
|
|
--- a/fs/bcachefs/alloc_foreground.c
|
|
+++ b/fs/bcachefs/alloc_foreground.c
|
|
@@ -127,14 +127,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
|
|
|
|
void bch2_open_bucket_write_error(struct bch_fs *c,
|
|
struct open_buckets *obs,
|
|
- unsigned dev)
|
|
+ unsigned dev, int err)
|
|
{
|
|
struct open_bucket *ob;
|
|
unsigned i;
|
|
|
|
open_bucket_for_each(c, obs, ob, i)
|
|
if (ob->dev == dev && ob->ec)
|
|
- bch2_ec_bucket_cancel(c, ob);
|
|
+ bch2_ec_bucket_cancel(c, ob, err);
|
|
}
|
|
|
|
static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
|
|
@@ -179,23 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
|
|
closure_wake_up(&c->freelist_wait);
|
|
}
|
|
|
|
-static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
|
|
-{
|
|
- switch (watermark) {
|
|
- case BCH_WATERMARK_interior_updates:
|
|
- return 0;
|
|
- case BCH_WATERMARK_reclaim:
|
|
- return OPEN_BUCKETS_COUNT / 6;
|
|
- case BCH_WATERMARK_btree:
|
|
- case BCH_WATERMARK_btree_copygc:
|
|
- return OPEN_BUCKETS_COUNT / 4;
|
|
- case BCH_WATERMARK_copygc:
|
|
- return OPEN_BUCKETS_COUNT / 3;
|
|
- default:
|
|
- return OPEN_BUCKETS_COUNT / 2;
|
|
- }
|
|
-}
|
|
-
|
|
static inline bool may_alloc_bucket(struct bch_fs *c,
|
|
struct bpos bucket,
|
|
struct bucket_alloc_state *s)
|
|
@@ -239,7 +222,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
|
|
|
|
spin_lock(&c->freelist_lock);
|
|
|
|
- if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
|
|
+ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) {
|
|
if (cl)
|
|
closure_wait(&c->open_buckets_wait, cl);
|
|
|
|
@@ -648,7 +631,7 @@ static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
|
|
struct bch_dev_usage *usage)
|
|
{
|
|
u64 *v = stripe->next_alloc + ca->dev_idx;
|
|
- u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
|
|
+ u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
|
|
u64 free_space_inv = free_space
|
|
? div64_u64(1ULL << 48, free_space)
|
|
: 1ULL << 48;
|
|
@@ -728,7 +711,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
|
|
|
|
struct bch_dev_usage usage;
|
|
struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
|
|
- cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
|
|
+ cl, flags & BCH_WRITE_alloc_nowait, &usage);
|
|
if (!IS_ERR(ob))
|
|
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
|
|
bch2_dev_put(ca);
|
|
@@ -1336,7 +1319,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
|
|
if (wp->data_type != BCH_DATA_user)
|
|
have_cache = true;
|
|
|
|
- if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
|
|
+ if (target && !(flags & BCH_WRITE_only_specified_devs)) {
|
|
ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
|
|
target, erasure_code,
|
|
nr_replicas, &nr_effective,
|
|
@@ -1426,7 +1409,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
|
|
if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
|
|
ret = -BCH_ERR_bucket_alloc_blocked;
|
|
|
|
- if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
|
|
+ if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
|
|
bch2_err_matches(ret, BCH_ERR_freelist_empty))
|
|
ret = -BCH_ERR_bucket_alloc_blocked;
|
|
|
|
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
|
|
index f25481a0d1a0..69ec6a012898 100644
|
|
--- a/fs/bcachefs/alloc_foreground.h
|
|
+++ b/fs/bcachefs/alloc_foreground.h
|
|
@@ -33,6 +33,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
|
|
return bch2_dev_have_ref(c, ob->dev);
|
|
}
|
|
|
|
+static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark)
|
|
+{
|
|
+ switch (watermark) {
|
|
+ case BCH_WATERMARK_interior_updates:
|
|
+ return 0;
|
|
+ case BCH_WATERMARK_reclaim:
|
|
+ return OPEN_BUCKETS_COUNT / 6;
|
|
+ case BCH_WATERMARK_btree:
|
|
+ case BCH_WATERMARK_btree_copygc:
|
|
+ return OPEN_BUCKETS_COUNT / 4;
|
|
+ case BCH_WATERMARK_copygc:
|
|
+ return OPEN_BUCKETS_COUNT / 3;
|
|
+ default:
|
|
+ return OPEN_BUCKETS_COUNT / 2;
|
|
+ }
|
|
+}
|
|
+
|
|
struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
|
|
enum bch_watermark, enum bch_data_type,
|
|
struct closure *);
|
|
@@ -65,7 +82,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
|
|
}
|
|
|
|
void bch2_open_bucket_write_error(struct bch_fs *,
|
|
- struct open_buckets *, unsigned);
|
|
+ struct open_buckets *, unsigned, int);
|
|
|
|
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
|
|
|
|
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
|
|
index 4aa8ee026cb8..8f79f46c2a78 100644
|
|
--- a/fs/bcachefs/alloc_types.h
|
|
+++ b/fs/bcachefs/alloc_types.h
|
|
@@ -90,6 +90,7 @@ struct dev_stripe_state {
|
|
x(stopped) \
|
|
x(waiting_io) \
|
|
x(waiting_work) \
|
|
+ x(runnable) \
|
|
x(running)
|
|
|
|
enum write_point_state {
|
|
@@ -125,6 +126,7 @@ struct write_point {
|
|
enum write_point_state state;
|
|
u64 last_state_change;
|
|
u64 time[WRITE_POINT_STATE_NR];
|
|
+ u64 last_runtime;
|
|
} __aligned(SMP_CACHE_BYTES);
|
|
};
|
|
|
|
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
|
|
index ebeb6a5ff9d2..20c497f0c2cb 100644
|
|
--- a/fs/bcachefs/backpointers.c
|
|
+++ b/fs/bcachefs/backpointers.c
|
|
@@ -11,6 +11,7 @@
|
|
#include "checksum.h"
|
|
#include "disk_accounting.h"
|
|
#include "error.h"
|
|
+#include "progress.h"
|
|
|
|
#include <linux/mm.h>
|
|
|
|
@@ -49,6 +50,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke
|
|
}
|
|
|
|
bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
|
|
+ prt_str(out, " data_type=");
|
|
+ bch2_prt_data_type(out, bp.v->data_type);
|
|
prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
|
|
(u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
|
|
bp.v->bucket_len,
|
|
@@ -244,27 +247,31 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
|
|
if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
|
|
return bkey_s_c_null;
|
|
|
|
- if (likely(!bp.v->level)) {
|
|
- bch2_trans_node_iter_init(trans, iter,
|
|
- bp.v->btree_id,
|
|
- bp.v->pos,
|
|
- 0, 0,
|
|
- iter_flags);
|
|
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
|
|
- if (bkey_err(k)) {
|
|
- bch2_trans_iter_exit(trans, iter);
|
|
- return k;
|
|
- }
|
|
+ bch2_trans_node_iter_init(trans, iter,
|
|
+ bp.v->btree_id,
|
|
+ bp.v->pos,
|
|
+ 0,
|
|
+ bp.v->level,
|
|
+ iter_flags);
|
|
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
|
|
+ if (bkey_err(k)) {
|
|
+ bch2_trans_iter_exit(trans, iter);
|
|
+ return k;
|
|
+ }
|
|
|
|
- if (k.k &&
|
|
- extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
|
|
- return k;
|
|
+ if (k.k &&
|
|
+ extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
|
|
+ return k;
|
|
|
|
- bch2_trans_iter_exit(trans, iter);
|
|
+ bch2_trans_iter_exit(trans, iter);
|
|
+
|
|
+ if (!bp.v->level) {
|
|
int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
|
|
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
|
|
} else {
|
|
struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
|
|
+ if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
|
|
+ return bkey_s_c_null;
|
|
if (IS_ERR_OR_NULL(b))
|
|
return ((struct bkey_s_c) { .k = ERR_CAST(b) });
|
|
|
|
@@ -514,6 +521,22 @@ static int check_bp_exists(struct btree_trans *trans,
|
|
if (!other_extent.k)
|
|
goto missing;
|
|
|
|
+ rcu_read_lock();
|
|
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode);
|
|
+ if (ca) {
|
|
+ struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent);
|
|
+ bkey_for_each_ptr(other_extent_ptrs, ptr)
|
|
+ if (ptr->dev == bp->k.p.inode &&
|
|
+ dev_ptr_stale_rcu(ca, ptr)) {
|
|
+ ret = drop_dev_and_update(trans, other_bp.v->btree_id,
|
|
+ other_extent, bp->k.p.inode);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
if (bch2_extents_match(orig_k, other_extent)) {
|
|
printbuf_reset(&buf);
|
|
prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
|
|
@@ -590,9 +613,6 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
|
|
struct extent_ptr_decoded p;
|
|
|
|
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
- if (p.ptr.cached)
|
|
- continue;
|
|
-
|
|
if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
|
|
continue;
|
|
|
|
@@ -600,9 +620,11 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
|
|
struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
|
|
bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches);
|
|
bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty);
|
|
+
|
|
+ bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr));
|
|
rcu_read_unlock();
|
|
|
|
- if (check || empty) {
|
|
+ if ((check || empty) && !stale) {
|
|
struct bkey_i_backpointer bp;
|
|
bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
|
|
|
|
@@ -715,71 +737,6 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
-struct progress_indicator_state {
|
|
- unsigned long next_print;
|
|
- u64 nodes_seen;
|
|
- u64 nodes_total;
|
|
- struct btree *last_node;
|
|
-};
|
|
-
|
|
-static inline void progress_init(struct progress_indicator_state *s,
|
|
- struct bch_fs *c,
|
|
- u64 btree_id_mask)
|
|
-{
|
|
- memset(s, 0, sizeof(*s));
|
|
-
|
|
- s->next_print = jiffies + HZ * 10;
|
|
-
|
|
- for (unsigned i = 0; i < BTREE_ID_NR; i++) {
|
|
- if (!(btree_id_mask & BIT_ULL(i)))
|
|
- continue;
|
|
-
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_btree,
|
|
- .btree.id = i,
|
|
- };
|
|
-
|
|
- u64 v;
|
|
- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
|
|
- s->nodes_total += div64_ul(v, btree_sectors(c));
|
|
- }
|
|
-}
|
|
-
|
|
-static inline bool progress_update_p(struct progress_indicator_state *s)
|
|
-{
|
|
- bool ret = time_after_eq(jiffies, s->next_print);
|
|
-
|
|
- if (ret)
|
|
- s->next_print = jiffies + HZ * 10;
|
|
- return ret;
|
|
-}
|
|
-
|
|
-static void progress_update_iter(struct btree_trans *trans,
|
|
- struct progress_indicator_state *s,
|
|
- struct btree_iter *iter,
|
|
- const char *msg)
|
|
-{
|
|
- struct bch_fs *c = trans->c;
|
|
- struct btree *b = path_l(btree_iter_path(trans, iter))->b;
|
|
-
|
|
- s->nodes_seen += b != s->last_node;
|
|
- s->last_node = b;
|
|
-
|
|
- if (progress_update_p(s)) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- unsigned percent = s->nodes_total
|
|
- ? div64_u64(s->nodes_seen * 100, s->nodes_total)
|
|
- : 0;
|
|
-
|
|
- prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
|
|
- msg, percent, s->nodes_seen, s->nodes_total);
|
|
- bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
|
|
-
|
|
- bch_info(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
-}
|
|
-
|
|
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
|
|
struct extents_to_bp_state *s)
|
|
{
|
|
@@ -787,7 +744,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
|
|
struct progress_indicator_state progress;
|
|
int ret = 0;
|
|
|
|
- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
|
|
+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
|
|
|
|
for (enum btree_id btree_id = 0;
|
|
btree_id < btree_id_nr_alive(c);
|
|
@@ -806,7 +763,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
|
|
BTREE_ITER_prefetch);
|
|
|
|
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
|
|
- progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
|
|
+ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
|
|
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
|
|
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
|
}));
|
|
@@ -827,7 +784,7 @@ enum alloc_sector_counter {
|
|
ALLOC_SECTORS_NR
|
|
};
|
|
|
|
-static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t)
|
|
+static int data_type_to_alloc_counter(enum bch_data_type t)
|
|
{
|
|
switch (t) {
|
|
case BCH_DATA_btree:
|
|
@@ -836,9 +793,10 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t
|
|
case BCH_DATA_cached:
|
|
return ALLOC_cached;
|
|
case BCH_DATA_stripe:
|
|
+ case BCH_DATA_parity:
|
|
return ALLOC_stripe;
|
|
default:
|
|
- BUG();
|
|
+ return -1;
|
|
}
|
|
}
|
|
|
|
@@ -889,7 +847,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
|
|
if (bp.v->bucket_gen != a->gen)
|
|
continue;
|
|
|
|
- sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len;
|
|
+ int alloc_counter = data_type_to_alloc_counter(bp.v->data_type);
|
|
+ if (alloc_counter < 0)
|
|
+ continue;
|
|
+
|
|
+ sectors[alloc_counter] += bp.v->bucket_len;
|
|
};
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
if (ret)
|
|
@@ -901,9 +863,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
|
|
goto err;
|
|
}
|
|
|
|
- /* Cached pointers don't have backpointers: */
|
|
-
|
|
if (sectors[ALLOC_dirty] != a->dirty_sectors ||
|
|
+ sectors[ALLOC_cached] != a->cached_sectors ||
|
|
sectors[ALLOC_stripe] != a->stripe_sectors) {
|
|
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
|
|
ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
|
|
@@ -912,6 +873,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
|
|
}
|
|
|
|
if (sectors[ALLOC_dirty] > a->dirty_sectors ||
|
|
+ sectors[ALLOC_cached] > a->cached_sectors ||
|
|
sectors[ALLOC_stripe] > a->stripe_sectors) {
|
|
ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
|
|
-BCH_ERR_transaction_restart_nested;
|
|
@@ -919,7 +881,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
|
|
}
|
|
|
|
if (!sectors[ALLOC_dirty] &&
|
|
- !sectors[ALLOC_stripe])
|
|
+ !sectors[ALLOC_stripe] &&
|
|
+ !sectors[ALLOC_cached])
|
|
__set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty);
|
|
else
|
|
__set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
|
|
@@ -1206,11 +1169,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
|
|
|
|
bch2_bkey_buf_init(&last_flushed);
|
|
bkey_init(&last_flushed.k->k);
|
|
- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
|
|
+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
|
|
|
|
int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
|
|
POS_MIN, BTREE_ITER_prefetch, k, ({
|
|
- progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
|
|
+ bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
|
|
check_one_backpointer(trans, start, end, k, &last_flushed);
|
|
}));
|
|
|
|
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
|
|
index 060dad1521ee..16575dbc5736 100644
|
|
--- a/fs/bcachefs/backpointers.h
|
|
+++ b/fs/bcachefs/backpointers.h
|
|
@@ -1,6 +1,6 @@
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
-#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
|
|
-#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
|
|
+#ifndef _BCACHEFS_BACKPOINTERS_H
|
|
+#define _BCACHEFS_BACKPOINTERS_H
|
|
|
|
#include "btree_cache.h"
|
|
#include "btree_iter.h"
|
|
@@ -123,7 +123,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
|
|
return BCH_DATA_btree;
|
|
case KEY_TYPE_extent:
|
|
case KEY_TYPE_reflink_v:
|
|
- return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
|
|
+ if (p.has_ec)
|
|
+ return BCH_DATA_stripe;
|
|
+ if (p.ptr.cached)
|
|
+ return BCH_DATA_cached;
|
|
+ else
|
|
+ return BCH_DATA_user;
|
|
case KEY_TYPE_stripe: {
|
|
const struct bch_extent_ptr *ptr = &entry->ptr;
|
|
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
|
|
@@ -147,7 +152,20 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
|
|
struct bkey_i_backpointer *bp)
|
|
{
|
|
bkey_backpointer_init(&bp->k_i);
|
|
- bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset);
|
|
+ bp->k.p.inode = p.ptr.dev;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_stripe)
|
|
+ bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
|
|
+ else {
|
|
+ /*
|
|
+ * Put stripe backpointers where they won't collide with the
|
|
+ * extent backpointers within the stripe:
|
|
+ */
|
|
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
|
|
+ bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) <<
|
|
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1;
|
|
+ }
|
|
+
|
|
bp->v = (struct bch_backpointer) {
|
|
.btree_id = btree_id,
|
|
.level = level,
|
|
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
|
|
index 161cf2f05d2a..f52311017aee 100644
|
|
--- a/fs/bcachefs/bcachefs.h
|
|
+++ b/fs/bcachefs/bcachefs.h
|
|
@@ -203,6 +203,7 @@
|
|
#include <linux/types.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/zstd.h>
|
|
+#include <linux/unicode.h>
|
|
|
|
#include "bcachefs_format.h"
|
|
#include "btree_journal_iter_types.h"
|
|
@@ -444,6 +445,7 @@ BCH_DEBUG_PARAMS_DEBUG()
|
|
x(btree_node_sort) \
|
|
x(btree_node_read) \
|
|
x(btree_node_read_done) \
|
|
+ x(btree_node_write) \
|
|
x(btree_interior_update_foreground) \
|
|
x(btree_interior_update_total) \
|
|
x(btree_gc) \
|
|
@@ -456,6 +458,7 @@ BCH_DEBUG_PARAMS_DEBUG()
|
|
x(blocked_journal_low_on_space) \
|
|
x(blocked_journal_low_on_pin) \
|
|
x(blocked_journal_max_in_flight) \
|
|
+ x(blocked_journal_max_open) \
|
|
x(blocked_key_cache_flush) \
|
|
x(blocked_allocate) \
|
|
x(blocked_allocate_open_bucket) \
|
|
@@ -533,6 +536,7 @@ struct bch_dev {
|
|
*/
|
|
struct bch_member_cpu mi;
|
|
atomic64_t errors[BCH_MEMBER_ERROR_NR];
|
|
+ unsigned long write_errors_start;
|
|
|
|
__uuid_t uuid;
|
|
char name[BDEVNAME_SIZE];
|
|
@@ -623,7 +627,8 @@ struct bch_dev {
|
|
x(topology_error) \
|
|
x(errors_fixed) \
|
|
x(errors_not_fixed) \
|
|
- x(no_invalid_checks)
|
|
+ x(no_invalid_checks) \
|
|
+ x(discard_mount_opt_set) \
|
|
|
|
enum bch_fs_flags {
|
|
#define x(n) BCH_FS_##n,
|
|
@@ -687,7 +692,8 @@ struct btree_trans_buf {
|
|
x(gc_gens) \
|
|
x(snapshot_delete_pagecache) \
|
|
x(sysfs) \
|
|
- x(btree_write_buffer)
|
|
+ x(btree_write_buffer) \
|
|
+ x(btree_node_scrub)
|
|
|
|
enum bch_write_ref {
|
|
#define x(n) BCH_WRITE_REF_##n,
|
|
@@ -696,6 +702,8 @@ enum bch_write_ref {
|
|
BCH_WRITE_REF_NR,
|
|
};
|
|
|
|
+#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0)
|
|
+
|
|
struct bch_fs {
|
|
struct closure cl;
|
|
|
|
@@ -780,6 +788,9 @@ struct bch_fs {
|
|
u64 btrees_lost_data;
|
|
} sb;
|
|
|
|
+#ifdef CONFIG_UNICODE
|
|
+ struct unicode_map *cf_encoding;
|
|
+#endif
|
|
|
|
struct bch_sb_handle disk_sb;
|
|
|
|
@@ -969,7 +980,6 @@ struct bch_fs {
|
|
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
|
|
size_t zstd_workspace_size;
|
|
|
|
- struct crypto_shash *sha256;
|
|
struct crypto_sync_skcipher *chacha20;
|
|
struct crypto_shash *poly1305;
|
|
|
|
@@ -993,15 +1003,11 @@ struct bch_fs {
|
|
wait_queue_head_t copygc_running_wq;
|
|
|
|
/* STRIPES: */
|
|
- GENRADIX(struct stripe) stripes;
|
|
GENRADIX(struct gc_stripe) gc_stripes;
|
|
|
|
struct hlist_head ec_stripes_new[32];
|
|
spinlock_t ec_stripes_new_lock;
|
|
|
|
- ec_stripes_heap ec_stripes_heap;
|
|
- struct mutex ec_stripes_heap_lock;
|
|
-
|
|
/* ERASURE CODING */
|
|
struct list_head ec_stripe_head_list;
|
|
struct mutex ec_stripe_head_lock;
|
|
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
|
|
index f70f0108401f..e96d87767020 100644
|
|
--- a/fs/bcachefs/bcachefs_format.h
|
|
+++ b/fs/bcachefs/bcachefs_format.h
|
|
@@ -686,7 +686,12 @@ struct bch_sb_field_ext {
|
|
x(inode_depth, BCH_VERSION(1, 17)) \
|
|
x(persistent_inode_cursors, BCH_VERSION(1, 18)) \
|
|
x(autofix_errors, BCH_VERSION(1, 19)) \
|
|
- x(directory_size, BCH_VERSION(1, 20))
|
|
+ x(directory_size, BCH_VERSION(1, 20)) \
|
|
+ x(cached_backpointers, BCH_VERSION(1, 21)) \
|
|
+ x(stripe_backpointers, BCH_VERSION(1, 22)) \
|
|
+ x(stripe_lru, BCH_VERSION(1, 23)) \
|
|
+ x(casefolding, BCH_VERSION(1, 24)) \
|
|
+ x(extent_flags, BCH_VERSION(1, 25))
|
|
|
|
enum bcachefs_metadata_version {
|
|
bcachefs_metadata_version_min = 9,
|
|
@@ -837,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
|
|
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
|
|
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
|
|
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
|
|
+/* one free bit */
|
|
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
|
|
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
|
|
LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
|
|
@@ -855,6 +861,8 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
|
|
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
|
|
struct bch_sb, flags[5], 48, 64);
|
|
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
|
|
+LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
|
|
+LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
|
|
|
|
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
|
|
{
|
|
@@ -908,7 +916,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
|
|
x(journal_no_flush, 16) \
|
|
x(alloc_v2, 17) \
|
|
x(extents_across_btree_nodes, 18) \
|
|
- x(incompat_version_field, 19)
|
|
+ x(incompat_version_field, 19) \
|
|
+ x(casefolding, 20)
|
|
|
|
#define BCH_SB_FEATURES_ALWAYS \
|
|
(BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \
|
|
@@ -922,7 +931,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
|
|
BIT_ULL(BCH_FEATURE_new_siphash)| \
|
|
BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
|
|
BIT_ULL(BCH_FEATURE_new_varint)| \
|
|
- BIT_ULL(BCH_FEATURE_journal_no_flush))
|
|
+ BIT_ULL(BCH_FEATURE_journal_no_flush)| \
|
|
+ BIT_ULL(BCH_FEATURE_incompat_version_field))
|
|
|
|
enum bch_sb_feature {
|
|
#define x(f, n) BCH_FEATURE_##f,
|
|
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
|
|
index 3c23bdf788ce..52594e925eb7 100644
|
|
--- a/fs/bcachefs/bcachefs_ioctl.h
|
|
+++ b/fs/bcachefs/bcachefs_ioctl.h
|
|
@@ -87,6 +87,7 @@ struct bch_ioctl_incremental {
|
|
#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
|
|
#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
|
|
#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
|
|
+#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters)
|
|
|
|
/* ioctl below act on a particular file, not the filesystem as a whole: */
|
|
|
|
@@ -213,6 +214,10 @@ struct bch_ioctl_data {
|
|
struct bpos end_pos;
|
|
|
|
union {
|
|
+ struct {
|
|
+ __u32 dev;
|
|
+ __u32 data_types;
|
|
+ } scrub;
|
|
struct {
|
|
__u32 dev;
|
|
__u32 pad;
|
|
@@ -229,6 +234,11 @@ enum bch_data_event {
|
|
BCH_DATA_EVENT_NR = 1,
|
|
};
|
|
|
|
+enum data_progress_data_type_special {
|
|
+ DATA_PROGRESS_DATA_TYPE_phys = 254,
|
|
+ DATA_PROGRESS_DATA_TYPE_done = 255,
|
|
+};
|
|
+
|
|
struct bch_ioctl_data_progress {
|
|
__u8 data_type;
|
|
__u8 btree_id;
|
|
@@ -237,11 +247,19 @@ struct bch_ioctl_data_progress {
|
|
|
|
__u64 sectors_done;
|
|
__u64 sectors_total;
|
|
+ __u64 sectors_error_corrected;
|
|
+ __u64 sectors_error_uncorrected;
|
|
} __packed __aligned(8);
|
|
|
|
+enum bch_ioctl_data_event_ret {
|
|
+ BCH_IOCTL_DATA_EVENT_RET_done = 1,
|
|
+ BCH_IOCTL_DATA_EVENT_RET_device_offline = 2,
|
|
+};
|
|
+
|
|
struct bch_ioctl_data_event {
|
|
__u8 type;
|
|
- __u8 pad[7];
|
|
+ __u8 ret;
|
|
+ __u8 pad[6];
|
|
union {
|
|
struct bch_ioctl_data_progress p;
|
|
__u64 pad2[15];
|
|
@@ -443,4 +461,13 @@ struct bch_ioctl_query_accounting {
|
|
struct bkey_i_accounting accounting[];
|
|
};
|
|
|
|
+#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0)
|
|
+
|
|
+struct bch_ioctl_query_counters {
|
|
+ __u16 nr;
|
|
+ __u16 flags;
|
|
+ __u32 pad;
|
|
+ __u64 d[];
|
|
+};
|
|
+
|
|
#endif /* _BCACHEFS_IOCTL_H */
|
|
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
|
|
index 054e2d5e8448..082632905649 100644
|
|
--- a/fs/bcachefs/bkey.h
|
|
+++ b/fs/bcachefs/bkey.h
|
|
@@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r)
|
|
static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
|
|
{
|
|
return bpos_eq(l.k->p, r.k->p) &&
|
|
+ l.k->size == r.k->size &&
|
|
bkey_bytes(l.k) == bkey_bytes(r.k) &&
|
|
!memcmp(l.v, r.v, bkey_val_bytes(l.k));
|
|
}
|
|
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
|
|
index 1ec1f90e0eb3..54666027aa85 100644
|
|
--- a/fs/bcachefs/btree_cache.c
|
|
+++ b/fs/bcachefs/btree_cache.c
|
|
@@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
|
|
btree_node_write_in_flight(b));
|
|
|
|
btree_node_data_free(bc, b);
|
|
+ cond_resched();
|
|
}
|
|
|
|
BUG_ON(!bch2_journal_error(&c->journal) &&
|
|
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
|
|
index dd1d9b74076e..ff681e733598 100644
|
|
--- a/fs/bcachefs/btree_gc.c
|
|
+++ b/fs/bcachefs/btree_gc.c
|
|
@@ -27,6 +27,7 @@
|
|
#include "journal.h"
|
|
#include "keylist.h"
|
|
#include "move.h"
|
|
+#include "progress.h"
|
|
#include "recovery_passes.h"
|
|
#include "reflink.h"
|
|
#include "recovery.h"
|
|
@@ -656,7 +657,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
|
|
return ret;
|
|
}
|
|
|
|
-static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
|
|
+static int bch2_gc_btree(struct btree_trans *trans,
|
|
+ struct progress_indicator_state *progress,
|
|
+ enum btree_id btree, bool initial)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
|
|
@@ -673,6 +676,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
|
|
BTREE_ITER_prefetch);
|
|
|
|
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
|
|
+ bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
|
|
gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
|
|
bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
|
|
}));
|
|
@@ -717,22 +721,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
|
|
static int bch2_gc_btrees(struct bch_fs *c)
|
|
{
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
- enum btree_id ids[BTREE_ID_NR];
|
|
struct printbuf buf = PRINTBUF;
|
|
- unsigned i;
|
|
int ret = 0;
|
|
|
|
- for (i = 0; i < BTREE_ID_NR; i++)
|
|
+ struct progress_indicator_state progress;
|
|
+ bch2_progress_init(&progress, c, ~0ULL);
|
|
+
|
|
+ enum btree_id ids[BTREE_ID_NR];
|
|
+ for (unsigned i = 0; i < BTREE_ID_NR; i++)
|
|
ids[i] = i;
|
|
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
|
|
|
|
- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
|
|
+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
|
|
unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
|
|
|
|
if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
|
|
continue;
|
|
|
|
- ret = bch2_gc_btree(trans, btree, true);
|
|
+ ret = bch2_gc_btree(trans, &progress, btree, true);
|
|
}
|
|
|
|
printbuf_exit(&buf);
|
|
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
|
|
index 756736f9243d..2ba33ffc9795 100644
|
|
--- a/fs/bcachefs/btree_io.c
|
|
+++ b/fs/bcachefs/btree_io.c
|
|
@@ -1,6 +1,7 @@
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include "bcachefs.h"
|
|
+#include "bkey_buf.h"
|
|
#include "bkey_methods.h"
|
|
#include "bkey_sort.h"
|
|
#include "btree_cache.h"
|
|
@@ -1328,6 +1329,7 @@ static void btree_node_read_work(struct work_struct *work)
|
|
bch_info(c, "retrying read");
|
|
ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
|
|
rb->have_ioref = ca != NULL;
|
|
+ rb->start_time = local_clock();
|
|
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
|
|
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
|
|
bio->bi_iter.bi_size = btree_buf_bytes(b);
|
|
@@ -1338,21 +1340,26 @@ static void btree_node_read_work(struct work_struct *work)
|
|
} else {
|
|
bio->bi_status = BLK_STS_REMOVED;
|
|
}
|
|
+
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
|
+ rb->start_time, !bio->bi_status);
|
|
start:
|
|
printbuf_reset(&buf);
|
|
bch2_btree_pos_to_text(&buf, c, b);
|
|
- bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
|
|
- "btree read error %s for %s",
|
|
- bch2_blk_status_to_str(bio->bi_status), buf.buf);
|
|
+
|
|
+ if (ca && bio->bi_status)
|
|
+ bch_err_dev_ratelimited(ca,
|
|
+ "btree read error %s for %s",
|
|
+ bch2_blk_status_to_str(bio->bi_status), buf.buf);
|
|
if (rb->have_ioref)
|
|
percpu_ref_put(&ca->io_ref);
|
|
rb->have_ioref = false;
|
|
|
|
- bch2_mark_io_failure(&failed, &rb->pick);
|
|
+ bch2_mark_io_failure(&failed, &rb->pick, false);
|
|
|
|
can_retry = bch2_bkey_pick_read_device(c,
|
|
bkey_i_to_s_c(&b->key),
|
|
- &failed, &rb->pick) > 0;
|
|
+ &failed, &rb->pick, -1) > 0;
|
|
|
|
if (!bio->bi_status &&
|
|
!bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
|
|
@@ -1400,12 +1407,11 @@ static void btree_node_read_endio(struct bio *bio)
|
|
struct btree_read_bio *rb =
|
|
container_of(bio, struct btree_read_bio, bio);
|
|
struct bch_fs *c = rb->c;
|
|
+ struct bch_dev *ca = rb->have_ioref
|
|
+ ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
|
|
|
|
- if (rb->have_ioref) {
|
|
- struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
|
|
-
|
|
- bch2_latency_acct(ca, rb->start_time, READ);
|
|
- }
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
|
+ rb->start_time, !bio->bi_status);
|
|
|
|
queue_work(c->btree_read_complete_wq, &rb->work);
|
|
}
|
|
@@ -1697,7 +1703,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
|
|
return;
|
|
|
|
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
|
|
- NULL, &pick);
|
|
+ NULL, &pick, -1);
|
|
|
|
if (ret <= 0) {
|
|
struct printbuf buf = PRINTBUF;
|
|
@@ -1811,6 +1817,190 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
|
|
return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
|
|
}
|
|
|
|
+struct btree_node_scrub {
|
|
+ struct bch_fs *c;
|
|
+ struct bch_dev *ca;
|
|
+ void *buf;
|
|
+ bool used_mempool;
|
|
+ unsigned written;
|
|
+
|
|
+ enum btree_id btree;
|
|
+ unsigned level;
|
|
+ struct bkey_buf key;
|
|
+ __le64 seq;
|
|
+
|
|
+ struct work_struct work;
|
|
+ struct bio bio;
|
|
+};
|
|
+
|
|
+static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
|
|
+ struct printbuf *err)
|
|
+{
|
|
+ unsigned written = 0;
|
|
+
|
|
+ if (le64_to_cpu(data->magic) != bset_magic(c)) {
|
|
+ prt_printf(err, "bad magic: want %llx, got %llx",
|
|
+ bset_magic(c), le64_to_cpu(data->magic));
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ while (written < (ptr_written ?: btree_sectors(c))) {
|
|
+ struct btree_node_entry *bne;
|
|
+ struct bset *i;
|
|
+ bool first = !written;
|
|
+
|
|
+ if (first) {
|
|
+ bne = NULL;
|
|
+ i = &data->keys;
|
|
+ } else {
|
|
+ bne = (void *) data + (written << 9);
|
|
+ i = &bne->keys;
|
|
+
|
|
+ if (!ptr_written && i->seq != data->keys.seq)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ struct nonce nonce = btree_nonce(i, written << 9);
|
|
+ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
|
|
+
|
|
+ if (first) {
|
|
+ if (good_csum_type) {
|
|
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
|
|
+ if (bch2_crc_cmp(data->csum, csum)) {
|
|
+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ written += vstruct_sectors(data, c->block_bits);
|
|
+ } else {
|
|
+ if (good_csum_type) {
|
|
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
|
|
+ if (bch2_crc_cmp(bne->csum, csum)) {
|
|
+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ written += vstruct_sectors(bne, c->block_bits);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void btree_node_scrub_work(struct work_struct *work)
|
|
+{
|
|
+ struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
|
|
+ struct bch_fs *c = scrub->c;
|
|
+ struct printbuf err = PRINTBUF;
|
|
+
|
|
+ __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
|
|
+ bkey_i_to_s_c(scrub->key.k));
|
|
+ prt_newline(&err);
|
|
+
|
|
+ if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
|
|
+ struct btree_trans *trans = bch2_trans_get(c);
|
|
+
|
|
+ struct btree_iter iter;
|
|
+ bch2_trans_node_iter_init(trans, &iter, scrub->btree,
|
|
+ scrub->key.k->k.p, 0, scrub->level - 1, 0);
|
|
+
|
|
+ struct btree *b;
|
|
+ int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter)));
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) {
|
|
+ bch_err(c, "error validating btree node during scrub on %s at btree %s",
|
|
+ scrub->ca->name, err.buf);
|
|
+
|
|
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
|
|
+ }
|
|
+err:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ bch2_trans_begin(trans);
|
|
+ bch2_trans_put(trans);
|
|
+ }
|
|
+
|
|
+ printbuf_exit(&err);
|
|
+ bch2_bkey_buf_exit(&scrub->key, c);;
|
|
+ btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
|
|
+ percpu_ref_put(&scrub->ca->io_ref);
|
|
+ kfree(scrub);
|
|
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
|
|
+}
|
|
+
|
|
+static void btree_node_scrub_endio(struct bio *bio)
|
|
+{
|
|
+ struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
|
|
+
|
|
+ queue_work(scrub->c->btree_read_complete_wq, &scrub->work);
|
|
+}
|
|
+
|
|
+int bch2_btree_node_scrub(struct btree_trans *trans,
|
|
+ enum btree_id btree, unsigned level,
|
|
+ struct bkey_s_c k, unsigned dev)
|
|
+{
|
|
+ if (k.k->type != KEY_TYPE_btree_ptr_v2)
|
|
+ return 0;
|
|
+
|
|
+ struct bch_fs *c = trans->c;
|
|
+
|
|
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub))
|
|
+ return -BCH_ERR_erofs_no_writes;
|
|
+
|
|
+ struct extent_ptr_decoded pick;
|
|
+ int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
|
|
+ if (ret <= 0)
|
|
+ goto err;
|
|
+
|
|
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
|
|
+ if (!ca) {
|
|
+ ret = -BCH_ERR_device_offline;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bool used_mempool = false;
|
|
+ void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
|
|
+
|
|
+ unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
|
|
+
|
|
+ struct btree_node_scrub *scrub =
|
|
+ kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
|
|
+ if (!scrub) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err_free;
|
|
+ }
|
|
+
|
|
+ scrub->c = c;
|
|
+ scrub->ca = ca;
|
|
+ scrub->buf = buf;
|
|
+ scrub->used_mempool = used_mempool;
|
|
+ scrub->written = btree_ptr_sectors_written(k);
|
|
+
|
|
+ scrub->btree = btree;
|
|
+ scrub->level = level;
|
|
+ bch2_bkey_buf_init(&scrub->key);
|
|
+ bch2_bkey_buf_reassemble(&scrub->key, c, k);
|
|
+ scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq;
|
|
+
|
|
+ INIT_WORK(&scrub->work, btree_node_scrub_work);
|
|
+
|
|
+ bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ);
|
|
+ bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
|
|
+ scrub->bio.bi_iter.bi_sector = pick.ptr.offset;
|
|
+ scrub->bio.bi_end_io = btree_node_scrub_endio;
|
|
+ submit_bio(&scrub->bio);
|
|
+ return 0;
|
|
+err_free:
|
|
+ btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+err:
|
|
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
|
|
struct btree_write *w)
|
|
{
|
|
@@ -1831,7 +2021,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
|
|
bch2_journal_pin_drop(&c->journal, &w->journal);
|
|
}
|
|
|
|
-static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
|
|
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
|
|
{
|
|
struct btree_write *w = btree_prev_write(b);
|
|
unsigned long old, new;
|
|
@@ -1839,6 +2029,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
|
|
|
|
bch2_btree_complete_write(c, b, w);
|
|
|
|
+ if (start_time)
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time);
|
|
+
|
|
old = READ_ONCE(b->flags);
|
|
do {
|
|
new = old;
|
|
@@ -1869,7 +2062,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
|
|
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
|
|
}
|
|
|
|
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
|
|
+static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
|
|
{
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
|
|
@@ -1877,7 +2070,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
|
|
|
|
/* we don't need transaction context anymore after we got the lock. */
|
|
bch2_trans_put(trans);
|
|
- __btree_node_write_done(c, b);
|
|
+ __btree_node_write_done(c, b, start_time);
|
|
six_unlock_read(&b->c.lock);
|
|
}
|
|
|
|
@@ -1887,6 +2080,7 @@ static void btree_node_write_work(struct work_struct *work)
|
|
container_of(work, struct btree_write_bio, work);
|
|
struct bch_fs *c = wbio->wbio.c;
|
|
struct btree *b = wbio->wbio.bio.bi_private;
|
|
+ u64 start_time = wbio->start_time;
|
|
int ret = 0;
|
|
|
|
btree_bounce_free(c,
|
|
@@ -1919,12 +2113,18 @@ static void btree_node_write_work(struct work_struct *work)
|
|
}
|
|
out:
|
|
bio_put(&wbio->wbio.bio);
|
|
- btree_node_write_done(c, b);
|
|
+ btree_node_write_done(c, b, start_time);
|
|
return;
|
|
err:
|
|
set_btree_node_noevict(b);
|
|
- bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
|
|
- "writing btree node: %s", bch2_err_str(ret));
|
|
+
|
|
+ if (!bch2_err_matches(ret, EROFS)) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret));
|
|
+ bch2_btree_pos_to_text(&buf, c, b);
|
|
+ bch2_fs_fatal_error(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
goto out;
|
|
}
|
|
|
|
@@ -1937,16 +2137,21 @@ static void btree_node_write_endio(struct bio *bio)
|
|
struct bch_fs *c = wbio->c;
|
|
struct btree *b = wbio->bio.bi_private;
|
|
struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
|
|
- unsigned long flags;
|
|
|
|
- if (wbio->have_ioref)
|
|
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
|
|
+ wbio->submit_time, !bio->bi_status);
|
|
|
|
- if (!ca ||
|
|
- bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
|
|
- "btree write error: %s",
|
|
- bch2_blk_status_to_str(bio->bi_status)) ||
|
|
- bch2_meta_write_fault("btree")) {
|
|
+ if (ca && bio->bi_status) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ prt_printf(&buf, "btree write error: %s\n ",
|
|
+ bch2_blk_status_to_str(bio->bi_status));
|
|
+ bch2_btree_pos_to_text(&buf, c, b);
|
|
+ bch_err_dev_ratelimited(ca, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
+
|
|
+ if (bio->bi_status) {
|
|
+ unsigned long flags;
|
|
spin_lock_irqsave(&c->btree_write_error_lock, flags);
|
|
bch2_dev_list_add_dev(&orig->failed, wbio->dev);
|
|
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
|
|
@@ -2023,6 +2228,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
|
|
bool validate_before_checksum = false;
|
|
enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
|
|
void *data;
|
|
+ u64 start_time = local_clock();
|
|
int ret;
|
|
|
|
if (flags & BTREE_WRITE_ALREADY_STARTED)
|
|
@@ -2231,6 +2437,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
|
|
wbio->data = data;
|
|
wbio->data_bytes = bytes;
|
|
wbio->sector_offset = b->written;
|
|
+ wbio->start_time = start_time;
|
|
wbio->wbio.c = c;
|
|
wbio->wbio.used_mempool = used_mempool;
|
|
wbio->wbio.first_btree_write = !b->written;
|
|
@@ -2258,7 +2465,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
|
|
b->written += sectors_to_write;
|
|
nowrite:
|
|
btree_bounce_free(c, bytes, used_mempool, data);
|
|
- __btree_node_write_done(c, b);
|
|
+ __btree_node_write_done(c, b, 0);
|
|
}
|
|
|
|
/*
|
|
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
|
|
index 6f9e4a6dacf7..dbf76d22c660 100644
|
|
--- a/fs/bcachefs/btree_io.h
|
|
+++ b/fs/bcachefs/btree_io.h
|
|
@@ -52,6 +52,7 @@ struct btree_write_bio {
|
|
void *data;
|
|
unsigned data_bytes;
|
|
unsigned sector_offset;
|
|
+ u64 start_time;
|
|
struct bch_write_bio wbio;
|
|
};
|
|
|
|
@@ -132,6 +133,9 @@ void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
|
|
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
|
|
const struct bkey_i *, unsigned);
|
|
|
|
+int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned,
|
|
+ struct bkey_s_c, unsigned);
|
|
+
|
|
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
|
|
|
|
enum btree_write_flags {
|
|
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
|
|
index e32fce4fd258..7542c6f9c88e 100644
|
|
--- a/fs/bcachefs/btree_iter.c
|
|
+++ b/fs/bcachefs/btree_iter.c
|
|
@@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
|
|
bch2_btree_node_iter_peek_all(&l->iter, l->b));
|
|
}
|
|
|
|
-static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
|
|
- struct btree_path *path,
|
|
- struct btree_path_level *l,
|
|
- struct bkey *u)
|
|
-{
|
|
- struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
|
|
- bch2_btree_node_iter_peek(&l->iter, l->b));
|
|
-
|
|
- path->pos = k.k ? k.k->p : l->b->key.k.p;
|
|
- trans->paths_sorted = false;
|
|
- bch2_btree_path_verify_level(trans, path, l - path->l);
|
|
- return k;
|
|
-}
|
|
-
|
|
static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
|
|
struct btree_path *path,
|
|
struct btree_path_level *l,
|
|
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
|
|
index b96157f3dc9c..8823eec6b284 100644
|
|
--- a/fs/bcachefs/btree_iter.h
|
|
+++ b/fs/bcachefs/btree_iter.h
|
|
@@ -335,13 +335,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra
|
|
}
|
|
|
|
__always_inline
|
|
-static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
|
|
+static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip)
|
|
{
|
|
BUG_ON(err <= 0);
|
|
BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
|
|
|
|
trans->restarted = err;
|
|
trans->last_restarted_ip = ip;
|
|
+ return -err;
|
|
+}
|
|
+
|
|
+__always_inline
|
|
+static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
|
|
+{
|
|
+ btree_trans_restart_foreign_task(trans, err, ip);
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
darray_exit(&trans->last_restarted_trace);
|
|
bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
|
|
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
|
|
index caef65adeae4..94eb2b73a843 100644
|
|
--- a/fs/bcachefs/btree_locking.c
|
|
+++ b/fs/bcachefs/btree_locking.c
|
|
@@ -91,10 +91,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
|
|
struct trans_waiting_for_lock *i;
|
|
|
|
for (i = g->g; i != g->g + g->nr; i++) {
|
|
- struct task_struct *task = i->trans->locking_wait.task;
|
|
+ struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
|
|
if (i != g->g)
|
|
prt_str(out, "<- ");
|
|
- prt_printf(out, "%u ", task ?task->pid : 0);
|
|
+ prt_printf(out, "%u ", task ? task->pid : 0);
|
|
}
|
|
prt_newline(out);
|
|
}
|
|
@@ -172,7 +172,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
|
|
{
|
|
if (i == g->g) {
|
|
trace_would_deadlock(g, i->trans);
|
|
- return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
|
|
+ return btree_trans_restart_foreign_task(i->trans,
|
|
+ BCH_ERR_transaction_restart_would_deadlock,
|
|
+ _THIS_IP_);
|
|
} else {
|
|
i->trans->lock_must_abort = true;
|
|
wake_up_process(i->trans->locking_wait.task);
|
|
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
|
|
index a7f06deee13c..678161321e42 100644
|
|
--- a/fs/bcachefs/btree_node_scan.c
|
|
+++ b/fs/bcachefs/btree_node_scan.c
|
|
@@ -166,11 +166,17 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
|
|
bio->bi_iter.bi_sector = offset;
|
|
bch2_bio_map(bio, bn, PAGE_SIZE);
|
|
|
|
+ u64 submit_time = local_clock();
|
|
submit_bio_wait(bio);
|
|
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
|
|
- "IO error in try_read_btree_node() at %llu: %s",
|
|
- offset, bch2_blk_status_to_str(bio->bi_status)))
|
|
+
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
|
|
+
|
|
+ if (bio->bi_status) {
|
|
+ bch_err_dev_ratelimited(ca,
|
|
+ "IO error in try_read_btree_node() at %llu: %s",
|
|
+ offset, bch2_blk_status_to_str(bio->bi_status));
|
|
return;
|
|
+ }
|
|
|
|
if (le64_to_cpu(bn->magic) != bset_magic(c))
|
|
return;
|
|
@@ -264,7 +270,7 @@ static int read_btree_nodes_worker(void *p)
|
|
err:
|
|
bio_put(bio);
|
|
free_page((unsigned long) buf);
|
|
- percpu_ref_get(&ca->io_ref);
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
closure_put(w->cl);
|
|
kfree(w);
|
|
return 0;
|
|
@@ -283,29 +289,28 @@ static int read_btree_nodes(struct find_btree_nodes *f)
|
|
continue;
|
|
|
|
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
|
|
- struct task_struct *t;
|
|
-
|
|
if (!w) {
|
|
percpu_ref_put(&ca->io_ref);
|
|
ret = -ENOMEM;
|
|
goto err;
|
|
}
|
|
|
|
- percpu_ref_get(&ca->io_ref);
|
|
- closure_get(&cl);
|
|
w->cl = &cl;
|
|
w->f = f;
|
|
w->ca = ca;
|
|
|
|
- t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
|
|
+ struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
|
|
ret = PTR_ERR_OR_ZERO(t);
|
|
if (ret) {
|
|
percpu_ref_put(&ca->io_ref);
|
|
- closure_put(&cl);
|
|
- f->ret = ret;
|
|
- bch_err(c, "error starting kthread: %i", ret);
|
|
+ kfree(w);
|
|
+ bch_err_msg(c, ret, "starting kthread");
|
|
break;
|
|
}
|
|
+
|
|
+ closure_get(&cl);
|
|
+ percpu_ref_get(&ca->io_ref);
|
|
+ wake_up_process(t);
|
|
}
|
|
err:
|
|
closure_sync(&cl);
|
|
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
|
|
index c4f524b2ca9a..7d7e52ddde02 100644
|
|
--- a/fs/bcachefs/btree_trans_commit.c
|
|
+++ b/fs/bcachefs/btree_trans_commit.c
|
|
@@ -164,6 +164,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
|
|
EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
|
|
EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
|
|
EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
|
|
+ kmsan_check_memory(insert, bkey_bytes(&insert->k));
|
|
|
|
k = bch2_btree_node_iter_peek_all(node_iter, b);
|
|
if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
|
|
@@ -336,6 +337,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
|
|
BUG_ON(i->cached != path->cached);
|
|
BUG_ON(i->level != path->level);
|
|
BUG_ON(i->btree_id != path->btree_id);
|
|
+ BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id));
|
|
EBUG_ON(!i->level &&
|
|
btree_type_has_snapshots(i->btree_id) &&
|
|
!(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
|
|
@@ -517,69 +519,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
|
|
}
|
|
}
|
|
|
|
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
|
|
- unsigned *btree_id_updates_start)
|
|
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
|
|
{
|
|
- bool trans_trigger_run;
|
|
+ unsigned sort_id_start = 0;
|
|
|
|
- /*
|
|
- * Running triggers will append more updates to the list of updates as
|
|
- * we're walking it:
|
|
- */
|
|
- do {
|
|
- trans_trigger_run = false;
|
|
+ while (sort_id_start < trans->nr_updates) {
|
|
+ unsigned i, sort_id = trans->updates[sort_id_start].sort_order;
|
|
+ bool trans_trigger_run;
|
|
|
|
- for (unsigned i = *btree_id_updates_start;
|
|
- i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
|
|
- i++) {
|
|
- if (trans->updates[i].btree_id < btree_id) {
|
|
- *btree_id_updates_start = i;
|
|
- continue;
|
|
+ /*
|
|
+ * For a given btree, this algorithm runs insert triggers before
|
|
+ * overwrite triggers: this is so that when extents are being
|
|
+ * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop
|
|
+ * references before they are re-added.
|
|
+ *
|
|
+ * Running triggers will append more updates to the list of
|
|
+ * updates as we're walking it:
|
|
+ */
|
|
+ do {
|
|
+ trans_trigger_run = false;
|
|
+
|
|
+ for (i = sort_id_start;
|
|
+ i < trans->nr_updates && trans->updates[i].sort_order <= sort_id;
|
|
+ i++) {
|
|
+ if (trans->updates[i].sort_order < sort_id) {
|
|
+ sort_id_start = i;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ int ret = run_one_trans_trigger(trans, trans->updates + i);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+ if (ret)
|
|
+ trans_trigger_run = true;
|
|
}
|
|
+ } while (trans_trigger_run);
|
|
|
|
- int ret = run_one_trans_trigger(trans, trans->updates + i);
|
|
- if (ret < 0)
|
|
- return ret;
|
|
- if (ret)
|
|
- trans_trigger_run = true;
|
|
- }
|
|
- } while (trans_trigger_run);
|
|
-
|
|
- trans_for_each_update(trans, i)
|
|
- BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
|
|
- i->btree_id == btree_id &&
|
|
- btree_node_type_has_trans_triggers(i->bkey_type) &&
|
|
- (!i->insert_trigger_run || !i->overwrite_trigger_run));
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
|
|
-{
|
|
- unsigned btree_id = 0, btree_id_updates_start = 0;
|
|
- int ret = 0;
|
|
-
|
|
- /*
|
|
- *
|
|
- * For a given btree, this algorithm runs insert triggers before
|
|
- * overwrite triggers: this is so that when extents are being moved
|
|
- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
|
|
- * they are re-added.
|
|
- */
|
|
- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
|
|
- if (btree_id == BTREE_ID_alloc)
|
|
- continue;
|
|
-
|
|
- ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start);
|
|
- if (ret)
|
|
- return ret;
|
|
+ sort_id_start = i;
|
|
}
|
|
|
|
- btree_id_updates_start = 0;
|
|
- ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start);
|
|
- if (ret)
|
|
- return ret;
|
|
-
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
trans_for_each_update(trans, i)
|
|
BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
|
|
@@ -903,18 +881,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
|
|
struct bch_fs *c = trans->c;
|
|
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
|
|
|
|
- switch (ret) {
|
|
- case -BCH_ERR_btree_insert_btree_node_full:
|
|
- ret = bch2_btree_split_leaf(trans, i->path, flags);
|
|
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
- trace_and_count(c, trans_restart_btree_node_split, trans,
|
|
- trace_ip, trans->paths + i->path);
|
|
- break;
|
|
- case -BCH_ERR_btree_insert_need_mark_replicas:
|
|
- ret = drop_locks_do(trans,
|
|
- bch2_accounting_update_sb(trans));
|
|
- break;
|
|
- case -BCH_ERR_journal_res_get_blocked:
|
|
+ if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) {
|
|
/*
|
|
* XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
|
|
* flag
|
|
@@ -922,13 +889,26 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
|
|
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
|
|
watermark < BCH_WATERMARK_reclaim) {
|
|
ret = -BCH_ERR_journal_reclaim_would_deadlock;
|
|
- break;
|
|
+ goto out;
|
|
}
|
|
|
|
ret = drop_locks_do(trans,
|
|
bch2_trans_journal_res_get(trans,
|
|
(flags & BCH_WATERMARK_MASK)|
|
|
JOURNAL_RES_GET_CHECK));
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ switch (ret) {
|
|
+ case -BCH_ERR_btree_insert_btree_node_full:
|
|
+ ret = bch2_btree_split_leaf(trans, i->path, flags);
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
+ trace_and_count(c, trans_restart_btree_node_split, trans,
|
|
+ trace_ip, trans->paths + i->path);
|
|
+ break;
|
|
+ case -BCH_ERR_btree_insert_need_mark_replicas:
|
|
+ ret = drop_locks_do(trans,
|
|
+ bch2_accounting_update_sb(trans));
|
|
break;
|
|
case -BCH_ERR_btree_insert_need_journal_reclaim:
|
|
bch2_trans_unlock(trans);
|
|
@@ -950,7 +930,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
|
|
BUG_ON(ret >= 0);
|
|
break;
|
|
}
|
|
-
|
|
+out:
|
|
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
|
|
|
|
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
|
|
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
|
|
index a09cbe9cd94f..77578da2d23f 100644
|
|
--- a/fs/bcachefs/btree_types.h
|
|
+++ b/fs/bcachefs/btree_types.h
|
|
@@ -423,6 +423,7 @@ static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
|
|
|
|
struct btree_insert_entry {
|
|
unsigned flags;
|
|
+ u8 sort_order;
|
|
u8 bkey_type;
|
|
enum btree_id btree_id:8;
|
|
u8 level:4;
|
|
@@ -853,6 +854,18 @@ static inline bool btree_type_uses_write_buffer(enum btree_id btree)
|
|
return BIT_ULL(btree) & mask;
|
|
}
|
|
|
|
+static inline u8 btree_trigger_order(enum btree_id btree)
|
|
+{
|
|
+ switch (btree) {
|
|
+ case BTREE_ID_alloc:
|
|
+ return U8_MAX;
|
|
+ case BTREE_ID_stripes:
|
|
+ return U8_MAX - 1;
|
|
+ default:
|
|
+ return btree;
|
|
+ }
|
|
+}
|
|
+
|
|
struct btree_root {
|
|
struct btree *b;
|
|
|
|
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
|
|
index 13d794f201a5..bd2eb42edb24 100644
|
|
--- a/fs/bcachefs/btree_update.c
|
|
+++ b/fs/bcachefs/btree_update.c
|
|
@@ -17,7 +17,7 @@
|
|
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
|
|
const struct btree_insert_entry *r)
|
|
{
|
|
- return cmp_int(l->btree_id, r->btree_id) ?:
|
|
+ return cmp_int(l->sort_order, r->sort_order) ?:
|
|
cmp_int(l->cached, r->cached) ?:
|
|
-cmp_int(l->level, r->level) ?:
|
|
bpos_cmp(l->k->k.p, r->k->k.p);
|
|
@@ -397,6 +397,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
|
|
|
|
n = (struct btree_insert_entry) {
|
|
.flags = flags,
|
|
+ .sort_order = btree_trigger_order(path->btree_id),
|
|
.bkey_type = __btree_node_type(path->level, path->btree_id),
|
|
.btree_id = path->btree_id,
|
|
.level = path->level,
|
|
@@ -511,6 +512,8 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
|
|
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
|
|
struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
|
|
{
|
|
+ kmsan_check_memory(k, bkey_bytes(&k->k));
|
|
+
|
|
btree_path_idx_t path_idx = iter->update_path ?: iter->path;
|
|
int ret;
|
|
|
|
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
|
|
index 47d8690f01bf..d2e1c04353f6 100644
|
|
--- a/fs/bcachefs/btree_update.h
|
|
+++ b/fs/bcachefs/btree_update.h
|
|
@@ -133,6 +133,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr
|
|
enum btree_id btree,
|
|
struct bkey_i *k)
|
|
{
|
|
+ kmsan_check_memory(k, bkey_bytes(&k->k));
|
|
+
|
|
if (unlikely(!btree_type_uses_write_buffer(btree))) {
|
|
int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
|
|
dump_stack();
|
|
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
|
|
index e4e7c804625e..67f1e3202835 100644
|
|
--- a/fs/bcachefs/btree_update_interior.c
|
|
+++ b/fs/bcachefs/btree_update_interior.c
|
|
@@ -649,6 +649,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
+/* If the node has been reused, we might be reading uninitialized memory - that's fine: */
|
|
+static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq)
|
|
+{
|
|
+ struct btree_node *b_data = READ_ONCE(b->data);
|
|
+
|
|
+ return (b_data ? b_data->keys.seq : 0) == seq;
|
|
+}
|
|
+
|
|
static void btree_update_nodes_written(struct btree_update *as)
|
|
{
|
|
struct bch_fs *c = as->c;
|
|
@@ -677,17 +685,9 @@ static void btree_update_nodes_written(struct btree_update *as)
|
|
* on disk:
|
|
*/
|
|
for (i = 0; i < as->nr_old_nodes; i++) {
|
|
- __le64 seq;
|
|
-
|
|
b = as->old_nodes[i];
|
|
|
|
- bch2_trans_begin(trans);
|
|
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
|
|
- seq = b->data ? b->data->keys.seq : 0;
|
|
- six_unlock_read(&b->c.lock);
|
|
- bch2_trans_unlock_long(trans);
|
|
-
|
|
- if (seq == as->old_nodes_seq[i])
|
|
+ if (btree_node_seq_matches(b, as->old_nodes_seq[i]))
|
|
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
|
|
TASK_UNINTERRUPTIBLE);
|
|
}
|
|
@@ -2126,6 +2126,31 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
|
|
goto out;
|
|
}
|
|
|
|
+static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
|
|
+ BTREE_MAX_DEPTH, b->c.level,
|
|
+ BTREE_ITER_intent);
|
|
+ int ret = bch2_btree_iter_traverse(iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ /* has node been freed? */
|
|
+ if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
|
|
+ /* node has been freed: */
|
|
+ BUG_ON(!btree_node_dying(b));
|
|
+ ret = -BCH_ERR_btree_node_dying;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ BUG_ON(!btree_node_hashed(b));
|
|
+ return 0;
|
|
+err:
|
|
+ bch2_trans_iter_exit(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
int bch2_btree_node_rewrite(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
struct btree *b,
|
|
@@ -2191,66 +2216,78 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
|
|
goto out;
|
|
}
|
|
|
|
-struct async_btree_rewrite {
|
|
- struct bch_fs *c;
|
|
- struct work_struct work;
|
|
- struct list_head list;
|
|
- enum btree_id btree_id;
|
|
- unsigned level;
|
|
- struct bkey_buf key;
|
|
-};
|
|
-
|
|
-static int async_btree_node_rewrite_trans(struct btree_trans *trans,
|
|
- struct async_btree_rewrite *a)
|
|
+static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
|
|
+ enum btree_id btree, unsigned level,
|
|
+ struct bkey_i *k, unsigned flags)
|
|
{
|
|
struct btree_iter iter;
|
|
bch2_trans_node_iter_init(trans, &iter,
|
|
- a->btree_id, a->key.k->k.p,
|
|
- BTREE_MAX_DEPTH, a->level, 0);
|
|
+ btree, k->k.p,
|
|
+ BTREE_MAX_DEPTH, level, 0);
|
|
struct btree *b = bch2_btree_iter_peek_node(&iter);
|
|
int ret = PTR_ERR_OR_ZERO(b);
|
|
if (ret)
|
|
goto out;
|
|
|
|
- bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
|
|
+ bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
|
|
ret = found
|
|
- ? bch2_btree_node_rewrite(trans, &iter, b, 0)
|
|
+ ? bch2_btree_node_rewrite(trans, &iter, b, flags)
|
|
: -ENOENT;
|
|
+out:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
|
|
-#if 0
|
|
- /* Tracepoint... */
|
|
- if (!ret || ret == -ENOENT) {
|
|
- struct bch_fs *c = trans->c;
|
|
- struct printbuf buf = PRINTBUF;
|
|
+int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
|
|
+ enum btree_id btree, unsigned level,
|
|
+ struct bpos pos, unsigned flags)
|
|
+{
|
|
+ BUG_ON(!level);
|
|
|
|
- if (!ret) {
|
|
- prt_printf(&buf, "rewrite node:\n ");
|
|
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
|
|
- } else {
|
|
- prt_printf(&buf, "node to rewrite not found:\n want: ");
|
|
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
|
|
- prt_printf(&buf, "\n got: ");
|
|
- if (b)
|
|
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
|
- else
|
|
- prt_str(&buf, "(null)");
|
|
- }
|
|
- bch_info(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
-#endif
|
|
-out:
|
|
+ /* Traverse one depth lower to get a pointer to the node itself: */
|
|
+ struct btree_iter iter;
|
|
+ bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0);
|
|
+ struct btree *b = bch2_btree_iter_peek_node(&iter);
|
|
+ int ret = PTR_ERR_OR_ZERO(b);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
|
|
+err:
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
|
|
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
|
|
+ struct btree *b, unsigned flags)
|
|
+{
|
|
+ struct btree_iter iter;
|
|
+ int ret = get_iter_to_node(trans, &iter, b);
|
|
+ if (ret)
|
|
+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
|
|
+
|
|
+ ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct async_btree_rewrite {
|
|
+ struct bch_fs *c;
|
|
+ struct work_struct work;
|
|
+ struct list_head list;
|
|
+ enum btree_id btree_id;
|
|
+ unsigned level;
|
|
+ struct bkey_buf key;
|
|
+};
|
|
+
|
|
static void async_btree_node_rewrite_work(struct work_struct *work)
|
|
{
|
|
struct async_btree_rewrite *a =
|
|
container_of(work, struct async_btree_rewrite, work);
|
|
struct bch_fs *c = a->c;
|
|
|
|
- int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
|
|
+ int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
|
|
+ a->btree_id, a->level, a->key.k, 0));
|
|
if (ret != -ENOENT)
|
|
bch_err_fn_ratelimited(c, ret);
|
|
|
|
@@ -2494,30 +2531,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
|
|
unsigned commit_flags, bool skip_triggers)
|
|
{
|
|
struct btree_iter iter;
|
|
- int ret;
|
|
-
|
|
- bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
|
|
- BTREE_MAX_DEPTH, b->c.level,
|
|
- BTREE_ITER_intent);
|
|
- ret = bch2_btree_iter_traverse(&iter);
|
|
+ int ret = get_iter_to_node(trans, &iter, b);
|
|
if (ret)
|
|
- goto out;
|
|
-
|
|
- /* has node been freed? */
|
|
- if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
|
|
- /* node has been freed: */
|
|
- BUG_ON(!btree_node_dying(b));
|
|
- goto out;
|
|
- }
|
|
-
|
|
- BUG_ON(!btree_node_hashed(b));
|
|
+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
|
|
|
|
bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
|
|
!bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
|
|
|
|
ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
|
|
commit_flags, skip_triggers);
|
|
-out:
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
|
|
index 26d646e1275c..be71cd73b864 100644
|
|
--- a/fs/bcachefs/btree_update_interior.h
|
|
+++ b/fs/bcachefs/btree_update_interior.h
|
|
@@ -169,7 +169,14 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
|
|
|
|
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
|
|
struct btree *, unsigned);
|
|
+int bch2_btree_node_rewrite_pos(struct btree_trans *,
|
|
+ enum btree_id, unsigned,
|
|
+ struct bpos, unsigned);
|
|
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
|
|
+ struct btree *, unsigned);
|
|
+
|
|
void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
|
|
+
|
|
int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
|
|
struct btree *, struct bkey_i *,
|
|
unsigned, bool);
|
|
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
|
|
index 345b117a4a4a..e56ef623ebc1 100644
|
|
--- a/fs/bcachefs/buckets.c
|
|
+++ b/fs/bcachefs/buckets.c
|
|
@@ -590,11 +590,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (!p.ptr.cached) {
|
|
- ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
|
|
- if (ret)
|
|
- goto err;
|
|
- }
|
|
+ ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
|
|
+ if (ret)
|
|
+ goto err;
|
|
}
|
|
|
|
if (flags & BTREE_TRIGGER_gc) {
|
|
@@ -674,10 +672,10 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
|
|
return -BCH_ERR_ENOMEM_mark_stripe_ptr;
|
|
}
|
|
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
+ gc_stripe_lock(m);
|
|
|
|
if (!m || !m->alive) {
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
+ gc_stripe_unlock(m);
|
|
struct printbuf buf = PRINTBUF;
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s",
|
|
@@ -693,7 +691,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
|
|
.type = BCH_DISK_ACCOUNTING_replicas,
|
|
};
|
|
memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
+ gc_stripe_unlock(m);
|
|
|
|
acc.replicas.data_type = data_type;
|
|
int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true);
|
|
@@ -726,9 +724,7 @@ static int __trigger_extent(struct btree_trans *trans,
|
|
.replicas.nr_required = 1,
|
|
};
|
|
|
|
- struct disk_accounting_pos acct_compression_key = {
|
|
- .type = BCH_DISK_ACCOUNTING_compression,
|
|
- };
|
|
+ unsigned cur_compression_type = 0;
|
|
u64 compression_acct[3] = { 1, 0, 0 };
|
|
|
|
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
@@ -762,13 +758,13 @@ static int __trigger_extent(struct btree_trans *trans,
|
|
acc_replicas_key.replicas.nr_required = 0;
|
|
}
|
|
|
|
- if (acct_compression_key.compression.type &&
|
|
- acct_compression_key.compression.type != p.crc.compression_type) {
|
|
+ if (cur_compression_type &&
|
|
+ cur_compression_type != p.crc.compression_type) {
|
|
if (flags & BTREE_TRIGGER_overwrite)
|
|
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
|
|
|
|
- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
|
|
- ARRAY_SIZE(compression_acct), gc);
|
|
+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
|
|
+ compression, cur_compression_type);
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -777,7 +773,7 @@ static int __trigger_extent(struct btree_trans *trans,
|
|
compression_acct[2] = 0;
|
|
}
|
|
|
|
- acct_compression_key.compression.type = p.crc.compression_type;
|
|
+ cur_compression_type = p.crc.compression_type;
|
|
if (p.crc.compression_type) {
|
|
compression_acct[1] += p.crc.uncompressed_size;
|
|
compression_acct[2] += p.crc.compressed_size;
|
|
@@ -791,45 +787,34 @@ static int __trigger_extent(struct btree_trans *trans,
|
|
}
|
|
|
|
if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
|
|
- struct disk_accounting_pos acc_snapshot_key = {
|
|
- .type = BCH_DISK_ACCOUNTING_snapshot,
|
|
- .snapshot.id = k.k->p.snapshot,
|
|
- };
|
|
- ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
|
|
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
- if (acct_compression_key.compression.type) {
|
|
+ if (cur_compression_type) {
|
|
if (flags & BTREE_TRIGGER_overwrite)
|
|
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
|
|
|
|
- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
|
|
- ARRAY_SIZE(compression_acct), gc);
|
|
+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
|
|
+ compression, cur_compression_type);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
if (level) {
|
|
- struct disk_accounting_pos acc_btree_key = {
|
|
- .type = BCH_DISK_ACCOUNTING_btree,
|
|
- .btree.id = btree_id,
|
|
- };
|
|
- ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
|
|
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id);
|
|
if (ret)
|
|
return ret;
|
|
} else {
|
|
bool insert = !(flags & BTREE_TRIGGER_overwrite);
|
|
- struct disk_accounting_pos acc_inum_key = {
|
|
- .type = BCH_DISK_ACCOUNTING_inum,
|
|
- .inum.inum = k.k->p.inode,
|
|
- };
|
|
+
|
|
s64 v[3] = {
|
|
insert ? 1 : -1,
|
|
insert ? k.k->size : -((s64) k.k->size),
|
|
*replicas_sectors,
|
|
};
|
|
- ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
|
|
+ ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -878,15 +863,15 @@ int bch2_trigger_extent(struct btree_trans *trans,
|
|
}
|
|
|
|
int need_rebalance_delta = 0;
|
|
- s64 need_rebalance_sectors_delta = 0;
|
|
+ s64 need_rebalance_sectors_delta[1] = { 0 };
|
|
|
|
s64 s = bch2_bkey_sectors_need_rebalance(c, old);
|
|
need_rebalance_delta -= s != 0;
|
|
- need_rebalance_sectors_delta -= s;
|
|
+ need_rebalance_sectors_delta[0] -= s;
|
|
|
|
s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
|
|
need_rebalance_delta += s != 0;
|
|
- need_rebalance_sectors_delta += s;
|
|
+ need_rebalance_sectors_delta[0] += s;
|
|
|
|
if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
|
|
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
|
|
@@ -895,12 +880,9 @@ int bch2_trigger_extent(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
- if (need_rebalance_sectors_delta) {
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_rebalance_work,
|
|
- };
|
|
- int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
|
|
- flags & BTREE_TRIGGER_gc);
|
|
+ if (need_rebalance_sectors_delta[0]) {
|
|
+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
|
|
+ need_rebalance_sectors_delta, rebalance_work);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -916,17 +898,13 @@ static int __trigger_reservation(struct btree_trans *trans,
|
|
enum btree_iter_update_trigger_flags flags)
|
|
{
|
|
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
|
|
- s64 sectors = k.k->size;
|
|
+ s64 sectors[1] = { k.k->size };
|
|
|
|
if (flags & BTREE_TRIGGER_overwrite)
|
|
- sectors = -sectors;
|
|
-
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_persistent_reserved,
|
|
- .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas,
|
|
- };
|
|
+ sectors[0] = -sectors[0];
|
|
|
|
- return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc);
|
|
+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors,
|
|
+ persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas);
|
|
}
|
|
|
|
return 0;
|
|
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
|
|
index a9acdd6c0c86..c5363256e363 100644
|
|
--- a/fs/bcachefs/buckets.h
|
|
+++ b/fs/bcachefs/buckets.h
|
|
@@ -39,33 +39,6 @@ static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t
|
|
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
|
|
_b < (_buckets)->b + (_buckets)->nbuckets; _b++)
|
|
|
|
-/*
|
|
- * Ugly hack alert:
|
|
- *
|
|
- * We need to cram a spinlock in a single byte, because that's what we have left
|
|
- * in struct bucket, and we care about the size of these - during fsck, we need
|
|
- * in memory state for every single bucket on every device.
|
|
- *
|
|
- * We used to do
|
|
- * while (xchg(&b->lock, 1) cpu_relax();
|
|
- * but, it turns out not all architectures support xchg on a single byte.
|
|
- *
|
|
- * So now we use bit_spin_lock(), with fun games since we can't burn a whole
|
|
- * ulong for this - we just need to make sure the lock bit always ends up in the
|
|
- * first byte.
|
|
- */
|
|
-
|
|
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
-#define BUCKET_LOCK_BITNR 0
|
|
-#else
|
|
-#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
|
|
-#endif
|
|
-
|
|
-union ulong_byte_assert {
|
|
- ulong ulong;
|
|
- u8 byte;
|
|
-};
|
|
-
|
|
static inline void bucket_unlock(struct bucket *b)
|
|
{
|
|
BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
|
|
@@ -167,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b)
|
|
|
|
static inline int gen_after(u8 a, u8 b)
|
|
{
|
|
- int r = gen_cmp(a, b);
|
|
-
|
|
- return r > 0 ? r : 0;
|
|
+ return max(0, gen_cmp(a, b));
|
|
}
|
|
|
|
static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
|
|
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
|
|
index 7174047b8e92..900b8680c8b5 100644
|
|
--- a/fs/bcachefs/buckets_types.h
|
|
+++ b/fs/bcachefs/buckets_types.h
|
|
@@ -7,6 +7,33 @@
|
|
|
|
#define BUCKET_JOURNAL_SEQ_BITS 16
|
|
|
|
+/*
|
|
+ * Ugly hack alert:
|
|
+ *
|
|
+ * We need to cram a spinlock in a single byte, because that's what we have left
|
|
+ * in struct bucket, and we care about the size of these - during fsck, we need
|
|
+ * in memory state for every single bucket on every device.
|
|
+ *
|
|
+ * We used to do
|
|
+ * while (xchg(&b->lock, 1) cpu_relax();
|
|
+ * but, it turns out not all architectures support xchg on a single byte.
|
|
+ *
|
|
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
|
|
+ * ulong for this - we just need to make sure the lock bit always ends up in the
|
|
+ * first byte.
|
|
+ */
|
|
+
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
+#define BUCKET_LOCK_BITNR 0
|
|
+#else
|
|
+#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
|
|
+#endif
|
|
+
|
|
+union ulong_byte_assert {
|
|
+ ulong ulong;
|
|
+ u8 byte;
|
|
+};
|
|
+
|
|
struct bucket {
|
|
u8 lock;
|
|
u8 gen_valid:1;
|
|
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
|
|
index 46e9e32105a9..57d55b3ddc71 100644
|
|
--- a/fs/bcachefs/chardev.c
|
|
+++ b/fs/bcachefs/chardev.c
|
|
@@ -11,6 +11,7 @@
|
|
#include "move.h"
|
|
#include "recovery_passes.h"
|
|
#include "replicas.h"
|
|
+#include "sb-counters.h"
|
|
#include "super-io.h"
|
|
#include "thread_with_file.h"
|
|
|
|
@@ -312,7 +313,12 @@ static int bch2_data_thread(void *arg)
|
|
struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
|
|
|
|
ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
|
|
- ctx->stats.data_type = U8_MAX;
|
|
+ if (ctx->thr.ret == -BCH_ERR_device_offline)
|
|
+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
|
|
+ else {
|
|
+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
|
|
+ ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done;
|
|
+ }
|
|
return 0;
|
|
}
|
|
|
|
@@ -331,14 +337,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
|
|
struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
|
|
struct bch_fs *c = ctx->c;
|
|
struct bch_ioctl_data_event e = {
|
|
- .type = BCH_DATA_EVENT_PROGRESS,
|
|
- .p.data_type = ctx->stats.data_type,
|
|
- .p.btree_id = ctx->stats.pos.btree,
|
|
- .p.pos = ctx->stats.pos.pos,
|
|
- .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
|
|
- .p.sectors_total = bch2_fs_usage_read_short(c).used,
|
|
+ .type = BCH_DATA_EVENT_PROGRESS,
|
|
+ .ret = ctx->stats.ret,
|
|
+ .p.data_type = ctx->stats.data_type,
|
|
+ .p.btree_id = ctx->stats.pos.btree,
|
|
+ .p.pos = ctx->stats.pos.pos,
|
|
+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
|
|
+ .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected),
|
|
+ .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected),
|
|
};
|
|
|
|
+ if (ctx->arg.op == BCH_DATA_OP_scrub) {
|
|
+ struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
|
|
+ if (ca) {
|
|
+ struct bch_dev_usage u;
|
|
+ bch2_dev_usage_read_fast(ca, &u);
|
|
+ for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
|
|
+ if (ctx->arg.scrub.data_types & BIT(i))
|
|
+ e.p.sectors_total += u.d[i].sectors;
|
|
+ bch2_dev_put(ca);
|
|
+ }
|
|
+ } else {
|
|
+ e.p.sectors_total = bch2_fs_usage_read_short(c).used;
|
|
+ }
|
|
+
|
|
if (len < sizeof(e))
|
|
return -EINVAL;
|
|
|
|
@@ -710,6 +732,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
|
|
BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
|
|
case BCH_IOCTL_QUERY_ACCOUNTING:
|
|
return bch2_ioctl_query_accounting(c, arg);
|
|
+ case BCH_IOCTL_QUERY_COUNTERS:
|
|
+ return bch2_ioctl_query_counters(c, arg);
|
|
default:
|
|
return -ENOTTY;
|
|
}
|
|
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
|
|
index 23a383577d4c..3726689093e3 100644
|
|
--- a/fs/bcachefs/checksum.c
|
|
+++ b/fs/bcachefs/checksum.c
|
|
@@ -466,7 +466,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
|
|
prt_str(&buf, ")");
|
|
WARN_RATELIMIT(1, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
- return -EIO;
|
|
+ return -BCH_ERR_recompute_checksum;
|
|
}
|
|
|
|
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
|
|
@@ -693,6 +693,14 @@ static int bch2_alloc_ciphers(struct bch_fs *c)
|
|
return 0;
|
|
}
|
|
|
|
+#if 0
|
|
+
|
|
+/*
|
|
+ * This seems to be duplicating code in cmd_remove_passphrase() in
|
|
+ * bcachefs-tools, but we might want to switch userspace to use this - and
|
|
+ * perhaps add an ioctl for calling this at runtime, so we can take the
|
|
+ * passphrase off of a mounted filesystem (which has come up).
|
|
+ */
|
|
int bch2_disable_encryption(struct bch_fs *c)
|
|
{
|
|
struct bch_sb_field_crypt *crypt;
|
|
@@ -725,6 +733,10 @@ int bch2_disable_encryption(struct bch_fs *c)
|
|
return ret;
|
|
}
|
|
|
|
+/*
|
|
+ * For enabling encryption on an existing filesystem: not hooked up yet, but it
|
|
+ * should be
|
|
+ */
|
|
int bch2_enable_encryption(struct bch_fs *c, bool keyed)
|
|
{
|
|
struct bch_encrypted_key key;
|
|
@@ -781,6 +793,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
|
|
memzero_explicit(&key, sizeof(key));
|
|
return ret;
|
|
}
|
|
+#endif
|
|
|
|
void bch2_fs_encryption_exit(struct bch_fs *c)
|
|
{
|
|
@@ -788,8 +801,6 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
|
|
crypto_free_shash(c->poly1305);
|
|
if (c->chacha20)
|
|
crypto_free_sync_skcipher(c->chacha20);
|
|
- if (c->sha256)
|
|
- crypto_free_shash(c->sha256);
|
|
}
|
|
|
|
int bch2_fs_encryption_init(struct bch_fs *c)
|
|
@@ -798,14 +809,6 @@ int bch2_fs_encryption_init(struct bch_fs *c)
|
|
struct bch_key key;
|
|
int ret = 0;
|
|
|
|
- c->sha256 = crypto_alloc_shash("sha256", 0, 0);
|
|
- ret = PTR_ERR_OR_ZERO(c->sha256);
|
|
- if (ret) {
|
|
- c->sha256 = NULL;
|
|
- bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
|
|
- goto out;
|
|
- }
|
|
-
|
|
crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
|
|
if (!crypt)
|
|
goto out;
|
|
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
|
|
index 43b9d71f2f2b..4ac251c8fcd8 100644
|
|
--- a/fs/bcachefs/checksum.h
|
|
+++ b/fs/bcachefs/checksum.h
|
|
@@ -103,8 +103,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
|
|
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
|
|
struct bch_key *);
|
|
|
|
+#if 0
|
|
int bch2_disable_encryption(struct bch_fs *);
|
|
int bch2_enable_encryption(struct bch_fs *, bool);
|
|
+#endif
|
|
|
|
void bch2_fs_encryption_exit(struct bch_fs *);
|
|
int bch2_fs_encryption_init(struct bch_fs *);
|
|
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
|
|
index 114bf2f3879f..85fc90342492 100644
|
|
--- a/fs/bcachefs/compress.c
|
|
+++ b/fs/bcachefs/compress.c
|
|
@@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
size_t src_len = src->bi_iter.bi_size;
|
|
size_t dst_len = crc.uncompressed_size << 9;
|
|
void *workspace;
|
|
- int ret;
|
|
+ int ret = 0, ret2;
|
|
|
|
enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
|
|
mempool_t *workspace_pool = &c->compress_workspace[opt];
|
|
@@ -189,7 +189,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
else
|
|
ret = -BCH_ERR_compression_workspace_not_initialized;
|
|
if (ret)
|
|
- goto out;
|
|
+ goto err;
|
|
}
|
|
|
|
src_data = bio_map_or_bounce(c, src, READ);
|
|
@@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
switch (crc.compression_type) {
|
|
case BCH_COMPRESSION_TYPE_lz4_old:
|
|
case BCH_COMPRESSION_TYPE_lz4:
|
|
- ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
|
|
- src_len, dst_len, dst_len);
|
|
- if (ret != dst_len)
|
|
- goto err;
|
|
+ ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
|
|
+ src_len, dst_len, dst_len);
|
|
+ if (ret2 != dst_len)
|
|
+ ret = -BCH_ERR_decompress_lz4;
|
|
break;
|
|
case BCH_COMPRESSION_TYPE_gzip: {
|
|
z_stream strm = {
|
|
@@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
|
|
zlib_set_workspace(&strm, workspace);
|
|
zlib_inflateInit2(&strm, -MAX_WBITS);
|
|
- ret = zlib_inflate(&strm, Z_FINISH);
|
|
+ ret2 = zlib_inflate(&strm, Z_FINISH);
|
|
|
|
mempool_free(workspace, workspace_pool);
|
|
|
|
- if (ret != Z_STREAM_END)
|
|
- goto err;
|
|
+ if (ret2 != Z_STREAM_END)
|
|
+ ret = -BCH_ERR_decompress_gzip;
|
|
break;
|
|
}
|
|
case BCH_COMPRESSION_TYPE_zstd: {
|
|
ZSTD_DCtx *ctx;
|
|
size_t real_src_len = le32_to_cpup(src_data.b);
|
|
|
|
- if (real_src_len > src_len - 4)
|
|
+ if (real_src_len > src_len - 4) {
|
|
+ ret = -BCH_ERR_decompress_zstd_src_len_bad;
|
|
goto err;
|
|
+ }
|
|
|
|
workspace = mempool_alloc(workspace_pool, GFP_NOFS);
|
|
ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
|
|
|
|
- ret = zstd_decompress_dctx(ctx,
|
|
+ ret2 = zstd_decompress_dctx(ctx,
|
|
dst_data, dst_len,
|
|
src_data.b + 4, real_src_len);
|
|
|
|
mempool_free(workspace, workspace_pool);
|
|
|
|
- if (ret != dst_len)
|
|
- goto err;
|
|
+ if (ret2 != dst_len)
|
|
+ ret = -BCH_ERR_decompress_zstd;
|
|
break;
|
|
}
|
|
default:
|
|
BUG();
|
|
}
|
|
- ret = 0;
|
|
+err:
|
|
fsck_err:
|
|
-out:
|
|
bio_unmap_or_unbounce(c, src_data);
|
|
return ret;
|
|
-err:
|
|
- ret = -EIO;
|
|
- goto out;
|
|
}
|
|
|
|
int bch2_bio_uncompress_inplace(struct bch_write_op *op,
|
|
@@ -268,27 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
|
|
BUG_ON(!bio->bi_vcnt);
|
|
BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
|
|
|
|
- if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
|
|
- crc->compressed_size << 9 > c->opts.encoded_extent_max) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "error rewriting existing data: extent too big");
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- return -EIO;
|
|
+ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) {
|
|
+ bch2_write_op_error(op, op->pos.offset,
|
|
+ "extent too big to decompress (%u > %u)",
|
|
+ crc->uncompressed_size << 9, c->opts.encoded_extent_max);
|
|
+ return -BCH_ERR_decompress_exceeded_max_encoded_extent;
|
|
}
|
|
|
|
data = __bounce_alloc(c, dst_len, WRITE);
|
|
|
|
- if (__bio_uncompress(c, bio, data.b, *crc)) {
|
|
- if (!c->opts.no_data_io) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "error rewriting existing data: decompression error");
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
- ret = -EIO;
|
|
+ ret = __bio_uncompress(c, bio, data.b, *crc);
|
|
+
|
|
+ if (c->opts.no_data_io)
|
|
+ ret = 0;
|
|
+
|
|
+ if (ret) {
|
|
+ bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret));
|
|
goto err;
|
|
}
|
|
|
|
@@ -321,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
|
|
if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
|
|
crc.compressed_size << 9 > c->opts.encoded_extent_max)
|
|
- return -EIO;
|
|
+ return -BCH_ERR_decompress_exceeded_max_encoded_extent;
|
|
|
|
dst_data = dst_len == dst_iter.bi_size
|
|
? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
|
|
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
|
|
index 642fbc60ecab..0ec273daccb7 100644
|
|
--- a/fs/bcachefs/data_update.c
|
|
+++ b/fs/bcachefs/data_update.c
|
|
@@ -20,6 +20,8 @@
|
|
#include "subvolume.h"
|
|
#include "trace.h"
|
|
|
|
+#include <linux/ioprio.h>
|
|
+
|
|
static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
|
|
{
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
@@ -33,7 +35,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
|
|
bkey_for_each_ptr(ptrs, ptr) {
|
|
- if (!bch2_dev_tryget(c, ptr->dev)) {
|
|
+ if (unlikely(!bch2_dev_tryget(c, ptr->dev))) {
|
|
bkey_for_each_ptr(ptrs, ptr2) {
|
|
if (ptr2 == ptr)
|
|
break;
|
|
@@ -91,7 +93,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
|
|
return true;
|
|
}
|
|
|
|
-static noinline void trace_move_extent_finish2(struct data_update *u,
|
|
+static noinline void trace_io_move_finish2(struct data_update *u,
|
|
struct bkey_i *new,
|
|
struct bkey_i *insert)
|
|
{
|
|
@@ -111,11 +113,11 @@ static noinline void trace_move_extent_finish2(struct data_update *u,
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
|
|
prt_newline(&buf);
|
|
|
|
- trace_move_extent_finish(c, buf.buf);
|
|
+ trace_io_move_finish(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
-static void trace_move_extent_fail2(struct data_update *m,
|
|
+static void trace_io_move_fail2(struct data_update *m,
|
|
struct bkey_s_c new,
|
|
struct bkey_s_c wrote,
|
|
struct bkey_i *insert,
|
|
@@ -126,7 +128,7 @@ static void trace_move_extent_fail2(struct data_update *m,
|
|
struct printbuf buf = PRINTBUF;
|
|
unsigned rewrites_found = 0;
|
|
|
|
- if (!trace_move_extent_fail_enabled())
|
|
+ if (!trace_io_move_fail_enabled())
|
|
return;
|
|
|
|
prt_str(&buf, msg);
|
|
@@ -166,7 +168,7 @@ static void trace_move_extent_fail2(struct data_update *m,
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
|
|
}
|
|
|
|
- trace_move_extent_fail(c, buf.buf);
|
|
+ trace_io_move_fail(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
@@ -214,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
new = bkey_i_to_extent(bch2_keylist_front(keys));
|
|
|
|
if (!bch2_extents_match(k, old)) {
|
|
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
|
|
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i),
|
|
NULL, "no match:");
|
|
goto nowork;
|
|
}
|
|
@@ -254,7 +256,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
if (m->data_opts.rewrite_ptrs &&
|
|
!rewrites_found &&
|
|
bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
|
|
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
|
|
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
|
|
goto nowork;
|
|
}
|
|
|
|
@@ -271,7 +273,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
}
|
|
|
|
if (!bkey_val_u64s(&new->k)) {
|
|
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
|
|
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
|
|
goto nowork;
|
|
}
|
|
|
|
@@ -352,7 +354,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
printbuf_exit(&buf);
|
|
|
|
bch2_fatal_error(c);
|
|
- ret = -EIO;
|
|
+ ret = -BCH_ERR_invalid_bkey;
|
|
goto out;
|
|
}
|
|
|
|
@@ -385,9 +387,9 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
if (!ret) {
|
|
bch2_btree_iter_set_pos(&iter, next_pos);
|
|
|
|
- this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
|
|
- if (trace_move_extent_finish_enabled())
|
|
- trace_move_extent_finish2(m, &new->k_i, insert);
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
|
|
+ if (trace_io_move_finish_enabled())
|
|
+ trace_io_move_finish2(m, &new->k_i, insert);
|
|
}
|
|
err:
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
@@ -409,7 +411,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
|
&m->stats->sectors_raced);
|
|
}
|
|
|
|
- count_event(c, move_extent_fail);
|
|
+ count_event(c, io_move_fail);
|
|
|
|
bch2_btree_iter_advance(&iter);
|
|
goto next;
|
|
@@ -427,14 +429,17 @@ int bch2_data_update_index_update(struct bch_write_op *op)
|
|
return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
|
|
}
|
|
|
|
-void bch2_data_update_read_done(struct data_update *m,
|
|
- struct bch_extent_crc_unpacked crc)
|
|
+void bch2_data_update_read_done(struct data_update *m)
|
|
{
|
|
+ m->read_done = true;
|
|
+
|
|
/* write bio must own pages: */
|
|
BUG_ON(!m->op.wbio.bio.bi_vcnt);
|
|
|
|
- m->op.crc = crc;
|
|
- m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
|
|
+ m->op.crc = m->rbio.pick.crc;
|
|
+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
|
|
+
|
|
+ this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size);
|
|
|
|
closure_call(&m->op.cl, bch2_write, NULL, NULL);
|
|
}
|
|
@@ -444,31 +449,34 @@ void bch2_data_update_exit(struct data_update *update)
|
|
struct bch_fs *c = update->op.c;
|
|
struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
|
|
|
|
+ bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
|
|
+ kfree(update->bvecs);
|
|
+ update->bvecs = NULL;
|
|
+
|
|
if (c->opts.nocow_enabled)
|
|
bkey_nocow_unlock(c, k);
|
|
bkey_put_dev_refs(c, k);
|
|
- bch2_bkey_buf_exit(&update->k, c);
|
|
bch2_disk_reservation_put(c, &update->op.res);
|
|
- bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
|
|
+ bch2_bkey_buf_exit(&update->k, c);
|
|
}
|
|
|
|
-static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
|
- struct data_update *update)
|
|
+static int bch2_update_unwritten_extent(struct btree_trans *trans,
|
|
+ struct data_update *update)
|
|
{
|
|
struct bch_fs *c = update->op.c;
|
|
- struct bio *bio = &update->op.wbio.bio;
|
|
struct bkey_i_extent *e;
|
|
struct write_point *wp;
|
|
struct closure cl;
|
|
struct btree_iter iter;
|
|
struct bkey_s_c k;
|
|
- int ret;
|
|
+ int ret = 0;
|
|
|
|
closure_init_stack(&cl);
|
|
bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
|
|
|
|
- while (bio_sectors(bio)) {
|
|
- unsigned sectors = bio_sectors(bio);
|
|
+ while (bpos_lt(update->op.pos, update->k.k->k.p)) {
|
|
+ unsigned sectors = update->k.k->k.p.offset -
|
|
+ update->op.pos.offset;
|
|
|
|
bch2_trans_begin(trans);
|
|
|
|
@@ -504,7 +512,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
|
bch_err_fn_ratelimited(c, ret);
|
|
|
|
if (ret)
|
|
- return;
|
|
+ break;
|
|
|
|
sectors = min(sectors, wp->sectors_free);
|
|
|
|
@@ -514,7 +522,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
|
bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
|
|
bch2_alloc_sectors_done(c, wp);
|
|
|
|
- bio_advance(bio, sectors << 9);
|
|
update->op.pos.offset += sectors;
|
|
|
|
extent_for_each_ptr(extent_i_to_s(e), ptr)
|
|
@@ -533,13 +540,16 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
|
bch2_trans_unlock(trans);
|
|
closure_sync(&cl);
|
|
}
|
|
+
|
|
+ return ret;
|
|
}
|
|
|
|
void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
|
|
struct bch_io_opts *io_opts,
|
|
struct data_update_opts *data_opts)
|
|
{
|
|
- printbuf_tabstop_push(out, 20);
|
|
+ if (!out->nr_tabstops)
|
|
+ printbuf_tabstop_push(out, 20);
|
|
|
|
prt_str_indented(out, "rewrite ptrs:\t");
|
|
bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
|
|
@@ -574,6 +584,17 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
|
|
bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
|
|
}
|
|
|
|
+void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m)
|
|
+{
|
|
+ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
|
|
+ prt_newline(out);
|
|
+ printbuf_indent_add(out, 2);
|
|
+ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
|
|
+ prt_printf(out, "read_done:\t\%u\n", m->read_done);
|
|
+ bch2_write_op_to_text(out, &m->op);
|
|
+ printbuf_indent_sub(out, 2);
|
|
+}
|
|
+
|
|
int bch2_extent_drop_ptrs(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
struct bkey_s_c k,
|
|
@@ -617,12 +638,85 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
|
|
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
|
}
|
|
|
|
+int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
|
|
+ struct bch_io_opts *io_opts)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+
|
|
+ /* write path might have to decompress data: */
|
|
+ unsigned buf_bytes = 0;
|
|
+ bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
|
|
+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
|
|
+
|
|
+ unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
|
|
+
|
|
+ m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
|
|
+ if (!m->bvecs)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
|
|
+ bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
|
|
+
|
|
+ if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
|
|
+ kfree(m->bvecs);
|
|
+ m->bvecs = NULL;
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ rbio_init(&m->rbio.bio, c, *io_opts, NULL);
|
|
+ m->rbio.data_update = true;
|
|
+ m->rbio.bio.bi_iter.bi_size = buf_bytes;
|
|
+ m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
|
|
+ m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int can_write_extent(struct bch_fs *c, struct data_update *m)
|
|
+{
|
|
+ if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
|
|
+ unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
|
|
+ return -BCH_ERR_data_update_done_would_block;
|
|
+
|
|
+ unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
|
|
+ ? m->op.target
|
|
+ : 0;
|
|
+ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
|
|
+
|
|
+ darray_for_each(m->op.devs_have, i)
|
|
+ __clear_bit(*i, devs.d);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ unsigned nr_replicas = 0, i;
|
|
+ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
|
|
+ struct bch_dev *ca = bch2_dev_rcu(c, i);
|
|
+
|
|
+ struct bch_dev_usage usage;
|
|
+ bch2_dev_usage_read_fast(ca, &usage);
|
|
+
|
|
+ if (!dev_buckets_free(ca, usage, m->op.watermark))
|
|
+ continue;
|
|
+
|
|
+ nr_replicas += ca->mi.durability;
|
|
+ if (nr_replicas >= m->op.nr_replicas)
|
|
+ break;
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (!nr_replicas)
|
|
+ return -BCH_ERR_data_update_done_no_rw_devs;
|
|
+ if (nr_replicas < m->op.nr_replicas)
|
|
+ return -BCH_ERR_insufficient_devices;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
int bch2_data_update_init(struct btree_trans *trans,
|
|
struct btree_iter *iter,
|
|
struct moving_context *ctxt,
|
|
struct data_update *m,
|
|
struct write_point_specifier wp,
|
|
- struct bch_io_opts io_opts,
|
|
+ struct bch_io_opts *io_opts,
|
|
struct data_update_opts data_opts,
|
|
enum btree_id btree_id,
|
|
struct bkey_s_c k)
|
|
@@ -640,16 +734,7 @@ int bch2_data_update_init(struct btree_trans *trans,
|
|
* snapshots table - just skip it, we can move it later.
|
|
*/
|
|
if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
|
|
- return -BCH_ERR_data_update_done;
|
|
-
|
|
- if (!bkey_get_dev_refs(c, k))
|
|
- return -BCH_ERR_data_update_done;
|
|
-
|
|
- if (c->opts.nocow_enabled &&
|
|
- !bkey_nocow_lock(c, ctxt, k)) {
|
|
- bkey_put_dev_refs(c, k);
|
|
- return -BCH_ERR_nocow_lock_blocked;
|
|
- }
|
|
+ return -BCH_ERR_data_update_done_no_snapshot;
|
|
|
|
bch2_bkey_buf_init(&m->k);
|
|
bch2_bkey_buf_reassemble(&m->k, c, k);
|
|
@@ -658,18 +743,18 @@ int bch2_data_update_init(struct btree_trans *trans,
|
|
m->ctxt = ctxt;
|
|
m->stats = ctxt ? ctxt->stats : NULL;
|
|
|
|
- bch2_write_op_init(&m->op, c, io_opts);
|
|
+ bch2_write_op_init(&m->op, c, *io_opts);
|
|
m->op.pos = bkey_start_pos(k.k);
|
|
m->op.version = k.k->bversion;
|
|
m->op.target = data_opts.target;
|
|
m->op.write_point = wp;
|
|
m->op.nr_replicas = 0;
|
|
- m->op.flags |= BCH_WRITE_PAGES_STABLE|
|
|
- BCH_WRITE_PAGES_OWNED|
|
|
- BCH_WRITE_DATA_ENCODED|
|
|
- BCH_WRITE_MOVE|
|
|
+ m->op.flags |= BCH_WRITE_pages_stable|
|
|
+ BCH_WRITE_pages_owned|
|
|
+ BCH_WRITE_data_encoded|
|
|
+ BCH_WRITE_move|
|
|
m->data_opts.write_flags;
|
|
- m->op.compression_opt = io_opts.background_compression;
|
|
+ m->op.compression_opt = io_opts->background_compression;
|
|
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
|
|
|
|
unsigned durability_have = 0, durability_removing = 0;
|
|
@@ -707,7 +792,7 @@ int bch2_data_update_init(struct btree_trans *trans,
|
|
ptr_bit <<= 1;
|
|
}
|
|
|
|
- unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
|
|
+ unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
|
|
|
|
/*
|
|
* If current extent durability is less than io_opts.data_replicas,
|
|
@@ -740,28 +825,70 @@ int bch2_data_update_init(struct btree_trans *trans,
|
|
m->data_opts.rewrite_ptrs = 0;
|
|
/* if iter == NULL, it's just a promote */
|
|
if (iter)
|
|
- ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
|
|
- goto out;
|
|
+ ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
|
|
+ if (!ret)
|
|
+ ret = -BCH_ERR_data_update_done_no_writes_needed;
|
|
+ goto out_bkey_buf_exit;
|
|
}
|
|
|
|
+ /*
|
|
+ * Check if the allocation will succeed, to avoid getting an error later
|
|
+ * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
|
|
+ * read:
|
|
+ *
|
|
+ * This guards against
|
|
+ * - BCH_WRITE_alloc_nowait allocations failing (promotes)
|
|
+ * - Destination target full
|
|
+ * - Device(s) in destination target offline
|
|
+ * - Insufficient durability available in destination target
|
|
+ * (i.e. trying to move a durability=2 replica to a target with a
|
|
+ * single durability=2 device)
|
|
+ */
|
|
+ ret = can_write_extent(c, m);
|
|
+ if (ret)
|
|
+ goto out_bkey_buf_exit;
|
|
+
|
|
if (reserve_sectors) {
|
|
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
|
|
m->data_opts.extra_replicas
|
|
? 0
|
|
: BCH_DISK_RESERVATION_NOFAIL);
|
|
if (ret)
|
|
- goto out;
|
|
+ goto out_bkey_buf_exit;
|
|
+ }
|
|
+
|
|
+ if (!bkey_get_dev_refs(c, k)) {
|
|
+ ret = -BCH_ERR_data_update_done_no_dev_refs;
|
|
+ goto out_put_disk_res;
|
|
+ }
|
|
+
|
|
+ if (c->opts.nocow_enabled &&
|
|
+ !bkey_nocow_lock(c, ctxt, k)) {
|
|
+ ret = -BCH_ERR_nocow_lock_blocked;
|
|
+ goto out_put_dev_refs;
|
|
}
|
|
|
|
if (bkey_extent_is_unwritten(k)) {
|
|
- bch2_update_unwritten_extent(trans, m);
|
|
- goto out;
|
|
+ ret = bch2_update_unwritten_extent(trans, m) ?:
|
|
+ -BCH_ERR_data_update_done_unwritten;
|
|
+ goto out_nocow_unlock;
|
|
}
|
|
|
|
+ ret = bch2_data_update_bios_init(m, c, io_opts);
|
|
+ if (ret)
|
|
+ goto out_nocow_unlock;
|
|
+
|
|
return 0;
|
|
-out:
|
|
- bch2_data_update_exit(m);
|
|
- return ret ?: -BCH_ERR_data_update_done;
|
|
+out_nocow_unlock:
|
|
+ if (c->opts.nocow_enabled)
|
|
+ bkey_nocow_unlock(c, k);
|
|
+out_put_dev_refs:
|
|
+ bkey_put_dev_refs(c, k);
|
|
+out_put_disk_res:
|
|
+ bch2_disk_reservation_put(c, &m->op.res);
|
|
+out_bkey_buf_exit:
|
|
+ bch2_bkey_buf_exit(&m->k, c);
|
|
+ return ret;
|
|
}
|
|
|
|
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
|
|
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
|
|
index e4b50723428e..c194cbbf5b51 100644
|
|
--- a/fs/bcachefs/data_update.h
|
|
+++ b/fs/bcachefs/data_update.h
|
|
@@ -4,6 +4,7 @@
|
|
#define _BCACHEFS_DATA_UPDATE_H
|
|
|
|
#include "bkey_buf.h"
|
|
+#include "io_read.h"
|
|
#include "io_write_types.h"
|
|
|
|
struct moving_context;
|
|
@@ -15,6 +16,9 @@ struct data_update_opts {
|
|
u8 extra_replicas;
|
|
unsigned btree_insert_flags;
|
|
unsigned write_flags;
|
|
+
|
|
+ int read_dev;
|
|
+ bool scrub;
|
|
};
|
|
|
|
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
|
|
@@ -22,20 +26,24 @@ void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
|
|
|
|
struct data_update {
|
|
/* extent being updated: */
|
|
+ bool read_done;
|
|
enum btree_id btree_id;
|
|
struct bkey_buf k;
|
|
struct data_update_opts data_opts;
|
|
struct moving_context *ctxt;
|
|
struct bch_move_stats *stats;
|
|
+
|
|
+ struct bch_read_bio rbio;
|
|
struct bch_write_op op;
|
|
+ struct bio_vec *bvecs;
|
|
};
|
|
|
|
void bch2_data_update_to_text(struct printbuf *, struct data_update *);
|
|
+void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
|
|
|
|
int bch2_data_update_index_update(struct bch_write_op *);
|
|
|
|
-void bch2_data_update_read_done(struct data_update *,
|
|
- struct bch_extent_crc_unpacked);
|
|
+void bch2_data_update_read_done(struct data_update *);
|
|
|
|
int bch2_extent_drop_ptrs(struct btree_trans *,
|
|
struct btree_iter *,
|
|
@@ -43,12 +51,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *,
|
|
struct bch_io_opts *,
|
|
struct data_update_opts *);
|
|
|
|
+int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
|
|
+ struct bch_io_opts *);
|
|
+
|
|
void bch2_data_update_exit(struct data_update *);
|
|
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
|
|
struct moving_context *,
|
|
struct data_update *,
|
|
struct write_point_specifier,
|
|
- struct bch_io_opts, struct data_update_opts,
|
|
+ struct bch_io_opts *, struct data_update_opts,
|
|
enum btree_id, struct bkey_s_c);
|
|
void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
|
|
|
|
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
|
|
index 55333e82d1fe..788af88f6979 100644
|
|
--- a/fs/bcachefs/debug.c
|
|
+++ b/fs/bcachefs/debug.c
|
|
@@ -7,6 +7,7 @@
|
|
*/
|
|
|
|
#include "bcachefs.h"
|
|
+#include "alloc_foreground.h"
|
|
#include "bkey_methods.h"
|
|
#include "btree_cache.h"
|
|
#include "btree_io.h"
|
|
@@ -190,7 +191,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
|
|
unsigned offset = 0;
|
|
int ret;
|
|
|
|
- if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
|
|
+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) {
|
|
prt_printf(out, "error getting device to read from: invalid device\n");
|
|
return;
|
|
}
|
|
@@ -844,8 +845,11 @@ static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
|
|
seqmutex_unlock(&c->btree_trans_lock);
|
|
}
|
|
|
|
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
|
- size_t size, loff_t *ppos)
|
|
+typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *);
|
|
+
|
|
+static ssize_t bch2_simple_print(struct file *file, char __user *buf,
|
|
+ size_t size, loff_t *ppos,
|
|
+ fs_to_text_fn fn)
|
|
{
|
|
struct dump_iter *i = file->private_data;
|
|
struct bch_fs *c = i->c;
|
|
@@ -856,7 +860,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
|
i->ret = 0;
|
|
|
|
if (!i->iter) {
|
|
- btree_deadlock_to_text(&i->buf, c);
|
|
+ fn(&i->buf, c);
|
|
i->iter++;
|
|
}
|
|
|
|
@@ -869,6 +873,12 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
|
return ret ?: i->ret;
|
|
}
|
|
|
|
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
|
+ size_t size, loff_t *ppos)
|
|
+{
|
|
+ return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text);
|
|
+}
|
|
+
|
|
static const struct file_operations btree_deadlock_ops = {
|
|
.owner = THIS_MODULE,
|
|
.open = bch2_dump_open,
|
|
@@ -876,6 +886,19 @@ static const struct file_operations btree_deadlock_ops = {
|
|
.read = bch2_btree_deadlock_read,
|
|
};
|
|
|
|
+static ssize_t bch2_write_points_read(struct file *file, char __user *buf,
|
|
+ size_t size, loff_t *ppos)
|
|
+{
|
|
+ return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text);
|
|
+}
|
|
+
|
|
+static const struct file_operations write_points_ops = {
|
|
+ .owner = THIS_MODULE,
|
|
+ .open = bch2_dump_open,
|
|
+ .release = bch2_dump_release,
|
|
+ .read = bch2_write_points_read,
|
|
+};
|
|
+
|
|
void bch2_fs_debug_exit(struct bch_fs *c)
|
|
{
|
|
if (!IS_ERR_OR_NULL(c->fs_debug_dir))
|
|
@@ -927,6 +950,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
|
|
debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
|
|
c->btree_debug, &btree_deadlock_ops);
|
|
|
|
+ debugfs_create_file("write_points", 0400, c->fs_debug_dir,
|
|
+ c->btree_debug, &write_points_ops);
|
|
+
|
|
c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
|
|
if (IS_ERR_OR_NULL(c->btree_debug_dir))
|
|
return;
|
|
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
|
|
index 600eee936f13..d7f9f79318a2 100644
|
|
--- a/fs/bcachefs/dirent.c
|
|
+++ b/fs/bcachefs/dirent.c
|
|
@@ -13,6 +13,40 @@
|
|
|
|
#include <linux/dcache.h>
|
|
|
|
+static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
|
|
+ const struct qstr *str, struct qstr *out_cf)
|
|
+{
|
|
+ *out_cf = (struct qstr) QSTR_INIT(NULL, 0);
|
|
+
|
|
+#ifdef CONFIG_UNICODE
|
|
+ unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1);
|
|
+ int ret = PTR_ERR_OR_ZERO(buf);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1);
|
|
+ if (ret <= 0)
|
|
+ return ret;
|
|
+
|
|
+ *out_cf = (struct qstr) QSTR_INIT(buf, ret);
|
|
+ return 0;
|
|
+#else
|
|
+ return -EOPNOTSUPP;
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline int bch2_maybe_casefold(struct btree_trans *trans,
|
|
+ const struct bch_hash_info *info,
|
|
+ const struct qstr *str, struct qstr *out_cf)
|
|
+{
|
|
+ if (likely(!info->cf_encoding)) {
|
|
+ *out_cf = *str;
|
|
+ return 0;
|
|
+ } else {
|
|
+ return bch2_casefold(trans, info, str, out_cf);
|
|
+ }
|
|
+}
|
|
+
|
|
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
|
|
{
|
|
if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
|
|
@@ -28,13 +62,38 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
|
|
#endif
|
|
|
|
return bkey_bytes -
|
|
- offsetof(struct bch_dirent, d_name) -
|
|
+ (d.v->d_casefold
|
|
+ ? offsetof(struct bch_dirent, d_cf_name_block.d_names)
|
|
+ : offsetof(struct bch_dirent, d_name)) -
|
|
trailing_nuls;
|
|
}
|
|
|
|
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
|
|
{
|
|
- return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
|
|
+ if (d.v->d_casefold) {
|
|
+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
|
|
+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len);
|
|
+ } else {
|
|
+ return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d)
|
|
+{
|
|
+ if (d.v->d_casefold) {
|
|
+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
|
|
+ unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len);
|
|
+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len);
|
|
+ } else {
|
|
+ return (struct qstr) QSTR_INIT(NULL, 0);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d)
|
|
+{
|
|
+ return d.v->d_casefold
|
|
+ ? bch2_dirent_get_casefold_name(d)
|
|
+ : bch2_dirent_get_name(d);
|
|
}
|
|
|
|
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
|
|
@@ -57,7 +116,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
|
|
static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
|
|
{
|
|
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
|
- struct qstr name = bch2_dirent_get_name(d);
|
|
+ struct qstr name = bch2_dirent_get_lookup_name(d);
|
|
|
|
return bch2_dirent_hash(info, &name);
|
|
}
|
|
@@ -65,7 +124,7 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
|
|
static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
|
|
{
|
|
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
|
|
- const struct qstr l_name = bch2_dirent_get_name(l);
|
|
+ const struct qstr l_name = bch2_dirent_get_lookup_name(l);
|
|
const struct qstr *r_name = _r;
|
|
|
|
return !qstr_eq(l_name, *r_name);
|
|
@@ -75,8 +134,8 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
|
|
{
|
|
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
|
|
struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
|
|
- const struct qstr l_name = bch2_dirent_get_name(l);
|
|
- const struct qstr r_name = bch2_dirent_get_name(r);
|
|
+ const struct qstr l_name = bch2_dirent_get_lookup_name(l);
|
|
+ const struct qstr r_name = bch2_dirent_get_lookup_name(r);
|
|
|
|
return !qstr_eq(l_name, r_name);
|
|
}
|
|
@@ -104,17 +163,19 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
struct bkey_validate_context from)
|
|
{
|
|
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
|
+ unsigned name_block_len = bch2_dirent_name_bytes(d);
|
|
struct qstr d_name = bch2_dirent_get_name(d);
|
|
+ struct qstr d_cf_name = bch2_dirent_get_casefold_name(d);
|
|
int ret = 0;
|
|
|
|
bkey_fsck_err_on(!d_name.len,
|
|
c, dirent_empty_name,
|
|
"empty name");
|
|
|
|
- bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len),
|
|
+ bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len,
|
|
c, dirent_val_too_big,
|
|
- "value too big (%zu > %u)",
|
|
- bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
|
|
+ "dirent names exceed bkey size (%d + %d > %d)",
|
|
+ d_name.len, d_cf_name.len, name_block_len);
|
|
|
|
/*
|
|
* Check new keys don't exceed the max length
|
|
@@ -142,6 +203,18 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
le64_to_cpu(d.v->d_inum) == d.k->p.inode,
|
|
c, dirent_to_itself,
|
|
"dirent points to own directory");
|
|
+
|
|
+ if (d.v->d_casefold) {
|
|
+ bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit &&
|
|
+ d_cf_name.len > BCH_NAME_MAX,
|
|
+ c, dirent_cf_name_too_big,
|
|
+ "dirent w/ cf name too big (%u > %u)",
|
|
+ d_cf_name.len, BCH_NAME_MAX);
|
|
+
|
|
+ bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len),
|
|
+ c, dirent_stray_data_after_cf_name,
|
|
+ "dirent has stray data after cf name's NUL");
|
|
+ }
|
|
fsck_err:
|
|
return ret;
|
|
}
|
|
@@ -163,15 +236,14 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
|
|
prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
|
|
}
|
|
|
|
-static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
|
|
- subvol_inum dir, u8 type,
|
|
- const struct qstr *name, u64 dst)
|
|
+static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans,
|
|
+ subvol_inum dir,
|
|
+ u8 type,
|
|
+ int name_len, int cf_name_len,
|
|
+ u64 dst)
|
|
{
|
|
struct bkey_i_dirent *dirent;
|
|
- unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
|
|
-
|
|
- if (name->len > BCH_NAME_MAX)
|
|
- return ERR_PTR(-ENAMETOOLONG);
|
|
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len);
|
|
|
|
BUG_ON(u64s > U8_MAX);
|
|
|
|
@@ -190,14 +262,65 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
|
|
}
|
|
|
|
dirent->v.d_type = type;
|
|
+ dirent->v.d_unused = 0;
|
|
+ dirent->v.d_casefold = cf_name_len ? 1 : 0;
|
|
|
|
- memcpy(dirent->v.d_name, name->name, name->len);
|
|
- memset(dirent->v.d_name + name->len, 0,
|
|
- bkey_val_bytes(&dirent->k) -
|
|
- offsetof(struct bch_dirent, d_name) -
|
|
- name->len);
|
|
+ return dirent;
|
|
+}
|
|
|
|
- EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
|
|
+static void dirent_init_regular_name(struct bkey_i_dirent *dirent,
|
|
+ const struct qstr *name)
|
|
+{
|
|
+ EBUG_ON(dirent->v.d_casefold);
|
|
+
|
|
+ memcpy(&dirent->v.d_name[0], name->name, name->len);
|
|
+ memset(&dirent->v.d_name[name->len], 0,
|
|
+ bkey_val_bytes(&dirent->k) -
|
|
+ offsetof(struct bch_dirent, d_name) -
|
|
+ name->len);
|
|
+}
|
|
+
|
|
+static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
|
|
+ const struct qstr *name,
|
|
+ const struct qstr *cf_name)
|
|
+{
|
|
+ EBUG_ON(!dirent->v.d_casefold);
|
|
+ EBUG_ON(!cf_name->len);
|
|
+
|
|
+ dirent->v.d_cf_name_block.d_name_len = name->len;
|
|
+ dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len;
|
|
+ memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
|
|
+ memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);
|
|
+ memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0,
|
|
+ bkey_val_bytes(&dirent->k) -
|
|
+ offsetof(struct bch_dirent, d_cf_name_block.d_names) -
|
|
+ name->len + cf_name->len);
|
|
+
|
|
+ EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len);
|
|
+}
|
|
+
|
|
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
|
|
+ subvol_inum dir,
|
|
+ u8 type,
|
|
+ const struct qstr *name,
|
|
+ const struct qstr *cf_name,
|
|
+ u64 dst)
|
|
+{
|
|
+ struct bkey_i_dirent *dirent;
|
|
+
|
|
+ if (name->len > BCH_NAME_MAX)
|
|
+ return ERR_PTR(-ENAMETOOLONG);
|
|
+
|
|
+ dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst);
|
|
+ if (IS_ERR(dirent))
|
|
+ return dirent;
|
|
+
|
|
+ if (cf_name)
|
|
+ dirent_init_casefolded_name(dirent, name, cf_name);
|
|
+ else
|
|
+ dirent_init_regular_name(dirent, name);
|
|
+
|
|
+ EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
|
|
|
|
return dirent;
|
|
}
|
|
@@ -213,7 +336,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
|
|
struct bkey_i_dirent *dirent;
|
|
int ret;
|
|
|
|
- dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
|
|
+ dirent = dirent_create_key(trans, dir_inum, type, name, NULL, dst_inum);
|
|
ret = PTR_ERR_OR_ZERO(dirent);
|
|
if (ret)
|
|
return ret;
|
|
@@ -233,16 +356,28 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
|
|
const struct bch_hash_info *hash_info,
|
|
u8 type, const struct qstr *name, u64 dst_inum,
|
|
u64 *dir_offset,
|
|
+ u64 *i_size,
|
|
enum btree_iter_update_trigger_flags flags)
|
|
{
|
|
struct bkey_i_dirent *dirent;
|
|
int ret;
|
|
|
|
- dirent = dirent_create_key(trans, dir, type, name, dst_inum);
|
|
+ if (hash_info->cf_encoding) {
|
|
+ struct qstr cf_name;
|
|
+ ret = bch2_casefold(trans, hash_info, name, &cf_name);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ dirent = dirent_create_key(trans, dir, type, name, &cf_name, dst_inum);
|
|
+ } else {
|
|
+ dirent = dirent_create_key(trans, dir, type, name, NULL, dst_inum);
|
|
+ }
|
|
+
|
|
ret = PTR_ERR_OR_ZERO(dirent);
|
|
if (ret)
|
|
return ret;
|
|
|
|
+ *i_size += bkey_bytes(&dirent->k);
|
|
+
|
|
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
|
|
dir, &dirent->k_i, flags);
|
|
*dir_offset = dirent->k.p.offset;
|
|
@@ -275,12 +410,13 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
|
|
}
|
|
|
|
int bch2_dirent_rename(struct btree_trans *trans,
|
|
- subvol_inum src_dir, struct bch_hash_info *src_hash,
|
|
- subvol_inum dst_dir, struct bch_hash_info *dst_hash,
|
|
+ subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size,
|
|
+ subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size,
|
|
const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
|
|
const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
|
|
enum bch_rename_mode mode)
|
|
{
|
|
+ struct qstr src_name_lookup, dst_name_lookup;
|
|
struct btree_iter src_iter = { NULL };
|
|
struct btree_iter dst_iter = { NULL };
|
|
struct bkey_s_c old_src, old_dst = bkey_s_c_null;
|
|
@@ -295,8 +431,11 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
memset(dst_inum, 0, sizeof(*dst_inum));
|
|
|
|
/* Lookup src: */
|
|
+ ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup);
|
|
+ if (ret)
|
|
+ goto out;
|
|
old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
|
|
- src_hash, src_dir, src_name,
|
|
+ src_hash, src_dir, &src_name_lookup,
|
|
BTREE_ITER_intent);
|
|
ret = bkey_err(old_src);
|
|
if (ret)
|
|
@@ -308,6 +447,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
goto out;
|
|
|
|
/* Lookup dst: */
|
|
+ ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup);
|
|
+ if (ret)
|
|
+ goto out;
|
|
if (mode == BCH_RENAME) {
|
|
/*
|
|
* Note that we're _not_ checking if the target already exists -
|
|
@@ -315,12 +457,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
* correctness:
|
|
*/
|
|
ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
|
|
- dst_hash, dst_dir, dst_name);
|
|
+ dst_hash, dst_dir, &dst_name_lookup);
|
|
if (ret)
|
|
goto out;
|
|
} else {
|
|
old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
|
|
- dst_hash, dst_dir, dst_name,
|
|
+ dst_hash, dst_dir, &dst_name_lookup,
|
|
BTREE_ITER_intent);
|
|
ret = bkey_err(old_dst);
|
|
if (ret)
|
|
@@ -336,7 +478,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
*src_offset = dst_iter.pos.offset;
|
|
|
|
/* Create new dst key: */
|
|
- new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
|
|
+ new_dst = dirent_create_key(trans, dst_dir, 0, dst_name,
|
|
+ dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
|
|
ret = PTR_ERR_OR_ZERO(new_dst);
|
|
if (ret)
|
|
goto out;
|
|
@@ -346,7 +489,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
|
|
/* Create new src key: */
|
|
if (mode == BCH_RENAME_EXCHANGE) {
|
|
- new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
|
|
+ new_src = dirent_create_key(trans, src_dir, 0, src_name,
|
|
+ src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
|
|
ret = PTR_ERR_OR_ZERO(new_src);
|
|
if (ret)
|
|
goto out;
|
|
@@ -406,6 +550,14 @@ int bch2_dirent_rename(struct btree_trans *trans,
|
|
new_src->v.d_type == DT_SUBVOL)
|
|
new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
|
|
|
|
+ if (old_dst.k)
|
|
+ *dst_dir_i_size -= bkey_bytes(old_dst.k);
|
|
+ *src_dir_i_size -= bkey_bytes(old_src.k);
|
|
+
|
|
+ if (mode == BCH_RENAME_EXCHANGE)
|
|
+ *src_dir_i_size += bkey_bytes(&new_src->k);
|
|
+ *dst_dir_i_size += bkey_bytes(&new_dst->k);
|
|
+
|
|
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
|
|
if (ret)
|
|
goto out;
|
|
@@ -465,9 +617,14 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans,
|
|
const struct qstr *name, subvol_inum *inum,
|
|
unsigned flags)
|
|
{
|
|
+ struct qstr lookup_name;
|
|
+ int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
|
|
- hash_info, dir, name, flags);
|
|
- int ret = bkey_err(k);
|
|
+ hash_info, dir, &lookup_name, flags);
|
|
+ ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
|
|
@@ -572,3 +729,54 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
|
|
|
|
return ret < 0 ? ret : 0;
|
|
}
|
|
+
|
|
+/* fsck */
|
|
+
|
|
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
|
|
+ BTREE_ITER_all_snapshots, k, ret) {
|
|
+ if (k.k->p.offset != inode_nr)
|
|
+ break;
|
|
+ if (!bkey_is_inode(k.k))
|
|
+ continue;
|
|
+ ret = bch2_inode_unpack(k, inode);
|
|
+ goto found;
|
|
+ }
|
|
+ ret = -BCH_ERR_ENOENT_inode;
|
|
+found:
|
|
+ bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter iter;
|
|
+ struct bch_inode_unpacked dir_inode;
|
|
+ struct bch_hash_info dir_hash_info;
|
|
+ int ret;
|
|
+
|
|
+ ret = lookup_first_inode(trans, pos.inode, &dir_inode);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ dir_hash_info = bch2_hash_info_init(c, &dir_inode);
|
|
+
|
|
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
|
|
+
|
|
+ ret = bch2_btree_iter_traverse(&iter) ?:
|
|
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
|
|
+ &dir_hash_info, &iter,
|
|
+ BTREE_UPDATE_internal_snapshot_node);
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+err:
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
|
|
index 362b3b2f2f2e..0880772b80a9 100644
|
|
--- a/fs/bcachefs/dirent.h
|
|
+++ b/fs/bcachefs/dirent.h
|
|
@@ -25,10 +25,13 @@ struct bch_inode_info;
|
|
|
|
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
|
|
|
|
-static inline unsigned dirent_val_u64s(unsigned len)
|
|
+static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
|
|
{
|
|
- return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
|
|
- sizeof(u64));
|
|
+ unsigned bytes = cf_len
|
|
+ ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len
|
|
+ : offsetof(struct bch_dirent, d_name) + len;
|
|
+
|
|
+ return DIV_ROUND_UP(bytes, sizeof(u64));
|
|
}
|
|
|
|
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
|
|
@@ -47,7 +50,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
|
|
enum btree_iter_update_trigger_flags);
|
|
int bch2_dirent_create(struct btree_trans *, subvol_inum,
|
|
const struct bch_hash_info *, u8,
|
|
- const struct qstr *, u64, u64 *,
|
|
+ const struct qstr *, u64, u64 *, u64 *,
|
|
enum btree_iter_update_trigger_flags);
|
|
|
|
static inline unsigned vfs_d_type(unsigned type)
|
|
@@ -62,8 +65,8 @@ enum bch_rename_mode {
|
|
};
|
|
|
|
int bch2_dirent_rename(struct btree_trans *,
|
|
- subvol_inum, struct bch_hash_info *,
|
|
- subvol_inum, struct bch_hash_info *,
|
|
+ subvol_inum, struct bch_hash_info *, u64 *,
|
|
+ subvol_inum, struct bch_hash_info *, u64 *,
|
|
const struct qstr *, subvol_inum *, u64 *,
|
|
const struct qstr *, subvol_inum *, u64 *,
|
|
enum bch_rename_mode);
|
|
@@ -79,4 +82,6 @@ int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
|
|
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
|
|
int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
|
|
|
|
+int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
|
|
+
|
|
#endif /* _BCACHEFS_DIRENT_H */
|
|
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
|
|
index 5e116b88e814..a46dbddd21aa 100644
|
|
--- a/fs/bcachefs/dirent_format.h
|
|
+++ b/fs/bcachefs/dirent_format.h
|
|
@@ -29,9 +29,25 @@ struct bch_dirent {
|
|
* Copy of mode bits 12-15 from the target inode - so userspace can get
|
|
* the filetype without having to do a stat()
|
|
*/
|
|
- __u8 d_type;
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u8 d_type:5,
|
|
+ d_unused:2,
|
|
+ d_casefold:1;
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ __u8 d_casefold:1,
|
|
+ d_unused:2,
|
|
+ d_type:5;
|
|
+#endif
|
|
|
|
- __u8 d_name[];
|
|
+ union {
|
|
+ struct {
|
|
+ __u8 d_pad;
|
|
+ __le16 d_name_len;
|
|
+ __le16 d_cf_name_len;
|
|
+ __u8 d_names[];
|
|
+ } d_cf_name_block __packed;
|
|
+ __DECLARE_FLEX_ARRAY(__u8, d_name);
|
|
+ } __packed;
|
|
} __packed __aligned(8);
|
|
|
|
#define DT_SUBVOL 16
|
|
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
|
|
index b32e91ba8be8..8a8de61429d8 100644
|
|
--- a/fs/bcachefs/disk_accounting.c
|
|
+++ b/fs/bcachefs/disk_accounting.c
|
|
@@ -135,6 +135,12 @@ static inline bool is_zero(char *start, char *end)
|
|
|
|
#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member))
|
|
|
|
+static const unsigned bch2_accounting_type_nr_counters[] = {
|
|
+#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr,
|
|
+ BCH_DISK_ACCOUNTING_TYPES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
struct bkey_validate_context from)
|
|
{
|
|
@@ -193,6 +199,11 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
|
|
c, accounting_key_junk_at_end,
|
|
"junk at end of accounting key");
|
|
+
|
|
+ bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type],
|
|
+ c, accounting_key_nr_counters_wrong,
|
|
+ "accounting key with %u counters, should be %u",
|
|
+ bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]);
|
|
fsck_err:
|
|
return ret;
|
|
}
|
|
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
|
|
index f4372cafea2e..abb1f6206fe9 100644
|
|
--- a/fs/bcachefs/disk_accounting.h
|
|
+++ b/fs/bcachefs/disk_accounting.h
|
|
@@ -33,10 +33,12 @@ static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
|
|
static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
|
|
struct bkey_s_c_accounting src)
|
|
{
|
|
- EBUG_ON(dst->k.u64s != src.k->u64s);
|
|
-
|
|
- for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
|
|
+ for (unsigned i = 0;
|
|
+ i < min(bch2_accounting_counters(&dst->k),
|
|
+ bch2_accounting_counters(src.k));
|
|
+ i++)
|
|
dst->v.d[i] += src.v->d[i];
|
|
+
|
|
if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
|
|
dst->k.bversion = src.k->bversion;
|
|
}
|
|
@@ -85,6 +87,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos
|
|
|
|
int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
|
|
s64 *, unsigned, bool);
|
|
+
|
|
+#define disk_accounting_key_init(_k, _type, ...) \
|
|
+do { \
|
|
+ memset(&(_k), 0, sizeof(_k)); \
|
|
+ (_k).type = BCH_DISK_ACCOUNTING_##_type; \
|
|
+ (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \
|
|
+} while (0)
|
|
+
|
|
+#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \
|
|
+({ \
|
|
+ struct disk_accounting_pos pos; \
|
|
+ disk_accounting_key_init(pos, __VA_ARGS__); \
|
|
+ bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \
|
|
+})
|
|
+
|
|
+#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \
|
|
+ bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__)
|
|
+
|
|
int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
|
|
|
|
int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
|
|
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
|
|
index 7b6e6c97e6aa..8269af1dbe2a 100644
|
|
--- a/fs/bcachefs/disk_accounting_format.h
|
|
+++ b/fs/bcachefs/disk_accounting_format.h
|
|
@@ -95,40 +95,81 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
|
|
}
|
|
}
|
|
|
|
+/*
|
|
+ * field 1: name
|
|
+ * field 2: id
|
|
+ * field 3: number of counters (max 3)
|
|
+ */
|
|
+
|
|
#define BCH_DISK_ACCOUNTING_TYPES() \
|
|
- x(nr_inodes, 0) \
|
|
- x(persistent_reserved, 1) \
|
|
- x(replicas, 2) \
|
|
- x(dev_data_type, 3) \
|
|
- x(compression, 4) \
|
|
- x(snapshot, 5) \
|
|
- x(btree, 6) \
|
|
- x(rebalance_work, 7) \
|
|
- x(inum, 8)
|
|
+ x(nr_inodes, 0, 1) \
|
|
+ x(persistent_reserved, 1, 1) \
|
|
+ x(replicas, 2, 1) \
|
|
+ x(dev_data_type, 3, 3) \
|
|
+ x(compression, 4, 3) \
|
|
+ x(snapshot, 5, 1) \
|
|
+ x(btree, 6, 1) \
|
|
+ x(rebalance_work, 7, 1) \
|
|
+ x(inum, 8, 3)
|
|
|
|
enum disk_accounting_type {
|
|
-#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr,
|
|
+#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr,
|
|
BCH_DISK_ACCOUNTING_TYPES()
|
|
#undef x
|
|
BCH_DISK_ACCOUNTING_TYPE_NR,
|
|
};
|
|
|
|
-struct bch_nr_inodes {
|
|
+/*
|
|
+ * No subtypes - number of inodes in the entire filesystem
|
|
+ *
|
|
+ * XXX: perhaps we could add a per-subvolume counter?
|
|
+ */
|
|
+struct bch_acct_nr_inodes {
|
|
};
|
|
|
|
-struct bch_persistent_reserved {
|
|
+/*
|
|
+ * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the
|
|
+ * reservation:
|
|
+ */
|
|
+struct bch_acct_persistent_reserved {
|
|
__u8 nr_replicas;
|
|
};
|
|
|
|
-struct bch_dev_data_type {
|
|
+/*
|
|
+ * device, data type counter fields:
|
|
+ * [
|
|
+ * nr_buckets
|
|
+ * live sectors (in buckets of that data type)
|
|
+ * sectors of internal fragmentation
|
|
+ * ]
|
|
+ *
|
|
+ * XXX: live sectors should've been done differently, you can have multiple data
|
|
+ * types in the same bucket (user, stripe, cached) and this collapses them to
|
|
+ * the bucket data type, and makes the internal fragmentation counter redundant
|
|
+ */
|
|
+struct bch_acct_dev_data_type {
|
|
__u8 dev;
|
|
__u8 data_type;
|
|
};
|
|
|
|
+/*
|
|
+ * Compression type fields:
|
|
+ * [
|
|
+ * number of extents
|
|
+ * uncompressed size
|
|
+ * compressed size
|
|
+ * ]
|
|
+ *
|
|
+ * Compression ratio, average extent size (fragmentation).
|
|
+ */
|
|
struct bch_acct_compression {
|
|
__u8 type;
|
|
};
|
|
|
|
+/*
|
|
+ * On disk usage by snapshot id; counts same values as replicas counter, but
|
|
+ * aggregated differently
|
|
+ */
|
|
struct bch_acct_snapshot {
|
|
__u32 id;
|
|
} __packed;
|
|
@@ -137,10 +178,27 @@ struct bch_acct_btree {
|
|
__u32 id;
|
|
} __packed;
|
|
|
|
+/*
|
|
+ * inum counter fields:
|
|
+ * [
|
|
+ * number of extents
|
|
+ * sum of extent sizes - bkey size
|
|
+ * this field is similar to inode.bi_sectors, except here extents in
|
|
+ * different snapshots but the same inode number are all collapsed to the
|
|
+ * same counter
|
|
+ * sum of on disk size - same values tracked by replicas counters
|
|
+ * ]
|
|
+ *
|
|
+ * This tracks on disk fragmentation.
|
|
+ */
|
|
struct bch_acct_inum {
|
|
__u64 inum;
|
|
} __packed;
|
|
|
|
+/*
|
|
+ * Simple counter of the amount of data (on disk sectors) rebalance needs to
|
|
+ * move, extents counted here are also in the rebalance_work btree.
|
|
+ */
|
|
struct bch_acct_rebalance_work {
|
|
};
|
|
|
|
@@ -149,10 +207,10 @@ struct disk_accounting_pos {
|
|
struct {
|
|
__u8 type;
|
|
union {
|
|
- struct bch_nr_inodes nr_inodes;
|
|
- struct bch_persistent_reserved persistent_reserved;
|
|
+ struct bch_acct_nr_inodes nr_inodes;
|
|
+ struct bch_acct_persistent_reserved persistent_reserved;
|
|
struct bch_replicas_entry_v1 replicas;
|
|
- struct bch_dev_data_type dev_data_type;
|
|
+ struct bch_acct_dev_data_type dev_data_type;
|
|
struct bch_acct_compression compression;
|
|
struct bch_acct_snapshot snapshot;
|
|
struct bch_acct_btree btree;
|
|
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
|
|
index d2a5e76e6479..f2b9225fe0bc 100644
|
|
--- a/fs/bcachefs/ec.c
|
|
+++ b/fs/bcachefs/ec.c
|
|
@@ -20,6 +20,7 @@
|
|
#include "io_read.h"
|
|
#include "io_write.h"
|
|
#include "keylist.h"
|
|
+#include "lru.h"
|
|
#include "recovery.h"
|
|
#include "replicas.h"
|
|
#include "super-io.h"
|
|
@@ -104,6 +105,7 @@ struct ec_bio {
|
|
struct bch_dev *ca;
|
|
struct ec_stripe_buf *buf;
|
|
size_t idx;
|
|
+ u64 submit_time;
|
|
struct bio bio;
|
|
};
|
|
|
|
@@ -298,10 +300,22 @@ static int mark_stripe_bucket(struct btree_trans *trans,
|
|
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
|
|
|
|
if (flags & BTREE_TRIGGER_transactional) {
|
|
+ struct extent_ptr_decoded p = {
|
|
+ .ptr = *ptr,
|
|
+ .crc = bch2_extent_crc_unpack(s.k, NULL),
|
|
+ };
|
|
+ struct bkey_i_backpointer bp;
|
|
+ bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p,
|
|
+ (const union bch_extent_entry *) ptr, &bp);
|
|
+
|
|
struct bkey_i_alloc_v4 *a =
|
|
bch2_trans_start_alloc_update(trans, bucket, 0);
|
|
- ret = PTR_ERR_OR_ZERO(a) ?:
|
|
- __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
|
|
+ ret = PTR_ERR_OR_ZERO(a) ?:
|
|
+ __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?:
|
|
+ bch2_bucket_backpointer_mod(trans, s.s_c, &bp,
|
|
+ !(flags & BTREE_TRIGGER_overwrite));
|
|
+ if (ret)
|
|
+ goto err;
|
|
}
|
|
|
|
if (flags & BTREE_TRIGGER_gc) {
|
|
@@ -366,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
-static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
|
|
-{
|
|
- m->sectors = le16_to_cpu(s->sectors);
|
|
- m->algorithm = s->algorithm;
|
|
- m->nr_blocks = s->nr_blocks;
|
|
- m->nr_redundant = s->nr_redundant;
|
|
- m->disk_label = s->disk_label;
|
|
- m->blocks_nonempty = 0;
|
|
-
|
|
- for (unsigned i = 0; i < s->nr_blocks; i++)
|
|
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
|
|
-}
|
|
-
|
|
int bch2_trigger_stripe(struct btree_trans *trans,
|
|
enum btree_id btree, unsigned level,
|
|
struct bkey_s_c old, struct bkey_s _new,
|
|
@@ -399,6 +400,15 @@ int bch2_trigger_stripe(struct btree_trans *trans,
|
|
(new_s->nr_blocks != old_s->nr_blocks ||
|
|
new_s->nr_redundant != old_s->nr_redundant));
|
|
|
|
+ if (flags & BTREE_TRIGGER_transactional) {
|
|
+ int ret = bch2_lru_change(trans,
|
|
+ BCH_LRU_STRIPE_FRAGMENTATION,
|
|
+ idx,
|
|
+ stripe_lru_pos(old_s),
|
|
+ stripe_lru_pos(new_s));
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
|
|
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
|
|
/*
|
|
@@ -472,38 +482,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
- if (flags & BTREE_TRIGGER_atomic) {
|
|
- struct stripe *m = genradix_ptr(&c->stripes, idx);
|
|
-
|
|
- if (!m) {
|
|
- struct printbuf buf1 = PRINTBUF;
|
|
- struct printbuf buf2 = PRINTBUF;
|
|
-
|
|
- bch2_bkey_val_to_text(&buf1, c, old);
|
|
- bch2_bkey_val_to_text(&buf2, c, new);
|
|
- bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
|
|
- "old %s\n"
|
|
- "new %s", idx, buf1.buf, buf2.buf);
|
|
- printbuf_exit(&buf2);
|
|
- printbuf_exit(&buf1);
|
|
- bch2_inconsistent_error(c);
|
|
- return -1;
|
|
- }
|
|
-
|
|
- if (!new_s) {
|
|
- bch2_stripes_heap_del(c, m, idx);
|
|
-
|
|
- memset(m, 0, sizeof(*m));
|
|
- } else {
|
|
- stripe_to_mem(m, new_s);
|
|
-
|
|
- if (!old_s)
|
|
- bch2_stripes_heap_insert(c, m, idx);
|
|
- else
|
|
- bch2_stripes_heap_update(c, m, idx);
|
|
- }
|
|
- }
|
|
-
|
|
return 0;
|
|
}
|
|
|
|
@@ -726,14 +704,15 @@ static void ec_block_endio(struct bio *bio)
|
|
struct bch_dev *ca = ec_bio->ca;
|
|
struct closure *cl = bio->bi_private;
|
|
|
|
- if (bch2_dev_io_err_on(bio->bi_status, ca,
|
|
- bio_data_dir(bio)
|
|
- ? BCH_MEMBER_ERROR_write
|
|
- : BCH_MEMBER_ERROR_read,
|
|
- "erasure coding %s error: %s",
|
|
+ bch2_account_io_completion(ca, bio_data_dir(bio),
|
|
+ ec_bio->submit_time, !bio->bi_status);
|
|
+
|
|
+ if (bio->bi_status) {
|
|
+ bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
|
|
str_write_read(bio_data_dir(bio)),
|
|
- bch2_blk_status_to_str(bio->bi_status)))
|
|
+ bch2_blk_status_to_str(bio->bi_status));
|
|
clear_bit(ec_bio->idx, ec_bio->buf->valid);
|
|
+ }
|
|
|
|
int stale = dev_ptr_stale(ca, ptr);
|
|
if (stale) {
|
|
@@ -796,6 +775,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
|
|
ec_bio->ca = ca;
|
|
ec_bio->buf = buf;
|
|
ec_bio->idx = idx;
|
|
+ ec_bio->submit_time = local_clock();
|
|
|
|
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
|
|
ec_bio->bio.bi_end_io = ec_block_endio;
|
|
@@ -917,26 +897,6 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
|
|
|
|
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
|
|
{
|
|
- ec_stripes_heap n, *h = &c->ec_stripes_heap;
|
|
-
|
|
- if (idx >= h->size) {
|
|
- if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
|
|
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
|
-
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- if (n.size > h->size) {
|
|
- memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
|
|
- n.nr = h->nr;
|
|
- swap(*h, n);
|
|
- }
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
-
|
|
- free_heap(&n);
|
|
- }
|
|
-
|
|
- if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
|
|
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
|
-
|
|
if (c->gc_pos.phase != GC_PHASE_not_running &&
|
|
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
|
|
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
|
@@ -1009,180 +969,50 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
|
|
s->idx = 0;
|
|
}
|
|
|
|
-/* Heap of all existing stripes, ordered by blocks_nonempty */
|
|
-
|
|
-static u64 stripe_idx_to_delete(struct bch_fs *c)
|
|
-{
|
|
- ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
-
|
|
- lockdep_assert_held(&c->ec_stripes_heap_lock);
|
|
-
|
|
- if (h->nr &&
|
|
- h->data[0].blocks_nonempty == 0 &&
|
|
- !bch2_stripe_is_open(c, h->data[0].idx))
|
|
- return h->data[0].idx;
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
|
|
- size_t i)
|
|
-{
|
|
- struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
|
|
-
|
|
- genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
|
|
-}
|
|
-
|
|
-static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
|
|
-{
|
|
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
|
|
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
|
|
-
|
|
- return ((_l->blocks_nonempty > _r->blocks_nonempty) <
|
|
- (_l->blocks_nonempty < _r->blocks_nonempty));
|
|
-}
|
|
-
|
|
-static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
|
|
-{
|
|
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
|
|
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
|
|
- ec_stripes_heap *_h = (ec_stripes_heap *)h;
|
|
- size_t i = _l - _h->data;
|
|
- size_t j = _r - _h->data;
|
|
-
|
|
- swap(*_l, *_r);
|
|
-
|
|
- ec_stripes_heap_set_backpointer(_h, i);
|
|
- ec_stripes_heap_set_backpointer(_h, j);
|
|
-}
|
|
-
|
|
-static const struct min_heap_callbacks callbacks = {
|
|
- .less = ec_stripes_heap_cmp,
|
|
- .swp = ec_stripes_heap_swap,
|
|
-};
|
|
-
|
|
-static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
|
|
-{
|
|
- ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
- struct stripe *m = genradix_ptr(&c->stripes, idx);
|
|
-
|
|
- BUG_ON(m->heap_idx >= h->nr);
|
|
- BUG_ON(h->data[m->heap_idx].idx != idx);
|
|
-}
|
|
-
|
|
-void bch2_stripes_heap_del(struct bch_fs *c,
|
|
- struct stripe *m, size_t idx)
|
|
-{
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- heap_verify_backpointer(c, idx);
|
|
-
|
|
- min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
-}
|
|
-
|
|
-void bch2_stripes_heap_insert(struct bch_fs *c,
|
|
- struct stripe *m, size_t idx)
|
|
-{
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- BUG_ON(min_heap_full(&c->ec_stripes_heap));
|
|
-
|
|
- genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
|
|
- min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
|
|
- .idx = idx,
|
|
- .blocks_nonempty = m->blocks_nonempty,
|
|
- }),
|
|
- &callbacks,
|
|
- &c->ec_stripes_heap);
|
|
-
|
|
- heap_verify_backpointer(c, idx);
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
-}
|
|
-
|
|
-void bch2_stripes_heap_update(struct bch_fs *c,
|
|
- struct stripe *m, size_t idx)
|
|
-{
|
|
- ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
- bool do_deletes;
|
|
- size_t i;
|
|
-
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- heap_verify_backpointer(c, idx);
|
|
-
|
|
- h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
|
|
-
|
|
- i = m->heap_idx;
|
|
- min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap);
|
|
- min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
|
|
-
|
|
- heap_verify_backpointer(c, idx);
|
|
-
|
|
- do_deletes = stripe_idx_to_delete(c) != 0;
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
-
|
|
- if (do_deletes)
|
|
- bch2_do_stripe_deletes(c);
|
|
-}
|
|
-
|
|
/* stripe deletion */
|
|
|
|
static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
|
|
{
|
|
- struct bch_fs *c = trans->c;
|
|
struct btree_iter iter;
|
|
- struct bkey_s_c k;
|
|
- struct bkey_s_c_stripe s;
|
|
- int ret;
|
|
-
|
|
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
|
|
- BTREE_ITER_intent);
|
|
- ret = bkey_err(k);
|
|
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
|
|
+ BTREE_ID_stripes, POS(0, idx),
|
|
+ BTREE_ITER_intent);
|
|
+ int ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (k.k->type != KEY_TYPE_stripe) {
|
|
- bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
|
|
- ret = -EINVAL;
|
|
- goto err;
|
|
- }
|
|
-
|
|
- s = bkey_s_c_to_stripe(k);
|
|
- for (unsigned i = 0; i < s.v->nr_blocks; i++)
|
|
- if (stripe_blockcount_get(s.v, i)) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
-
|
|
- bch2_bkey_val_to_text(&buf, c, k);
|
|
- bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- ret = -EINVAL;
|
|
- goto err;
|
|
- }
|
|
-
|
|
- ret = bch2_btree_delete_at(trans, &iter, 0);
|
|
+ /*
|
|
+ * We expect write buffer races here
|
|
+ * Important: check stripe_is_open with stripe key locked:
|
|
+ */
|
|
+ if (k.k->type == KEY_TYPE_stripe &&
|
|
+ !bch2_stripe_is_open(trans->c, idx) &&
|
|
+ stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
|
|
+ ret = bch2_btree_delete_at(trans, &iter, 0);
|
|
err:
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
|
|
+/*
|
|
+ * XXX
|
|
+ * can we kill this and delete stripes from the trigger?
|
|
+ */
|
|
static void ec_stripe_delete_work(struct work_struct *work)
|
|
{
|
|
struct bch_fs *c =
|
|
container_of(work, struct bch_fs, ec_stripe_delete_work);
|
|
|
|
- while (1) {
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- u64 idx = stripe_idx_to_delete(c);
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
-
|
|
- if (!idx)
|
|
- break;
|
|
-
|
|
- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
- ec_stripe_delete(trans, idx));
|
|
- bch_err_fn(c, ret);
|
|
- if (ret)
|
|
- break;
|
|
- }
|
|
-
|
|
+ bch2_trans_run(c,
|
|
+ bch2_btree_write_buffer_tryflush(trans) ?:
|
|
+ for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
|
|
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
|
|
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
|
|
+ 0, lru_k,
|
|
+ NULL, NULL,
|
|
+ BCH_TRANS_COMMIT_no_enospc, ({
|
|
+ ec_stripe_delete(trans, lru_k.k->p.offset);
|
|
+ })));
|
|
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
|
|
}
|
|
|
|
@@ -1294,7 +1124,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
|
|
|
|
bch2_fs_inconsistent(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
- return -EIO;
|
|
+ return -BCH_ERR_erasure_coding_found_btree_node;
|
|
}
|
|
|
|
k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
|
|
@@ -1360,7 +1190,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
|
|
|
|
struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
|
|
if (!ca)
|
|
- return -EIO;
|
|
+ return -BCH_ERR_ENOENT_dev_not_found;
|
|
|
|
struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
|
|
|
|
@@ -1380,8 +1210,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
|
|
if (bp_k.k->type != KEY_TYPE_backpointer)
|
|
continue;
|
|
|
|
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
|
|
+ if (bp.v->btree_id == BTREE_ID_stripes)
|
|
+ continue;
|
|
+
|
|
ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
|
|
- bkey_s_c_to_backpointer(bp_k), &last_flushed);
|
|
+ bp, &last_flushed);
|
|
}));
|
|
|
|
bch2_bkey_buf_exit(&last_flushed, c);
|
|
@@ -1393,21 +1227,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
|
|
{
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
|
|
- unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
|
|
- int ret = 0;
|
|
+ unsigned nr_data = v->nr_blocks - v->nr_redundant;
|
|
|
|
- ret = bch2_btree_write_buffer_flush_sync(trans);
|
|
+ int ret = bch2_btree_write_buffer_flush_sync(trans);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- for (i = 0; i < nr_data; i++) {
|
|
+ for (unsigned i = 0; i < nr_data; i++) {
|
|
ret = ec_stripe_update_bucket(trans, s, i);
|
|
if (ret)
|
|
break;
|
|
}
|
|
err:
|
|
bch2_trans_put(trans);
|
|
-
|
|
return ret;
|
|
}
|
|
|
|
@@ -1473,6 +1305,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
|
if (s->err) {
|
|
if (!bch2_err_matches(s->err, EROFS))
|
|
bch_err(c, "error creating stripe: error writing data buckets");
|
|
+ ret = s->err;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1481,6 +1314,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
|
|
|
if (ec_do_recov(c, &s->existing_stripe)) {
|
|
bch_err(c, "error creating stripe: error reading existing stripe");
|
|
+ ret = -BCH_ERR_ec_block_read;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1506,6 +1340,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
|
|
|
if (ec_nr_failed(&s->new_stripe)) {
|
|
bch_err(c, "error creating stripe: error writing redundancy buckets");
|
|
+ ret = -BCH_ERR_ec_block_write;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1527,6 +1362,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
|
if (ret)
|
|
goto err;
|
|
err:
|
|
+ trace_stripe_create(c, s->idx, ret);
|
|
+
|
|
bch2_disk_reservation_put(c, &s->res);
|
|
|
|
for (i = 0; i < v->nr_blocks; i++)
|
|
@@ -1612,11 +1449,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int
|
|
ec_stripe_new_set_pending(c, h);
|
|
}
|
|
|
|
-void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
|
|
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err)
|
|
{
|
|
struct ec_stripe_new *s = ob->ec;
|
|
|
|
- s->err = -EIO;
|
|
+ s->err = err;
|
|
}
|
|
|
|
void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
|
|
@@ -1968,39 +1805,40 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
-static s64 get_existing_stripe(struct bch_fs *c,
|
|
- struct ec_stripe_head *head)
|
|
+static int __get_existing_stripe(struct btree_trans *trans,
|
|
+ struct ec_stripe_head *head,
|
|
+ struct ec_stripe_buf *stripe,
|
|
+ u64 idx)
|
|
{
|
|
- ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
- struct stripe *m;
|
|
- size_t heap_idx;
|
|
- u64 stripe_idx;
|
|
- s64 ret = -1;
|
|
-
|
|
- if (may_create_new_stripe(c))
|
|
- return -1;
|
|
+ struct bch_fs *c = trans->c;
|
|
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
|
|
- /* No blocks worth reusing, stripe will just be deleted: */
|
|
- if (!h->data[heap_idx].blocks_nonempty)
|
|
- continue;
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
|
|
+ BTREE_ID_stripes, POS(0, idx), 0);
|
|
+ int ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
|
|
- stripe_idx = h->data[heap_idx].idx;
|
|
+ /* We expect write buffer races here */
|
|
+ if (k.k->type != KEY_TYPE_stripe)
|
|
+ goto out;
|
|
|
|
- m = genradix_ptr(&c->stripes, stripe_idx);
|
|
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
|
|
+ if (stripe_lru_pos(s.v) <= 1)
|
|
+ goto out;
|
|
|
|
- if (m->disk_label == head->disk_label &&
|
|
- m->algorithm == head->algo &&
|
|
- m->nr_redundant == head->redundancy &&
|
|
- m->sectors == head->blocksize &&
|
|
- m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
|
|
- bch2_try_open_stripe(c, head->s, stripe_idx)) {
|
|
- ret = stripe_idx;
|
|
- break;
|
|
- }
|
|
+ if (s.v->disk_label == head->disk_label &&
|
|
+ s.v->algorithm == head->algo &&
|
|
+ s.v->nr_redundant == head->redundancy &&
|
|
+ le16_to_cpu(s.v->sectors) == head->blocksize &&
|
|
+ bch2_try_open_stripe(c, head->s, idx)) {
|
|
+ bkey_reassemble(&stripe->key, k);
|
|
+ ret = 1;
|
|
}
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
+out:
|
|
+ bch2_set_btree_iter_dontneed(&iter);
|
|
+err:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
|
|
@@ -2052,24 +1890,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
|
|
struct ec_stripe_new *s)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- s64 idx;
|
|
- int ret;
|
|
|
|
/*
|
|
* If we can't allocate a new stripe, and there's no stripes with empty
|
|
* blocks for us to reuse, that means we have to wait on copygc:
|
|
*/
|
|
- idx = get_existing_stripe(c, h);
|
|
- if (idx < 0)
|
|
- return -BCH_ERR_stripe_alloc_blocked;
|
|
+ if (may_create_new_stripe(c))
|
|
+ return -1;
|
|
|
|
- ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
|
|
- bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
|
|
- "reading stripe key: %s", bch2_err_str(ret));
|
|
- if (ret) {
|
|
- bch2_stripe_close(c, s);
|
|
- return ret;
|
|
+ struct btree_iter lru_iter;
|
|
+ struct bkey_s_c lru_k;
|
|
+ int ret = 0;
|
|
+
|
|
+ for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
|
|
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
|
|
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
|
|
+ 0, lru_k, ret) {
|
|
+ ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
|
|
+ if (ret)
|
|
+ break;
|
|
}
|
|
+ bch2_trans_iter_exit(trans, &lru_iter);
|
|
+ if (!ret)
|
|
+ ret = -BCH_ERR_stripe_alloc_blocked;
|
|
+ if (ret == 1)
|
|
+ ret = 0;
|
|
+ if (ret)
|
|
+ return ret;
|
|
|
|
return init_new_stripe_from_existing(c, s);
|
|
}
|
|
@@ -2367,46 +2214,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
|
|
|
|
int bch2_stripes_read(struct bch_fs *c)
|
|
{
|
|
- int ret = bch2_trans_run(c,
|
|
- for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
|
|
- BTREE_ITER_prefetch, k, ({
|
|
- if (k.k->type != KEY_TYPE_stripe)
|
|
- continue;
|
|
-
|
|
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
|
|
- if (ret)
|
|
- break;
|
|
-
|
|
- struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
|
|
-
|
|
- stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
|
|
-
|
|
- bch2_stripes_heap_insert(c, m, k.k->p.offset);
|
|
- 0;
|
|
- })));
|
|
- bch_err_fn(c, ret);
|
|
- return ret;
|
|
-}
|
|
-
|
|
-void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
|
|
-{
|
|
- ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
- struct stripe *m;
|
|
- size_t i;
|
|
-
|
|
- mutex_lock(&c->ec_stripes_heap_lock);
|
|
- for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
|
|
- m = genradix_ptr(&c->stripes, h->data[i].idx);
|
|
-
|
|
- prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
|
|
- h->data[i].blocks_nonempty,
|
|
- m->nr_blocks - m->nr_redundant,
|
|
- m->nr_redundant);
|
|
- if (bch2_stripe_is_open(c, h->data[i].idx))
|
|
- prt_str(out, " open");
|
|
- prt_newline(out);
|
|
- }
|
|
- mutex_unlock(&c->ec_stripes_heap_lock);
|
|
+ return 0;
|
|
}
|
|
|
|
static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
|
|
@@ -2477,15 +2285,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
|
|
|
|
BUG_ON(!list_empty(&c->ec_stripe_new_list));
|
|
|
|
- free_heap(&c->ec_stripes_heap);
|
|
- genradix_free(&c->stripes);
|
|
bioset_exit(&c->ec_bioset);
|
|
}
|
|
|
|
void bch2_fs_ec_init_early(struct bch_fs *c)
|
|
{
|
|
spin_lock_init(&c->ec_stripes_new_lock);
|
|
- mutex_init(&c->ec_stripes_heap_lock);
|
|
|
|
INIT_LIST_HEAD(&c->ec_stripe_head_list);
|
|
mutex_init(&c->ec_stripe_head_lock);
|
|
@@ -2503,3 +2308,40 @@ int bch2_fs_ec_init(struct bch_fs *c)
|
|
return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
|
|
BIOSET_NEED_BVECS);
|
|
}
|
|
+
|
|
+static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans,
|
|
+ struct bkey_s_c k,
|
|
+ struct bkey_buf *last_flushed)
|
|
+{
|
|
+ if (k.k->type != KEY_TYPE_stripe)
|
|
+ return 0;
|
|
+
|
|
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
|
|
+
|
|
+ u64 lru_idx = stripe_lru_pos(s.v);
|
|
+ if (lru_idx) {
|
|
+ int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION,
|
|
+ k.k->p.offset, lru_idx, k, last_flushed);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_check_stripe_to_lru_refs(struct bch_fs *c)
|
|
+{
|
|
+ struct bkey_buf last_flushed;
|
|
+
|
|
+ bch2_bkey_buf_init(&last_flushed);
|
|
+ bkey_init(&last_flushed.k->k);
|
|
+
|
|
+ int ret = bch2_trans_run(c,
|
|
+ for_each_btree_key_commit(trans, iter, BTREE_ID_stripes,
|
|
+ POS_MIN, BTREE_ITER_prefetch, k,
|
|
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
+ bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)));
|
|
+
|
|
+ bch2_bkey_buf_exit(&last_flushed, c);
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
|
|
index 583ca6a226da..62d27e04d763 100644
|
|
--- a/fs/bcachefs/ec.h
|
|
+++ b/fs/bcachefs/ec.h
|
|
@@ -92,6 +92,29 @@ static inline void stripe_csum_set(struct bch_stripe *s,
|
|
memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
|
|
}
|
|
|
|
+#define STRIPE_LRU_POS_EMPTY 1
|
|
+
|
|
+static inline u64 stripe_lru_pos(const struct bch_stripe *s)
|
|
+{
|
|
+ if (!s)
|
|
+ return 0;
|
|
+
|
|
+ unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
|
|
+
|
|
+ for (unsigned i = 0; i < nr_data; i++)
|
|
+ blocks_empty += !stripe_blockcount_get(s, i);
|
|
+
|
|
+ /* Will be picked up by the stripe_delete worker */
|
|
+ if (blocks_empty == nr_data)
|
|
+ return STRIPE_LRU_POS_EMPTY;
|
|
+
|
|
+ if (!blocks_empty)
|
|
+ return 0;
|
|
+
|
|
+ /* invert: more blocks empty = reuse first */
|
|
+ return LRU_TIME_MAX - blocks_empty;
|
|
+}
|
|
+
|
|
static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
|
|
const struct bch_extent_ptr *data_ptr,
|
|
unsigned sectors)
|
|
@@ -132,6 +155,20 @@ static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
|
|
m->sectors);
|
|
}
|
|
|
|
+static inline void gc_stripe_unlock(struct gc_stripe *s)
|
|
+{
|
|
+ BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
|
|
+
|
|
+ clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock);
|
|
+ wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR);
|
|
+}
|
|
+
|
|
+static inline void gc_stripe_lock(struct gc_stripe *s)
|
|
+{
|
|
+ wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR,
|
|
+ TASK_UNINTERRUPTIBLE);
|
|
+}
|
|
+
|
|
struct bch_read_bio;
|
|
|
|
struct ec_stripe_buf {
|
|
@@ -212,7 +249,7 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey
|
|
|
|
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
|
|
|
|
-void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
|
|
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
|
|
|
|
int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
|
|
|
|
@@ -221,10 +258,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
|
|
unsigned, unsigned, unsigned,
|
|
enum bch_watermark, struct closure *);
|
|
|
|
-void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
|
|
-void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
|
|
-void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
|
|
-
|
|
void bch2_do_stripe_deletes(struct bch_fs *);
|
|
void bch2_ec_do_stripe_creates(struct bch_fs *);
|
|
void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
|
|
@@ -261,11 +294,12 @@ void bch2_fs_ec_flush(struct bch_fs *);
|
|
|
|
int bch2_stripes_read(struct bch_fs *);
|
|
|
|
-void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
|
|
void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
|
|
|
|
void bch2_fs_ec_exit(struct bch_fs *);
|
|
void bch2_fs_ec_init_early(struct bch_fs *);
|
|
int bch2_fs_ec_init(struct bch_fs *);
|
|
|
|
+int bch2_check_stripe_to_lru_refs(struct bch_fs *);
|
|
+
|
|
#endif /* _BCACHEFS_EC_H */
|
|
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
|
|
index 8d1e70e830ac..06144bfd9c19 100644
|
|
--- a/fs/bcachefs/ec_types.h
|
|
+++ b/fs/bcachefs/ec_types.h
|
|
@@ -20,23 +20,15 @@ struct stripe {
|
|
};
|
|
|
|
struct gc_stripe {
|
|
+ u8 lock;
|
|
+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */
|
|
u16 sectors;
|
|
-
|
|
u8 nr_blocks;
|
|
u8 nr_redundant;
|
|
-
|
|
- unsigned alive:1; /* does a corresponding key exist in stripes btree? */
|
|
u16 block_sectors[BCH_BKEY_PTRS_MAX];
|
|
struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];
|
|
|
|
struct bch_replicas_padded r;
|
|
};
|
|
|
|
-struct ec_stripe_heap_entry {
|
|
- size_t idx;
|
|
- unsigned blocks_nonempty;
|
|
-};
|
|
-
|
|
-typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap;
|
|
-
|
|
#endif /* _BCACHEFS_EC_TYPES_H */
|
|
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
|
|
index 4590cd0c7c90..101806d7ebe1 100644
|
|
--- a/fs/bcachefs/errcode.h
|
|
+++ b/fs/bcachefs/errcode.h
|
|
@@ -116,9 +116,11 @@
|
|
x(ENOENT, ENOENT_snapshot_tree) \
|
|
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
|
|
x(ENOENT, ENOENT_dev_not_found) \
|
|
+ x(ENOENT, ENOENT_dev_bucket_not_found) \
|
|
x(ENOENT, ENOENT_dev_idx_not_found) \
|
|
x(ENOENT, ENOENT_inode_no_backpointer) \
|
|
x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
|
|
+ x(ENOENT, btree_node_dying) \
|
|
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
|
|
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
|
|
x(EEXIST, EEXIST_str_hash_set) \
|
|
@@ -180,6 +182,12 @@
|
|
x(EINVAL, not_in_recovery) \
|
|
x(EINVAL, cannot_rewind_recovery) \
|
|
x(0, data_update_done) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_would_block) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_unwritten) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \
|
|
+ x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \
|
|
x(EINVAL, device_state_not_allowed) \
|
|
x(EINVAL, member_info_missing) \
|
|
x(EINVAL, mismatched_block_size) \
|
|
@@ -200,6 +208,8 @@
|
|
x(EINVAL, no_resize_with_buckets_nouse) \
|
|
x(EINVAL, inode_unpack_error) \
|
|
x(EINVAL, varint_decode_error) \
|
|
+ x(EINVAL, erasure_coding_found_btree_node) \
|
|
+ x(EOPNOTSUPP, may_not_use_incompat_feature) \
|
|
x(EROFS, erofs_trans_commit) \
|
|
x(EROFS, erofs_no_writes) \
|
|
x(EROFS, erofs_journal_err) \
|
|
@@ -210,10 +220,18 @@
|
|
x(EROFS, insufficient_devices) \
|
|
x(0, operation_blocked) \
|
|
x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \
|
|
- x(BCH_ERR_operation_blocked, journal_res_get_blocked) \
|
|
- x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \
|
|
- x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \
|
|
- x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \
|
|
+ x(BCH_ERR_operation_blocked, journal_res_blocked) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_blocked) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_max_open) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_full) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_pin_full) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_stuck) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_retry_open) \
|
|
+ x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \
|
|
+ x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \
|
|
+ x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \
|
|
x(BCH_ERR_invalid, invalid_sb) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_magic) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_version) \
|
|
@@ -223,6 +241,7 @@
|
|
x(BCH_ERR_invalid_sb, invalid_sb_csum) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
|
|
+ x(BCH_ERR_invalid_sb, invalid_sb_offset) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
|
|
x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
|
|
@@ -250,6 +269,7 @@
|
|
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
|
|
x(EIO, journal_shutdown) \
|
|
x(EIO, journal_flush_err) \
|
|
+ x(EIO, journal_write_err) \
|
|
x(EIO, btree_node_read_err) \
|
|
x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \
|
|
x(EIO, sb_not_downgraded) \
|
|
@@ -258,17 +278,52 @@
|
|
x(EIO, btree_node_read_validate_error) \
|
|
x(EIO, btree_need_topology_repair) \
|
|
x(EIO, bucket_ref_update) \
|
|
+ x(EIO, trigger_alloc) \
|
|
x(EIO, trigger_pointer) \
|
|
x(EIO, trigger_stripe_pointer) \
|
|
x(EIO, metadata_bucket_inconsistency) \
|
|
x(EIO, mark_stripe) \
|
|
x(EIO, stripe_reconstruct) \
|
|
x(EIO, key_type_error) \
|
|
- x(EIO, no_device_to_read_from) \
|
|
+ x(EIO, extent_poisened) \
|
|
x(EIO, missing_indirect_extent) \
|
|
x(EIO, invalidate_stripe_to_dev) \
|
|
x(EIO, no_encryption_key) \
|
|
x(EIO, insufficient_journal_devices) \
|
|
+ x(EIO, device_offline) \
|
|
+ x(EIO, EIO_fault_injected) \
|
|
+ x(EIO, ec_block_read) \
|
|
+ x(EIO, ec_block_write) \
|
|
+ x(EIO, recompute_checksum) \
|
|
+ x(EIO, decompress) \
|
|
+ x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \
|
|
+ x(BCH_ERR_decompress, decompress_lz4) \
|
|
+ x(BCH_ERR_decompress, decompress_gzip) \
|
|
+ x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \
|
|
+ x(BCH_ERR_decompress, decompress_zstd) \
|
|
+ x(EIO, data_write) \
|
|
+ x(BCH_ERR_data_write, data_write_io) \
|
|
+ x(BCH_ERR_data_write, data_write_csum) \
|
|
+ x(BCH_ERR_data_write, data_write_invalid_ptr) \
|
|
+ x(BCH_ERR_data_write, data_write_misaligned) \
|
|
+ x(BCH_ERR_decompress, data_read) \
|
|
+ x(BCH_ERR_data_read, no_device_to_read_from) \
|
|
+ x(BCH_ERR_data_read, data_read_io_err) \
|
|
+ x(BCH_ERR_data_read, data_read_csum_err) \
|
|
+ x(BCH_ERR_data_read, data_read_retry) \
|
|
+ x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
|
|
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \
|
|
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \
|
|
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \
|
|
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \
|
|
+ x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\
|
|
+ x(BCH_ERR_data_read, data_read_decompress_err) \
|
|
+ x(BCH_ERR_data_read, data_read_decrypt_err) \
|
|
+ x(BCH_ERR_data_read, data_read_ptr_stale_race) \
|
|
+ x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \
|
|
+ x(BCH_ERR_data_read, data_read_no_encryption_key) \
|
|
+ x(BCH_ERR_data_read, data_read_buffer_too_small) \
|
|
+ x(BCH_ERR_data_read, data_read_key_overwritten) \
|
|
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
|
|
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
|
|
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
|
|
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
|
|
index 038da6a61f6b..207f35d3cce2 100644
|
|
--- a/fs/bcachefs/error.c
|
|
+++ b/fs/bcachefs/error.c
|
|
@@ -3,8 +3,8 @@
|
|
#include "btree_cache.h"
|
|
#include "btree_iter.h"
|
|
#include "error.h"
|
|
-#include "fs-common.h"
|
|
#include "journal.h"
|
|
+#include "namei.h"
|
|
#include "recovery_passes.h"
|
|
#include "super.h"
|
|
#include "thread_with_file.h"
|
|
@@ -54,25 +54,41 @@ void bch2_io_error_work(struct work_struct *work)
|
|
{
|
|
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
|
|
struct bch_fs *c = ca->fs;
|
|
- bool dev;
|
|
+
|
|
+ /* XXX: if it's reads or checksums that are failing, set it to failed */
|
|
|
|
down_write(&c->state_lock);
|
|
- dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
|
|
- BCH_FORCE_IF_DEGRADED);
|
|
- if (dev
|
|
- ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
|
|
- BCH_FORCE_IF_DEGRADED)
|
|
- : bch2_fs_emergency_read_only(c))
|
|
+ unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
|
|
+
|
|
+ if (write_errors_start &&
|
|
+ time_after(jiffies,
|
|
+ write_errors_start + c->opts.write_error_timeout * HZ)) {
|
|
+ if (ca->mi.state >= BCH_MEMBER_STATE_ro)
|
|
+ goto out;
|
|
+
|
|
+ bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
|
|
+ BCH_FORCE_IF_DEGRADED);
|
|
+
|
|
bch_err(ca,
|
|
- "too many IO errors, setting %s RO",
|
|
+ "writes erroring for %u seconds, setting %s ro",
|
|
+ c->opts.write_error_timeout,
|
|
dev ? "device" : "filesystem");
|
|
+ if (!dev)
|
|
+ bch2_fs_emergency_read_only(c);
|
|
+
|
|
+ }
|
|
+out:
|
|
up_write(&c->state_lock);
|
|
}
|
|
|
|
void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
|
|
{
|
|
atomic64_inc(&ca->errors[type]);
|
|
- //queue_work(system_long_wq, &ca->io_error_work);
|
|
+
|
|
+ if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
|
|
+ ca->write_errors_start = jiffies;
|
|
+
|
|
+ queue_work(system_long_wq, &ca->io_error_work);
|
|
}
|
|
|
|
enum ask_yn {
|
|
@@ -530,35 +546,59 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
|
|
mutex_unlock(&c->fsck_error_msgs_lock);
|
|
}
|
|
|
|
-int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum)
|
|
+int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
|
|
+ subvol_inum inum, u64 offset)
|
|
{
|
|
u32 restart_count = trans->restart_count;
|
|
int ret = 0;
|
|
|
|
- /* XXX: we don't yet attempt to print paths when we don't know the subvol */
|
|
- if (inum.subvol)
|
|
- ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out));
|
|
+ if (inum.subvol) {
|
|
+ ret = bch2_inum_to_path(trans, inum, out);
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
+ return ret;
|
|
+ }
|
|
if (!inum.subvol || ret)
|
|
prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
|
|
+ prt_printf(out, " offset %llu: ", offset);
|
|
|
|
return trans_was_restarted(trans, restart_count);
|
|
}
|
|
|
|
-int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
|
|
- subvol_inum inum, u64 offset)
|
|
+void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
|
|
+ subvol_inum inum, u64 offset)
|
|
{
|
|
- int ret = bch2_inum_err_msg_trans(trans, out, inum);
|
|
- prt_printf(out, " offset %llu: ", offset);
|
|
- return ret;
|
|
+ bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
|
|
}
|
|
|
|
-void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum)
|
|
+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
|
|
+ struct bpos pos)
|
|
{
|
|
- bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum));
|
|
+ struct bch_fs *c = trans->c;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!bch2_snapshot_is_leaf(c, pos.snapshot))
|
|
+ prt_str(out, "(multiple snapshots) ");
|
|
+
|
|
+ subvol_inum inum = {
|
|
+ .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot),
|
|
+ .inum = pos.inode,
|
|
+ };
|
|
+
|
|
+ if (inum.subvol) {
|
|
+ ret = bch2_inum_to_path(trans, inum, out);
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (!inum.subvol || ret)
|
|
+ prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot);
|
|
+
|
|
+ prt_printf(out, " offset %llu: ", pos.offset << 8);
|
|
+ return 0;
|
|
}
|
|
|
|
-void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
|
|
- subvol_inum inum, u64 offset)
|
|
+void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out,
|
|
+ struct bpos pos)
|
|
{
|
|
- bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
|
|
+ bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
|
|
}
|
|
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
|
|
index 7acf2a27ca28..7d3f0e2a5fd6 100644
|
|
--- a/fs/bcachefs/error.h
|
|
+++ b/fs/bcachefs/error.h
|
|
@@ -216,32 +216,43 @@ void bch2_io_error_work(struct work_struct *);
|
|
/* Does the error handling without logging a message */
|
|
void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
|
|
|
|
-#define bch2_dev_io_err_on(cond, ca, _type, ...) \
|
|
-({ \
|
|
- bool _ret = (cond); \
|
|
- \
|
|
- if (_ret) { \
|
|
- bch_err_dev_ratelimited(ca, __VA_ARGS__); \
|
|
- bch2_io_error(ca, _type); \
|
|
- } \
|
|
- _ret; \
|
|
-})
|
|
-
|
|
-#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \
|
|
-({ \
|
|
- bool _ret = (cond); \
|
|
- \
|
|
- if (_ret) { \
|
|
- bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
|
|
- bch2_io_error(ca, _type); \
|
|
- } \
|
|
- _ret; \
|
|
-})
|
|
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
|
+void bch2_latency_acct(struct bch_dev *, u64, int);
|
|
+#else
|
|
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
|
|
+#endif
|
|
+
|
|
+static inline void bch2_account_io_success_fail(struct bch_dev *ca,
|
|
+ enum bch_member_error_type type,
|
|
+ bool success)
|
|
+{
|
|
+ if (likely(success)) {
|
|
+ if (type == BCH_MEMBER_ERROR_write &&
|
|
+ ca->write_errors_start)
|
|
+ ca->write_errors_start = 0;
|
|
+ } else {
|
|
+ bch2_io_error(ca, type);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void bch2_account_io_completion(struct bch_dev *ca,
|
|
+ enum bch_member_error_type type,
|
|
+ u64 submit_time, bool success)
|
|
+{
|
|
+ if (unlikely(!ca))
|
|
+ return;
|
|
+
|
|
+ if (type != BCH_MEMBER_ERROR_checksum)
|
|
+ bch2_latency_acct(ca, submit_time, type);
|
|
+
|
|
+ bch2_account_io_success_fail(ca, type, success);
|
|
+}
|
|
|
|
-int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum);
|
|
int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
|
|
|
|
-void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum);
|
|
void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
|
|
|
|
+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos);
|
|
+void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos);
|
|
+
|
|
#endif /* _BCACHEFS_ERROR_H */
|
|
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
|
|
index 2d8042f853dc..ca2073db472b 100644
|
|
--- a/fs/bcachefs/extents.c
|
|
+++ b/fs/bcachefs/extents.c
|
|
@@ -28,6 +28,13 @@
|
|
#include "trace.h"
|
|
#include "util.h"
|
|
|
|
+static const char * const bch2_extent_flags_strs[] = {
|
|
+#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
|
|
+ BCH_EXTENT_FLAGS()
|
|
+#undef x
|
|
+ NULL,
|
|
+};
|
|
+
|
|
static unsigned bch2_crc_field_size_max[] = {
|
|
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
|
|
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
|
|
@@ -51,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
|
|
}
|
|
|
|
void bch2_mark_io_failure(struct bch_io_failures *failed,
|
|
- struct extent_ptr_decoded *p)
|
|
+ struct extent_ptr_decoded *p,
|
|
+ bool csum_error)
|
|
{
|
|
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
|
|
|
|
@@ -59,53 +67,57 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
|
|
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
|
|
|
|
f = &failed->devs[failed->nr++];
|
|
- f->dev = p->ptr.dev;
|
|
- f->idx = p->idx;
|
|
- f->nr_failed = 1;
|
|
- f->nr_retries = 0;
|
|
- } else if (p->idx != f->idx) {
|
|
- f->idx = p->idx;
|
|
- f->nr_failed = 1;
|
|
- f->nr_retries = 0;
|
|
- } else {
|
|
- f->nr_failed++;
|
|
+ memset(f, 0, sizeof(*f));
|
|
+ f->dev = p->ptr.dev;
|
|
}
|
|
+
|
|
+ if (p->do_ec_reconstruct)
|
|
+ f->failed_ec = true;
|
|
+ else if (!csum_error)
|
|
+ f->failed_io = true;
|
|
+ else
|
|
+ f->failed_csum_nr++;
|
|
}
|
|
|
|
-static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
|
|
+static inline u64 dev_latency(struct bch_dev *ca)
|
|
{
|
|
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
|
|
return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
|
|
}
|
|
|
|
+static inline int dev_failed(struct bch_dev *ca)
|
|
+{
|
|
+ return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
|
|
+}
|
|
+
|
|
/*
|
|
* returns true if p1 is better than p2:
|
|
*/
|
|
static inline bool ptr_better(struct bch_fs *c,
|
|
const struct extent_ptr_decoded p1,
|
|
- const struct extent_ptr_decoded p2)
|
|
+ u64 p1_latency,
|
|
+ struct bch_dev *ca1,
|
|
+ const struct extent_ptr_decoded p2,
|
|
+ u64 p2_latency)
|
|
{
|
|
- if (likely(!p1.idx && !p2.idx)) {
|
|
- u64 l1 = dev_latency(c, p1.ptr.dev);
|
|
- u64 l2 = dev_latency(c, p2.ptr.dev);
|
|
+ struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
|
|
|
|
- /*
|
|
- * Square the latencies, to bias more in favor of the faster
|
|
- * device - we never want to stop issuing reads to the slower
|
|
- * device altogether, so that we can update our latency numbers:
|
|
- */
|
|
- l1 *= l1;
|
|
- l2 *= l2;
|
|
+ int failed_delta = dev_failed(ca1) - dev_failed(ca2);
|
|
+ if (unlikely(failed_delta))
|
|
+ return failed_delta < 0;
|
|
|
|
- /* Pick at random, biased in favor of the faster device: */
|
|
+ if (unlikely(bch2_force_reconstruct_read))
|
|
+ return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
|
|
|
|
- return bch2_get_random_u64_below(l1 + l2) > l1;
|
|
- }
|
|
+ if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
|
|
+ return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
|
|
|
|
- if (bch2_force_reconstruct_read)
|
|
- return p1.idx > p2.idx;
|
|
+ int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
|
|
+ if (unlikely(crc_retry_delta))
|
|
+ return crc_retry_delta < 0;
|
|
|
|
- return p1.idx < p2.idx;
|
|
+ /* Pick at random, biased in favor of the faster device: */
|
|
+
|
|
+ return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
|
|
}
|
|
|
|
/*
|
|
@@ -115,64 +127,104 @@ static inline bool ptr_better(struct bch_fs *c,
|
|
*/
|
|
int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
|
struct bch_io_failures *failed,
|
|
- struct extent_ptr_decoded *pick)
|
|
+ struct extent_ptr_decoded *pick,
|
|
+ int dev)
|
|
{
|
|
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
- const union bch_extent_entry *entry;
|
|
- struct extent_ptr_decoded p;
|
|
- struct bch_dev_io_failures *f;
|
|
- int ret = 0;
|
|
+ bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
|
|
+ bool have_dirty_ptrs = false, have_pick = false;
|
|
|
|
if (k.k->type == KEY_TYPE_error)
|
|
return -BCH_ERR_key_type_error;
|
|
|
|
rcu_read_lock();
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ u64 pick_latency;
|
|
+
|
|
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ have_dirty_ptrs |= !p.ptr.cached;
|
|
+
|
|
/*
|
|
* Unwritten extent: no need to actually read, treat it as a
|
|
* hole and return 0s:
|
|
*/
|
|
if (p.ptr.unwritten) {
|
|
- ret = 0;
|
|
- break;
|
|
+ rcu_read_unlock();
|
|
+ return 0;
|
|
}
|
|
|
|
- /*
|
|
- * If there are any dirty pointers it's an error if we can't
|
|
- * read:
|
|
- */
|
|
- if (!ret && !p.ptr.cached)
|
|
- ret = -BCH_ERR_no_device_to_read_from;
|
|
+ /* Are we being asked to read from a specific device? */
|
|
+ if (dev >= 0 && p.ptr.dev != dev)
|
|
+ continue;
|
|
|
|
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
|
|
|
|
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
|
|
continue;
|
|
|
|
- f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
|
|
- if (f)
|
|
- p.idx = f->nr_failed < f->nr_retries
|
|
- ? f->idx
|
|
- : f->idx + 1;
|
|
+ struct bch_dev_io_failures *f =
|
|
+ unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
|
|
+ if (unlikely(f)) {
|
|
+ p.crc_retry_nr = f->failed_csum_nr;
|
|
+ p.has_ec &= ~f->failed_ec;
|
|
|
|
- if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
|
|
- p.idx++;
|
|
+ if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
|
|
+ have_io_errors |= f->failed_io;
|
|
+ have_io_errors |= f->failed_ec;
|
|
+ }
|
|
+ have_csum_errors |= !!f->failed_csum_nr;
|
|
|
|
- if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
|
|
- p.idx++;
|
|
+ if (p.has_ec && (f->failed_io || f->failed_csum_nr))
|
|
+ p.do_ec_reconstruct = true;
|
|
+ else if (f->failed_io ||
|
|
+ f->failed_csum_nr > c->opts.checksum_err_retry_nr)
|
|
+ continue;
|
|
+ }
|
|
|
|
- if (p.idx > (unsigned) p.has_ec)
|
|
- continue;
|
|
+ have_missing_devs |= ca && !bch2_dev_is_online(ca);
|
|
|
|
- if (ret > 0 && !ptr_better(c, p, *pick))
|
|
- continue;
|
|
+ if (!ca || !bch2_dev_is_online(ca)) {
|
|
+ if (!p.has_ec)
|
|
+ continue;
|
|
+ p.do_ec_reconstruct = true;
|
|
+ }
|
|
+
|
|
+ if (bch2_force_reconstruct_read && p.has_ec)
|
|
+ p.do_ec_reconstruct = true;
|
|
|
|
- *pick = p;
|
|
- ret = 1;
|
|
+ u64 p_latency = dev_latency(ca);
|
|
+ /*
|
|
+ * Square the latencies, to bias more in favor of the faster
|
|
+ * device - we never want to stop issuing reads to the slower
|
|
+ * device altogether, so that we can update our latency numbers:
|
|
+ */
|
|
+ p_latency *= p_latency;
|
|
+
|
|
+ if (!have_pick ||
|
|
+ ptr_better(c,
|
|
+ p, p_latency, ca,
|
|
+ *pick, pick_latency)) {
|
|
+ *pick = p;
|
|
+ pick_latency = p_latency;
|
|
+ have_pick = true;
|
|
+ }
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
- return ret;
|
|
+ if (have_pick)
|
|
+ return 1;
|
|
+ if (!have_dirty_ptrs)
|
|
+ return 0;
|
|
+ if (have_missing_devs)
|
|
+ return -BCH_ERR_no_device_to_read_from;
|
|
+ if (have_csum_errors)
|
|
+ return -BCH_ERR_data_read_csum_err;
|
|
+ if (have_io_errors)
|
|
+ return -BCH_ERR_data_read_io_err;
|
|
+
|
|
+ WARN_ONCE(1, "unhandled error case in %s\n", __func__);
|
|
+ return -EINVAL;
|
|
}
|
|
|
|
/* KEY_TYPE_btree_ptr: */
|
|
@@ -536,29 +588,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
|
|
struct bch_extent_crc_unpacked src,
|
|
enum bch_extent_entry_type type)
|
|
{
|
|
-#define set_common_fields(_dst, _src) \
|
|
- _dst.type = 1 << type; \
|
|
- _dst.csum_type = _src.csum_type, \
|
|
- _dst.compression_type = _src.compression_type, \
|
|
- _dst._compressed_size = _src.compressed_size - 1, \
|
|
- _dst._uncompressed_size = _src.uncompressed_size - 1, \
|
|
- _dst.offset = _src.offset
|
|
+#define common_fields(_src) \
|
|
+ .type = BIT(type), \
|
|
+ .csum_type = _src.csum_type, \
|
|
+ .compression_type = _src.compression_type, \
|
|
+ ._compressed_size = _src.compressed_size - 1, \
|
|
+ ._uncompressed_size = _src.uncompressed_size - 1, \
|
|
+ .offset = _src.offset
|
|
|
|
switch (type) {
|
|
case BCH_EXTENT_ENTRY_crc32:
|
|
- set_common_fields(dst->crc32, src);
|
|
- dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo);
|
|
+ dst->crc32 = (struct bch_extent_crc32) {
|
|
+ common_fields(src),
|
|
+ .csum = (u32 __force) *((__le32 *) &src.csum.lo),
|
|
+ };
|
|
break;
|
|
case BCH_EXTENT_ENTRY_crc64:
|
|
- set_common_fields(dst->crc64, src);
|
|
- dst->crc64.nonce = src.nonce;
|
|
- dst->crc64.csum_lo = (u64 __force) src.csum.lo;
|
|
- dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi);
|
|
+ dst->crc64 = (struct bch_extent_crc64) {
|
|
+ common_fields(src),
|
|
+ .nonce = src.nonce,
|
|
+ .csum_lo = (u64 __force) src.csum.lo,
|
|
+ .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi),
|
|
+ };
|
|
break;
|
|
case BCH_EXTENT_ENTRY_crc128:
|
|
- set_common_fields(dst->crc128, src);
|
|
- dst->crc128.nonce = src.nonce;
|
|
- dst->crc128.csum = src.csum;
|
|
+ dst->crc128 = (struct bch_extent_crc128) {
|
|
+ common_fields(src),
|
|
+ .nonce = src.nonce,
|
|
+ .csum = src.csum,
|
|
+ };
|
|
break;
|
|
default:
|
|
BUG();
|
|
@@ -997,7 +1055,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
|
|
|
|
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
|
|
|
|
- return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
|
|
+ return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
|
|
}
|
|
|
|
void bch2_extent_ptr_set_cached(struct bch_fs *c,
|
|
@@ -1220,6 +1278,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
|
|
bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
|
|
break;
|
|
|
|
+ case BCH_EXTENT_ENTRY_flags:
|
|
+ prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
|
|
+ break;
|
|
+
|
|
default:
|
|
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
|
|
return;
|
|
@@ -1381,6 +1443,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
#endif
|
|
break;
|
|
}
|
|
+ case BCH_EXTENT_ENTRY_flags:
|
|
+ bkey_fsck_err_on(entry != ptrs.start,
|
|
+ c, extent_flags_not_at_start,
|
|
+ "extent flags entry not at start");
|
|
+ break;
|
|
}
|
|
}
|
|
|
|
@@ -1447,6 +1514,28 @@ void bch2_ptr_swab(struct bkey_s k)
|
|
}
|
|
}
|
|
|
|
+int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
|
|
+{
|
|
+ int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
|
|
+
|
|
+ if (ptrs.start != ptrs.end &&
|
|
+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
|
|
+ ptrs.start->flags.flags = flags;
|
|
+ } else {
|
|
+ struct bch_extent_flags f = {
|
|
+ .type = BIT(BCH_EXTENT_ENTRY_flags),
|
|
+ .flags = flags,
|
|
+ };
|
|
+ __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
/* Generic extent code: */
|
|
|
|
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
|
|
@@ -1492,8 +1581,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
|
|
entry->crc128.offset += sub;
|
|
break;
|
|
case BCH_EXTENT_ENTRY_stripe_ptr:
|
|
- break;
|
|
case BCH_EXTENT_ENTRY_rebalance:
|
|
+ case BCH_EXTENT_ENTRY_flags:
|
|
break;
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
|
|
index 204d765dd74c..e78a39e7e18f 100644
|
|
--- a/fs/bcachefs/extents.h
|
|
+++ b/fs/bcachefs/extents.h
|
|
@@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
|
|
({ \
|
|
__label__ out; \
|
|
\
|
|
- (_ptr).idx = 0; \
|
|
- (_ptr).has_ec = false; \
|
|
+ (_ptr).has_ec = false; \
|
|
+ (_ptr).do_ec_reconstruct = false; \
|
|
+ (_ptr).crc_retry_nr = 0; \
|
|
\
|
|
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
|
|
switch (__extent_entry_type(_entry)) { \
|
|
@@ -401,10 +402,10 @@ out: \
|
|
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
|
|
unsigned);
|
|
void bch2_mark_io_failure(struct bch_io_failures *,
|
|
- struct extent_ptr_decoded *);
|
|
+ struct extent_ptr_decoded *, bool);
|
|
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
|
|
struct bch_io_failures *,
|
|
- struct extent_ptr_decoded *);
|
|
+ struct extent_ptr_decoded *, int);
|
|
|
|
/* KEY_TYPE_btree_ptr: */
|
|
|
|
@@ -753,4 +754,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
|
|
k->size = new_size;
|
|
}
|
|
|
|
+static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
|
|
+{
|
|
+ if (ptrs.start != ptrs.end &&
|
|
+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
|
|
+ return ptrs.start->flags.flags;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
|
|
+{
|
|
+ return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
|
|
+}
|
|
+
|
|
+int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
|
|
+
|
|
#endif /* _BCACHEFS_EXTENTS_H */
|
|
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
|
|
index c198dfc376d6..74c0252cbd98 100644
|
|
--- a/fs/bcachefs/extents_format.h
|
|
+++ b/fs/bcachefs/extents_format.h
|
|
@@ -79,8 +79,9 @@
|
|
x(crc64, 2) \
|
|
x(crc128, 3) \
|
|
x(stripe_ptr, 4) \
|
|
- x(rebalance, 5)
|
|
-#define BCH_EXTENT_ENTRY_MAX 6
|
|
+ x(rebalance, 5) \
|
|
+ x(flags, 6)
|
|
+#define BCH_EXTENT_ENTRY_MAX 7
|
|
|
|
enum bch_extent_entry_type {
|
|
#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
|
|
@@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr {
|
|
#endif
|
|
};
|
|
|
|
+#define BCH_EXTENT_FLAGS() \
|
|
+ x(poisoned, 0)
|
|
+
|
|
+enum bch_extent_flags_e {
|
|
+#define x(n, v) BCH_EXTENT_FLAG_##n = v,
|
|
+ BCH_EXTENT_FLAGS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+struct bch_extent_flags {
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u64 type:7,
|
|
+ flags:57;
|
|
+#elif defined (__BIG_ENDIAN_BITFIELD)
|
|
+ __u64 flags:57,
|
|
+ type:7;
|
|
+#endif
|
|
+};
|
|
+
|
|
/* bch_extent_rebalance: */
|
|
#include "rebalance_format.h"
|
|
|
|
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
|
|
index 43d6c341ecca..e51529dca4c2 100644
|
|
--- a/fs/bcachefs/extents_types.h
|
|
+++ b/fs/bcachefs/extents_types.h
|
|
@@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked {
|
|
};
|
|
|
|
struct extent_ptr_decoded {
|
|
- unsigned idx;
|
|
bool has_ec;
|
|
+ bool do_ec_reconstruct;
|
|
+ u8 crc_retry_nr;
|
|
struct bch_extent_crc_unpacked crc;
|
|
struct bch_extent_ptr ptr;
|
|
struct bch_extent_stripe_ptr ec;
|
|
@@ -31,10 +32,10 @@ struct bch_io_failures {
|
|
u8 nr;
|
|
struct bch_dev_io_failures {
|
|
u8 dev;
|
|
- u8 idx;
|
|
- u8 nr_failed;
|
|
- u8 nr_retries;
|
|
- } devs[BCH_REPLICAS_MAX];
|
|
+ unsigned failed_csum_nr:6,
|
|
+ failed_io:1,
|
|
+ failed_ec:1;
|
|
+ } devs[BCH_REPLICAS_MAX + 1];
|
|
};
|
|
|
|
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
|
|
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
|
|
index 2eaffe37b5e7..0e742555cb0a 100644
|
|
--- a/fs/bcachefs/eytzinger.c
|
|
+++ b/fs/bcachefs/eytzinger.c
|
|
@@ -148,89 +148,99 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr
|
|
return cmp(a, b, priv);
|
|
}
|
|
|
|
-static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
|
|
+static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size,
|
|
cmp_r_func_t cmp_func, const void *priv,
|
|
size_t l, size_t r)
|
|
{
|
|
- return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
|
|
- base + inorder_to_eytzinger0(r, n) * size,
|
|
+ return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size,
|
|
+ base1 + inorder_to_eytzinger1(r, n) * size,
|
|
cmp_func, priv);
|
|
}
|
|
|
|
-static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
|
|
+static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size,
|
|
swap_r_func_t swap_func, const void *priv,
|
|
size_t l, size_t r)
|
|
{
|
|
- do_swap(base + inorder_to_eytzinger0(l, n) * size,
|
|
- base + inorder_to_eytzinger0(r, n) * size,
|
|
+ do_swap(base1 + inorder_to_eytzinger1(l, n) * size,
|
|
+ base1 + inorder_to_eytzinger1(r, n) * size,
|
|
size, swap_func, priv);
|
|
}
|
|
|
|
-void eytzinger0_sort_r(void *base, size_t n, size_t size,
|
|
- cmp_r_func_t cmp_func,
|
|
- swap_r_func_t swap_func,
|
|
- const void *priv)
|
|
+static void eytzinger1_sort_r(void *base1, size_t n, size_t size,
|
|
+ cmp_r_func_t cmp_func,
|
|
+ swap_r_func_t swap_func,
|
|
+ const void *priv)
|
|
{
|
|
- int i, j, k;
|
|
+ unsigned i, j, k;
|
|
|
|
/* called from 'sort' without swap function, let's pick the default */
|
|
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
|
|
swap_func = NULL;
|
|
|
|
if (!swap_func) {
|
|
- if (is_aligned(base, size, 8))
|
|
+ if (is_aligned(base1, size, 8))
|
|
swap_func = SWAP_WORDS_64;
|
|
- else if (is_aligned(base, size, 4))
|
|
+ else if (is_aligned(base1, size, 4))
|
|
swap_func = SWAP_WORDS_32;
|
|
else
|
|
swap_func = SWAP_BYTES;
|
|
}
|
|
|
|
/* heapify */
|
|
- for (i = n / 2 - 1; i >= 0; --i) {
|
|
+ for (i = n / 2; i >= 1; --i) {
|
|
/* Find the sift-down path all the way to the leaves. */
|
|
- for (j = i; k = j * 2 + 1, k + 1 < n;)
|
|
- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
|
|
+ for (j = i; k = j * 2, k < n;)
|
|
+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
|
|
|
|
/* Special case for the last leaf with no sibling. */
|
|
- if (j * 2 + 2 == n)
|
|
- j = j * 2 + 1;
|
|
+ if (j * 2 == n)
|
|
+ j *= 2;
|
|
|
|
/* Backtrack to the correct location. */
|
|
- while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0)
|
|
- j = (j - 1) / 2;
|
|
+ while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0)
|
|
+ j /= 2;
|
|
|
|
/* Shift the element into its correct place. */
|
|
for (k = j; j != i;) {
|
|
- j = (j - 1) / 2;
|
|
- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
|
|
+ j /= 2;
|
|
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
|
|
}
|
|
}
|
|
|
|
/* sort */
|
|
- for (i = n - 1; i > 0; --i) {
|
|
- eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
|
|
+ for (i = n; i > 1; --i) {
|
|
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i);
|
|
|
|
/* Find the sift-down path all the way to the leaves. */
|
|
- for (j = 0; k = j * 2 + 1, k + 1 < i;)
|
|
- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
|
|
+ for (j = 1; k = j * 2, k + 1 < i;)
|
|
+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
|
|
|
|
/* Special case for the last leaf with no sibling. */
|
|
- if (j * 2 + 2 == i)
|
|
- j = j * 2 + 1;
|
|
+ if (j * 2 + 1 == i)
|
|
+ j *= 2;
|
|
|
|
/* Backtrack to the correct location. */
|
|
- while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0)
|
|
- j = (j - 1) / 2;
|
|
+ while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0)
|
|
+ j /= 2;
|
|
|
|
/* Shift the element into its correct place. */
|
|
- for (k = j; j;) {
|
|
- j = (j - 1) / 2;
|
|
- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
|
|
+ for (k = j; j > 1;) {
|
|
+ j /= 2;
|
|
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
|
|
}
|
|
}
|
|
}
|
|
|
|
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
|
|
+ cmp_r_func_t cmp_func,
|
|
+ swap_r_func_t swap_func,
|
|
+ const void *priv)
|
|
+{
|
|
+ void *base1 = base - size;
|
|
+
|
|
+ return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv);
|
|
+}
|
|
+
|
|
void eytzinger0_sort(void *base, size_t n, size_t size,
|
|
cmp_func_t cmp_func,
|
|
swap_func_t swap_func)
|
|
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
|
|
index 0541192d7bc0..643c1f716061 100644
|
|
--- a/fs/bcachefs/eytzinger.h
|
|
+++ b/fs/bcachefs/eytzinger.h
|
|
@@ -6,6 +6,7 @@
|
|
#include <linux/log2.h>
|
|
|
|
#ifdef EYTZINGER_DEBUG
|
|
+#include <linux/bug.h>
|
|
#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
|
|
#else
|
|
#define EYTZINGER_BUG_ON(cond)
|
|
@@ -56,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size)
|
|
return rounddown_pow_of_two(size + 1) - 1;
|
|
}
|
|
|
|
-/*
|
|
- * eytzinger1_next() and eytzinger1_prev() have the nice properties that
|
|
- *
|
|
- * eytzinger1_next(0) == eytzinger1_first())
|
|
- * eytzinger1_prev(0) == eytzinger1_last())
|
|
- *
|
|
- * eytzinger1_prev(eytzinger1_first()) == 0
|
|
- * eytzinger1_next(eytzinger1_last()) == 0
|
|
- */
|
|
-
|
|
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
|
|
{
|
|
- EYTZINGER_BUG_ON(i > size);
|
|
+ EYTZINGER_BUG_ON(i == 0 || i > size);
|
|
|
|
if (eytzinger1_right_child(i) <= size) {
|
|
i = eytzinger1_right_child(i);
|
|
|
|
- i <<= __fls(size + 1) - __fls(i);
|
|
+ i <<= __fls(size) - __fls(i);
|
|
i >>= i > size;
|
|
} else {
|
|
i >>= ffz(i) + 1;
|
|
@@ -84,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
|
|
|
|
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
|
|
{
|
|
- EYTZINGER_BUG_ON(i > size);
|
|
+ EYTZINGER_BUG_ON(i == 0 || i > size);
|
|
|
|
if (eytzinger1_left_child(i) <= size) {
|
|
i = eytzinger1_left_child(i) + 1;
|
|
|
|
- i <<= __fls(size + 1) - __fls(i);
|
|
+ i <<= __fls(size) - __fls(i);
|
|
i -= 1;
|
|
i >>= i > size;
|
|
} else {
|
|
@@ -243,73 +234,63 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
|
|
(_i) != -1; \
|
|
(_i) = eytzinger0_next((_i), (_size)))
|
|
|
|
+#define eytzinger0_for_each_prev(_i, _size) \
|
|
+ for (unsigned (_i) = eytzinger0_last((_size)); \
|
|
+ (_i) != -1; \
|
|
+ (_i) = eytzinger0_prev((_i), (_size)))
|
|
+
|
|
/* return greatest node <= @search, or -1 if not found */
|
|
static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
|
|
cmp_func_t cmp, const void *search)
|
|
{
|
|
- unsigned i, n = 0;
|
|
-
|
|
- if (!nr)
|
|
- return -1;
|
|
-
|
|
- do {
|
|
- i = n;
|
|
- n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
|
|
- } while (n < nr);
|
|
-
|
|
- if (n & 1) {
|
|
- /*
|
|
- * @i was greater than @search, return previous node:
|
|
- *
|
|
- * if @i was leftmost/smallest element,
|
|
- * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
|
|
- */
|
|
- return eytzinger0_prev(i, nr);
|
|
- } else {
|
|
- return i;
|
|
- }
|
|
+ void *base1 = base - size;
|
|
+ unsigned n = 1;
|
|
+
|
|
+ while (n <= nr)
|
|
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
|
|
+ n >>= __ffs(n) + 1;
|
|
+ return n - 1;
|
|
}
|
|
|
|
+/* return smallest node > @search, or -1 if not found */
|
|
static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
|
|
cmp_func_t cmp, const void *search)
|
|
{
|
|
- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
|
|
+ void *base1 = base - size;
|
|
+ unsigned n = 1;
|
|
|
|
- /*
|
|
- * if eytitzinger0_find_le() returned -1 - no element was <= search - we
|
|
- * want to return the first element; next/prev identities mean this work
|
|
- * as expected
|
|
- *
|
|
- * similarly if find_le() returns last element, we should return -1;
|
|
- * identities mean this all works out:
|
|
- */
|
|
- return eytzinger0_next(idx, nr);
|
|
+ while (n <= nr)
|
|
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
|
|
+ n >>= __ffs(n + 1) + 1;
|
|
+ return n - 1;
|
|
}
|
|
|
|
+/* return smallest node >= @search, or -1 if not found */
|
|
static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
|
|
cmp_func_t cmp, const void *search)
|
|
{
|
|
- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
|
|
-
|
|
- if (idx < nr && !cmp(base + idx * size, search))
|
|
- return idx;
|
|
+ void *base1 = base - size;
|
|
+ unsigned n = 1;
|
|
|
|
- return eytzinger0_next(idx, nr);
|
|
+ while (n <= nr)
|
|
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0);
|
|
+ n >>= __ffs(n + 1) + 1;
|
|
+ return n - 1;
|
|
}
|
|
|
|
#define eytzinger0_find(base, nr, size, _cmp, search) \
|
|
({ \
|
|
- void *_base = (base); \
|
|
+ size_t _size = (size); \
|
|
+ void *_base1 = (void *)(base) - _size; \
|
|
const void *_search = (search); \
|
|
size_t _nr = (nr); \
|
|
- size_t _size = (size); \
|
|
- size_t _i = 0; \
|
|
+ size_t _i = 1; \
|
|
int _res; \
|
|
\
|
|
- while (_i < _nr && \
|
|
- (_res = _cmp(_search, _base + _i * _size))) \
|
|
- _i = eytzinger0_child(_i, _res > 0); \
|
|
- _i; \
|
|
+ while (_i <= _nr && \
|
|
+ (_res = _cmp(_search, _base1 + _i * _size))) \
|
|
+ _i = eytzinger1_child(_i, _res > 0); \
|
|
+ _i - 1; \
|
|
})
|
|
|
|
void eytzinger0_sort_r(void *, size_t, size_t,
|
|
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
|
|
index ab1d5db2fa56..a03e2c780cba 100644
|
|
--- a/fs/bcachefs/fs-io-buffered.c
|
|
+++ b/fs/bcachefs/fs-io-buffered.c
|
|
@@ -110,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans,
|
|
if (!get_more)
|
|
break;
|
|
|
|
+ unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio);
|
|
+
|
|
+ if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping))
|
|
+ break;
|
|
+
|
|
+ unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
|
|
+
|
|
+ /* ensure proper alignment */
|
|
+ order = min(order, __ffs(folio_offset|BIT(31)));
|
|
+
|
|
folio = xa_load(&iter->mapping->i_pages, folio_offset);
|
|
if (folio && !xa_is_value(folio))
|
|
break;
|
|
|
|
- folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
|
|
+ folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order);
|
|
if (!folio)
|
|
break;
|
|
|
|
@@ -149,12 +159,10 @@ static void bchfs_read(struct btree_trans *trans,
|
|
struct bch_fs *c = trans->c;
|
|
struct btree_iter iter;
|
|
struct bkey_buf sk;
|
|
- int flags = BCH_READ_RETRY_IF_STALE|
|
|
- BCH_READ_MAY_PROMOTE;
|
|
+ int flags = BCH_READ_retry_if_stale|
|
|
+ BCH_READ_may_promote;
|
|
int ret = 0;
|
|
|
|
- rbio->c = c;
|
|
- rbio->start_time = local_clock();
|
|
rbio->subvol = inum.subvol;
|
|
|
|
bch2_bkey_buf_init(&sk);
|
|
@@ -211,17 +219,17 @@ static void bchfs_read(struct btree_trans *trans,
|
|
swap(rbio->bio.bi_iter.bi_size, bytes);
|
|
|
|
if (rbio->bio.bi_iter.bi_size == bytes)
|
|
- flags |= BCH_READ_LAST_FRAGMENT;
|
|
+ flags |= BCH_READ_last_fragment;
|
|
|
|
bch2_bio_page_state_set(&rbio->bio, k);
|
|
|
|
bch2_read_extent(trans, rbio, iter.pos,
|
|
data_btree, k, offset_into_extent, flags);
|
|
+ swap(rbio->bio.bi_iter.bi_size, bytes);
|
|
|
|
- if (flags & BCH_READ_LAST_FRAGMENT)
|
|
+ if (flags & BCH_READ_last_fragment)
|
|
break;
|
|
|
|
- swap(rbio->bio.bi_iter.bi_size, bytes);
|
|
bio_advance(&rbio->bio, bytes);
|
|
err:
|
|
if (ret &&
|
|
@@ -232,7 +240,8 @@ static void bchfs_read(struct btree_trans *trans,
|
|
|
|
if (ret) {
|
|
struct printbuf buf = PRINTBUF;
|
|
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9);
|
|
+ lockrestart_do(trans,
|
|
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
|
|
prt_printf(&buf, "read error %i from btree lookup", ret);
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
@@ -280,12 +289,13 @@ void bch2_readahead(struct readahead_control *ractl)
|
|
struct bch_read_bio *rbio =
|
|
rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
|
|
GFP_KERNEL, &c->bio_read),
|
|
- opts);
|
|
+ c,
|
|
+ opts,
|
|
+ bch2_readpages_end_io);
|
|
|
|
readpage_iter_advance(&readpages_iter);
|
|
|
|
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
|
|
- rbio->bio.bi_end_io = bch2_readpages_end_io;
|
|
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
|
|
|
|
bchfs_read(trans, rbio, inode_inum(inode),
|
|
@@ -323,10 +333,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
|
|
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
|
|
|
|
rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
|
|
- opts);
|
|
+ c,
|
|
+ opts,
|
|
+ bch2_read_single_folio_end_io);
|
|
rbio->bio.bi_private = &done;
|
|
- rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
|
|
-
|
|
rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
|
|
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
|
|
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
|
|
@@ -420,7 +430,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
|
|
}
|
|
}
|
|
|
|
- if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
|
|
+ if (io->op.flags & BCH_WRITE_wrote_data_inline) {
|
|
bio_for_each_folio_all(fi, bio) {
|
|
struct bch_folio *s;
|
|
|
|
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
|
|
index 2089c36b5866..535bc5fcbcc0 100644
|
|
--- a/fs/bcachefs/fs-io-direct.c
|
|
+++ b/fs/bcachefs/fs-io-direct.c
|
|
@@ -73,6 +73,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
|
struct blk_plug plug;
|
|
loff_t offset = req->ki_pos;
|
|
bool sync = is_sync_kiocb(req);
|
|
+ bool split = false;
|
|
size_t shorten;
|
|
ssize_t ret;
|
|
|
|
@@ -99,8 +100,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
|
GFP_KERNEL,
|
|
&c->dio_read_bioset);
|
|
|
|
- bio->bi_end_io = bch2_direct_IO_read_endio;
|
|
-
|
|
dio = container_of(bio, struct dio_read, rbio.bio);
|
|
closure_init(&dio->cl, NULL);
|
|
|
|
@@ -133,12 +132,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
|
|
|
goto start;
|
|
while (iter->count) {
|
|
+ split = true;
|
|
+
|
|
bio = bio_alloc_bioset(NULL,
|
|
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
|
|
REQ_OP_READ,
|
|
GFP_KERNEL,
|
|
&c->bio_read);
|
|
- bio->bi_end_io = bch2_direct_IO_read_split_endio;
|
|
start:
|
|
bio->bi_opf = REQ_OP_READ|REQ_SYNC;
|
|
bio->bi_iter.bi_sector = offset >> 9;
|
|
@@ -160,7 +160,15 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
|
if (iter->count)
|
|
closure_get(&dio->cl);
|
|
|
|
- bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
|
|
+ struct bch_read_bio *rbio =
|
|
+ rbio_init(bio,
|
|
+ c,
|
|
+ opts,
|
|
+ split
|
|
+ ? bch2_direct_IO_read_split_endio
|
|
+ : bch2_direct_IO_read_endio);
|
|
+
|
|
+ bch2_read(c, rbio, inode_inum(inode));
|
|
}
|
|
|
|
blk_finish_plug(&plug);
|
|
@@ -511,8 +519,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
|
|
dio->op.devs_need_flush = &inode->ei_devs_need_flush;
|
|
|
|
if (sync)
|
|
- dio->op.flags |= BCH_WRITE_SYNC;
|
|
- dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
|
|
+ dio->op.flags |= BCH_WRITE_sync;
|
|
+ dio->op.flags |= BCH_WRITE_check_enospc;
|
|
|
|
ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
|
|
bio_sectors(bio), true);
|
|
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
|
|
index 15725b4ce393..e3a3230fc652 100644
|
|
--- a/fs/bcachefs/fs-ioctl.c
|
|
+++ b/fs/bcachefs/fs-ioctl.c
|
|
@@ -5,8 +5,8 @@
|
|
#include "chardev.h"
|
|
#include "dirent.h"
|
|
#include "fs.h"
|
|
-#include "fs-common.h"
|
|
#include "fs-ioctl.h"
|
|
+#include "namei.h"
|
|
#include "quota.h"
|
|
|
|
#include <linux/compat.h>
|
|
@@ -54,6 +54,32 @@ static int bch2_inode_flags_set(struct btree_trans *trans,
|
|
(newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
|
|
return -EINVAL;
|
|
|
|
+ if ((newflags ^ oldflags) & BCH_INODE_casefolded) {
|
|
+#ifdef CONFIG_UNICODE
|
|
+ int ret = 0;
|
|
+ /* Not supported on individual files. */
|
|
+ if (!S_ISDIR(bi->bi_mode))
|
|
+ return -EOPNOTSUPP;
|
|
+
|
|
+ /*
|
|
+ * Make sure the dir is empty, as otherwise we'd need to
|
|
+ * rehash everything and update the dirent keys.
|
|
+ */
|
|
+ ret = bch2_empty_dir_trans(trans, inode_inum(inode));
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ bch2_check_set_feature(c, BCH_FEATURE_casefolding);
|
|
+#else
|
|
+ printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n");
|
|
+ return -EOPNOTSUPP;
|
|
+#endif
|
|
+ }
|
|
+
|
|
if (s->set_projinherit) {
|
|
bi->bi_fields_set &= ~(1 << Inode_opt_project);
|
|
bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
|
|
@@ -218,7 +244,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
|
|
int ret = 0;
|
|
subvol_inum inum;
|
|
|
|
- kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
|
|
+ kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
|
|
if (!kname)
|
|
return -ENOMEM;
|
|
|
|
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
|
|
index d30f9bb056fd..ecd3bfdcde21 100644
|
|
--- a/fs/bcachefs/fs-ioctl.h
|
|
+++ b/fs/bcachefs/fs-ioctl.h
|
|
@@ -6,19 +6,21 @@
|
|
|
|
/* bcachefs inode flags -> vfs inode flags: */
|
|
static const __maybe_unused unsigned bch_flags_to_vfs[] = {
|
|
- [__BCH_INODE_sync] = S_SYNC,
|
|
- [__BCH_INODE_immutable] = S_IMMUTABLE,
|
|
- [__BCH_INODE_append] = S_APPEND,
|
|
- [__BCH_INODE_noatime] = S_NOATIME,
|
|
+ [__BCH_INODE_sync] = S_SYNC,
|
|
+ [__BCH_INODE_immutable] = S_IMMUTABLE,
|
|
+ [__BCH_INODE_append] = S_APPEND,
|
|
+ [__BCH_INODE_noatime] = S_NOATIME,
|
|
+ [__BCH_INODE_casefolded] = S_CASEFOLD,
|
|
};
|
|
|
|
/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
|
|
static const __maybe_unused unsigned bch_flags_to_uflags[] = {
|
|
- [__BCH_INODE_sync] = FS_SYNC_FL,
|
|
- [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
|
|
- [__BCH_INODE_append] = FS_APPEND_FL,
|
|
- [__BCH_INODE_nodump] = FS_NODUMP_FL,
|
|
- [__BCH_INODE_noatime] = FS_NOATIME_FL,
|
|
+ [__BCH_INODE_sync] = FS_SYNC_FL,
|
|
+ [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
|
|
+ [__BCH_INODE_append] = FS_APPEND_FL,
|
|
+ [__BCH_INODE_nodump] = FS_NODUMP_FL,
|
|
+ [__BCH_INODE_noatime] = FS_NOATIME_FL,
|
|
+ [__BCH_INODE_casefolded] = FS_CASEFOLD_FL,
|
|
};
|
|
|
|
/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
|
|
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
|
|
index 90ade8f648d9..fbca200f7636 100644
|
|
--- a/fs/bcachefs/fs.c
|
|
+++ b/fs/bcachefs/fs.c
|
|
@@ -11,7 +11,6 @@
|
|
#include "errcode.h"
|
|
#include "extents.h"
|
|
#include "fs.h"
|
|
-#include "fs-common.h"
|
|
#include "fs-io.h"
|
|
#include "fs-ioctl.h"
|
|
#include "fs-io-buffered.h"
|
|
@@ -22,6 +21,7 @@
|
|
#include "io_read.h"
|
|
#include "journal.h"
|
|
#include "keylist.h"
|
|
+#include "namei.h"
|
|
#include "quota.h"
|
|
#include "rebalance.h"
|
|
#include "snapshot.h"
|
|
@@ -641,7 +641,9 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
|
|
- ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
|
|
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
|
+
|
|
+ ret = bch2_dirent_read_target(trans, dir, d, &inum);
|
|
if (ret > 0)
|
|
ret = -ENOENT;
|
|
if (ret)
|
|
@@ -651,30 +653,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
|
|
if (inode)
|
|
goto out;
|
|
|
|
+ /*
|
|
+ * Note: if check/repair needs it, we commit before
|
|
+ * bch2_inode_hash_init_insert(), as after that point we can't take a
|
|
+ * restart - not in the top level loop with a commit_do(), like we
|
|
+ * usually do:
|
|
+ */
|
|
+
|
|
struct bch_subvolume subvol;
|
|
struct bch_inode_unpacked inode_u;
|
|
ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
|
|
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
|
|
+ bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?:
|
|
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
|
|
PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
|
|
|
|
+ /*
|
|
+ * don't remove it: check_inodes might find another inode that points
|
|
+ * back to this dirent
|
|
+ */
|
|
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
|
|
c, "dirent to missing inode:\n %s",
|
|
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
|
|
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf));
|
|
if (ret)
|
|
goto err;
|
|
-
|
|
- /* regular files may have hardlinks: */
|
|
- if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) &&
|
|
- !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
|
|
- c,
|
|
- "dirent points to inode that does not point back:\n %s",
|
|
- (bch2_bkey_val_to_text(&buf, c, k),
|
|
- prt_printf(&buf, "\n "),
|
|
- bch2_inode_unpacked_to_text(&buf, &inode_u),
|
|
- buf.buf))) {
|
|
- ret = -ENOENT;
|
|
- goto err;
|
|
- }
|
|
out:
|
|
bch2_trans_iter_exit(trans, &dirent_iter);
|
|
printbuf_exit(&buf);
|
|
@@ -698,6 +700,23 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
|
|
if (IS_ERR(inode))
|
|
inode = NULL;
|
|
|
|
+#ifdef CONFIG_UNICODE
|
|
+ if (!inode && IS_CASEFOLDED(vdir)) {
|
|
+ /*
|
|
+ * Do not cache a negative dentry in casefolded directories
|
|
+ * as it would need to be invalidated in the following situation:
|
|
+ * - Lookup file "blAH" in a casefolded directory
|
|
+ * - Creation of file "BLAH" in a casefolded directory
|
|
+ * - Lookup file "blAH" in a casefolded directory
|
|
+ * which would fail if we had a negative dentry.
|
|
+ *
|
|
+ * We should come back to this when VFS has a method to handle
|
|
+ * this edgecase.
|
|
+ */
|
|
+ return NULL;
|
|
+ }
|
|
+#endif
|
|
+
|
|
return d_splice_alias(&inode->v, dentry);
|
|
}
|
|
|
|
@@ -1802,7 +1821,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans,
|
|
break;
|
|
}
|
|
|
|
- mapping_set_large_folios(inode->v.i_mapping);
|
|
+ mapping_set_folio_min_order(inode->v.i_mapping,
|
|
+ get_order(trans->c->opts.block_size));
|
|
}
|
|
|
|
static void bch2_free_inode(struct inode *vinode)
|
|
@@ -2008,44 +2028,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
|
|
return c ?: ERR_PTR(-ENOENT);
|
|
}
|
|
|
|
-static int bch2_remount(struct super_block *sb, int *flags,
|
|
- struct bch_opts opts)
|
|
-{
|
|
- struct bch_fs *c = sb->s_fs_info;
|
|
- int ret = 0;
|
|
-
|
|
- opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
|
|
-
|
|
- if (opts.read_only != c->opts.read_only) {
|
|
- down_write(&c->state_lock);
|
|
-
|
|
- if (opts.read_only) {
|
|
- bch2_fs_read_only(c);
|
|
-
|
|
- sb->s_flags |= SB_RDONLY;
|
|
- } else {
|
|
- ret = bch2_fs_read_write(c);
|
|
- if (ret) {
|
|
- bch_err(c, "error going rw: %i", ret);
|
|
- up_write(&c->state_lock);
|
|
- ret = -EINVAL;
|
|
- goto err;
|
|
- }
|
|
-
|
|
- sb->s_flags &= ~SB_RDONLY;
|
|
- }
|
|
-
|
|
- c->opts.read_only = opts.read_only;
|
|
-
|
|
- up_write(&c->state_lock);
|
|
- }
|
|
-
|
|
- if (opt_defined(opts, errors))
|
|
- c->opts.errors = opts.errors;
|
|
-err:
|
|
- return bch2_err_class(ret);
|
|
-}
|
|
-
|
|
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
|
|
{
|
|
struct bch_fs *c = root->d_sb->s_fs_info;
|
|
@@ -2192,6 +2174,9 @@ static int bch2_fs_get_tree(struct fs_context *fc)
|
|
if (ret)
|
|
goto err;
|
|
|
|
+ if (opt_defined(opts, discard))
|
|
+ set_bit(BCH_FS_discard_mount_opt_set, &c->flags);
|
|
+
|
|
/* Some options can't be parsed until after the fs is started: */
|
|
opts = bch2_opts_empty();
|
|
ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
|
|
@@ -2200,9 +2185,10 @@ static int bch2_fs_get_tree(struct fs_context *fc)
|
|
|
|
bch2_opts_apply(&c->opts, opts);
|
|
|
|
- ret = bch2_fs_start(c);
|
|
- if (ret)
|
|
- goto err_stop_fs;
|
|
+ /*
|
|
+ * need to initialise sb and set c->vfs_sb _before_ starting fs,
|
|
+ * for blk_holder_ops
|
|
+ */
|
|
|
|
sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
|
|
ret = PTR_ERR_OR_ZERO(sb);
|
|
@@ -2264,6 +2250,10 @@ static int bch2_fs_get_tree(struct fs_context *fc)
|
|
|
|
sb->s_shrink->seeks = 0;
|
|
|
|
+ ret = bch2_fs_start(c);
|
|
+ if (ret)
|
|
+ goto err_put_super;
|
|
+
|
|
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
|
|
ret = PTR_ERR_OR_ZERO(vinode);
|
|
bch_err_msg(c, ret, "mounting: error getting root inode");
|
|
@@ -2351,8 +2341,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc)
|
|
{
|
|
struct super_block *sb = fc->root->d_sb;
|
|
struct bch2_opts_parse *opts = fc->fs_private;
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+ int ret = 0;
|
|
+
|
|
+ opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
|
|
|
|
- return bch2_remount(sb, &fc->sb_flags, opts->opts);
|
|
+ if (opts->opts.read_only != c->opts.read_only) {
|
|
+ down_write(&c->state_lock);
|
|
+
|
|
+ if (opts->opts.read_only) {
|
|
+ bch2_fs_read_only(c);
|
|
+
|
|
+ sb->s_flags |= SB_RDONLY;
|
|
+ } else {
|
|
+ ret = bch2_fs_read_write(c);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error going rw: %i", ret);
|
|
+ up_write(&c->state_lock);
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ sb->s_flags &= ~SB_RDONLY;
|
|
+ }
|
|
+
|
|
+ c->opts.read_only = opts->opts.read_only;
|
|
+
|
|
+ up_write(&c->state_lock);
|
|
+ }
|
|
+
|
|
+ if (opt_defined(opts->opts, errors))
|
|
+ c->opts.errors = opts->opts.errors;
|
|
+err:
|
|
+ return bch2_err_class(ret);
|
|
}
|
|
|
|
static const struct fs_context_operations bch2_context_ops = {
|
|
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
|
|
index 0e85131d0af8..091057023fc5 100644
|
|
--- a/fs/bcachefs/fsck.c
|
|
+++ b/fs/bcachefs/fsck.c
|
|
@@ -10,10 +10,10 @@
|
|
#include "dirent.h"
|
|
#include "error.h"
|
|
#include "fs.h"
|
|
-#include "fs-common.h"
|
|
#include "fsck.h"
|
|
#include "inode.h"
|
|
#include "keylist.h"
|
|
+#include "namei.h"
|
|
#include "recovery_passes.h"
|
|
#include "snapshot.h"
|
|
#include "super.h"
|
|
@@ -23,13 +23,6 @@
|
|
#include <linux/bsearch.h>
|
|
#include <linux/dcache.h> /* struct qstr */
|
|
|
|
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
|
|
- struct bkey_s_c_dirent d)
|
|
-{
|
|
- return inode->bi_dir == d.k->p.inode &&
|
|
- inode->bi_dir_offset == d.k->p.offset;
|
|
-}
|
|
-
|
|
static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
|
|
struct bch_inode_unpacked *inode)
|
|
{
|
|
@@ -116,29 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
|
|
return ret;
|
|
}
|
|
|
|
-static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
|
|
- struct bch_inode_unpacked *inode)
|
|
-{
|
|
- struct btree_iter iter;
|
|
- struct bkey_s_c k;
|
|
- int ret;
|
|
-
|
|
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
|
|
- BTREE_ITER_all_snapshots, k, ret) {
|
|
- if (k.k->p.offset != inode_nr)
|
|
- break;
|
|
- if (!bkey_is_inode(k.k))
|
|
- continue;
|
|
- ret = bch2_inode_unpack(k, inode);
|
|
- goto found;
|
|
- }
|
|
- ret = -BCH_ERR_ENOENT_inode;
|
|
-found:
|
|
- bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
- return ret;
|
|
-}
|
|
-
|
|
static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
|
|
struct bch_inode_unpacked *inode)
|
|
{
|
|
@@ -179,32 +149,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
-static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
|
|
-{
|
|
- struct bch_fs *c = trans->c;
|
|
- struct btree_iter iter;
|
|
- struct bch_inode_unpacked dir_inode;
|
|
- struct bch_hash_info dir_hash_info;
|
|
- int ret;
|
|
-
|
|
- ret = lookup_first_inode(trans, pos.inode, &dir_inode);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- dir_hash_info = bch2_hash_info_init(c, &dir_inode);
|
|
-
|
|
- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
|
|
-
|
|
- ret = bch2_btree_iter_traverse(&iter) ?:
|
|
- bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
|
|
- &dir_hash_info, &iter,
|
|
- BTREE_UPDATE_internal_snapshot_node);
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
-err:
|
|
- bch_err_fn(c, ret);
|
|
- return ret;
|
|
-}
|
|
-
|
|
/*
|
|
* Find any subvolume associated with a tree of snapshots
|
|
* We can't rely on master_subvol - it might have been deleted.
|
|
@@ -548,7 +492,7 @@ static int remove_backpointer(struct btree_trans *trans,
|
|
SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
|
|
int ret = bkey_err(d) ?:
|
|
dirent_points_to_inode(c, d, inode) ?:
|
|
- __remove_dirent(trans, d.k->p);
|
|
+ bch2_fsck_remove_dirent(trans, d.k->p);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
return ret;
|
|
}
|
|
@@ -1985,169 +1929,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa
|
|
trans_was_restarted(trans, restart_count);
|
|
}
|
|
|
|
-noinline_for_stack
|
|
-static int check_dirent_inode_dirent(struct btree_trans *trans,
|
|
- struct btree_iter *iter,
|
|
- struct bkey_s_c_dirent d,
|
|
- struct bch_inode_unpacked *target)
|
|
-{
|
|
- struct bch_fs *c = trans->c;
|
|
- struct printbuf buf = PRINTBUF;
|
|
- struct btree_iter bp_iter = { NULL };
|
|
- int ret = 0;
|
|
-
|
|
- if (inode_points_to_dirent(target, d))
|
|
- return 0;
|
|
-
|
|
- if (!target->bi_dir &&
|
|
- !target->bi_dir_offset) {
|
|
- fsck_err_on(S_ISDIR(target->bi_mode),
|
|
- trans, inode_dir_missing_backpointer,
|
|
- "directory with missing backpointer\n%s",
|
|
- (printbuf_reset(&buf),
|
|
- bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
- prt_printf(&buf, "\n"),
|
|
- bch2_inode_unpacked_to_text(&buf, target),
|
|
- buf.buf));
|
|
-
|
|
- fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
|
|
- trans, inode_unlinked_but_has_dirent,
|
|
- "inode unlinked but has dirent\n%s",
|
|
- (printbuf_reset(&buf),
|
|
- bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
- prt_printf(&buf, "\n"),
|
|
- bch2_inode_unpacked_to_text(&buf, target),
|
|
- buf.buf));
|
|
-
|
|
- target->bi_flags &= ~BCH_INODE_unlinked;
|
|
- target->bi_dir = d.k->p.inode;
|
|
- target->bi_dir_offset = d.k->p.offset;
|
|
- return __bch2_fsck_write_inode(trans, target);
|
|
- }
|
|
-
|
|
- if (bch2_inode_should_have_single_bp(target) &&
|
|
- !fsck_err(trans, inode_wrong_backpointer,
|
|
- "dirent points to inode that does not point back:\n %s",
|
|
- (bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
- prt_printf(&buf, "\n "),
|
|
- bch2_inode_unpacked_to_text(&buf, target),
|
|
- buf.buf)))
|
|
- goto err;
|
|
-
|
|
- struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
|
|
- SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
|
|
- ret = bkey_err(bp_dirent);
|
|
- if (ret && !bch2_err_matches(ret, ENOENT))
|
|
- goto err;
|
|
-
|
|
- bool backpointer_exists = !ret;
|
|
- ret = 0;
|
|
-
|
|
- if (fsck_err_on(!backpointer_exists,
|
|
- trans, inode_wrong_backpointer,
|
|
- "inode %llu:%u has wrong backpointer:\n"
|
|
- "got %llu:%llu\n"
|
|
- "should be %llu:%llu",
|
|
- target->bi_inum, target->bi_snapshot,
|
|
- target->bi_dir,
|
|
- target->bi_dir_offset,
|
|
- d.k->p.inode,
|
|
- d.k->p.offset)) {
|
|
- target->bi_dir = d.k->p.inode;
|
|
- target->bi_dir_offset = d.k->p.offset;
|
|
- ret = __bch2_fsck_write_inode(trans, target);
|
|
- goto out;
|
|
- }
|
|
-
|
|
- bch2_bkey_val_to_text(&buf, c, d.s_c);
|
|
- prt_newline(&buf);
|
|
- if (backpointer_exists)
|
|
- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
|
|
-
|
|
- if (fsck_err_on(backpointer_exists &&
|
|
- (S_ISDIR(target->bi_mode) ||
|
|
- target->bi_subvol),
|
|
- trans, inode_dir_multiple_links,
|
|
- "%s %llu:%u with multiple links\n%s",
|
|
- S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
|
|
- target->bi_inum, target->bi_snapshot, buf.buf)) {
|
|
- ret = __remove_dirent(trans, d.k->p);
|
|
- goto out;
|
|
- }
|
|
-
|
|
- /*
|
|
- * hardlinked file with nlink 0:
|
|
- * We're just adjusting nlink here so check_nlinks() will pick
|
|
- * it up, it ignores inodes with nlink 0
|
|
- */
|
|
- if (fsck_err_on(backpointer_exists && !target->bi_nlink,
|
|
- trans, inode_multiple_links_but_nlink_0,
|
|
- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
|
|
- target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
|
|
- target->bi_nlink++;
|
|
- target->bi_flags &= ~BCH_INODE_unlinked;
|
|
- ret = __bch2_fsck_write_inode(trans, target);
|
|
- if (ret)
|
|
- goto err;
|
|
- }
|
|
-out:
|
|
-err:
|
|
-fsck_err:
|
|
- bch2_trans_iter_exit(trans, &bp_iter);
|
|
- printbuf_exit(&buf);
|
|
- bch_err_fn(c, ret);
|
|
- return ret;
|
|
-}
|
|
-
|
|
-noinline_for_stack
|
|
-static int check_dirent_target(struct btree_trans *trans,
|
|
- struct btree_iter *iter,
|
|
- struct bkey_s_c_dirent d,
|
|
- struct bch_inode_unpacked *target)
|
|
-{
|
|
- struct bch_fs *c = trans->c;
|
|
- struct bkey_i_dirent *n;
|
|
- struct printbuf buf = PRINTBUF;
|
|
- int ret = 0;
|
|
-
|
|
- ret = check_dirent_inode_dirent(trans, iter, d, target);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- if (fsck_err_on(d.v->d_type != inode_d_type(target),
|
|
- trans, dirent_d_type_wrong,
|
|
- "incorrect d_type: got %s, should be %s:\n%s",
|
|
- bch2_d_type_str(d.v->d_type),
|
|
- bch2_d_type_str(inode_d_type(target)),
|
|
- (printbuf_reset(&buf),
|
|
- bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
|
|
- n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
|
|
- ret = PTR_ERR_OR_ZERO(n);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- bkey_reassemble(&n->k_i, d.s_c);
|
|
- n->v.d_type = inode_d_type(target);
|
|
- if (n->v.d_type == DT_SUBVOL) {
|
|
- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
|
|
- n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
|
|
- } else {
|
|
- n->v.d_inum = cpu_to_le64(target->bi_inum);
|
|
- }
|
|
-
|
|
- ret = bch2_trans_update(trans, iter, &n->k_i, 0);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- d = dirent_i_to_s_c(n);
|
|
- }
|
|
-err:
|
|
-fsck_err:
|
|
- printbuf_exit(&buf);
|
|
- bch_err_fn(c, ret);
|
|
- return ret;
|
|
-}
|
|
-
|
|
/* find a subvolume that's a descendent of @snapshot: */
|
|
static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
|
|
{
|
|
@@ -2247,7 +2028,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
|
|
if (fsck_err(trans, dirent_to_missing_subvol,
|
|
"dirent points to missing subvolume\n%s",
|
|
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
|
|
- return __remove_dirent(trans, d.k->p);
|
|
+ return bch2_fsck_remove_dirent(trans, d.k->p);
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
@@ -2291,7 +2072,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
|
|
goto err;
|
|
}
|
|
|
|
- ret = check_dirent_target(trans, iter, d, &subvol_root);
|
|
+ ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
|
|
if (ret)
|
|
goto err;
|
|
out:
|
|
@@ -2378,13 +2159,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
|
|
(printbuf_reset(&buf),
|
|
bch2_bkey_val_to_text(&buf, c, k),
|
|
buf.buf))) {
|
|
- ret = __remove_dirent(trans, d.k->p);
|
|
+ ret = bch2_fsck_remove_dirent(trans, d.k->p);
|
|
if (ret)
|
|
goto err;
|
|
}
|
|
|
|
darray_for_each(target->inodes, i) {
|
|
- ret = check_dirent_target(trans, iter, d, &i->inode);
|
|
+ ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
|
|
if (ret)
|
|
goto err;
|
|
}
|
|
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
|
|
index 339b80770f1d..80051073f613 100644
|
|
--- a/fs/bcachefs/inode.c
|
|
+++ b/fs/bcachefs/inode.c
|
|
@@ -731,10 +731,9 @@ int bch2_trigger_inode(struct btree_trans *trans,
|
|
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
|
|
}
|
|
|
|
- s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
|
|
- if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
|
|
- struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
|
|
- int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
|
|
+ s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) };
|
|
+ if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) {
|
|
+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -868,19 +867,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
|
|
uid, gid, mode, rdev, parent);
|
|
}
|
|
|
|
-static inline u32 bkey_generation(struct bkey_s_c k)
|
|
-{
|
|
- switch (k.k->type) {
|
|
- case KEY_TYPE_inode:
|
|
- case KEY_TYPE_inode_v2:
|
|
- BUG();
|
|
- case KEY_TYPE_inode_generation:
|
|
- return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
|
|
- default:
|
|
- return 0;
|
|
- }
|
|
-}
|
|
-
|
|
static struct bkey_i_inode_alloc_cursor *
|
|
bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
|
|
{
|
|
@@ -1092,7 +1078,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
|
|
bch2_fs_inconsistent(c,
|
|
"inode %llu:%u not found when deleting",
|
|
inum.inum, snapshot);
|
|
- ret = -EIO;
|
|
+ ret = -BCH_ERR_ENOENT_inode;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1256,7 +1242,7 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum
|
|
bch2_fs_inconsistent(c,
|
|
"inode %llu:%u not found when deleting",
|
|
inum, snapshot);
|
|
- ret = -EIO;
|
|
+ ret = -BCH_ERR_ENOENT_inode;
|
|
goto err;
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
|
|
index 428b9be6af34..f82cfbf460d0 100644
|
|
--- a/fs/bcachefs/inode.h
|
|
+++ b/fs/bcachefs/inode.h
|
|
@@ -277,6 +277,7 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i
|
|
bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
|
|
|
|
return S_ISDIR(inode->bi_mode) ||
|
|
+ inode->bi_subvol ||
|
|
(!inode->bi_nlink && inode_has_bp);
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
|
|
index b99a5bf1a75e..117110af1e3f 100644
|
|
--- a/fs/bcachefs/inode_format.h
|
|
+++ b/fs/bcachefs/inode_format.h
|
|
@@ -137,7 +137,8 @@ enum inode_opt_id {
|
|
x(i_sectors_dirty, 6) \
|
|
x(unlinked, 7) \
|
|
x(backptr_untrusted, 8) \
|
|
- x(has_child_snapshot, 9)
|
|
+ x(has_child_snapshot, 9) \
|
|
+ x(casefolded, 10)
|
|
|
|
/* bits 20+ reserved for packed fields below: */
|
|
|
|
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
|
|
index 5353979117b0..6b842c8d21be 100644
|
|
--- a/fs/bcachefs/io_misc.c
|
|
+++ b/fs/bcachefs/io_misc.c
|
|
@@ -115,7 +115,8 @@ int bch2_extent_fallocate(struct btree_trans *trans,
|
|
bch2_increment_clock(c, sectors_allocated, WRITE);
|
|
if (should_print_err(ret)) {
|
|
struct printbuf buf = PRINTBUF;
|
|
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9);
|
|
+ lockrestart_do(trans,
|
|
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9));
|
|
prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
|
|
index aa91fcf51eec..a04dffa45f5f 100644
|
|
--- a/fs/bcachefs/io_read.c
|
|
+++ b/fs/bcachefs/io_read.c
|
|
@@ -25,8 +25,15 @@
|
|
#include "subvolume.h"
|
|
#include "trace.h"
|
|
|
|
+#include <linux/random.h>
|
|
#include <linux/sched/mm.h>
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+static unsigned bch2_read_corrupt_ratio;
|
|
+module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
|
|
+MODULE_PARM_DESC(read_corrupt_ratio, "");
|
|
+#endif
|
|
+
|
|
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
|
|
|
static bool bch2_target_congested(struct bch_fs *c, u16 target)
|
|
@@ -80,6 +87,7 @@ struct promote_op {
|
|
struct rhash_head hash;
|
|
struct bpos pos;
|
|
|
|
+ struct work_struct work;
|
|
struct data_update write;
|
|
struct bio_vec bi_inline_vecs[]; /* must be last */
|
|
};
|
|
@@ -96,6 +104,33 @@ static inline bool have_io_error(struct bch_io_failures *failed)
|
|
return failed && failed->nr;
|
|
}
|
|
|
|
+static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
|
|
+{
|
|
+ EBUG_ON(rbio->split);
|
|
+
|
|
+ return rbio->data_update
|
|
+ ? container_of(rbio, struct data_update, rbio)
|
|
+ : NULL;
|
|
+}
|
|
+
|
|
+static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
|
|
+{
|
|
+ struct data_update *u = rbio_data_update(orig);
|
|
+ if (!u)
|
|
+ return false;
|
|
+
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
|
|
+ unsigned i = 0;
|
|
+ bkey_for_each_ptr(ptrs, ptr) {
|
|
+ if (ptr->dev == dev &&
|
|
+ u->data_opts.rewrite_ptrs & BIT(i))
|
|
+ return true;
|
|
+ i++;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
|
|
struct bpos pos,
|
|
struct bch_io_opts opts,
|
|
@@ -105,7 +140,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
|
|
if (!have_io_error(failed)) {
|
|
BUG_ON(!opts.promote_target);
|
|
|
|
- if (!(flags & BCH_READ_MAY_PROMOTE))
|
|
+ if (!(flags & BCH_READ_may_promote))
|
|
return -BCH_ERR_nopromote_may_not;
|
|
|
|
if (bch2_bkey_has_target(c, k, opts.promote_target))
|
|
@@ -125,98 +160,93 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
|
|
return 0;
|
|
}
|
|
|
|
-static void promote_free(struct bch_fs *c, struct promote_op *op)
|
|
+static noinline void promote_free(struct bch_read_bio *rbio)
|
|
{
|
|
- int ret;
|
|
+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
|
|
+ struct bch_fs *c = rbio->c;
|
|
+
|
|
+ int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
|
|
+ bch_promote_params);
|
|
+ BUG_ON(ret);
|
|
|
|
bch2_data_update_exit(&op->write);
|
|
|
|
- ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
|
|
- bch_promote_params);
|
|
- BUG_ON(ret);
|
|
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
|
|
kfree_rcu(op, rcu);
|
|
}
|
|
|
|
static void promote_done(struct bch_write_op *wop)
|
|
{
|
|
- struct promote_op *op =
|
|
- container_of(wop, struct promote_op, write.op);
|
|
- struct bch_fs *c = op->write.op.c;
|
|
+ struct promote_op *op = container_of(wop, struct promote_op, write.op);
|
|
+ struct bch_fs *c = op->write.rbio.c;
|
|
|
|
- bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
|
|
- op->start_time);
|
|
- promote_free(c, op);
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
|
|
+ promote_free(&op->write.rbio);
|
|
}
|
|
|
|
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
|
|
+static void promote_start_work(struct work_struct *work)
|
|
{
|
|
- struct bio *bio = &op->write.op.wbio.bio;
|
|
+ struct promote_op *op = container_of(work, struct promote_op, work);
|
|
|
|
- trace_and_count(op->write.op.c, read_promote, &rbio->bio);
|
|
+ bch2_data_update_read_done(&op->write);
|
|
+}
|
|
|
|
- /* we now own pages: */
|
|
- BUG_ON(!rbio->bounce);
|
|
- BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
|
|
+static noinline void promote_start(struct bch_read_bio *rbio)
|
|
+{
|
|
+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
|
|
|
|
- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
|
|
- sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
|
|
- swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
|
|
+ trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
|
|
|
|
- bch2_data_update_read_done(&op->write, rbio->pick.crc);
|
|
+ INIT_WORK(&op->work, promote_start_work);
|
|
+ queue_work(rbio->c->write_ref_wq, &op->work);
|
|
}
|
|
|
|
-static struct promote_op *__promote_alloc(struct btree_trans *trans,
|
|
- enum btree_id btree_id,
|
|
- struct bkey_s_c k,
|
|
- struct bpos pos,
|
|
- struct extent_ptr_decoded *pick,
|
|
- struct bch_io_opts opts,
|
|
- unsigned sectors,
|
|
- struct bch_read_bio **rbio,
|
|
- struct bch_io_failures *failed)
|
|
+static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
|
|
+ enum btree_id btree_id,
|
|
+ struct bkey_s_c k,
|
|
+ struct bpos pos,
|
|
+ struct extent_ptr_decoded *pick,
|
|
+ unsigned sectors,
|
|
+ struct bch_read_bio *orig,
|
|
+ struct bch_io_failures *failed)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct promote_op *op = NULL;
|
|
- struct bio *bio;
|
|
- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
|
|
int ret;
|
|
|
|
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
|
|
- return ERR_PTR(-BCH_ERR_nopromote_no_writes);
|
|
+ struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
|
|
|
|
- op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
|
|
- if (!op) {
|
|
- ret = -BCH_ERR_nopromote_enomem;
|
|
- goto err;
|
|
- }
|
|
+ if (!have_io_error(failed)) {
|
|
+ update_opts.target = orig->opts.promote_target;
|
|
+ update_opts.extra_replicas = 1;
|
|
+ update_opts.write_flags |= BCH_WRITE_cached;
|
|
+ update_opts.write_flags |= BCH_WRITE_only_specified_devs;
|
|
+ } else {
|
|
+ update_opts.target = orig->opts.foreground_target;
|
|
|
|
- op->start_time = local_clock();
|
|
- op->pos = pos;
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ unsigned ptr_bit = 1;
|
|
+ bkey_for_each_ptr(ptrs, ptr) {
|
|
+ if (bch2_dev_io_failures(failed, ptr->dev) &&
|
|
+ !ptr_being_rewritten(orig, ptr->dev))
|
|
+ update_opts.rewrite_ptrs |= ptr_bit;
|
|
+ ptr_bit <<= 1;
|
|
+ }
|
|
|
|
- /*
|
|
- * We don't use the mempool here because extents that aren't
|
|
- * checksummed or compressed can be too big for the mempool:
|
|
- */
|
|
- *rbio = kzalloc(sizeof(struct bch_read_bio) +
|
|
- sizeof(struct bio_vec) * pages,
|
|
- GFP_KERNEL);
|
|
- if (!*rbio) {
|
|
- ret = -BCH_ERR_nopromote_enomem;
|
|
- goto err;
|
|
+ if (!update_opts.rewrite_ptrs)
|
|
+ return NULL;
|
|
}
|
|
|
|
- rbio_init(&(*rbio)->bio, opts);
|
|
- bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
|
|
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
|
|
+ return ERR_PTR(-BCH_ERR_nopromote_no_writes);
|
|
|
|
- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) {
|
|
+ struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
|
|
+ if (!op) {
|
|
ret = -BCH_ERR_nopromote_enomem;
|
|
- goto err;
|
|
+ goto err_put;
|
|
}
|
|
|
|
- (*rbio)->bounce = true;
|
|
- (*rbio)->split = true;
|
|
- (*rbio)->kmalloc = true;
|
|
+ op->start_time = local_clock();
|
|
+ op->pos = pos;
|
|
|
|
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
|
|
bch_promote_params)) {
|
|
@@ -224,68 +254,54 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
|
|
goto err;
|
|
}
|
|
|
|
- bio = &op->write.op.wbio.bio;
|
|
- bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
|
|
-
|
|
- struct data_update_opts update_opts = {};
|
|
-
|
|
- if (!have_io_error(failed)) {
|
|
- update_opts.target = opts.promote_target;
|
|
- update_opts.extra_replicas = 1;
|
|
- update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED;
|
|
- } else {
|
|
- update_opts.target = opts.foreground_target;
|
|
-
|
|
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
- unsigned ptr_bit = 1;
|
|
- bkey_for_each_ptr(ptrs, ptr) {
|
|
- if (bch2_dev_io_failures(failed, ptr->dev))
|
|
- update_opts.rewrite_ptrs |= ptr_bit;
|
|
- ptr_bit <<= 1;
|
|
- }
|
|
- }
|
|
-
|
|
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
|
|
writepoint_hashed((unsigned long) current),
|
|
- opts,
|
|
+ &orig->opts,
|
|
update_opts,
|
|
btree_id, k);
|
|
/*
|
|
* possible errors: -BCH_ERR_nocow_lock_blocked,
|
|
* -BCH_ERR_ENOSPC_disk_reservation:
|
|
*/
|
|
- if (ret) {
|
|
- BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
|
|
- bch_promote_params));
|
|
- goto err;
|
|
- }
|
|
+ if (ret)
|
|
+ goto err_remove_hash;
|
|
|
|
+ rbio_init_fragment(&op->write.rbio.bio, orig);
|
|
+ op->write.rbio.bounce = true;
|
|
+ op->write.rbio.promote = true;
|
|
op->write.op.end_io = promote_done;
|
|
|
|
- return op;
|
|
+ return &op->write.rbio;
|
|
+err_remove_hash:
|
|
+ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
|
|
+ bch_promote_params));
|
|
err:
|
|
- if (*rbio)
|
|
- bio_free_pages(&(*rbio)->bio);
|
|
- kfree(*rbio);
|
|
- *rbio = NULL;
|
|
+ bio_free_pages(&op->write.op.wbio.bio);
|
|
/* We may have added to the rhashtable and thus need rcu freeing: */
|
|
kfree_rcu(op, rcu);
|
|
+err_put:
|
|
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
|
|
return ERR_PTR(ret);
|
|
}
|
|
|
|
noinline
|
|
-static struct promote_op *promote_alloc(struct btree_trans *trans,
|
|
+static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
|
|
struct bvec_iter iter,
|
|
struct bkey_s_c k,
|
|
struct extent_ptr_decoded *pick,
|
|
- struct bch_io_opts opts,
|
|
unsigned flags,
|
|
- struct bch_read_bio **rbio,
|
|
+ struct bch_read_bio *orig,
|
|
bool *bounce,
|
|
bool *read_full,
|
|
struct bch_io_failures *failed)
|
|
{
|
|
+ /*
|
|
+ * We're in the retry path, but we don't know what to repair yet, and we
|
|
+ * don't want to do a promote here:
|
|
+ */
|
|
+ if (failed && !failed->nr)
|
|
+ return NULL;
|
|
+
|
|
struct bch_fs *c = trans->c;
|
|
/*
|
|
* if failed != NULL we're not actually doing a promote, we're
|
|
@@ -301,18 +317,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
|
|
struct bpos pos = promote_full
|
|
? bkey_start_pos(k.k)
|
|
: POS(k.k->p.inode, iter.bi_sector);
|
|
- struct promote_op *promote;
|
|
int ret;
|
|
|
|
- ret = should_promote(c, k, pos, opts, flags, failed);
|
|
+ ret = should_promote(c, k, pos, orig->opts, flags, failed);
|
|
if (ret)
|
|
goto nopromote;
|
|
|
|
- promote = __promote_alloc(trans,
|
|
- k.k->type == KEY_TYPE_reflink_v
|
|
- ? BTREE_ID_reflink
|
|
- : BTREE_ID_extents,
|
|
- k, pos, pick, opts, sectors, rbio, failed);
|
|
+ struct bch_read_bio *promote =
|
|
+ __promote_alloc(trans,
|
|
+ k.k->type == KEY_TYPE_reflink_v
|
|
+ ? BTREE_ID_reflink
|
|
+ : BTREE_ID_extents,
|
|
+ k, pos, pick, sectors, orig, failed);
|
|
+ if (!promote)
|
|
+ return NULL;
|
|
+
|
|
ret = PTR_ERR_OR_ZERO(promote);
|
|
if (ret)
|
|
goto nopromote;
|
|
@@ -321,7 +340,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
|
|
*read_full = promote_full;
|
|
return promote;
|
|
nopromote:
|
|
- trace_read_nopromote(c, ret);
|
|
+ trace_io_read_nopromote(c, ret);
|
|
return NULL;
|
|
}
|
|
|
|
@@ -330,9 +349,17 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
|
|
static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
|
|
struct bch_read_bio *rbio, struct bpos read_pos)
|
|
{
|
|
- return bch2_inum_offset_err_msg_trans(trans, out,
|
|
- (subvol_inum) { rbio->subvol, read_pos.inode },
|
|
- read_pos.offset << 9);
|
|
+ int ret = lockrestart_do(trans,
|
|
+ bch2_inum_offset_err_msg_trans(trans, out,
|
|
+ (subvol_inum) { rbio->subvol, read_pos.inode },
|
|
+ read_pos.offset << 9));
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (rbio->data_update)
|
|
+ prt_str(out, "(internal move) ");
|
|
+
|
|
+ return 0;
|
|
}
|
|
|
|
static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
|
|
@@ -341,10 +368,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
|
|
bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
|
|
}
|
|
|
|
-#define READ_RETRY_AVOID 1
|
|
-#define READ_RETRY 2
|
|
-#define READ_ERR 3
|
|
-
|
|
enum rbio_context {
|
|
RBIO_CONTEXT_NULL,
|
|
RBIO_CONTEXT_HIGHPRI,
|
|
@@ -375,20 +398,25 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
|
|
{
|
|
BUG_ON(rbio->bounce && !rbio->split);
|
|
|
|
- if (rbio->promote)
|
|
- promote_free(rbio->c, rbio->promote);
|
|
- rbio->promote = NULL;
|
|
-
|
|
- if (rbio->bounce)
|
|
- bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
|
|
+ if (rbio->have_ioref) {
|
|
+ struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ }
|
|
|
|
if (rbio->split) {
|
|
struct bch_read_bio *parent = rbio->parent;
|
|
|
|
- if (rbio->kmalloc)
|
|
- kfree(rbio);
|
|
- else
|
|
+ if (unlikely(rbio->promote)) {
|
|
+ if (!rbio->bio.bi_status)
|
|
+ promote_start(rbio);
|
|
+ else
|
|
+ promote_free(rbio);
|
|
+ } else {
|
|
+ if (rbio->bounce)
|
|
+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
|
|
+
|
|
bio_put(&rbio->bio);
|
|
+ }
|
|
|
|
rbio = parent;
|
|
}
|
|
@@ -408,61 +436,115 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
|
|
bio_endio(&rbio->bio);
|
|
}
|
|
|
|
-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
- struct bvec_iter bvec_iter,
|
|
- struct bch_io_failures *failed,
|
|
- unsigned flags)
|
|
+static void get_rbio_extent(struct btree_trans *trans,
|
|
+ struct bch_read_bio *rbio,
|
|
+ struct bkey_buf *sk)
|
|
{
|
|
- struct btree_trans *trans = bch2_trans_get(c);
|
|
struct btree_iter iter;
|
|
- struct bkey_buf sk;
|
|
struct bkey_s_c k;
|
|
- int ret;
|
|
+ int ret = lockrestart_do(trans,
|
|
+ bkey_err(k = bch2_bkey_get_iter(trans, &iter,
|
|
+ rbio->data_btree, rbio->data_pos, 0)));
|
|
+ if (ret)
|
|
+ return;
|
|
+
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ bkey_for_each_ptr(ptrs, ptr)
|
|
+ if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) {
|
|
+ bch2_bkey_buf_reassemble(sk, trans->c, k);
|
|
+ break;
|
|
+ }
|
|
|
|
- flags &= ~BCH_READ_LAST_FRAGMENT;
|
|
- flags |= BCH_READ_MUST_CLONE;
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+}
|
|
|
|
- bch2_bkey_buf_init(&sk);
|
|
+static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
|
|
+ enum btree_id btree, struct bkey_s_c read_k)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+
|
|
+ struct data_update *u = rbio_data_update(rbio);
|
|
+ if (u)
|
|
+ read_k = bkey_i_to_s_c(u->k.k);
|
|
+
|
|
+ u64 flags = bch2_bkey_extent_flags(read_k);
|
|
+ if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
|
|
+ return 0;
|
|
+
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k),
|
|
+ BTREE_ITER_intent);
|
|
+ int ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (!bkey_and_val_eq(k, read_k))
|
|
+ goto out;
|
|
+
|
|
+ struct bkey_i *new = bch2_trans_kmalloc(trans,
|
|
+ bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
|
|
+ ret = PTR_ERR_OR_ZERO(new) ?:
|
|
+ (bkey_reassemble(new, k), 0) ?:
|
|
+ bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
|
|
+ bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?:
|
|
+ bch2_trans_commit(trans, NULL, NULL, 0);
|
|
+
|
|
+ /*
|
|
+ * Propagate key change back to data update path, in particular so it
|
|
+ * knows the extent has been poisoned and it's safe to change the
|
|
+ * checksum
|
|
+ */
|
|
+ if (u && !ret)
|
|
+ bch2_bkey_buf_copy(&u->k, c, new);
|
|
+out:
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ return ret;
|
|
+}
|
|
|
|
- bch2_trans_iter_init(trans, &iter, rbio->data_btree,
|
|
- rbio->read_pos, BTREE_ITER_slots);
|
|
+static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
|
|
+ struct bch_read_bio *rbio,
|
|
+ struct bvec_iter bvec_iter,
|
|
+ struct bch_io_failures *failed,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct data_update *u = container_of(rbio, struct data_update, rbio);
|
|
retry:
|
|
bch2_trans_begin(trans);
|
|
- rbio->bio.bi_status = 0;
|
|
|
|
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = lockrestart_do(trans,
|
|
+ bkey_err(k = bch2_bkey_get_iter(trans, &iter,
|
|
+ u->btree_id, bkey_start_pos(&u->k.k->k),
|
|
+ 0)));
|
|
if (ret)
|
|
goto err;
|
|
|
|
- bch2_bkey_buf_reassemble(&sk, c, k);
|
|
- k = bkey_i_to_s_c(sk.k);
|
|
-
|
|
- if (!bch2_bkey_matches_ptr(c, k,
|
|
- rbio->pick.ptr,
|
|
- rbio->data_pos.offset -
|
|
- rbio->pick.crc.offset)) {
|
|
+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
|
|
/* extent we wanted to read no longer exists: */
|
|
- rbio->hole = true;
|
|
- goto out;
|
|
+ rbio->ret = -BCH_ERR_data_read_key_overwritten;
|
|
+ goto err;
|
|
}
|
|
|
|
ret = __bch2_read_extent(trans, rbio, bvec_iter,
|
|
- rbio->read_pos,
|
|
- rbio->data_btree,
|
|
- k, 0, failed, flags);
|
|
- if (ret == READ_RETRY)
|
|
- goto retry;
|
|
- if (ret)
|
|
- goto err;
|
|
-out:
|
|
- bch2_rbio_done(rbio);
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
- bch2_trans_put(trans);
|
|
- bch2_bkey_buf_exit(&sk, c);
|
|
- return;
|
|
+ bkey_start_pos(&u->k.k->k),
|
|
+ u->btree_id,
|
|
+ bkey_i_to_s_c(u->k.k),
|
|
+ 0, failed, flags, -1);
|
|
err:
|
|
- rbio->bio.bi_status = BLK_STS_IOERR;
|
|
- goto out;
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
|
|
+ bch2_err_matches(ret, BCH_ERR_data_read_retry))
|
|
+ goto retry;
|
|
+
|
|
+ if (ret) {
|
|
+ rbio->bio.bi_status = BLK_STS_IOERR;
|
|
+ rbio->ret = ret;
|
|
+ }
|
|
+
|
|
+ BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
|
|
+ return ret;
|
|
}
|
|
|
|
static void bch2_rbio_retry(struct work_struct *work)
|
|
@@ -478,44 +560,88 @@ static void bch2_rbio_retry(struct work_struct *work)
|
|
};
|
|
struct bch_io_failures failed = { .nr = 0 };
|
|
|
|
- trace_and_count(c, read_retry, &rbio->bio);
|
|
+ struct btree_trans *trans = bch2_trans_get(c);
|
|
+
|
|
+ struct bkey_buf sk;
|
|
+ bch2_bkey_buf_init(&sk);
|
|
+ bkey_init(&sk.k->k);
|
|
|
|
- if (rbio->retry == READ_RETRY_AVOID)
|
|
- bch2_mark_io_failure(&failed, &rbio->pick);
|
|
+ trace_io_read_retry(&rbio->bio);
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
|
|
+ bvec_iter_sectors(rbio->bvec_iter));
|
|
|
|
- rbio->bio.bi_status = 0;
|
|
+ get_rbio_extent(trans, rbio, &sk);
|
|
+
|
|
+ if (!bkey_deleted(&sk.k->k) &&
|
|
+ bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
|
|
+ bch2_mark_io_failure(&failed, &rbio->pick,
|
|
+ rbio->ret == -BCH_ERR_data_read_retry_csum_err);
|
|
+
|
|
+ if (!rbio->split) {
|
|
+ rbio->bio.bi_status = 0;
|
|
+ rbio->ret = 0;
|
|
+ }
|
|
+
|
|
+ unsigned subvol = rbio->subvol;
|
|
+ struct bpos read_pos = rbio->read_pos;
|
|
|
|
rbio = bch2_rbio_free(rbio);
|
|
|
|
- flags |= BCH_READ_IN_RETRY;
|
|
- flags &= ~BCH_READ_MAY_PROMOTE;
|
|
+ flags |= BCH_READ_in_retry;
|
|
+ flags &= ~BCH_READ_may_promote;
|
|
+ flags &= ~BCH_READ_last_fragment;
|
|
+ flags |= BCH_READ_must_clone;
|
|
|
|
- if (flags & BCH_READ_NODECODE) {
|
|
- bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
|
|
+ int ret = rbio->data_update
|
|
+ ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
|
|
+ : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags);
|
|
+
|
|
+ if (ret) {
|
|
+ rbio->ret = ret;
|
|
+ rbio->bio.bi_status = BLK_STS_IOERR;
|
|
} else {
|
|
- flags &= ~BCH_READ_LAST_FRAGMENT;
|
|
- flags |= BCH_READ_MUST_CLONE;
|
|
+ struct printbuf buf = PRINTBUF;
|
|
|
|
- __bch2_read(c, rbio, iter, inum, &failed, flags);
|
|
+ lockrestart_do(trans,
|
|
+ bch2_inum_offset_err_msg_trans(trans, &buf,
|
|
+ (subvol_inum) { subvol, read_pos.inode },
|
|
+ read_pos.offset << 9));
|
|
+ if (rbio->data_update)
|
|
+ prt_str(&buf, "(internal move) ");
|
|
+ prt_str(&buf, "successful retry");
|
|
+
|
|
+ bch_err_ratelimited(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
}
|
|
+
|
|
+ bch2_rbio_done(rbio);
|
|
+ bch2_bkey_buf_exit(&sk, c);
|
|
+ bch2_trans_put(trans);
|
|
}
|
|
|
|
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
|
|
- blk_status_t error)
|
|
+static void bch2_rbio_error(struct bch_read_bio *rbio,
|
|
+ int ret, blk_status_t blk_error)
|
|
{
|
|
- rbio->retry = retry;
|
|
+ BUG_ON(ret >= 0);
|
|
+
|
|
+ rbio->ret = ret;
|
|
+ rbio->bio.bi_status = blk_error;
|
|
|
|
- if (rbio->flags & BCH_READ_IN_RETRY)
|
|
+ bch2_rbio_parent(rbio)->saw_error = true;
|
|
+
|
|
+ if (rbio->flags & BCH_READ_in_retry)
|
|
return;
|
|
|
|
- if (retry == READ_ERR) {
|
|
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
|
|
+ bch2_rbio_punt(rbio, bch2_rbio_retry,
|
|
+ RBIO_CONTEXT_UNBOUND, system_unbound_wq);
|
|
+ } else {
|
|
rbio = bch2_rbio_free(rbio);
|
|
|
|
- rbio->bio.bi_status = error;
|
|
+ rbio->ret = ret;
|
|
+ rbio->bio.bi_status = blk_error;
|
|
+
|
|
bch2_rbio_done(rbio);
|
|
- } else {
|
|
- bch2_rbio_punt(rbio, bch2_rbio_retry,
|
|
- RBIO_CONTEXT_UNBOUND, system_unbound_wq);
|
|
}
|
|
}
|
|
|
|
@@ -531,15 +657,13 @@ static void bch2_read_io_err(struct work_struct *work)
|
|
bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
|
|
prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
|
|
|
|
- if (ca) {
|
|
- bch2_io_error(ca, BCH_MEMBER_ERROR_read);
|
|
+ if (ca)
|
|
bch_err_ratelimited(ca, "%s", buf.buf);
|
|
- } else {
|
|
+ else
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
- }
|
|
|
|
printbuf_exit(&buf);
|
|
- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
|
|
}
|
|
|
|
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
|
|
@@ -621,14 +745,12 @@ static void bch2_read_csum_err(struct work_struct *work)
|
|
bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
|
|
|
|
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
|
|
- if (ca) {
|
|
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
|
|
+ if (ca)
|
|
bch_err_ratelimited(ca, "%s", buf.buf);
|
|
- } else {
|
|
+ else
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
- }
|
|
|
|
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
@@ -648,7 +770,7 @@ static void bch2_read_decompress_err(struct work_struct *work)
|
|
else
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
|
|
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
@@ -668,7 +790,7 @@ static void bch2_read_decrypt_err(struct work_struct *work)
|
|
else
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
|
|
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
@@ -678,9 +800,11 @@ static void __bch2_read_endio(struct work_struct *work)
|
|
struct bch_read_bio *rbio =
|
|
container_of(work, struct bch_read_bio, work);
|
|
struct bch_fs *c = rbio->c;
|
|
- struct bio *src = &rbio->bio;
|
|
- struct bio *dst = &bch2_rbio_parent(rbio)->bio;
|
|
- struct bvec_iter dst_iter = rbio->bvec_iter;
|
|
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
|
|
+ struct bch_read_bio *parent = bch2_rbio_parent(rbio);
|
|
+ struct bio *src = &rbio->bio;
|
|
+ struct bio *dst = &parent->bio;
|
|
+ struct bvec_iter dst_iter = rbio->bvec_iter;
|
|
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
|
|
struct nonce nonce = extent_nonce(rbio->version, crc);
|
|
unsigned nofs_flags;
|
|
@@ -698,8 +822,26 @@ static void __bch2_read_endio(struct work_struct *work)
|
|
src->bi_iter = rbio->bvec_iter;
|
|
}
|
|
|
|
+ bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
|
|
+
|
|
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
|
|
- if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
|
|
+ bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
|
|
+
|
|
+ /*
|
|
+ * Checksum error: if the bio wasn't bounced, we may have been
|
|
+ * reading into buffers owned by userspace (that userspace can
|
|
+ * scribble over) - retry the read, bouncing it this time:
|
|
+ */
|
|
+ if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
|
|
+ rbio->flags |= BCH_READ_must_bounce;
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
|
|
+ BLK_STS_IOERR);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
|
|
+
|
|
+ if (!csum_good)
|
|
goto csum_err;
|
|
|
|
/*
|
|
@@ -712,32 +854,40 @@ static void __bch2_read_endio(struct work_struct *work)
|
|
if (unlikely(rbio->narrow_crcs))
|
|
bch2_rbio_narrow_crcs(rbio);
|
|
|
|
- if (rbio->flags & BCH_READ_NODECODE)
|
|
- goto nodecode;
|
|
+ if (likely(!parent->data_update)) {
|
|
+ /* Adjust crc to point to subset of data we want: */
|
|
+ crc.offset += rbio->offset_into_extent;
|
|
+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
|
|
|
|
- /* Adjust crc to point to subset of data we want: */
|
|
- crc.offset += rbio->offset_into_extent;
|
|
- crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
|
|
+ if (crc_is_compressed(crc)) {
|
|
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
+ if (ret)
|
|
+ goto decrypt_err;
|
|
|
|
- if (crc_is_compressed(crc)) {
|
|
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
- if (ret)
|
|
- goto decrypt_err;
|
|
+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
|
|
+ !c->opts.no_data_io)
|
|
+ goto decompression_err;
|
|
+ } else {
|
|
+ /* don't need to decrypt the entire bio: */
|
|
+ nonce = nonce_add(nonce, crc.offset << 9);
|
|
+ bio_advance(src, crc.offset << 9);
|
|
|
|
- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
|
|
- !c->opts.no_data_io)
|
|
- goto decompression_err;
|
|
- } else {
|
|
- /* don't need to decrypt the entire bio: */
|
|
- nonce = nonce_add(nonce, crc.offset << 9);
|
|
- bio_advance(src, crc.offset << 9);
|
|
+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
|
|
+ src->bi_iter.bi_size = dst_iter.bi_size;
|
|
|
|
- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
|
|
- src->bi_iter.bi_size = dst_iter.bi_size;
|
|
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
+ if (ret)
|
|
+ goto decrypt_err;
|
|
|
|
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
- if (ret)
|
|
- goto decrypt_err;
|
|
+ if (rbio->bounce) {
|
|
+ struct bvec_iter src_iter = src->bi_iter;
|
|
+
|
|
+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ if (rbio->split)
|
|
+ rbio->parent->pick = rbio->pick;
|
|
|
|
if (rbio->bounce) {
|
|
struct bvec_iter src_iter = src->bi_iter;
|
|
@@ -754,12 +904,9 @@ static void __bch2_read_endio(struct work_struct *work)
|
|
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
if (ret)
|
|
goto decrypt_err;
|
|
-
|
|
- promote_start(rbio->promote, rbio);
|
|
- rbio->promote = NULL;
|
|
}
|
|
-nodecode:
|
|
- if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
|
|
+
|
|
+ if (likely(!(rbio->flags & BCH_READ_in_retry))) {
|
|
rbio = bch2_rbio_free(rbio);
|
|
bch2_rbio_done(rbio);
|
|
}
|
|
@@ -767,17 +914,6 @@ static void __bch2_read_endio(struct work_struct *work)
|
|
memalloc_nofs_restore(nofs_flags);
|
|
return;
|
|
csum_err:
|
|
- /*
|
|
- * Checksum error: if the bio wasn't bounced, we may have been
|
|
- * reading into buffers owned by userspace (that userspace can
|
|
- * scribble over) - retry the read, bouncing it this time:
|
|
- */
|
|
- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
|
|
- rbio->flags |= BCH_READ_MUST_BOUNCE;
|
|
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
|
|
- goto out;
|
|
- }
|
|
-
|
|
bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
|
|
goto out;
|
|
decompression_err:
|
|
@@ -797,10 +933,8 @@ static void bch2_read_endio(struct bio *bio)
|
|
struct workqueue_struct *wq = NULL;
|
|
enum rbio_context context = RBIO_CONTEXT_NULL;
|
|
|
|
- if (rbio->have_ioref) {
|
|
- bch2_latency_acct(ca, rbio->submit_time, READ);
|
|
- percpu_ref_put(&ca->io_ref);
|
|
- }
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
|
+ rbio->submit_time, !bio->bi_status);
|
|
|
|
if (!rbio->split)
|
|
rbio->bio.bi_end_io = rbio->end_io;
|
|
@@ -810,14 +944,14 @@ static void bch2_read_endio(struct bio *bio)
|
|
return;
|
|
}
|
|
|
|
- if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
|
|
+ if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
|
|
(ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
|
|
- trace_and_count(c, read_reuse_race, &rbio->bio);
|
|
+ trace_and_count(c, io_read_reuse_race, &rbio->bio);
|
|
|
|
- if (rbio->flags & BCH_READ_RETRY_IF_STALE)
|
|
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
|
|
+ if (rbio->flags & BCH_READ_retry_if_stale)
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
|
|
else
|
|
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
|
|
return;
|
|
}
|
|
|
|
@@ -883,15 +1017,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
struct bvec_iter iter, struct bpos read_pos,
|
|
enum btree_id data_btree, struct bkey_s_c k,
|
|
unsigned offset_into_extent,
|
|
- struct bch_io_failures *failed, unsigned flags)
|
|
+ struct bch_io_failures *failed, unsigned flags, int dev)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct extent_ptr_decoded pick;
|
|
struct bch_read_bio *rbio = NULL;
|
|
- struct promote_op *promote = NULL;
|
|
bool bounce = false, read_full = false, narrow_crcs = false;
|
|
struct bpos data_pos = bkey_start_pos(k.k);
|
|
- int pick_ret;
|
|
+ struct data_update *u = rbio_data_update(orig);
|
|
+ int ret = 0;
|
|
|
|
if (bkey_extent_is_inline_data(k.k)) {
|
|
unsigned bytes = min_t(unsigned, iter.bi_size,
|
|
@@ -902,19 +1036,35 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
swap(iter.bi_size, bytes);
|
|
bio_advance_iter(&orig->bio, &iter, bytes);
|
|
zero_fill_bio_iter(&orig->bio, iter);
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
|
|
+ bvec_iter_sectors(iter));
|
|
goto out_read_done;
|
|
}
|
|
+
|
|
+ if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) &&
|
|
+ !orig->data_update)
|
|
+ return -BCH_ERR_extent_poisened;
|
|
retry_pick:
|
|
- pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
|
|
+ ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
|
|
|
|
/* hole or reservation - just zero fill: */
|
|
- if (!pick_ret)
|
|
+ if (!ret)
|
|
goto hole;
|
|
|
|
- if (unlikely(pick_ret < 0)) {
|
|
+ if (unlikely(ret < 0)) {
|
|
+ if (ret == -BCH_ERR_data_read_csum_err) {
|
|
+ int ret2 = maybe_poison_extent(trans, orig, data_btree, k);
|
|
+ if (ret2) {
|
|
+ ret = ret2;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ trace_and_count(c, io_read_fail_and_poison, &orig->bio);
|
|
+ }
|
|
+
|
|
struct printbuf buf = PRINTBUF;
|
|
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
|
|
- prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret));
|
|
+ prt_printf(&buf, "%s\n ", bch2_err_str(ret));
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
@@ -930,6 +1080,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
+ ret = -BCH_ERR_data_read_no_encryption_key;
|
|
goto err;
|
|
}
|
|
|
|
@@ -941,56 +1092,57 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
* retry path, don't check here, it'll be caught in bch2_read_endio()
|
|
* and we'll end up in the retry path:
|
|
*/
|
|
- if ((flags & BCH_READ_IN_RETRY) &&
|
|
+ if ((flags & BCH_READ_in_retry) &&
|
|
!pick.ptr.cached &&
|
|
ca &&
|
|
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
|
|
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
|
|
- bch2_mark_io_failure(failed, &pick);
|
|
+ bch2_mark_io_failure(failed, &pick, false);
|
|
percpu_ref_put(&ca->io_ref);
|
|
goto retry_pick;
|
|
}
|
|
|
|
- if (flags & BCH_READ_NODECODE) {
|
|
+ if (likely(!u)) {
|
|
+ if (!(flags & BCH_READ_last_fragment) ||
|
|
+ bio_flagged(&orig->bio, BIO_CHAIN))
|
|
+ flags |= BCH_READ_must_clone;
|
|
+
|
|
+ narrow_crcs = !(flags & BCH_READ_in_retry) &&
|
|
+ bch2_can_narrow_extent_crcs(k, pick.crc);
|
|
+
|
|
+ if (narrow_crcs && (flags & BCH_READ_user_mapped))
|
|
+ flags |= BCH_READ_must_bounce;
|
|
+
|
|
+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
|
|
+
|
|
+ if (crc_is_compressed(pick.crc) ||
|
|
+ (pick.crc.csum_type != BCH_CSUM_none &&
|
|
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
|
|
+ (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
|
|
+ (flags & BCH_READ_user_mapped)) ||
|
|
+ (flags & BCH_READ_must_bounce)))) {
|
|
+ read_full = true;
|
|
+ bounce = true;
|
|
+ }
|
|
+ } else {
|
|
/*
|
|
* can happen if we retry, and the extent we were going to read
|
|
* has been merged in the meantime:
|
|
*/
|
|
- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) {
|
|
+ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
|
|
if (ca)
|
|
percpu_ref_put(&ca->io_ref);
|
|
- goto hole;
|
|
+ rbio->ret = -BCH_ERR_data_read_buffer_too_small;
|
|
+ goto out_read_done;
|
|
}
|
|
|
|
iter.bi_size = pick.crc.compressed_size << 9;
|
|
- goto get_bio;
|
|
- }
|
|
-
|
|
- if (!(flags & BCH_READ_LAST_FRAGMENT) ||
|
|
- bio_flagged(&orig->bio, BIO_CHAIN))
|
|
- flags |= BCH_READ_MUST_CLONE;
|
|
-
|
|
- narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
|
|
- bch2_can_narrow_extent_crcs(k, pick.crc);
|
|
-
|
|
- if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
|
|
- flags |= BCH_READ_MUST_BOUNCE;
|
|
-
|
|
- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
|
|
-
|
|
- if (crc_is_compressed(pick.crc) ||
|
|
- (pick.crc.csum_type != BCH_CSUM_none &&
|
|
- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
|
|
- (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
|
|
- (flags & BCH_READ_USER_MAPPED)) ||
|
|
- (flags & BCH_READ_MUST_BOUNCE)))) {
|
|
read_full = true;
|
|
- bounce = true;
|
|
}
|
|
|
|
if (orig->opts.promote_target || have_io_error(failed))
|
|
- promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
|
|
- &rbio, &bounce, &read_full, failed);
|
|
+ rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
|
|
+ &bounce, &read_full, failed);
|
|
|
|
if (!read_full) {
|
|
EBUG_ON(crc_is_compressed(pick.crc));
|
|
@@ -1009,7 +1161,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
pick.crc.offset = 0;
|
|
pick.crc.live_size = bvec_iter_sectors(iter);
|
|
}
|
|
-get_bio:
|
|
+
|
|
if (rbio) {
|
|
/*
|
|
* promote already allocated bounce rbio:
|
|
@@ -1024,17 +1176,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
} else if (bounce) {
|
|
unsigned sectors = pick.crc.compressed_size;
|
|
|
|
- rbio = rbio_init(bio_alloc_bioset(NULL,
|
|
+ rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
|
|
DIV_ROUND_UP(sectors, PAGE_SECTORS),
|
|
0,
|
|
GFP_NOFS,
|
|
&c->bio_read_split),
|
|
- orig->opts);
|
|
+ orig);
|
|
|
|
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
|
|
rbio->bounce = true;
|
|
- rbio->split = true;
|
|
- } else if (flags & BCH_READ_MUST_CLONE) {
|
|
+ } else if (flags & BCH_READ_must_clone) {
|
|
/*
|
|
* Have to clone if there were any splits, due to error
|
|
* reporting issues (if a split errored, and retrying didn't
|
|
@@ -1043,11 +1194,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
* from the whole bio, in which case we don't want to retry and
|
|
* lose the error)
|
|
*/
|
|
- rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
|
|
+ rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
|
|
&c->bio_read_split),
|
|
- orig->opts);
|
|
+ orig);
|
|
rbio->bio.bi_iter = iter;
|
|
- rbio->split = true;
|
|
} else {
|
|
rbio = orig;
|
|
rbio->bio.bi_iter = iter;
|
|
@@ -1056,67 +1206,64 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
|
|
EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
|
|
|
|
- rbio->c = c;
|
|
rbio->submit_time = local_clock();
|
|
- if (rbio->split)
|
|
- rbio->parent = orig;
|
|
- else
|
|
+ if (!rbio->split)
|
|
rbio->end_io = orig->bio.bi_end_io;
|
|
rbio->bvec_iter = iter;
|
|
rbio->offset_into_extent= offset_into_extent;
|
|
rbio->flags = flags;
|
|
rbio->have_ioref = ca != NULL;
|
|
rbio->narrow_crcs = narrow_crcs;
|
|
- rbio->hole = 0;
|
|
- rbio->retry = 0;
|
|
+ rbio->ret = 0;
|
|
rbio->context = 0;
|
|
- /* XXX: only initialize this if needed */
|
|
- rbio->devs_have = bch2_bkey_devs(k);
|
|
rbio->pick = pick;
|
|
rbio->subvol = orig->subvol;
|
|
rbio->read_pos = read_pos;
|
|
rbio->data_btree = data_btree;
|
|
rbio->data_pos = data_pos;
|
|
rbio->version = k.k->bversion;
|
|
- rbio->promote = promote;
|
|
INIT_WORK(&rbio->work, NULL);
|
|
|
|
- if (flags & BCH_READ_NODECODE)
|
|
- orig->pick = pick;
|
|
-
|
|
rbio->bio.bi_opf = orig->bio.bi_opf;
|
|
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
|
|
rbio->bio.bi_end_io = bch2_read_endio;
|
|
|
|
+ /* XXX: also nvme read recovery level */
|
|
+ if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
|
|
+ rbio->bio.bi_opf |= REQ_FUA;
|
|
+
|
|
if (rbio->bounce)
|
|
- trace_and_count(c, read_bounce, &rbio->bio);
|
|
+ trace_and_count(c, io_read_bounce, &rbio->bio);
|
|
|
|
- this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
|
|
+ if (!u)
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
|
|
+ else
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
|
|
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
|
|
|
|
/*
|
|
* If it's being moved internally, we don't want to flag it as a cache
|
|
* hit:
|
|
*/
|
|
- if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE))
|
|
+ if (ca && pick.ptr.cached && !u)
|
|
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
|
|
PTR_BUCKET_NR(ca, &pick.ptr), READ);
|
|
|
|
- if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
|
|
+ if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
|
|
bio_inc_remaining(&orig->bio);
|
|
- trace_and_count(c, read_split, &orig->bio);
|
|
+ trace_and_count(c, io_read_split, &orig->bio);
|
|
}
|
|
|
|
/*
|
|
* Unlock the iterator while the btree node's lock is still in
|
|
* cache, before doing the IO:
|
|
*/
|
|
- if (!(flags & BCH_READ_IN_RETRY))
|
|
+ if (!(flags & BCH_READ_in_retry))
|
|
bch2_trans_unlock(trans);
|
|
else
|
|
bch2_trans_unlock_long(trans);
|
|
|
|
- if (!rbio->pick.idx) {
|
|
+ if (likely(!rbio->pick.do_ec_reconstruct)) {
|
|
if (unlikely(!rbio->have_ioref)) {
|
|
struct printbuf buf = PRINTBUF;
|
|
bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
|
|
@@ -1126,7 +1273,9 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
|
|
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
|
+ bch2_rbio_error(rbio,
|
|
+ -BCH_ERR_data_read_retry_device_offline,
|
|
+ BLK_STS_IOERR);
|
|
goto out;
|
|
}
|
|
|
|
@@ -1135,10 +1284,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
|
|
|
|
if (unlikely(c->opts.no_data_io)) {
|
|
- if (likely(!(flags & BCH_READ_IN_RETRY)))
|
|
+ if (likely(!(flags & BCH_READ_in_retry)))
|
|
bio_endio(&rbio->bio);
|
|
} else {
|
|
- if (likely(!(flags & BCH_READ_IN_RETRY)))
|
|
+ if (likely(!(flags & BCH_READ_in_retry)))
|
|
submit_bio(&rbio->bio);
|
|
else
|
|
submit_bio_wait(&rbio->bio);
|
|
@@ -1152,15 +1301,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
} else {
|
|
/* Attempting reconstruct read: */
|
|
if (bch2_ec_read_extent(trans, rbio, k)) {
|
|
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
|
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
|
|
+ BLK_STS_IOERR);
|
|
goto out;
|
|
}
|
|
|
|
- if (likely(!(flags & BCH_READ_IN_RETRY)))
|
|
+ if (likely(!(flags & BCH_READ_in_retry)))
|
|
bio_endio(&rbio->bio);
|
|
}
|
|
out:
|
|
- if (likely(!(flags & BCH_READ_IN_RETRY))) {
|
|
+ if (likely(!(flags & BCH_READ_in_retry))) {
|
|
return 0;
|
|
} else {
|
|
bch2_trans_unlock(trans);
|
|
@@ -1170,54 +1320,57 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
rbio->context = RBIO_CONTEXT_UNBOUND;
|
|
bch2_read_endio(&rbio->bio);
|
|
|
|
- ret = rbio->retry;
|
|
+ ret = rbio->ret;
|
|
rbio = bch2_rbio_free(rbio);
|
|
|
|
- if (ret == READ_RETRY_AVOID) {
|
|
- bch2_mark_io_failure(failed, &pick);
|
|
- ret = READ_RETRY;
|
|
- }
|
|
-
|
|
- if (!ret)
|
|
- goto out_read_done;
|
|
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
|
|
+ bch2_mark_io_failure(failed, &pick,
|
|
+ ret == -BCH_ERR_data_read_retry_csum_err);
|
|
|
|
return ret;
|
|
}
|
|
|
|
err:
|
|
- if (flags & BCH_READ_IN_RETRY)
|
|
- return READ_ERR;
|
|
+ if (flags & BCH_READ_in_retry)
|
|
+ return ret;
|
|
|
|
- orig->bio.bi_status = BLK_STS_IOERR;
|
|
+ orig->bio.bi_status = BLK_STS_IOERR;
|
|
+ orig->ret = ret;
|
|
goto out_read_done;
|
|
|
|
hole:
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
|
|
+ bvec_iter_sectors(iter));
|
|
/*
|
|
- * won't normally happen in the BCH_READ_NODECODE
|
|
- * (bch2_move_extent()) path, but if we retry and the extent we wanted
|
|
- * to read no longer exists we have to signal that:
|
|
+ * won't normally happen in the data update (bch2_move_extent()) path,
|
|
+ * but if we retry and the extent we wanted to read no longer exists we
|
|
+ * have to signal that:
|
|
*/
|
|
- if (flags & BCH_READ_NODECODE)
|
|
- orig->hole = true;
|
|
+ if (u)
|
|
+ orig->ret = -BCH_ERR_data_read_key_overwritten;
|
|
|
|
zero_fill_bio_iter(&orig->bio, iter);
|
|
out_read_done:
|
|
- if (flags & BCH_READ_LAST_FRAGMENT)
|
|
+ if ((flags & BCH_READ_last_fragment) &&
|
|
+ !(flags & BCH_READ_in_retry))
|
|
bch2_rbio_done(orig);
|
|
return 0;
|
|
}
|
|
|
|
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
- struct bvec_iter bvec_iter, subvol_inum inum,
|
|
- struct bch_io_failures *failed, unsigned flags)
|
|
+int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
|
|
+ struct bvec_iter bvec_iter, subvol_inum inum,
|
|
+ struct bch_io_failures *failed,
|
|
+ struct bkey_buf *prev_read,
|
|
+ unsigned flags)
|
|
{
|
|
- struct btree_trans *trans = bch2_trans_get(c);
|
|
+ struct bch_fs *c = trans->c;
|
|
struct btree_iter iter;
|
|
struct bkey_buf sk;
|
|
struct bkey_s_c k;
|
|
+ enum btree_id data_btree;
|
|
int ret;
|
|
|
|
- BUG_ON(flags & BCH_READ_NODECODE);
|
|
+ EBUG_ON(rbio->data_update);
|
|
|
|
bch2_bkey_buf_init(&sk);
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
|
|
@@ -1225,7 +1378,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
BTREE_ITER_slots);
|
|
|
|
while (1) {
|
|
- enum btree_id data_btree = BTREE_ID_extents;
|
|
+ data_btree = BTREE_ID_extents;
|
|
|
|
bch2_trans_begin(trans);
|
|
|
|
@@ -1257,6 +1410,12 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
|
|
k = bkey_i_to_s_c(sk.k);
|
|
|
|
+ if (unlikely(flags & BCH_READ_in_retry)) {
|
|
+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k)))
|
|
+ failed->nr = 0;
|
|
+ bch2_bkey_buf_copy(prev_read, c, sk.k);
|
|
+ }
|
|
+
|
|
/*
|
|
* With indirect extents, the amount of data to read is the min
|
|
* of the original extent and the indirect extent:
|
|
@@ -1267,42 +1426,49 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
swap(bvec_iter.bi_size, bytes);
|
|
|
|
if (bvec_iter.bi_size == bytes)
|
|
- flags |= BCH_READ_LAST_FRAGMENT;
|
|
+ flags |= BCH_READ_last_fragment;
|
|
|
|
ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
|
|
data_btree, k,
|
|
- offset_into_extent, failed, flags);
|
|
+ offset_into_extent, failed, flags, -1);
|
|
+ swap(bvec_iter.bi_size, bytes);
|
|
+
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (flags & BCH_READ_LAST_FRAGMENT)
|
|
+ if (flags & BCH_READ_last_fragment)
|
|
break;
|
|
|
|
- swap(bvec_iter.bi_size, bytes);
|
|
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
|
|
err:
|
|
+ if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
|
|
+ flags |= BCH_READ_must_bounce;
|
|
+
|
|
if (ret &&
|
|
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
|
|
- ret != READ_RETRY &&
|
|
- ret != READ_RETRY_AVOID)
|
|
+ !bch2_err_matches(ret, BCH_ERR_data_read_retry))
|
|
break;
|
|
}
|
|
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
-
|
|
- if (ret) {
|
|
+ if (unlikely(ret)) {
|
|
struct printbuf buf = PRINTBUF;
|
|
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9);
|
|
- prt_printf(&buf, "read error %i from btree lookup", ret);
|
|
+ lockrestart_do(trans,
|
|
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum,
|
|
+ bvec_iter.bi_sector << 9));
|
|
+ prt_printf(&buf, "read error: %s", bch2_err_str(ret));
|
|
bch_err_ratelimited(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
|
|
- rbio->bio.bi_status = BLK_STS_IOERR;
|
|
- bch2_rbio_done(rbio);
|
|
+ rbio->bio.bi_status = BLK_STS_IOERR;
|
|
+ rbio->ret = ret;
|
|
+
|
|
+ if (!(flags & BCH_READ_in_retry))
|
|
+ bch2_rbio_done(rbio);
|
|
}
|
|
|
|
- bch2_trans_put(trans);
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
bch2_bkey_buf_exit(&sk, c);
|
|
+ return ret;
|
|
}
|
|
|
|
void bch2_fs_io_read_exit(struct bch_fs *c)
|
|
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
|
|
index a82e8a94ccb6..1a85b092fd1d 100644
|
|
--- a/fs/bcachefs/io_read.h
|
|
+++ b/fs/bcachefs/io_read.h
|
|
@@ -3,6 +3,7 @@
|
|
#define _BCACHEFS_IO_READ_H
|
|
|
|
#include "bkey_buf.h"
|
|
+#include "btree_iter.h"
|
|
#include "reflink.h"
|
|
|
|
struct bch_read_bio {
|
|
@@ -35,19 +36,18 @@ struct bch_read_bio {
|
|
u16 flags;
|
|
union {
|
|
struct {
|
|
- u16 bounce:1,
|
|
+ u16 data_update:1,
|
|
+ promote:1,
|
|
+ bounce:1,
|
|
split:1,
|
|
- kmalloc:1,
|
|
have_ioref:1,
|
|
narrow_crcs:1,
|
|
- hole:1,
|
|
- retry:2,
|
|
+ saw_error:1,
|
|
context:2;
|
|
};
|
|
u16 _state;
|
|
};
|
|
-
|
|
- struct bch_devs_list devs_have;
|
|
+ s16 ret;
|
|
|
|
struct extent_ptr_decoded pick;
|
|
|
|
@@ -65,8 +65,6 @@ struct bch_read_bio {
|
|
struct bpos data_pos;
|
|
struct bversion version;
|
|
|
|
- struct promote_op *promote;
|
|
-
|
|
struct bch_io_opts opts;
|
|
|
|
struct work_struct work;
|
|
@@ -108,61 +106,90 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
+#define BCH_READ_FLAGS() \
|
|
+ x(retry_if_stale) \
|
|
+ x(may_promote) \
|
|
+ x(user_mapped) \
|
|
+ x(last_fragment) \
|
|
+ x(must_bounce) \
|
|
+ x(must_clone) \
|
|
+ x(in_retry)
|
|
+
|
|
+enum __bch_read_flags {
|
|
+#define x(n) __BCH_READ_##n,
|
|
+ BCH_READ_FLAGS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
enum bch_read_flags {
|
|
- BCH_READ_RETRY_IF_STALE = 1 << 0,
|
|
- BCH_READ_MAY_PROMOTE = 1 << 1,
|
|
- BCH_READ_USER_MAPPED = 1 << 2,
|
|
- BCH_READ_NODECODE = 1 << 3,
|
|
- BCH_READ_LAST_FRAGMENT = 1 << 4,
|
|
-
|
|
- /* internal: */
|
|
- BCH_READ_MUST_BOUNCE = 1 << 5,
|
|
- BCH_READ_MUST_CLONE = 1 << 6,
|
|
- BCH_READ_IN_RETRY = 1 << 7,
|
|
+#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n),
|
|
+ BCH_READ_FLAGS()
|
|
+#undef x
|
|
};
|
|
|
|
int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
|
|
struct bvec_iter, struct bpos, enum btree_id,
|
|
struct bkey_s_c, unsigned,
|
|
- struct bch_io_failures *, unsigned);
|
|
+ struct bch_io_failures *, unsigned, int);
|
|
|
|
static inline void bch2_read_extent(struct btree_trans *trans,
|
|
struct bch_read_bio *rbio, struct bpos read_pos,
|
|
enum btree_id data_btree, struct bkey_s_c k,
|
|
unsigned offset_into_extent, unsigned flags)
|
|
{
|
|
- __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
|
|
- data_btree, k, offset_into_extent, NULL, flags);
|
|
+ int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
|
|
+ data_btree, k, offset_into_extent, NULL, flags, -1);
|
|
+ /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */
|
|
+ WARN(ret, "unhandled error from __bch2_read_extent()");
|
|
}
|
|
|
|
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
|
|
- subvol_inum, struct bch_io_failures *, unsigned flags);
|
|
+int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
|
|
+ subvol_inum,
|
|
+ struct bch_io_failures *, struct bkey_buf *, unsigned flags);
|
|
|
|
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
subvol_inum inum)
|
|
{
|
|
- struct bch_io_failures failed = { .nr = 0 };
|
|
-
|
|
BUG_ON(rbio->_state);
|
|
|
|
- rbio->c = c;
|
|
- rbio->start_time = local_clock();
|
|
rbio->subvol = inum.subvol;
|
|
|
|
- __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
|
|
- BCH_READ_RETRY_IF_STALE|
|
|
- BCH_READ_MAY_PROMOTE|
|
|
- BCH_READ_USER_MAPPED);
|
|
+ bch2_trans_run(c,
|
|
+ __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL,
|
|
+ BCH_READ_retry_if_stale|
|
|
+ BCH_READ_may_promote|
|
|
+ BCH_READ_user_mapped));
|
|
+}
|
|
+
|
|
+static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
|
|
+ struct bch_read_bio *orig)
|
|
+{
|
|
+ struct bch_read_bio *rbio = to_rbio(bio);
|
|
+
|
|
+ rbio->c = orig->c;
|
|
+ rbio->_state = 0;
|
|
+ rbio->flags = 0;
|
|
+ rbio->ret = 0;
|
|
+ rbio->split = true;
|
|
+ rbio->parent = orig;
|
|
+ rbio->opts = orig->opts;
|
|
+ return rbio;
|
|
}
|
|
|
|
static inline struct bch_read_bio *rbio_init(struct bio *bio,
|
|
- struct bch_io_opts opts)
|
|
+ struct bch_fs *c,
|
|
+ struct bch_io_opts opts,
|
|
+ bio_end_io_t end_io)
|
|
{
|
|
struct bch_read_bio *rbio = to_rbio(bio);
|
|
|
|
- rbio->_state = 0;
|
|
- rbio->promote = NULL;
|
|
- rbio->opts = opts;
|
|
+ rbio->start_time = local_clock();
|
|
+ rbio->c = c;
|
|
+ rbio->_state = 0;
|
|
+ rbio->flags = 0;
|
|
+ rbio->ret = 0;
|
|
+ rbio->opts = opts;
|
|
+ rbio->bio.bi_end_io = end_io;
|
|
return rbio;
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
|
|
index 03892388832b..07b55839768e 100644
|
|
--- a/fs/bcachefs/io_write.c
|
|
+++ b/fs/bcachefs/io_write.c
|
|
@@ -34,6 +34,12 @@
|
|
#include <linux/random.h>
|
|
#include <linux/sched/mm.h>
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+static unsigned bch2_write_corrupt_ratio;
|
|
+module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
|
|
+MODULE_PARM_DESC(write_corrupt_ratio, "");
|
|
+#endif
|
|
+
|
|
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
|
|
|
static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
|
|
@@ -374,7 +380,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
|
|
bch2_extent_update(trans, inum, &iter, sk.k,
|
|
&op->res,
|
|
op->new_i_size, &op->i_sectors_delta,
|
|
- op->flags & BCH_WRITE_CHECK_ENOSPC);
|
|
+ op->flags & BCH_WRITE_check_enospc);
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
@@ -396,29 +402,36 @@ static int bch2_write_index_default(struct bch_write_op *op)
|
|
|
|
/* Writes */
|
|
|
|
-static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
|
|
- u64 offset)
|
|
+void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
|
|
{
|
|
- bch2_inum_offset_err_msg(op->c, out,
|
|
- (subvol_inum) { op->subvol, op->pos.inode, },
|
|
- offset << 9);
|
|
- prt_printf(out, "write error%s: ",
|
|
- op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
|
|
-}
|
|
+ struct printbuf buf = PRINTBUF;
|
|
|
|
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
|
|
-{
|
|
- __bch2_write_op_error(out, op, op->pos.offset);
|
|
-}
|
|
+ if (op->subvol) {
|
|
+ bch2_inum_offset_err_msg(op->c, &buf,
|
|
+ (subvol_inum) { op->subvol, op->pos.inode, },
|
|
+ offset << 9);
|
|
+ } else {
|
|
+ struct bpos pos = op->pos;
|
|
+ pos.offset = offset;
|
|
+ bch2_inum_snap_offset_err_msg(op->c, &buf, pos);
|
|
+ }
|
|
|
|
-static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
|
|
- struct bch_write_op *op, u64 offset)
|
|
-{
|
|
- bch2_inum_offset_err_msg_trans(trans, out,
|
|
- (subvol_inum) { op->subvol, op->pos.inode, },
|
|
- offset << 9);
|
|
- prt_printf(out, "write error%s: ",
|
|
- op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
|
|
+ prt_str(&buf, "write error: ");
|
|
+
|
|
+ va_list args;
|
|
+ va_start(args, fmt);
|
|
+ prt_vprintf(&buf, fmt, args);
|
|
+ va_end(args);
|
|
+
|
|
+ if (op->flags & BCH_WRITE_move) {
|
|
+ struct data_update *u = container_of(op, struct data_update, op);
|
|
+
|
|
+ prt_printf(&buf, "\n from internal move ");
|
|
+ bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
|
|
+ }
|
|
+
|
|
+ bch_err_ratelimited(op->c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
}
|
|
|
|
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
|
|
@@ -493,7 +506,7 @@ static void bch2_write_done(struct closure *cl)
|
|
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
|
|
bch2_disk_reservation_put(c, &op->res);
|
|
|
|
- if (!(op->flags & BCH_WRITE_MOVE))
|
|
+ if (!(op->flags & BCH_WRITE_move))
|
|
bch2_write_ref_put(c, BCH_WRITE_REF_write);
|
|
bch2_keylist_free(&op->insert_keys, op->inline_keys);
|
|
|
|
@@ -516,7 +529,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
|
|
test_bit(ptr->dev, op->failed.d));
|
|
|
|
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
|
|
- return -EIO;
|
|
+ return -BCH_ERR_data_write_io;
|
|
}
|
|
|
|
if (dst != src)
|
|
@@ -539,7 +552,7 @@ static void __bch2_write_index(struct bch_write_op *op)
|
|
unsigned dev;
|
|
int ret = 0;
|
|
|
|
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
|
|
+ if (unlikely(op->flags & BCH_WRITE_io_error)) {
|
|
ret = bch2_write_drop_io_error_ptrs(op);
|
|
if (ret)
|
|
goto err;
|
|
@@ -548,7 +561,7 @@ static void __bch2_write_index(struct bch_write_op *op)
|
|
if (!bch2_keylist_empty(keys)) {
|
|
u64 sectors_start = keylist_sectors(keys);
|
|
|
|
- ret = !(op->flags & BCH_WRITE_MOVE)
|
|
+ ret = !(op->flags & BCH_WRITE_move)
|
|
? bch2_write_index_default(op)
|
|
: bch2_data_update_index_update(op);
|
|
|
|
@@ -560,11 +573,8 @@ static void __bch2_write_index(struct bch_write_op *op)
|
|
if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
|
|
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
|
|
|
|
- struct printbuf buf = PRINTBUF;
|
|
- __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
|
|
- prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
|
|
+ "btree update error: %s", bch2_err_str(ret));
|
|
}
|
|
|
|
if (ret)
|
|
@@ -573,21 +583,29 @@ static void __bch2_write_index(struct bch_write_op *op)
|
|
out:
|
|
/* If some a bucket wasn't written, we can't erasure code it: */
|
|
for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
|
|
- bch2_open_bucket_write_error(c, &op->open_buckets, dev);
|
|
+ bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io);
|
|
|
|
bch2_open_buckets_put(c, &op->open_buckets);
|
|
return;
|
|
err:
|
|
keys->top = keys->keys;
|
|
op->error = ret;
|
|
- op->flags |= BCH_WRITE_SUBMITTED;
|
|
+ op->flags |= BCH_WRITE_submitted;
|
|
goto out;
|
|
}
|
|
|
|
static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
|
|
{
|
|
if (state != wp->state) {
|
|
+ struct task_struct *p = current;
|
|
u64 now = ktime_get_ns();
|
|
+ u64 runtime = p->se.sum_exec_runtime +
|
|
+ (now - p->se.exec_start);
|
|
+
|
|
+ if (state == WRITE_POINT_runnable)
|
|
+ wp->last_runtime = runtime;
|
|
+ else if (wp->state == WRITE_POINT_runnable)
|
|
+ wp->time[WRITE_POINT_running] += runtime - wp->last_runtime;
|
|
|
|
if (wp->last_state_change &&
|
|
time_after64(now, wp->last_state_change))
|
|
@@ -601,7 +619,7 @@ static inline void wp_update_state(struct write_point *wp, bool running)
|
|
{
|
|
enum write_point_state state;
|
|
|
|
- state = running ? WRITE_POINT_running :
|
|
+ state = running ? WRITE_POINT_runnable:
|
|
!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
|
|
: WRITE_POINT_stopped;
|
|
|
|
@@ -615,8 +633,8 @@ static CLOSURE_CALLBACK(bch2_write_index)
|
|
struct workqueue_struct *wq = index_update_wq(op);
|
|
unsigned long flags;
|
|
|
|
- if ((op->flags & BCH_WRITE_SUBMITTED) &&
|
|
- (op->flags & BCH_WRITE_MOVE))
|
|
+ if ((op->flags & BCH_WRITE_submitted) &&
|
|
+ (op->flags & BCH_WRITE_move))
|
|
bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
|
|
|
|
spin_lock_irqsave(&wp->writes_lock, flags);
|
|
@@ -654,11 +672,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
|
|
if (!op)
|
|
break;
|
|
|
|
- op->flags |= BCH_WRITE_IN_WORKER;
|
|
+ op->flags |= BCH_WRITE_in_worker;
|
|
|
|
__bch2_write_index(op);
|
|
|
|
- if (!(op->flags & BCH_WRITE_SUBMITTED))
|
|
+ if (!(op->flags & BCH_WRITE_submitted))
|
|
__bch2_write(op);
|
|
else
|
|
bch2_write_done(&op->cl);
|
|
@@ -676,13 +694,17 @@ static void bch2_write_endio(struct bio *bio)
|
|
? bch2_dev_have_ref(c, wbio->dev)
|
|
: NULL;
|
|
|
|
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
|
|
+ wbio->submit_time, !bio->bi_status);
|
|
+
|
|
+ if (bio->bi_status) {
|
|
+ bch_err_inum_offset_ratelimited(ca,
|
|
op->pos.inode,
|
|
wbio->inode_offset << 9,
|
|
"data write error: %s",
|
|
- bch2_blk_status_to_str(bio->bi_status))) {
|
|
+ bch2_blk_status_to_str(bio->bi_status));
|
|
set_bit(wbio->dev, op->failed.d);
|
|
- op->flags |= BCH_WRITE_IO_ERROR;
|
|
+ op->flags |= BCH_WRITE_io_error;
|
|
}
|
|
|
|
if (wbio->nocow) {
|
|
@@ -692,10 +714,8 @@ static void bch2_write_endio(struct bio *bio)
|
|
set_bit(wbio->dev, op->devs_need_flush->d);
|
|
}
|
|
|
|
- if (wbio->have_ioref) {
|
|
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
|
|
+ if (wbio->have_ioref)
|
|
percpu_ref_put(&ca->io_ref);
|
|
- }
|
|
|
|
if (wbio->bounce)
|
|
bch2_bio_free_pages_pool(c, bio);
|
|
@@ -729,7 +749,7 @@ static void init_append_extent(struct bch_write_op *op,
|
|
bch2_extent_crc_append(&e->k_i, crc);
|
|
|
|
bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
|
|
- op->flags & BCH_WRITE_CACHED);
|
|
+ op->flags & BCH_WRITE_cached);
|
|
|
|
bch2_keylist_push(&op->insert_keys);
|
|
}
|
|
@@ -789,7 +809,6 @@ static int bch2_write_rechecksum(struct bch_fs *c,
|
|
{
|
|
struct bio *bio = &op->wbio.bio;
|
|
struct bch_extent_crc_unpacked new_crc;
|
|
- int ret;
|
|
|
|
/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
|
|
|
|
@@ -797,10 +816,10 @@ static int bch2_write_rechecksum(struct bch_fs *c,
|
|
bch2_csum_type_is_encryption(new_csum_type))
|
|
new_csum_type = op->crc.csum_type;
|
|
|
|
- ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
|
|
- NULL, &new_crc,
|
|
- op->crc.offset, op->crc.live_size,
|
|
- new_csum_type);
|
|
+ int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
|
|
+ NULL, &new_crc,
|
|
+ op->crc.offset, op->crc.live_size,
|
|
+ new_csum_type);
|
|
if (ret)
|
|
return ret;
|
|
|
|
@@ -810,44 +829,12 @@ static int bch2_write_rechecksum(struct bch_fs *c,
|
|
return 0;
|
|
}
|
|
|
|
-static int bch2_write_decrypt(struct bch_write_op *op)
|
|
-{
|
|
- struct bch_fs *c = op->c;
|
|
- struct nonce nonce = extent_nonce(op->version, op->crc);
|
|
- struct bch_csum csum;
|
|
- int ret;
|
|
-
|
|
- if (!bch2_csum_type_is_encryption(op->crc.csum_type))
|
|
- return 0;
|
|
-
|
|
- /*
|
|
- * If we need to decrypt data in the write path, we'll no longer be able
|
|
- * to verify the existing checksum (poly1305 mac, in this case) after
|
|
- * it's decrypted - this is the last point we'll be able to reverify the
|
|
- * checksum:
|
|
- */
|
|
- csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
|
|
- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
|
|
- return -EIO;
|
|
-
|
|
- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
|
|
- op->crc.csum_type = 0;
|
|
- op->crc.csum = (struct bch_csum) { 0, 0 };
|
|
- return ret;
|
|
-}
|
|
-
|
|
-static enum prep_encoded_ret {
|
|
- PREP_ENCODED_OK,
|
|
- PREP_ENCODED_ERR,
|
|
- PREP_ENCODED_CHECKSUM_ERR,
|
|
- PREP_ENCODED_DO_WRITE,
|
|
-} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
|
|
+static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
|
|
{
|
|
struct bch_fs *c = op->c;
|
|
struct bio *bio = &op->wbio.bio;
|
|
-
|
|
- if (!(op->flags & BCH_WRITE_DATA_ENCODED))
|
|
- return PREP_ENCODED_OK;
|
|
+ struct bch_csum csum;
|
|
+ int ret = 0;
|
|
|
|
BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
|
|
|
|
@@ -858,12 +845,13 @@ static enum prep_encoded_ret {
|
|
(op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
|
|
op->incompressible)) {
|
|
if (!crc_is_compressed(op->crc) &&
|
|
- op->csum_type != op->crc.csum_type &&
|
|
- bch2_write_rechecksum(c, op, op->csum_type) &&
|
|
- !c->opts.no_data_io)
|
|
- return PREP_ENCODED_CHECKSUM_ERR;
|
|
+ op->csum_type != op->crc.csum_type) {
|
|
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
|
|
- return PREP_ENCODED_DO_WRITE;
|
|
+ return 1;
|
|
}
|
|
|
|
/*
|
|
@@ -871,20 +859,24 @@ static enum prep_encoded_ret {
|
|
* is, we have to decompress it:
|
|
*/
|
|
if (crc_is_compressed(op->crc)) {
|
|
- struct bch_csum csum;
|
|
-
|
|
- if (bch2_write_decrypt(op))
|
|
- return PREP_ENCODED_CHECKSUM_ERR;
|
|
-
|
|
/* Last point we can still verify checksum: */
|
|
- csum = bch2_checksum_bio(c, op->crc.csum_type,
|
|
- extent_nonce(op->version, op->crc),
|
|
- bio);
|
|
+ struct nonce nonce = extent_nonce(op->version, op->crc);
|
|
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
|
|
if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
|
|
- return PREP_ENCODED_CHECKSUM_ERR;
|
|
+ goto csum_err;
|
|
+
|
|
+ if (bch2_csum_type_is_encryption(op->crc.csum_type)) {
|
|
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ op->crc.csum_type = 0;
|
|
+ op->crc.csum = (struct bch_csum) { 0, 0 };
|
|
+ }
|
|
|
|
- if (bch2_bio_uncompress_inplace(op, bio))
|
|
- return PREP_ENCODED_ERR;
|
|
+ ret = bch2_bio_uncompress_inplace(op, bio);
|
|
+ if (ret)
|
|
+ return ret;
|
|
}
|
|
|
|
/*
|
|
@@ -896,22 +888,44 @@ static enum prep_encoded_ret {
|
|
* If the data is checksummed and we're only writing a subset,
|
|
* rechecksum and adjust bio to point to currently live data:
|
|
*/
|
|
- if ((op->crc.live_size != op->crc.uncompressed_size ||
|
|
- op->crc.csum_type != op->csum_type) &&
|
|
- bch2_write_rechecksum(c, op, op->csum_type) &&
|
|
- !c->opts.no_data_io)
|
|
- return PREP_ENCODED_CHECKSUM_ERR;
|
|
+ if (op->crc.live_size != op->crc.uncompressed_size ||
|
|
+ op->crc.csum_type != op->csum_type) {
|
|
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
|
|
/*
|
|
* If we want to compress the data, it has to be decrypted:
|
|
*/
|
|
- if ((op->compression_opt ||
|
|
- bch2_csum_type_is_encryption(op->crc.csum_type) !=
|
|
- bch2_csum_type_is_encryption(op->csum_type)) &&
|
|
- bch2_write_decrypt(op))
|
|
- return PREP_ENCODED_CHECKSUM_ERR;
|
|
+ if (bch2_csum_type_is_encryption(op->crc.csum_type) &&
|
|
+ (op->compression_opt || op->crc.csum_type != op->csum_type)) {
|
|
+ struct nonce nonce = extent_nonce(op->version, op->crc);
|
|
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
|
|
+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
|
|
+ goto csum_err;
|
|
+
|
|
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ op->crc.csum_type = 0;
|
|
+ op->crc.csum = (struct bch_csum) { 0, 0 };
|
|
+ }
|
|
|
|
- return PREP_ENCODED_OK;
|
|
+ return 0;
|
|
+csum_err:
|
|
+ bch2_write_op_error(op, op->pos.offset,
|
|
+ "error verifying existing checksum while moving existing data (memory corruption?)\n"
|
|
+ " expected %0llx:%0llx got %0llx:%0llx type %s",
|
|
+ op->crc.csum.hi,
|
|
+ op->crc.csum.lo,
|
|
+ csum.hi,
|
|
+ csum.lo,
|
|
+ op->crc.csum_type < BCH_CSUM_NR
|
|
+ ? __bch2_csum_types[op->crc.csum_type]
|
|
+ : "(unknown)");
|
|
+ return -BCH_ERR_data_write_csum;
|
|
}
|
|
|
|
static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
@@ -926,43 +940,51 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
bool page_alloc_failed = false;
|
|
int ret, more = 0;
|
|
|
|
+ if (op->incompressible)
|
|
+ op->compression_opt = 0;
|
|
+
|
|
BUG_ON(!bio_sectors(src));
|
|
|
|
ec_buf = bch2_writepoint_ec_buf(c, wp);
|
|
|
|
- switch (bch2_write_prep_encoded_data(op, wp)) {
|
|
- case PREP_ENCODED_OK:
|
|
- break;
|
|
- case PREP_ENCODED_ERR:
|
|
- ret = -EIO;
|
|
- goto err;
|
|
- case PREP_ENCODED_CHECKSUM_ERR:
|
|
- goto csum_err;
|
|
- case PREP_ENCODED_DO_WRITE:
|
|
- /* XXX look for bug here */
|
|
- if (ec_buf) {
|
|
- dst = bch2_write_bio_alloc(c, wp, src,
|
|
- &page_alloc_failed,
|
|
- ec_buf);
|
|
- bio_copy_data(dst, src);
|
|
- bounce = true;
|
|
+ if (unlikely(op->flags & BCH_WRITE_data_encoded)) {
|
|
+ ret = bch2_write_prep_encoded_data(op, wp);
|
|
+ if (ret < 0)
|
|
+ goto err;
|
|
+ if (ret) {
|
|
+ if (ec_buf) {
|
|
+ dst = bch2_write_bio_alloc(c, wp, src,
|
|
+ &page_alloc_failed,
|
|
+ ec_buf);
|
|
+ bio_copy_data(dst, src);
|
|
+ bounce = true;
|
|
+ }
|
|
+ init_append_extent(op, wp, op->version, op->crc);
|
|
+ goto do_write;
|
|
}
|
|
- init_append_extent(op, wp, op->version, op->crc);
|
|
- goto do_write;
|
|
}
|
|
|
|
if (ec_buf ||
|
|
op->compression_opt ||
|
|
(op->csum_type &&
|
|
- !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
|
|
+ !(op->flags & BCH_WRITE_pages_stable)) ||
|
|
(bch2_csum_type_is_encryption(op->csum_type) &&
|
|
- !(op->flags & BCH_WRITE_PAGES_OWNED))) {
|
|
+ !(op->flags & BCH_WRITE_pages_owned))) {
|
|
dst = bch2_write_bio_alloc(c, wp, src,
|
|
&page_alloc_failed,
|
|
ec_buf);
|
|
bounce = true;
|
|
}
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
|
|
+ if (!bounce && write_corrupt_ratio) {
|
|
+ dst = bch2_write_bio_alloc(c, wp, src,
|
|
+ &page_alloc_failed,
|
|
+ ec_buf);
|
|
+ bounce = true;
|
|
+ }
|
|
+#endif
|
|
saved_iter = dst->bi_iter;
|
|
|
|
do {
|
|
@@ -976,7 +998,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
break;
|
|
|
|
BUG_ON(op->compression_opt &&
|
|
- (op->flags & BCH_WRITE_DATA_ENCODED) &&
|
|
+ (op->flags & BCH_WRITE_data_encoded) &&
|
|
bch2_csum_type_is_encryption(op->crc.csum_type));
|
|
BUG_ON(op->compression_opt && !bounce);
|
|
|
|
@@ -1014,7 +1036,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
}
|
|
}
|
|
|
|
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
|
|
+ if ((op->flags & BCH_WRITE_data_encoded) &&
|
|
!crc_is_compressed(crc) &&
|
|
bch2_csum_type_is_encryption(op->crc.csum_type) ==
|
|
bch2_csum_type_is_encryption(op->csum_type)) {
|
|
@@ -1032,12 +1054,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
* data can't be modified (by userspace) while it's in
|
|
* flight.
|
|
*/
|
|
- if (bch2_rechecksum_bio(c, src, version, op->crc,
|
|
+ ret = bch2_rechecksum_bio(c, src, version, op->crc,
|
|
&crc, &op->crc,
|
|
src_len >> 9,
|
|
bio_sectors(src) - (src_len >> 9),
|
|
- op->csum_type))
|
|
- goto csum_err;
|
|
+ op->csum_type);
|
|
+ if (ret)
|
|
+ goto err;
|
|
/*
|
|
* rchecksum_bio sets compression_type on crc from op->crc,
|
|
* this isn't always correct as sometimes we're changing
|
|
@@ -1046,13 +1069,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
crc.compression_type = compression_type;
|
|
crc.nonce = nonce;
|
|
} else {
|
|
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
|
|
- bch2_rechecksum_bio(c, src, version, op->crc,
|
|
+ if ((op->flags & BCH_WRITE_data_encoded) &&
|
|
+ (ret = bch2_rechecksum_bio(c, src, version, op->crc,
|
|
NULL, &op->crc,
|
|
src_len >> 9,
|
|
bio_sectors(src) - (src_len >> 9),
|
|
- op->crc.csum_type))
|
|
- goto csum_err;
|
|
+ op->crc.csum_type)))
|
|
+ goto err;
|
|
|
|
crc.compressed_size = dst_len >> 9;
|
|
crc.uncompressed_size = src_len >> 9;
|
|
@@ -1072,6 +1095,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
|
|
init_append_extent(op, wp, version, crc);
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ if (write_corrupt_ratio) {
|
|
+ swap(dst->bi_iter.bi_size, dst_len);
|
|
+ bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
|
|
+ swap(dst->bi_iter.bi_size, dst_len);
|
|
+ }
|
|
+#endif
|
|
+
|
|
if (dst != src)
|
|
bio_advance(dst, dst_len);
|
|
bio_advance(src, src_len);
|
|
@@ -1103,16 +1134,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
do_write:
|
|
*_dst = dst;
|
|
return more;
|
|
-csum_err:
|
|
- {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)");
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
-
|
|
- ret = -EIO;
|
|
err:
|
|
if (to_wbio(dst)->bounce)
|
|
bch2_bio_free_pages_pool(c, dst);
|
|
@@ -1190,39 +1211,36 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
|
|
{
|
|
struct bch_fs *c = op->c;
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
+ int ret = 0;
|
|
|
|
for_each_keylist_key(&op->insert_keys, orig) {
|
|
- int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
|
|
+ ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
|
|
bkey_start_pos(&orig->k), orig->k.p,
|
|
BTREE_ITER_intent, k,
|
|
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
|
|
}));
|
|
-
|
|
- if (ret && !bch2_err_matches(ret, EROFS)) {
|
|
- struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
|
|
-
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k));
|
|
- prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
-
|
|
- if (ret) {
|
|
- op->error = ret;
|
|
+ if (ret)
|
|
break;
|
|
- }
|
|
}
|
|
|
|
bch2_trans_put(trans);
|
|
+
|
|
+ if (ret && !bch2_err_matches(ret, EROFS)) {
|
|
+ struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
|
|
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
|
|
+ "btree update error: %s", bch2_err_str(ret));
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ op->error = ret;
|
|
}
|
|
|
|
static void __bch2_nocow_write_done(struct bch_write_op *op)
|
|
{
|
|
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
|
|
- op->error = -EIO;
|
|
- } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
|
|
+ if (unlikely(op->flags & BCH_WRITE_io_error)) {
|
|
+ op->error = -BCH_ERR_data_write_io;
|
|
+ } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
|
|
bch2_nocow_write_convert_unwritten(op);
|
|
}
|
|
|
|
@@ -1251,7 +1269,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
struct bucket_to_lock *stale_at;
|
|
int stale, ret;
|
|
|
|
- if (op->flags & BCH_WRITE_MOVE)
|
|
+ if (op->flags & BCH_WRITE_move)
|
|
return;
|
|
|
|
darray_init(&buckets);
|
|
@@ -1309,7 +1327,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
}), GFP_KERNEL|__GFP_NOFAIL);
|
|
|
|
if (ptr->unwritten)
|
|
- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
|
|
+ op->flags |= BCH_WRITE_convert_unwritten;
|
|
}
|
|
|
|
/* Unlock before taking nocow locks, doing IO: */
|
|
@@ -1317,7 +1335,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
bch2_trans_unlock(trans);
|
|
|
|
bch2_cut_front(op->pos, op->insert_keys.top);
|
|
- if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
|
|
+ if (op->flags & BCH_WRITE_convert_unwritten)
|
|
bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
|
|
|
|
darray_for_each(buckets, i) {
|
|
@@ -1342,7 +1360,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
wbio_init(bio)->put_bio = true;
|
|
bio->bi_opf = op->wbio.bio.bi_opf;
|
|
} else {
|
|
- op->flags |= BCH_WRITE_SUBMITTED;
|
|
+ op->flags |= BCH_WRITE_submitted;
|
|
}
|
|
|
|
op->pos.offset += bio_sectors(bio);
|
|
@@ -1352,11 +1370,12 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
bio->bi_private = &op->cl;
|
|
bio->bi_opf |= REQ_OP_WRITE;
|
|
closure_get(&op->cl);
|
|
+
|
|
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
|
|
op->insert_keys.top, true);
|
|
|
|
bch2_keylist_push(&op->insert_keys);
|
|
- if (op->flags & BCH_WRITE_SUBMITTED)
|
|
+ if (op->flags & BCH_WRITE_submitted)
|
|
break;
|
|
bch2_btree_iter_advance(&iter);
|
|
}
|
|
@@ -1370,21 +1389,18 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
darray_exit(&buckets);
|
|
|
|
if (ret) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
+ bch2_write_op_error(op, op->pos.offset,
|
|
+ "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
|
|
op->error = ret;
|
|
- op->flags |= BCH_WRITE_SUBMITTED;
|
|
+ op->flags |= BCH_WRITE_submitted;
|
|
}
|
|
|
|
/* fallback to cow write path? */
|
|
- if (!(op->flags & BCH_WRITE_SUBMITTED)) {
|
|
+ if (!(op->flags & BCH_WRITE_submitted)) {
|
|
closure_sync(&op->cl);
|
|
__bch2_nocow_write_done(op);
|
|
op->insert_keys.top = op->insert_keys.keys;
|
|
- } else if (op->flags & BCH_WRITE_SYNC) {
|
|
+ } else if (op->flags & BCH_WRITE_sync) {
|
|
closure_sync(&op->cl);
|
|
bch2_nocow_write_done(&op->cl.work);
|
|
} else {
|
|
@@ -1414,7 +1430,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
|
"pointer to invalid bucket in nocow path on device %llu\n %s",
|
|
stale_at->b.inode,
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
|
- ret = -EIO;
|
|
+ ret = -BCH_ERR_data_write_invalid_ptr;
|
|
} else {
|
|
/* We can retry this: */
|
|
ret = -BCH_ERR_transaction_restart;
|
|
@@ -1436,7 +1452,7 @@ static void __bch2_write(struct bch_write_op *op)
|
|
|
|
if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
|
|
bch2_nocow_write(op);
|
|
- if (op->flags & BCH_WRITE_SUBMITTED)
|
|
+ if (op->flags & BCH_WRITE_submitted)
|
|
goto out_nofs_restore;
|
|
}
|
|
again:
|
|
@@ -1466,7 +1482,7 @@ static void __bch2_write(struct bch_write_op *op)
|
|
ret = bch2_trans_run(c, lockrestart_do(trans,
|
|
bch2_alloc_sectors_start_trans(trans,
|
|
op->target,
|
|
- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
|
|
+ op->opts.erasure_code && !(op->flags & BCH_WRITE_cached),
|
|
op->write_point,
|
|
&op->devs_have,
|
|
op->nr_replicas,
|
|
@@ -1489,16 +1505,12 @@ static void __bch2_write(struct bch_write_op *op)
|
|
bch2_alloc_sectors_done_inlined(c, wp);
|
|
err:
|
|
if (ret <= 0) {
|
|
- op->flags |= BCH_WRITE_SUBMITTED;
|
|
+ op->flags |= BCH_WRITE_submitted;
|
|
|
|
if (unlikely(ret < 0)) {
|
|
- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
|
|
- bch_err_ratelimited(c, "%s", buf.buf);
|
|
- printbuf_exit(&buf);
|
|
- }
|
|
+ if (!(op->flags & BCH_WRITE_alloc_nowait))
|
|
+ bch2_write_op_error(op, op->pos.offset,
|
|
+ "%s(): %s", __func__, bch2_err_str(ret));
|
|
op->error = ret;
|
|
break;
|
|
}
|
|
@@ -1524,14 +1536,14 @@ static void __bch2_write(struct bch_write_op *op)
|
|
* synchronously here if we weren't able to submit all of the IO at
|
|
* once, as that signals backpressure to the caller.
|
|
*/
|
|
- if ((op->flags & BCH_WRITE_SYNC) ||
|
|
- (!(op->flags & BCH_WRITE_SUBMITTED) &&
|
|
- !(op->flags & BCH_WRITE_IN_WORKER))) {
|
|
+ if ((op->flags & BCH_WRITE_sync) ||
|
|
+ (!(op->flags & BCH_WRITE_submitted) &&
|
|
+ !(op->flags & BCH_WRITE_in_worker))) {
|
|
bch2_wait_on_allocator(c, &op->cl);
|
|
|
|
__bch2_write_index(op);
|
|
|
|
- if (!(op->flags & BCH_WRITE_SUBMITTED))
|
|
+ if (!(op->flags & BCH_WRITE_submitted))
|
|
goto again;
|
|
bch2_write_done(&op->cl);
|
|
} else {
|
|
@@ -1552,8 +1564,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
|
|
|
|
memset(&op->failed, 0, sizeof(op->failed));
|
|
|
|
- op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
|
|
- op->flags |= BCH_WRITE_SUBMITTED;
|
|
+ op->flags |= BCH_WRITE_wrote_data_inline;
|
|
+ op->flags |= BCH_WRITE_submitted;
|
|
|
|
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
|
|
|
|
@@ -1616,8 +1628,8 @@ CLOSURE_CALLBACK(bch2_write)
|
|
BUG_ON(!op->write_point.v);
|
|
BUG_ON(bkey_eq(op->pos, POS_MAX));
|
|
|
|
- if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
|
|
- op->flags |= BCH_WRITE_ALLOC_NOWAIT;
|
|
+ if (op->flags & BCH_WRITE_only_specified_devs)
|
|
+ op->flags |= BCH_WRITE_alloc_nowait;
|
|
|
|
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
|
|
op->start_time = local_clock();
|
|
@@ -1625,11 +1637,8 @@ CLOSURE_CALLBACK(bch2_write)
|
|
wbio_init(bio)->put_bio = false;
|
|
|
|
if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
|
|
- struct printbuf buf = PRINTBUF;
|
|
- bch2_write_op_error(&buf, op);
|
|
- prt_printf(&buf, "misaligned write");
|
|
- printbuf_exit(&buf);
|
|
- op->error = -EIO;
|
|
+ bch2_write_op_error(op, op->pos.offset, "misaligned write");
|
|
+ op->error = -BCH_ERR_data_write_misaligned;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1638,13 +1647,14 @@ CLOSURE_CALLBACK(bch2_write)
|
|
goto err;
|
|
}
|
|
|
|
- if (!(op->flags & BCH_WRITE_MOVE) &&
|
|
+ if (!(op->flags & BCH_WRITE_move) &&
|
|
!bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
|
|
op->error = -BCH_ERR_erofs_no_writes;
|
|
goto err;
|
|
}
|
|
|
|
- this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
|
|
+ if (!(op->flags & BCH_WRITE_move))
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
|
|
bch2_increment_clock(c, bio_sectors(bio), WRITE);
|
|
|
|
data_len = min_t(u64, bio->bi_iter.bi_size,
|
|
@@ -1675,20 +1685,26 @@ static const char * const bch2_write_flags[] = {
|
|
|
|
void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
|
|
{
|
|
- prt_str(out, "pos: ");
|
|
+ if (!out->nr_tabstops)
|
|
+ printbuf_tabstop_push(out, 32);
|
|
+
|
|
+ prt_printf(out, "pos:\t");
|
|
bch2_bpos_to_text(out, op->pos);
|
|
prt_newline(out);
|
|
printbuf_indent_add(out, 2);
|
|
|
|
- prt_str(out, "started: ");
|
|
+ prt_printf(out, "started:\t");
|
|
bch2_pr_time_units(out, local_clock() - op->start_time);
|
|
prt_newline(out);
|
|
|
|
- prt_str(out, "flags: ");
|
|
+ prt_printf(out, "flags:\t");
|
|
prt_bitflags(out, bch2_write_flags, op->flags);
|
|
prt_newline(out);
|
|
|
|
- prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
|
|
+ prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
|
|
+ prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
|
|
+
|
|
+ prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
|
|
|
|
printbuf_indent_sub(out, 2);
|
|
}
|
|
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
|
|
index b4626013abc8..b8ab19a1e1da 100644
|
|
--- a/fs/bcachefs/io_write.h
|
|
+++ b/fs/bcachefs/io_write.h
|
|
@@ -11,33 +11,27 @@
|
|
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
|
|
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
|
|
|
|
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
|
-void bch2_latency_acct(struct bch_dev *, u64, int);
|
|
-#else
|
|
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
|
|
-#endif
|
|
-
|
|
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
|
|
enum bch_data_type, const struct bkey_i *, bool);
|
|
|
|
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op);
|
|
+__printf(3, 4)
|
|
+void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...);
|
|
|
|
#define BCH_WRITE_FLAGS() \
|
|
- x(ALLOC_NOWAIT) \
|
|
- x(CACHED) \
|
|
- x(DATA_ENCODED) \
|
|
- x(PAGES_STABLE) \
|
|
- x(PAGES_OWNED) \
|
|
- x(ONLY_SPECIFIED_DEVS) \
|
|
- x(WROTE_DATA_INLINE) \
|
|
- x(FROM_INTERNAL) \
|
|
- x(CHECK_ENOSPC) \
|
|
- x(SYNC) \
|
|
- x(MOVE) \
|
|
- x(IN_WORKER) \
|
|
- x(SUBMITTED) \
|
|
- x(IO_ERROR) \
|
|
- x(CONVERT_UNWRITTEN)
|
|
+ x(alloc_nowait) \
|
|
+ x(cached) \
|
|
+ x(data_encoded) \
|
|
+ x(pages_stable) \
|
|
+ x(pages_owned) \
|
|
+ x(only_specified_devs) \
|
|
+ x(wrote_data_inline) \
|
|
+ x(check_enospc) \
|
|
+ x(sync) \
|
|
+ x(move) \
|
|
+ x(in_worker) \
|
|
+ x(submitted) \
|
|
+ x(io_error) \
|
|
+ x(convert_unwritten)
|
|
|
|
enum __bch_write_flags {
|
|
#define x(f) __BCH_WRITE_##f,
|
|
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
|
|
index 6e878a6f2f0b..3ef6df9145ef 100644
|
|
--- a/fs/bcachefs/io_write_types.h
|
|
+++ b/fs/bcachefs/io_write_types.h
|
|
@@ -64,7 +64,7 @@ struct bch_write_op {
|
|
struct bpos pos;
|
|
struct bversion version;
|
|
|
|
- /* For BCH_WRITE_DATA_ENCODED: */
|
|
+ /* For BCH_WRITE_data_encoded: */
|
|
struct bch_extent_crc_unpacked crc;
|
|
|
|
struct write_point_specifier write_point;
|
|
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
|
|
index 05b1250619ec..ecb97d435f6a 100644
|
|
--- a/fs/bcachefs/journal.c
|
|
+++ b/fs/bcachefs/journal.c
|
|
@@ -20,13 +20,6 @@
|
|
#include "journal_seq_blacklist.h"
|
|
#include "trace.h"
|
|
|
|
-static const char * const bch2_journal_errors[] = {
|
|
-#define x(n) #n,
|
|
- JOURNAL_ERRORS()
|
|
-#undef x
|
|
- NULL
|
|
-};
|
|
-
|
|
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
|
|
{
|
|
return seq > j->seq_ondisk;
|
|
@@ -56,14 +49,20 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
|
|
prt_printf(out, "seq:\t%llu\n", seq);
|
|
printbuf_indent_add(out, 2);
|
|
|
|
- prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i));
|
|
+ if (!buf->write_started)
|
|
+ prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK));
|
|
|
|
- prt_printf(out, "size:\t");
|
|
- prt_human_readable_u64(out, vstruct_bytes(buf->data));
|
|
- prt_newline(out);
|
|
+ struct closure *cl = &buf->io;
|
|
+ int r = atomic_read(&cl->remaining);
|
|
+ prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK);
|
|
+
|
|
+ if (buf->data) {
|
|
+ prt_printf(out, "size:\t");
|
|
+ prt_human_readable_u64(out, vstruct_bytes(buf->data));
|
|
+ prt_newline(out);
|
|
+ }
|
|
|
|
- prt_printf(out, "expires:\t");
|
|
- prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
|
|
+ prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies);
|
|
|
|
prt_printf(out, "flags:\t");
|
|
if (buf->noflush)
|
|
@@ -87,6 +86,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
|
|
|
|
static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
|
|
{
|
|
+ lockdep_assert_held(&j->lock);
|
|
+ out->atomic++;
|
|
+
|
|
if (!out->nr_tabstops)
|
|
printbuf_tabstop_push(out, 24);
|
|
|
|
@@ -95,6 +97,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
|
|
seq++)
|
|
bch2_journal_buf_to_text(out, j, seq);
|
|
prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
|
|
+
|
|
+ --out->atomic;
|
|
}
|
|
|
|
static inline struct journal_buf *
|
|
@@ -104,10 +108,8 @@ journal_seq_to_buf(struct journal *j, u64 seq)
|
|
|
|
EBUG_ON(seq > journal_cur_seq(j));
|
|
|
|
- if (journal_seq_unwritten(j, seq)) {
|
|
+ if (journal_seq_unwritten(j, seq))
|
|
buf = j->buf + (seq & JOURNAL_BUF_MASK);
|
|
- EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
|
|
- }
|
|
return buf;
|
|
}
|
|
|
|
@@ -139,8 +141,10 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
|
|
bool stuck = false;
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
- if (!(error == JOURNAL_ERR_journal_full ||
|
|
- error == JOURNAL_ERR_journal_pin_full) ||
|
|
+ buf.atomic++;
|
|
+
|
|
+ if (!(error == -BCH_ERR_journal_full ||
|
|
+ error == -BCH_ERR_journal_pin_full) ||
|
|
nr_unwritten_journal_entries(j) ||
|
|
(flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
|
|
return stuck;
|
|
@@ -167,9 +171,9 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
|
|
spin_unlock(&j->lock);
|
|
|
|
bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
|
|
- bch2_journal_errors[error]);
|
|
+ bch2_err_str(error));
|
|
bch2_journal_debug_to_text(&buf, j);
|
|
- bch_err(c, "%s", buf.buf);
|
|
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
|
|
|
|
printbuf_reset(&buf);
|
|
bch2_journal_pins_to_text(&buf, j);
|
|
@@ -195,7 +199,8 @@ void bch2_journal_do_writes(struct journal *j)
|
|
if (w->write_started)
|
|
continue;
|
|
|
|
- if (!journal_state_count(j->reservations, idx)) {
|
|
+ if (!journal_state_seq_count(j, j->reservations, seq)) {
|
|
+ j->seq_write_started = seq;
|
|
w->write_started = true;
|
|
closure_call(&w->io, bch2_journal_write, j->wq, NULL);
|
|
}
|
|
@@ -306,7 +311,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
|
|
|
|
bch2_journal_space_available(j);
|
|
|
|
- __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
|
|
+ __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq));
|
|
}
|
|
|
|
void bch2_journal_halt(struct journal *j)
|
|
@@ -377,29 +382,41 @@ static int journal_entry_open(struct journal *j)
|
|
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
|
|
|
|
if (j->blocked)
|
|
- return JOURNAL_ERR_blocked;
|
|
+ return -BCH_ERR_journal_blocked;
|
|
|
|
if (j->cur_entry_error)
|
|
return j->cur_entry_error;
|
|
|
|
- if (bch2_journal_error(j))
|
|
- return JOURNAL_ERR_insufficient_devices; /* -EROFS */
|
|
+ int ret = bch2_journal_error(j);
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
|
|
if (!fifo_free(&j->pin))
|
|
- return JOURNAL_ERR_journal_pin_full;
|
|
+ return -BCH_ERR_journal_pin_full;
|
|
|
|
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
|
|
- return JOURNAL_ERR_max_in_flight;
|
|
+ return -BCH_ERR_journal_max_in_flight;
|
|
+
|
|
+ if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
|
|
+ return -BCH_ERR_journal_max_open;
|
|
|
|
if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) {
|
|
bch_err(c, "cannot start: journal seq overflow");
|
|
if (bch2_fs_emergency_read_only_locked(c))
|
|
bch_err(c, "fatal error - emergency read only");
|
|
- return JOURNAL_ERR_insufficient_devices; /* -EROFS */
|
|
+ return -BCH_ERR_journal_shutdown;
|
|
}
|
|
|
|
+ if (!j->free_buf && !buf->data)
|
|
+ return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */
|
|
+
|
|
BUG_ON(!j->cur_entry_sectors);
|
|
|
|
+ if (!buf->data) {
|
|
+ swap(buf->data, j->free_buf);
|
|
+ swap(buf->buf_size, j->free_buf_size);
|
|
+ }
|
|
+
|
|
buf->expires =
|
|
(journal_cur_seq(j) == j->flushed_seq_ondisk
|
|
? jiffies
|
|
@@ -415,7 +432,7 @@ static int journal_entry_open(struct journal *j)
|
|
u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
|
|
|
|
if (u64s <= (ssize_t) j->early_journal_entries.nr)
|
|
- return JOURNAL_ERR_journal_full;
|
|
+ return -BCH_ERR_journal_full;
|
|
|
|
if (fifo_empty(&j->pin) && j->reclaim_thread)
|
|
wake_up_process(j->reclaim_thread);
|
|
@@ -464,7 +481,7 @@ static int journal_entry_open(struct journal *j)
|
|
|
|
new.idx++;
|
|
BUG_ON(journal_state_count(new, new.idx));
|
|
- BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
|
|
+ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK));
|
|
|
|
journal_state_inc(&new);
|
|
|
|
@@ -514,6 +531,33 @@ static void journal_write_work(struct work_struct *work)
|
|
spin_unlock(&j->lock);
|
|
}
|
|
|
|
+static void journal_buf_prealloc(struct journal *j)
|
|
+{
|
|
+ if (j->free_buf &&
|
|
+ j->free_buf_size >= j->buf_size_want)
|
|
+ return;
|
|
+
|
|
+ unsigned buf_size = j->buf_size_want;
|
|
+
|
|
+ spin_unlock(&j->lock);
|
|
+ void *buf = kvmalloc(buf_size, GFP_NOFS);
|
|
+ spin_lock(&j->lock);
|
|
+
|
|
+ if (buf &&
|
|
+ (!j->free_buf ||
|
|
+ buf_size > j->free_buf_size)) {
|
|
+ swap(buf, j->free_buf);
|
|
+ swap(buf_size, j->free_buf_size);
|
|
+ }
|
|
+
|
|
+ if (unlikely(buf)) {
|
|
+ spin_unlock(&j->lock);
|
|
+ /* kvfree can sleep */
|
|
+ kvfree(buf);
|
|
+ spin_lock(&j->lock);
|
|
+ }
|
|
+}
|
|
+
|
|
static int __journal_res_get(struct journal *j, struct journal_res *res,
|
|
unsigned flags)
|
|
{
|
|
@@ -525,25 +569,28 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
|
|
if (journal_res_get_fast(j, res, flags))
|
|
return 0;
|
|
|
|
- if (bch2_journal_error(j))
|
|
- return -BCH_ERR_erofs_journal_err;
|
|
+ ret = bch2_journal_error(j);
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
|
|
if (j->blocked)
|
|
- return -BCH_ERR_journal_res_get_blocked;
|
|
+ return -BCH_ERR_journal_blocked;
|
|
|
|
if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
|
|
- ret = JOURNAL_ERR_journal_full;
|
|
+ ret = -BCH_ERR_journal_full;
|
|
can_discard = j->can_discard;
|
|
goto out;
|
|
}
|
|
|
|
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
|
|
- ret = JOURNAL_ERR_max_in_flight;
|
|
+ ret = -BCH_ERR_journal_max_in_flight;
|
|
goto out;
|
|
}
|
|
|
|
spin_lock(&j->lock);
|
|
|
|
+ journal_buf_prealloc(j);
|
|
+
|
|
/*
|
|
* Recheck after taking the lock, so we don't race with another thread
|
|
* that just did journal_entry_open() and call bch2_journal_entry_close()
|
|
@@ -566,25 +613,48 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
|
|
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
|
|
|
|
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
|
|
- ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
|
|
+ ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open;
|
|
unlock:
|
|
can_discard = j->can_discard;
|
|
spin_unlock(&j->lock);
|
|
out:
|
|
- if (ret == JOURNAL_ERR_retry)
|
|
- goto retry;
|
|
- if (!ret)
|
|
+ if (likely(!ret))
|
|
return 0;
|
|
+ if (ret == -BCH_ERR_journal_retry_open)
|
|
+ goto retry;
|
|
|
|
if (journal_error_check_stuck(j, ret, flags))
|
|
- ret = -BCH_ERR_journal_res_get_blocked;
|
|
+ ret = -BCH_ERR_journal_stuck;
|
|
+
|
|
+ if (ret == -BCH_ERR_journal_max_in_flight &&
|
|
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
|
|
+ trace_journal_entry_full_enabled()) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ bch2_printbuf_make_room(&buf, 4096);
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
|
|
+ bch2_journal_bufs_to_text(&buf, j);
|
|
+ spin_unlock(&j->lock);
|
|
|
|
- if (ret == JOURNAL_ERR_max_in_flight &&
|
|
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
|
|
+ trace_journal_entry_full(c, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ count_event(c, journal_entry_full);
|
|
+ }
|
|
|
|
+ if (ret == -BCH_ERR_journal_max_open &&
|
|
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) &&
|
|
+ trace_journal_entry_full_enabled()) {
|
|
struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ bch2_printbuf_make_room(&buf, 4096);
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
|
|
bch2_journal_bufs_to_text(&buf, j);
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
trace_journal_entry_full(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
count_event(c, journal_entry_full);
|
|
@@ -594,8 +664,8 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
|
|
* Journal is full - can't rely on reclaim from work item due to
|
|
* freezing:
|
|
*/
|
|
- if ((ret == JOURNAL_ERR_journal_full ||
|
|
- ret == JOURNAL_ERR_journal_pin_full) &&
|
|
+ if ((ret == -BCH_ERR_journal_full ||
|
|
+ ret == -BCH_ERR_journal_pin_full) &&
|
|
!(flags & JOURNAL_RES_GET_NONBLOCK)) {
|
|
if (can_discard) {
|
|
bch2_journal_do_discards(j);
|
|
@@ -608,9 +678,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
|
|
}
|
|
}
|
|
|
|
- return ret == JOURNAL_ERR_insufficient_devices
|
|
- ? -BCH_ERR_erofs_journal_err
|
|
- : -BCH_ERR_journal_res_get_blocked;
|
|
+ return ret;
|
|
}
|
|
|
|
static unsigned max_dev_latency(struct bch_fs *c)
|
|
@@ -640,7 +708,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
|
|
int ret;
|
|
|
|
if (closure_wait_event_timeout(&j->async_wait,
|
|
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
|
|
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
|
|
(flags & JOURNAL_RES_GET_NONBLOCK),
|
|
HZ))
|
|
return ret;
|
|
@@ -654,19 +722,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
|
|
remaining_wait = max(0, remaining_wait - HZ);
|
|
|
|
if (closure_wait_event_timeout(&j->async_wait,
|
|
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
|
|
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
|
|
(flags & JOURNAL_RES_GET_NONBLOCK),
|
|
remaining_wait))
|
|
return ret;
|
|
|
|
+ bch_err(c, "Journal stuck? Waited for 10 seconds, err %s", bch2_err_str(ret));
|
|
struct printbuf buf = PRINTBUF;
|
|
bch2_journal_debug_to_text(&buf, j);
|
|
- bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
|
|
- buf.buf);
|
|
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
|
|
printbuf_exit(&buf);
|
|
|
|
closure_wait_event(&j->async_wait,
|
|
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
|
|
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
|
|
(flags & JOURNAL_RES_GET_NONBLOCK));
|
|
return ret;
|
|
}
|
|
@@ -687,7 +755,6 @@ void bch2_journal_entry_res_resize(struct journal *j,
|
|
goto out;
|
|
|
|
j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
|
|
- smp_mb();
|
|
state = READ_ONCE(j->reservations);
|
|
|
|
if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
|
|
@@ -907,7 +974,7 @@ int bch2_journal_meta(struct journal *j)
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
|
|
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal))
|
|
- return -EROFS;
|
|
+ return -BCH_ERR_erofs_no_writes;
|
|
|
|
int ret = __bch2_journal_meta(j);
|
|
bch2_write_ref_put(c, BCH_WRITE_REF_journal);
|
|
@@ -951,7 +1018,8 @@ static void __bch2_journal_block(struct journal *j)
|
|
new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
|
|
} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
|
|
|
|
- journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
|
|
+ if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL)
|
|
+ journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
|
|
}
|
|
}
|
|
|
|
@@ -992,7 +1060,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
|
|
*blocked = true;
|
|
}
|
|
|
|
- ret = journal_state_count(s, idx) > open
|
|
+ ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open
|
|
? ERR_PTR(-EAGAIN)
|
|
: buf;
|
|
break;
|
|
@@ -1349,6 +1417,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
|
|
j->replay_journal_seq_end = cur_seq;
|
|
j->last_seq_ondisk = last_seq;
|
|
j->flushed_seq_ondisk = cur_seq - 1;
|
|
+ j->seq_write_started = cur_seq - 1;
|
|
j->seq_ondisk = cur_seq - 1;
|
|
j->pin.front = last_seq;
|
|
j->pin.back = cur_seq;
|
|
@@ -1389,8 +1458,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
|
|
set_bit(JOURNAL_running, &j->flags);
|
|
j->last_flush_write = jiffies;
|
|
|
|
- j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
|
|
- j->reservations.unwritten_idx++;
|
|
+ j->reservations.idx = journal_cur_seq(j);
|
|
|
|
c->last_bucket_seq_cleanup = journal_cur_seq(j);
|
|
|
|
@@ -1443,7 +1511,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
|
|
unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
|
|
- ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
|
|
+ ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
|
|
nr_bvecs), GFP_KERNEL);
|
|
if (!ja->bio[i])
|
|
return -BCH_ERR_ENOMEM_dev_journal_init;
|
|
@@ -1482,6 +1550,7 @@ void bch2_fs_journal_exit(struct journal *j)
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
|
|
kvfree(j->buf[i].data);
|
|
+ kvfree(j->free_buf);
|
|
free_fifo(&j->pin);
|
|
}
|
|
|
|
@@ -1508,13 +1577,13 @@ int bch2_fs_journal_init(struct journal *j)
|
|
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
|
|
return -BCH_ERR_ENOMEM_journal_pin_fifo;
|
|
|
|
- for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
|
|
- j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
|
|
- j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
|
|
- if (!j->buf[i].data)
|
|
- return -BCH_ERR_ENOMEM_journal_buf;
|
|
+ j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
|
|
+ j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
|
|
+ if (!j->free_buf)
|
|
+ return -BCH_ERR_ENOMEM_journal_buf;
|
|
+
|
|
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
|
|
j->buf[i].idx = i;
|
|
- }
|
|
|
|
j->pin.front = j->pin.back = 1;
|
|
|
|
@@ -1564,6 +1633,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
|
|
prt_printf(out, "average write size:\t");
|
|
prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
|
|
prt_newline(out);
|
|
+ prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0);
|
|
prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
|
|
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
|
|
prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked);
|
|
@@ -1571,7 +1641,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
|
|
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
|
|
prt_printf(out, "blocked:\t%u\n", j->blocked);
|
|
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
|
|
- prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
|
|
+ prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error));
|
|
prt_printf(out, "current entry:\t");
|
|
|
|
switch (s.cur_entry_offset) {
|
|
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
|
|
index 107f7f901cd9..47828771f9c2 100644
|
|
--- a/fs/bcachefs/journal.h
|
|
+++ b/fs/bcachefs/journal.h
|
|
@@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j)
|
|
closure_wake_up(&j->async_wait);
|
|
}
|
|
|
|
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
|
|
-{
|
|
- return j->buf + j->reservations.idx;
|
|
-}
|
|
-
|
|
/* Sequence number of oldest dirty journal entry */
|
|
|
|
static inline u64 journal_last_seq(struct journal *j)
|
|
@@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j)
|
|
return j->seq_ondisk + 1;
|
|
}
|
|
|
|
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
|
|
+{
|
|
+ unsigned idx = (journal_cur_seq(j) &
|
|
+ JOURNAL_BUF_MASK &
|
|
+ ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx;
|
|
+
|
|
+ return j->buf + idx;
|
|
+}
|
|
+
|
|
static inline int journal_state_count(union journal_res_state s, int idx)
|
|
{
|
|
switch (idx) {
|
|
@@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx)
|
|
BUG();
|
|
}
|
|
|
|
+static inline int journal_state_seq_count(struct journal *j,
|
|
+ union journal_res_state s, u64 seq)
|
|
+{
|
|
+ if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR)
|
|
+ return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK);
|
|
+ else
|
|
+ return 0;
|
|
+}
|
|
+
|
|
static inline void journal_state_inc(union journal_res_state *s)
|
|
{
|
|
s->buf0_count += s->idx == 0;
|
|
@@ -193,7 +206,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
|
|
static inline struct jset_entry *
|
|
journal_res_entry(struct journal *j, struct journal_res *res)
|
|
{
|
|
- return vstruct_idx(j->buf[res->idx].data, res->offset);
|
|
+ return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset);
|
|
}
|
|
|
|
static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
|
|
@@ -267,8 +280,9 @@ bool bch2_journal_entry_close(struct journal *);
|
|
void bch2_journal_do_writes(struct journal *);
|
|
void bch2_journal_buf_put_final(struct journal *, u64);
|
|
|
|
-static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
|
|
+static inline void __bch2_journal_buf_put(struct journal *j, u64 seq)
|
|
{
|
|
+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
|
|
union journal_res_state s;
|
|
|
|
s = journal_state_buf_put(j, idx);
|
|
@@ -276,8 +290,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
|
|
bch2_journal_buf_put_final(j, seq);
|
|
}
|
|
|
|
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
|
|
+static inline void bch2_journal_buf_put(struct journal *j, u64 seq)
|
|
{
|
|
+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
|
|
union journal_res_state s;
|
|
|
|
s = journal_state_buf_put(j, idx);
|
|
@@ -306,7 +321,7 @@ static inline void bch2_journal_res_put(struct journal *j,
|
|
BCH_JSET_ENTRY_btree_keys,
|
|
0, 0, 0);
|
|
|
|
- bch2_journal_buf_put(j, res->idx, res->seq);
|
|
+ bch2_journal_buf_put(j, res->seq);
|
|
|
|
res->ref = 0;
|
|
}
|
|
@@ -335,8 +350,10 @@ static inline int journal_res_get_fast(struct journal *j,
|
|
|
|
/*
|
|
* Check if there is still room in the current journal
|
|
- * entry:
|
|
+ * entry, smp_rmb() guarantees that reads from reservations.counter
|
|
+ * occur before accessing cur_entry_u64s:
|
|
*/
|
|
+ smp_rmb();
|
|
if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
|
|
return 0;
|
|
|
|
@@ -361,9 +378,9 @@ static inline int journal_res_get_fast(struct journal *j,
|
|
&old.v, new.v));
|
|
|
|
res->ref = true;
|
|
- res->idx = old.idx;
|
|
res->offset = old.cur_entry_offset;
|
|
- res->seq = le64_to_cpu(j->buf[old.idx].data->seq);
|
|
+ res->seq = journal_cur_seq(j);
|
|
+ res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK;
|
|
return 1;
|
|
}
|
|
|
|
@@ -390,6 +407,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
|
|
(flags & JOURNAL_RES_GET_NONBLOCK) != 0,
|
|
NULL, _THIS_IP_);
|
|
EBUG_ON(!res->ref);
|
|
+ BUG_ON(!res->seq);
|
|
}
|
|
return 0;
|
|
}
|
|
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
|
|
index 11c39e0c34f4..4ed6137f0439 100644
|
|
--- a/fs/bcachefs/journal_io.c
|
|
+++ b/fs/bcachefs/journal_io.c
|
|
@@ -1041,13 +1041,19 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|
bio->bi_iter.bi_sector = offset;
|
|
bch2_bio_map(bio, buf->data, sectors_read << 9);
|
|
|
|
+ u64 submit_time = local_clock();
|
|
ret = submit_bio_wait(bio);
|
|
kfree(bio);
|
|
|
|
- if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
|
|
- "journal read error: sector %llu",
|
|
- offset) ||
|
|
- bch2_meta_read_fault("journal")) {
|
|
+ if (!ret && bch2_meta_read_fault("journal"))
|
|
+ ret = -BCH_ERR_EIO_fault_injected;
|
|
+
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
|
+ submit_time, !ret);
|
|
+
|
|
+ if (ret) {
|
|
+ bch_err_dev_ratelimited(ca,
|
|
+ "journal read error: sector %llu", offset);
|
|
/*
|
|
* We don't error out of the recovery process
|
|
* here, since the relevant journal entry may be
|
|
@@ -1110,13 +1116,16 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|
struct bch_csum csum;
|
|
csum_good = jset_csum_good(c, j, &csum);
|
|
|
|
- if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
|
|
- "%s",
|
|
- (printbuf_reset(&err),
|
|
- prt_str(&err, "journal "),
|
|
- bch2_csum_err_msg(&err, csum_type, j->csum, csum),
|
|
- err.buf)))
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
|
|
+
|
|
+ if (!csum_good) {
|
|
+ bch_err_dev_ratelimited(ca, "%s",
|
|
+ (printbuf_reset(&err),
|
|
+ prt_str(&err, "journal "),
|
|
+ bch2_csum_err_msg(&err, csum_type, j->csum, csum),
|
|
+ err.buf));
|
|
saw_bad = true;
|
|
+ }
|
|
|
|
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
|
|
j->encrypted_start,
|
|
@@ -1515,7 +1524,7 @@ static void __journal_write_alloc(struct journal *j,
|
|
* @j: journal object
|
|
* @w: journal buf (entry to be written)
|
|
*
|
|
- * Returns: 0 on success, or -EROFS on failure
|
|
+ * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure
|
|
*/
|
|
static int journal_write_alloc(struct journal *j, struct journal_buf *w)
|
|
{
|
|
@@ -1600,18 +1609,12 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
|
|
kvfree(new_buf);
|
|
}
|
|
|
|
-static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
|
|
-{
|
|
- return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
|
|
-}
|
|
-
|
|
static CLOSURE_CALLBACK(journal_write_done)
|
|
{
|
|
closure_type(w, struct journal_buf, io);
|
|
struct journal *j = container_of(w, struct journal, buf[w->idx]);
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
struct bch_replicas_padded replicas;
|
|
- union journal_res_state old, new;
|
|
u64 seq = le64_to_cpu(w->data->seq);
|
|
int err = 0;
|
|
|
|
@@ -1621,12 +1624,11 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
|
|
if (!w->devs_written.nr) {
|
|
bch_err(c, "unable to write journal to sufficient devices");
|
|
- err = -EIO;
|
|
+ err = -BCH_ERR_journal_write_err;
|
|
} else {
|
|
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
|
|
w->devs_written);
|
|
- if (bch2_mark_replicas(c, &replicas.e))
|
|
- err = -EIO;
|
|
+ err = bch2_mark_replicas(c, &replicas.e);
|
|
}
|
|
|
|
if (err)
|
|
@@ -1641,7 +1643,23 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
j->err_seq = seq;
|
|
w->write_done = true;
|
|
|
|
+ if (!j->free_buf || j->free_buf_size < w->buf_size) {
|
|
+ swap(j->free_buf, w->data);
|
|
+ swap(j->free_buf_size, w->buf_size);
|
|
+ }
|
|
+
|
|
+ if (w->data) {
|
|
+ void *buf = w->data;
|
|
+ w->data = NULL;
|
|
+ w->buf_size = 0;
|
|
+
|
|
+ spin_unlock(&j->lock);
|
|
+ kvfree(buf);
|
|
+ spin_lock(&j->lock);
|
|
+ }
|
|
+
|
|
bool completed = false;
|
|
+ bool do_discards = false;
|
|
|
|
for (seq = journal_last_unwritten_seq(j);
|
|
seq <= journal_cur_seq(j);
|
|
@@ -1650,11 +1668,10 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
if (!w->write_done)
|
|
break;
|
|
|
|
- if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
|
|
+ if (!j->err_seq && !w->noflush) {
|
|
j->flushed_seq_ondisk = seq;
|
|
j->last_seq_ondisk = w->last_seq;
|
|
|
|
- bch2_do_discards(c);
|
|
closure_wake_up(&c->freelist_wait);
|
|
bch2_reset_alloc_cursors(c);
|
|
}
|
|
@@ -1671,16 +1688,6 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
if (j->watermark != BCH_WATERMARK_stripe)
|
|
journal_reclaim_kick(&c->journal);
|
|
|
|
- old.v = atomic64_read(&j->reservations.counter);
|
|
- do {
|
|
- new.v = old.v;
|
|
- BUG_ON(journal_state_count(new, new.unwritten_idx));
|
|
- BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
|
|
-
|
|
- new.unwritten_idx++;
|
|
- } while (!atomic64_try_cmpxchg(&j->reservations.counter,
|
|
- &old.v, new.v));
|
|
-
|
|
closure_wake_up(&w->wait);
|
|
completed = true;
|
|
}
|
|
@@ -1695,7 +1702,7 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
}
|
|
|
|
if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
|
|
- new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
|
|
+ j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
|
|
struct journal_buf *buf = journal_cur_buf(j);
|
|
long delta = buf->expires - jiffies;
|
|
|
|
@@ -1715,6 +1722,9 @@ static CLOSURE_CALLBACK(journal_write_done)
|
|
*/
|
|
bch2_journal_do_writes(j);
|
|
spin_unlock(&j->lock);
|
|
+
|
|
+ if (do_discards)
|
|
+ bch2_do_discards(c);
|
|
}
|
|
|
|
static void journal_write_endio(struct bio *bio)
|
|
@@ -1724,13 +1734,16 @@ static void journal_write_endio(struct bio *bio)
|
|
struct journal *j = &ca->fs->journal;
|
|
struct journal_buf *w = j->buf + jbio->buf_idx;
|
|
|
|
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
|
|
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
|
|
+ jbio->submit_time, !bio->bi_status);
|
|
+
|
|
+ if (bio->bi_status) {
|
|
+ bch_err_dev_ratelimited(ca,
|
|
"error writing journal entry %llu: %s",
|
|
le64_to_cpu(w->data->seq),
|
|
- bch2_blk_status_to_str(bio->bi_status)) ||
|
|
- bch2_meta_write_fault("journal")) {
|
|
- unsigned long flags;
|
|
+ bch2_blk_status_to_str(bio->bi_status));
|
|
|
|
+ unsigned long flags;
|
|
spin_lock_irqsave(&j->err_lock, flags);
|
|
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
|
|
spin_unlock_irqrestore(&j->err_lock, flags);
|
|
@@ -1759,7 +1772,11 @@ static CLOSURE_CALLBACK(journal_write_submit)
|
|
sectors);
|
|
|
|
struct journal_device *ja = &ca->journal;
|
|
- struct bio *bio = &ja->bio[w->idx]->bio;
|
|
+ struct journal_bio *jbio = ja->bio[w->idx];
|
|
+ struct bio *bio = &jbio->bio;
|
|
+
|
|
+ jbio->submit_time = local_clock();
|
|
+
|
|
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
|
|
bio->bi_iter.bi_sector = ptr->offset;
|
|
bio->bi_end_io = journal_write_endio;
|
|
@@ -1791,6 +1808,10 @@ static CLOSURE_CALLBACK(journal_write_preflush)
|
|
struct journal *j = container_of(w, struct journal, buf[w->idx]);
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
|
|
+ /*
|
|
+ * Wait for previous journal writes to comelete; they won't necessarily
|
|
+ * be flushed if they're still in flight
|
|
+ */
|
|
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
|
|
spin_lock(&j->lock);
|
|
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
|
|
@@ -1984,7 +2005,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
|
|
* write anything at all.
|
|
*/
|
|
if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
|
|
- return -EIO;
|
|
+ return error;
|
|
|
|
if (error ||
|
|
w->noflush ||
|
|
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
|
|
index d373cd181a7f..5d1547aa118a 100644
|
|
--- a/fs/bcachefs/journal_reclaim.c
|
|
+++ b/fs/bcachefs/journal_reclaim.c
|
|
@@ -226,7 +226,7 @@ void bch2_journal_space_available(struct journal *j)
|
|
|
|
bch_err(c, "%s", buf.buf);
|
|
printbuf_exit(&buf);
|
|
- ret = JOURNAL_ERR_insufficient_devices;
|
|
+ ret = -BCH_ERR_insufficient_journal_devices;
|
|
goto out;
|
|
}
|
|
|
|
@@ -240,7 +240,7 @@ void bch2_journal_space_available(struct journal *j)
|
|
total = j->space[journal_space_total].total;
|
|
|
|
if (!j->space[journal_space_discarded].next_entry)
|
|
- ret = JOURNAL_ERR_journal_full;
|
|
+ ret = -BCH_ERR_journal_full;
|
|
|
|
if ((j->space[journal_space_clean_ondisk].next_entry <
|
|
j->space[journal_space_clean_ondisk].total) &&
|
|
@@ -645,7 +645,6 @@ static u64 journal_seq_to_flush(struct journal *j)
|
|
* @j: journal object
|
|
* @direct: direct or background reclaim?
|
|
* @kicked: requested to run since we last ran?
|
|
- * Returns: 0 on success, or -EIO if the journal has been shutdown
|
|
*
|
|
* Background journal reclaim writes out btree nodes. It should be run
|
|
* early enough so that we never completely run out of journal buckets.
|
|
@@ -685,10 +684,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
|
|
if (kthread && kthread_should_stop())
|
|
break;
|
|
|
|
- if (bch2_journal_error(j)) {
|
|
- ret = -EIO;
|
|
+ ret = bch2_journal_error(j);
|
|
+ if (ret)
|
|
break;
|
|
- }
|
|
|
|
bch2_journal_do_discards(j);
|
|
|
|
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
|
|
index 1f25c111c54c..e463d2d95359 100644
|
|
--- a/fs/bcachefs/journal_seq_blacklist.c
|
|
+++ b/fs/bcachefs/journal_seq_blacklist.c
|
|
@@ -231,15 +231,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c)
|
|
struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
|
|
BUG_ON(nr != t->nr);
|
|
|
|
- unsigned i;
|
|
- for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
|
|
- src < bl->start + nr;
|
|
- src++, i = eytzinger0_next(i, nr)) {
|
|
+ src = bl->start;
|
|
+ eytzinger0_for_each(i, nr) {
|
|
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
|
|
BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
|
|
|
|
if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
|
|
*dst++ = *src;
|
|
+ src++;
|
|
}
|
|
|
|
unsigned new_nr = dst - bl->start;
|
|
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
|
|
index 1ef3a28ed6ab..8e0eba776b9d 100644
|
|
--- a/fs/bcachefs/journal_types.h
|
|
+++ b/fs/bcachefs/journal_types.h
|
|
@@ -12,7 +12,11 @@
|
|
/* btree write buffer steals 8 bits for its own purposes: */
|
|
#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1)
|
|
|
|
-#define JOURNAL_BUF_BITS 2
|
|
+#define JOURNAL_STATE_BUF_BITS 2
|
|
+#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS)
|
|
+#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1)
|
|
+
|
|
+#define JOURNAL_BUF_BITS 4
|
|
#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
|
|
#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
|
|
|
|
@@ -82,7 +86,6 @@ struct journal_entry_pin {
|
|
|
|
struct journal_res {
|
|
bool ref;
|
|
- u8 idx;
|
|
u16 u64s;
|
|
u32 offset;
|
|
u64 seq;
|
|
@@ -98,9 +101,8 @@ union journal_res_state {
|
|
};
|
|
|
|
struct {
|
|
- u64 cur_entry_offset:20,
|
|
+ u64 cur_entry_offset:22,
|
|
idx:2,
|
|
- unwritten_idx:2,
|
|
buf0_count:10,
|
|
buf1_count:10,
|
|
buf2_count:10,
|
|
@@ -110,13 +112,13 @@ union journal_res_state {
|
|
|
|
/* bytes: */
|
|
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
|
|
-#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
|
|
+#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */
|
|
|
|
/*
|
|
* We stash some journal state as sentinal values in cur_entry_offset:
|
|
* note - cur_entry_offset is in units of u64s
|
|
*/
|
|
-#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
|
|
+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1)
|
|
|
|
#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2)
|
|
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
|
|
@@ -149,28 +151,12 @@ enum journal_flags {
|
|
#undef x
|
|
};
|
|
|
|
-/* Reasons we may fail to get a journal reservation: */
|
|
-#define JOURNAL_ERRORS() \
|
|
- x(ok) \
|
|
- x(retry) \
|
|
- x(blocked) \
|
|
- x(max_in_flight) \
|
|
- x(journal_full) \
|
|
- x(journal_pin_full) \
|
|
- x(journal_stuck) \
|
|
- x(insufficient_devices)
|
|
-
|
|
-enum journal_errors {
|
|
-#define x(n) JOURNAL_ERR_##n,
|
|
- JOURNAL_ERRORS()
|
|
-#undef x
|
|
-};
|
|
-
|
|
typedef DARRAY(u64) darray_u64;
|
|
|
|
struct journal_bio {
|
|
struct bch_dev *ca;
|
|
unsigned buf_idx;
|
|
+ u64 submit_time;
|
|
|
|
struct bio bio;
|
|
};
|
|
@@ -199,7 +185,7 @@ struct journal {
|
|
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
|
|
* insufficient devices:
|
|
*/
|
|
- enum journal_errors cur_entry_error;
|
|
+ int cur_entry_error;
|
|
unsigned cur_entry_offset_if_blocked;
|
|
|
|
unsigned buf_size_want;
|
|
@@ -220,6 +206,8 @@ struct journal {
|
|
* other is possibly being written out.
|
|
*/
|
|
struct journal_buf buf[JOURNAL_BUF_NR];
|
|
+ void *free_buf;
|
|
+ unsigned free_buf_size;
|
|
|
|
spinlock_t lock;
|
|
|
|
@@ -237,6 +225,7 @@ struct journal {
|
|
/* Sequence number of most recent journal entry (last entry in @pin) */
|
|
atomic64_t seq;
|
|
|
|
+ u64 seq_write_started;
|
|
/* seq, last_seq from the most recent journal entry successfully written */
|
|
u64 seq_ondisk;
|
|
u64 flushed_seq_ondisk;
|
|
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
|
|
index ce794d55818f..a299d9ec8ee4 100644
|
|
--- a/fs/bcachefs/lru.c
|
|
+++ b/fs/bcachefs/lru.c
|
|
@@ -6,6 +6,7 @@
|
|
#include "btree_iter.h"
|
|
#include "btree_update.h"
|
|
#include "btree_write_buffer.h"
|
|
+#include "ec.h"
|
|
#include "error.h"
|
|
#include "lru.h"
|
|
#include "recovery.h"
|
|
@@ -59,9 +60,9 @@ int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time
|
|
return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
|
|
}
|
|
|
|
-int bch2_lru_change(struct btree_trans *trans,
|
|
- u16 lru_id, u64 dev_bucket,
|
|
- u64 old_time, u64 new_time)
|
|
+int __bch2_lru_change(struct btree_trans *trans,
|
|
+ u16 lru_id, u64 dev_bucket,
|
|
+ u64 old_time, u64 new_time)
|
|
{
|
|
if (old_time == new_time)
|
|
return 0;
|
|
@@ -78,7 +79,9 @@ static const char * const bch2_lru_types[] = {
|
|
};
|
|
|
|
int bch2_lru_check_set(struct btree_trans *trans,
|
|
- u16 lru_id, u64 time,
|
|
+ u16 lru_id,
|
|
+ u64 dev_bucket,
|
|
+ u64 time,
|
|
struct bkey_s_c referring_k,
|
|
struct bkey_buf *last_flushed)
|
|
{
|
|
@@ -87,9 +90,7 @@ int bch2_lru_check_set(struct btree_trans *trans,
|
|
struct btree_iter lru_iter;
|
|
struct bkey_s_c lru_k =
|
|
bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
|
|
- lru_pos(lru_id,
|
|
- bucket_to_u64(referring_k.k->p),
|
|
- time), 0);
|
|
+ lru_pos(lru_id, dev_bucket, time), 0);
|
|
int ret = bkey_err(lru_k);
|
|
if (ret)
|
|
return ret;
|
|
@@ -104,7 +105,7 @@ int bch2_lru_check_set(struct btree_trans *trans,
|
|
" %s",
|
|
bch2_lru_types[lru_type(lru_k)],
|
|
(bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
|
|
- ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time);
|
|
+ ret = bch2_lru_set(trans, lru_id, dev_bucket, time);
|
|
if (ret)
|
|
goto err;
|
|
}
|
|
@@ -116,49 +117,73 @@ int bch2_lru_check_set(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
+static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k)
|
|
+{
|
|
+ enum bch_lru_type type = lru_type(lru_k);
|
|
+
|
|
+ switch (type) {
|
|
+ case BCH_LRU_read:
|
|
+ case BCH_LRU_fragmentation:
|
|
+ return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset));
|
|
+ case BCH_LRU_stripes:
|
|
+ return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset));
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static u64 bkey_lru_type_idx(struct bch_fs *c,
|
|
+ enum bch_lru_type type,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bch_alloc_v4 a_convert;
|
|
+ const struct bch_alloc_v4 *a;
|
|
+
|
|
+ switch (type) {
|
|
+ case BCH_LRU_read:
|
|
+ a = bch2_alloc_to_v4(k, &a_convert);
|
|
+ return alloc_lru_idx_read(*a);
|
|
+ case BCH_LRU_fragmentation: {
|
|
+ a = bch2_alloc_to_v4(k, &a_convert);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
|
|
+ u64 idx = ca
|
|
+ ? alloc_lru_idx_fragmentation(*a, ca)
|
|
+ : 0;
|
|
+ rcu_read_unlock();
|
|
+ return idx;
|
|
+ }
|
|
+ case BCH_LRU_stripes:
|
|
+ return k.k->type == KEY_TYPE_stripe
|
|
+ ? stripe_lru_pos(bkey_s_c_to_stripe(k).v)
|
|
+ : 0;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
static int bch2_check_lru_key(struct btree_trans *trans,
|
|
struct btree_iter *lru_iter,
|
|
struct bkey_s_c lru_k,
|
|
struct bkey_buf *last_flushed)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
- struct btree_iter iter;
|
|
- struct bkey_s_c k;
|
|
- struct bch_alloc_v4 a_convert;
|
|
- const struct bch_alloc_v4 *a;
|
|
struct printbuf buf1 = PRINTBUF;
|
|
struct printbuf buf2 = PRINTBUF;
|
|
- enum bch_lru_type type = lru_type(lru_k);
|
|
- struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
|
|
- u64 idx;
|
|
- int ret;
|
|
-
|
|
- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos);
|
|
|
|
- if (fsck_err_on(!ca,
|
|
- trans, lru_entry_to_invalid_bucket,
|
|
- "lru key points to nonexistent device:bucket %llu:%llu",
|
|
- alloc_pos.inode, alloc_pos.offset))
|
|
- return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
|
|
+ struct bbpos bp = lru_pos_to_bp(lru_k);
|
|
|
|
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
|
|
- ret = bkey_err(k);
|
|
+ struct btree_iter iter;
|
|
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0);
|
|
+ int ret = bkey_err(k);
|
|
if (ret)
|
|
goto err;
|
|
|
|
- a = bch2_alloc_to_v4(k, &a_convert);
|
|
-
|
|
- switch (type) {
|
|
- case BCH_LRU_read:
|
|
- idx = alloc_lru_idx_read(*a);
|
|
- break;
|
|
- case BCH_LRU_fragmentation:
|
|
- idx = alloc_lru_idx_fragmentation(*a, ca);
|
|
- break;
|
|
- }
|
|
+ enum bch_lru_type type = lru_type(lru_k);
|
|
+ u64 idx = bkey_lru_type_idx(c, type, k);
|
|
|
|
- if (lru_k.k->type != KEY_TYPE_set ||
|
|
- lru_pos_time(lru_k.k->p) != idx) {
|
|
+ if (lru_pos_time(lru_k.k->p) != idx) {
|
|
ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
|
|
if (ret)
|
|
goto err;
|
|
@@ -176,7 +201,6 @@ static int bch2_check_lru_key(struct btree_trans *trans,
|
|
err:
|
|
fsck_err:
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
- bch2_dev_put(ca);
|
|
printbuf_exit(&buf2);
|
|
printbuf_exit(&buf1);
|
|
return ret;
|
|
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
|
|
index f31a6cf1514c..8abd0aa2083a 100644
|
|
--- a/fs/bcachefs/lru.h
|
|
+++ b/fs/bcachefs/lru.h
|
|
@@ -28,9 +28,14 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
|
|
{
|
|
u16 lru_id = l.k->p.inode >> 48;
|
|
|
|
- if (lru_id == BCH_LRU_FRAGMENTATION_START)
|
|
+ switch (lru_id) {
|
|
+ case BCH_LRU_BUCKET_FRAGMENTATION:
|
|
return BCH_LRU_fragmentation;
|
|
- return BCH_LRU_read;
|
|
+ case BCH_LRU_STRIPE_FRAGMENTATION:
|
|
+ return BCH_LRU_stripes;
|
|
+ default:
|
|
+ return BCH_LRU_read;
|
|
+ }
|
|
}
|
|
|
|
int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
|
|
@@ -46,10 +51,19 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
|
|
|
|
int bch2_lru_del(struct btree_trans *, u16, u64, u64);
|
|
int bch2_lru_set(struct btree_trans *, u16, u64, u64);
|
|
-int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
|
|
+int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
|
|
+
|
|
+static inline int bch2_lru_change(struct btree_trans *trans,
|
|
+ u16 lru_id, u64 dev_bucket,
|
|
+ u64 old_time, u64 new_time)
|
|
+{
|
|
+ return old_time != new_time
|
|
+ ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time)
|
|
+ : 0;
|
|
+}
|
|
|
|
struct bkey_buf;
|
|
-int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *);
|
|
+int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *);
|
|
|
|
int bch2_check_lrus(struct bch_fs *);
|
|
|
|
diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h
|
|
index f372cb3b8cda..b7392ad8e41f 100644
|
|
--- a/fs/bcachefs/lru_format.h
|
|
+++ b/fs/bcachefs/lru_format.h
|
|
@@ -9,7 +9,8 @@ struct bch_lru {
|
|
|
|
#define BCH_LRU_TYPES() \
|
|
x(read) \
|
|
- x(fragmentation)
|
|
+ x(fragmentation) \
|
|
+ x(stripes)
|
|
|
|
enum bch_lru_type {
|
|
#define x(n) BCH_LRU_##n,
|
|
@@ -17,7 +18,8 @@ enum bch_lru_type {
|
|
#undef x
|
|
};
|
|
|
|
-#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
|
|
+#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1)
|
|
+#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2)
|
|
|
|
#define LRU_TIME_BITS 48
|
|
#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
|
|
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
|
|
index ddc187fb693d..57ad662871ba 100644
|
|
--- a/fs/bcachefs/migrate.c
|
|
+++ b/fs/bcachefs/migrate.c
|
|
@@ -15,6 +15,7 @@
|
|
#include "keylist.h"
|
|
#include "migrate.h"
|
|
#include "move.h"
|
|
+#include "progress.h"
|
|
#include "replicas.h"
|
|
#include "super-io.h"
|
|
|
|
@@ -76,7 +77,9 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
|
|
return 0;
|
|
}
|
|
|
|
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
+static int bch2_dev_usrdata_drop(struct bch_fs *c,
|
|
+ struct progress_indicator_state *progress,
|
|
+ unsigned dev_idx, int flags)
|
|
{
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
enum btree_id id;
|
|
@@ -88,8 +91,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
|
|
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
|
|
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
|
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
- bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
|
|
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
+ bch2_progress_update_iter(trans, progress, &iter, "dropping user data");
|
|
+ bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
|
|
+ }));
|
|
if (ret)
|
|
break;
|
|
}
|
|
@@ -99,7 +104,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
return ret;
|
|
}
|
|
|
|
-static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
+static int bch2_dev_metadata_drop(struct bch_fs *c,
|
|
+ struct progress_indicator_state *progress,
|
|
+ unsigned dev_idx, int flags)
|
|
{
|
|
struct btree_trans *trans;
|
|
struct btree_iter iter;
|
|
@@ -125,6 +132,8 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
while (bch2_trans_begin(trans),
|
|
(b = bch2_btree_iter_peek_node(&iter)) &&
|
|
!(ret = PTR_ERR_OR_ZERO(b))) {
|
|
+ bch2_progress_update_iter(trans, progress, &iter, "dropping metadata");
|
|
+
|
|
if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
|
|
goto next;
|
|
|
|
@@ -169,6 +178,11 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
|
|
int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
{
|
|
- return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
|
|
- bch2_dev_metadata_drop(c, dev_idx, flags);
|
|
+ struct progress_indicator_state progress;
|
|
+ bch2_progress_init(&progress, c,
|
|
+ BIT_ULL(BTREE_ID_extents)|
|
|
+ BIT_ULL(BTREE_ID_reflink));
|
|
+
|
|
+ return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?:
|
|
+ bch2_dev_metadata_drop(c, &progress, dev_idx, flags);
|
|
}
|
|
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
|
|
index 160b4374160a..66d1c055a2e3 100644
|
|
--- a/fs/bcachefs/move.c
|
|
+++ b/fs/bcachefs/move.c
|
|
@@ -38,28 +38,28 @@ const char * const bch2_data_ops_strs[] = {
|
|
NULL
|
|
};
|
|
|
|
-static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
|
|
+static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
|
|
struct bch_io_opts *io_opts,
|
|
struct data_update_opts *data_opts)
|
|
{
|
|
- if (trace_move_extent_enabled()) {
|
|
+ if (trace_io_move_enabled()) {
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
prt_newline(&buf);
|
|
bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
|
|
- trace_move_extent(c, buf.buf);
|
|
+ trace_io_move(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
}
|
|
|
|
-static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
|
|
+static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
|
|
{
|
|
- if (trace_move_extent_read_enabled()) {
|
|
+ if (trace_io_move_read_enabled()) {
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
- trace_move_extent_read(c, buf.buf);
|
|
+ trace_io_move_read(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
}
|
|
@@ -74,11 +74,7 @@ struct moving_io {
|
|
unsigned read_sectors;
|
|
unsigned write_sectors;
|
|
|
|
- struct bch_read_bio rbio;
|
|
-
|
|
struct data_update write;
|
|
- /* Must be last since it is variable size */
|
|
- struct bio_vec bi_inline_vecs[];
|
|
};
|
|
|
|
static void move_free(struct moving_io *io)
|
|
@@ -88,43 +84,86 @@ static void move_free(struct moving_io *io)
|
|
if (io->b)
|
|
atomic_dec(&io->b->count);
|
|
|
|
- bch2_data_update_exit(&io->write);
|
|
-
|
|
mutex_lock(&ctxt->lock);
|
|
list_del(&io->io_list);
|
|
wake_up(&ctxt->wait);
|
|
mutex_unlock(&ctxt->lock);
|
|
|
|
+ if (!io->write.data_opts.scrub) {
|
|
+ bch2_data_update_exit(&io->write);
|
|
+ } else {
|
|
+ bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
|
|
+ kfree(io->write.bvecs);
|
|
+ }
|
|
kfree(io);
|
|
}
|
|
|
|
static void move_write_done(struct bch_write_op *op)
|
|
{
|
|
struct moving_io *io = container_of(op, struct moving_io, write.op);
|
|
+ struct bch_fs *c = op->c;
|
|
struct moving_context *ctxt = io->write.ctxt;
|
|
|
|
- if (io->write.op.error)
|
|
+ if (op->error) {
|
|
+ if (trace_io_move_write_fail_enabled()) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+
|
|
+ bch2_write_op_to_text(&buf, op);
|
|
+ prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error));
|
|
+ trace_io_move_write_fail(c, buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
+ this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
|
|
+
|
|
ctxt->write_error = true;
|
|
+ }
|
|
|
|
- atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
|
|
- atomic_dec(&io->write.ctxt->write_ios);
|
|
+ atomic_sub(io->write_sectors, &ctxt->write_sectors);
|
|
+ atomic_dec(&ctxt->write_ios);
|
|
move_free(io);
|
|
closure_put(&ctxt->cl);
|
|
}
|
|
|
|
static void move_write(struct moving_io *io)
|
|
{
|
|
- if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
|
|
+ struct bch_fs *c = io->write.op.c;
|
|
+ struct moving_context *ctxt = io->write.ctxt;
|
|
+ struct bch_read_bio *rbio = &io->write.rbio;
|
|
+
|
|
+ if (ctxt->stats) {
|
|
+ if (rbio->bio.bi_status)
|
|
+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
|
|
+ &ctxt->stats->sectors_error_uncorrected);
|
|
+ else if (rbio->saw_error)
|
|
+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
|
|
+ &ctxt->stats->sectors_error_corrected);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If the extent has been bitrotted, we're going to have to give it a
|
|
+ * new checksum in order to move it - but the poison bit will ensure
|
|
+ * that userspace still gets the appropriate error.
|
|
+ */
|
|
+ if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
|
|
+ (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
|
|
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
|
|
+ struct nonce nonce = extent_nonce(rbio->version, crc);
|
|
+
|
|
+ rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type,
|
|
+ nonce, &rbio->bio);
|
|
+ rbio->ret = 0;
|
|
+ }
|
|
+
|
|
+ if (unlikely(rbio->ret || io->write.data_opts.scrub)) {
|
|
move_free(io);
|
|
return;
|
|
}
|
|
|
|
- if (trace_move_extent_write_enabled()) {
|
|
- struct bch_fs *c = io->write.op.c;
|
|
+ if (trace_io_move_write_enabled()) {
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
|
|
- trace_move_extent_write(c, buf.buf);
|
|
+ trace_io_move_write(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
|
|
@@ -132,7 +171,7 @@ static void move_write(struct moving_io *io)
|
|
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
|
|
atomic_inc(&io->write.ctxt->write_ios);
|
|
|
|
- bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
|
|
+ bch2_data_update_read_done(&io->write);
|
|
}
|
|
|
|
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
|
|
@@ -145,7 +184,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx
|
|
|
|
static void move_read_endio(struct bio *bio)
|
|
{
|
|
- struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
|
|
+ struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio);
|
|
struct moving_context *ctxt = io->write.ctxt;
|
|
|
|
atomic_sub(io->read_sectors, &ctxt->read_sectors);
|
|
@@ -258,14 +297,10 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
{
|
|
struct btree_trans *trans = ctxt->trans;
|
|
struct bch_fs *c = trans->c;
|
|
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
- struct moving_io *io;
|
|
- const union bch_extent_entry *entry;
|
|
- struct extent_ptr_decoded p;
|
|
- unsigned sectors = k.k->size, pages;
|
|
int ret = -ENOMEM;
|
|
|
|
- trace_move_extent2(c, k, &io_opts, &data_opts);
|
|
+ trace_io_move2(c, k, &io_opts, &data_opts);
|
|
+ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
|
|
|
|
if (ctxt->stats)
|
|
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
|
|
@@ -273,7 +308,8 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
bch2_data_update_opts_normalize(k, &data_opts);
|
|
|
|
if (!data_opts.rewrite_ptrs &&
|
|
- !data_opts.extra_replicas) {
|
|
+ !data_opts.extra_replicas &&
|
|
+ !data_opts.scrub) {
|
|
if (data_opts.kill_ptrs)
|
|
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
|
|
return 0;
|
|
@@ -285,13 +321,7 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
*/
|
|
bch2_trans_unlock(trans);
|
|
|
|
- /* write path might have to decompress data: */
|
|
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
- sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
|
|
-
|
|
- pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
|
|
- io = kzalloc(sizeof(struct moving_io) +
|
|
- sizeof(struct bio_vec) * pages, GFP_KERNEL);
|
|
+ struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL);
|
|
if (!io)
|
|
goto err;
|
|
|
|
@@ -300,31 +330,27 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
io->read_sectors = k.k->size;
|
|
io->write_sectors = k.k->size;
|
|
|
|
- bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
|
|
- io->write.op.wbio.bio.bi_ioprio =
|
|
- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
|
-
|
|
- if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
|
|
- GFP_KERNEL))
|
|
- goto err_free;
|
|
+ if (!data_opts.scrub) {
|
|
+ ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
|
|
+ &io_opts, data_opts, iter->btree_id, k);
|
|
+ if (ret)
|
|
+ goto err_free;
|
|
|
|
- io->rbio.c = c;
|
|
- io->rbio.opts = io_opts;
|
|
- bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
|
|
- io->rbio.bio.bi_vcnt = pages;
|
|
- io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
|
- io->rbio.bio.bi_iter.bi_size = sectors << 9;
|
|
+ io->write.op.end_io = move_write_done;
|
|
+ } else {
|
|
+ bch2_bkey_buf_init(&io->write.k);
|
|
+ bch2_bkey_buf_reassemble(&io->write.k, c, k);
|
|
|
|
- io->rbio.bio.bi_opf = REQ_OP_READ;
|
|
- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
|
|
- io->rbio.bio.bi_end_io = move_read_endio;
|
|
+ io->write.op.c = c;
|
|
+ io->write.data_opts = data_opts;
|
|
|
|
- ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
|
|
- io_opts, data_opts, iter->btree_id, k);
|
|
- if (ret)
|
|
- goto err_free_pages;
|
|
+ ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
|
|
+ if (ret)
|
|
+ goto err_free;
|
|
+ }
|
|
|
|
- io->write.op.end_io = move_write_done;
|
|
+ io->write.rbio.bio.bi_end_io = move_read_endio;
|
|
+ io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
|
|
|
if (ctxt->rate)
|
|
bch2_ratelimit_increment(ctxt->rate, k.k->size);
|
|
@@ -339,9 +365,7 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
atomic_inc(&io->b->count);
|
|
}
|
|
|
|
- this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
|
|
- this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
|
|
- trace_move_extent_read2(c, k);
|
|
+ trace_io_move_read2(c, k);
|
|
|
|
mutex_lock(&ctxt->lock);
|
|
atomic_add(io->read_sectors, &ctxt->read_sectors);
|
|
@@ -356,33 +380,33 @@ int bch2_move_extent(struct moving_context *ctxt,
|
|
* ctxt when doing wakeup
|
|
*/
|
|
closure_get(&ctxt->cl);
|
|
- bch2_read_extent(trans, &io->rbio,
|
|
- bkey_start_pos(k.k),
|
|
- iter->btree_id, k, 0,
|
|
- BCH_READ_NODECODE|
|
|
- BCH_READ_LAST_FRAGMENT);
|
|
+ __bch2_read_extent(trans, &io->write.rbio,
|
|
+ io->write.rbio.bio.bi_iter,
|
|
+ bkey_start_pos(k.k),
|
|
+ iter->btree_id, k, 0,
|
|
+ NULL,
|
|
+ BCH_READ_last_fragment,
|
|
+ data_opts.scrub ? data_opts.read_dev : -1);
|
|
return 0;
|
|
-err_free_pages:
|
|
- bio_free_pages(&io->write.op.wbio.bio);
|
|
err_free:
|
|
kfree(io);
|
|
err:
|
|
- if (ret == -BCH_ERR_data_update_done)
|
|
+ if (bch2_err_matches(ret, BCH_ERR_data_update_done))
|
|
return 0;
|
|
|
|
if (bch2_err_matches(ret, EROFS) ||
|
|
bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
return ret;
|
|
|
|
- count_event(c, move_extent_start_fail);
|
|
+ count_event(c, io_move_start_fail);
|
|
|
|
- if (trace_move_extent_start_fail_enabled()) {
|
|
+ if (trace_io_move_start_fail_enabled()) {
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
bch2_bkey_val_to_text(&buf, c, k);
|
|
prt_str(&buf, ": ");
|
|
prt_str(&buf, bch2_err_str(ret));
|
|
- trace_move_extent_start_fail(c, buf.buf);
|
|
+ trace_io_move_start_fail(c, buf.buf);
|
|
printbuf_exit(&buf);
|
|
}
|
|
return ret;
|
|
@@ -518,6 +542,37 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
|
|
return 0;
|
|
}
|
|
|
|
+/*
|
|
+ * Move requires non extents iterators, and there's also no need for it to
|
|
+ * signal indirect_extent_missing_error:
|
|
+ */
|
|
+static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_s_c_reflink_p p)
|
|
+{
|
|
+ if (unlikely(REFLINK_P_ERROR(p.v)))
|
|
+ return bkey_s_c_null;
|
|
+
|
|
+ struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v));
|
|
+
|
|
+ bch2_trans_iter_init(trans, iter,
|
|
+ BTREE_ID_reflink, reflink_pos,
|
|
+ BTREE_ITER_not_extents);
|
|
+
|
|
+ struct bkey_s_c k = bch2_btree_iter_peek(iter);
|
|
+ if (!k.k || bkey_err(k)) {
|
|
+ bch2_trans_iter_exit(trans, iter);
|
|
+ return k;
|
|
+ }
|
|
+
|
|
+ if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) {
|
|
+ bch2_trans_iter_exit(trans, iter);
|
|
+ return bkey_s_c_null;
|
|
+ }
|
|
+
|
|
+ return k;
|
|
+}
|
|
+
|
|
static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
struct bpos start,
|
|
struct bpos end,
|
|
@@ -551,6 +606,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
bch2_trans_begin(trans);
|
|
bch2_trans_iter_init(trans, &iter, btree_id, start,
|
|
BTREE_ITER_prefetch|
|
|
+ BTREE_ITER_not_extents|
|
|
BTREE_ITER_all_snapshots);
|
|
|
|
if (ctxt->rate)
|
|
@@ -581,17 +637,16 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
k.k->type == KEY_TYPE_reflink_p &&
|
|
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
|
|
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
|
|
- s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
|
|
|
|
bch2_trans_iter_exit(trans, &reflink_iter);
|
|
- k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
|
|
+ k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p);
|
|
ret = bkey_err(k);
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
continue;
|
|
if (ret)
|
|
break;
|
|
|
|
- if (bkey_deleted(k.k))
|
|
+ if (!k.k)
|
|
goto next_nondata;
|
|
|
|
/*
|
|
@@ -627,7 +682,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
|
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
|
|
continue;
|
|
|
|
- if (ret2 == -ENOMEM) {
|
|
+ if (bch2_err_matches(ret2, ENOMEM)) {
|
|
/* memory allocation failure, wait for some IO to finish */
|
|
bch2_move_ctxt_wait_for_io(ctxt);
|
|
continue;
|
|
@@ -689,21 +744,22 @@ int bch2_move_data(struct bch_fs *c,
|
|
bool wait_on_copygc,
|
|
move_pred_fn pred, void *arg)
|
|
{
|
|
-
|
|
struct moving_context ctxt;
|
|
- int ret;
|
|
|
|
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
|
|
- ret = __bch2_move_data(&ctxt, start, end, pred, arg);
|
|
+ int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
|
|
bch2_moving_ctxt_exit(&ctxt);
|
|
|
|
return ret;
|
|
}
|
|
|
|
-int bch2_evacuate_bucket(struct moving_context *ctxt,
|
|
- struct move_bucket_in_flight *bucket_in_flight,
|
|
- struct bpos bucket, int gen,
|
|
- struct data_update_opts _data_opts)
|
|
+static int __bch2_move_data_phys(struct moving_context *ctxt,
|
|
+ struct move_bucket_in_flight *bucket_in_flight,
|
|
+ unsigned dev,
|
|
+ u64 bucket_start,
|
|
+ u64 bucket_end,
|
|
+ unsigned data_types,
|
|
+ move_pred_fn pred, void *arg)
|
|
{
|
|
struct btree_trans *trans = ctxt->trans;
|
|
struct bch_fs *c = trans->c;
|
|
@@ -712,16 +768,19 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
|
struct btree_iter iter = {}, bp_iter = {};
|
|
struct bkey_buf sk;
|
|
struct bkey_s_c k;
|
|
- struct data_update_opts data_opts;
|
|
- unsigned sectors_moved = 0;
|
|
struct bkey_buf last_flushed;
|
|
int ret = 0;
|
|
|
|
- struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
|
|
+ struct bch_dev *ca = bch2_dev_tryget(c, dev);
|
|
if (!ca)
|
|
return 0;
|
|
|
|
- trace_bucket_evacuate(c, &bucket);
|
|
+ bucket_end = min(bucket_end, ca->mi.nbuckets);
|
|
+
|
|
+ struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start));
|
|
+ struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end));
|
|
+ bch2_dev_put(ca);
|
|
+ ca = NULL;
|
|
|
|
bch2_bkey_buf_init(&last_flushed);
|
|
bkey_init(&last_flushed.k->k);
|
|
@@ -732,8 +791,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
|
*/
|
|
bch2_trans_begin(trans);
|
|
|
|
- bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
|
|
- bucket_pos_to_bp_start(ca, bucket), 0);
|
|
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0);
|
|
|
|
bch_err_msg(c, ret, "looking up alloc key");
|
|
if (ret)
|
|
@@ -757,7 +815,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
|
if (ret)
|
|
goto err;
|
|
|
|
- if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket)))
|
|
+ if (!k.k || bkey_gt(k.k->p, bp_end))
|
|
break;
|
|
|
|
if (k.k->type != KEY_TYPE_backpointer)
|
|
@@ -765,107 +823,148 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
|
|
|
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
|
|
|
|
- if (!bp.v->level) {
|
|
- k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
|
|
- ret = bkey_err(k);
|
|
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
- continue;
|
|
- if (ret)
|
|
- goto err;
|
|
- if (!k.k)
|
|
- goto next;
|
|
+ if (ctxt->stats)
|
|
+ ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
|
|
+
|
|
+ if (!(data_types & BIT(bp.v->data_type)))
|
|
+ goto next;
|
|
+
|
|
+ if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes)
|
|
+ goto next;
|
|
|
|
- bch2_bkey_buf_reassemble(&sk, c, k);
|
|
- k = bkey_i_to_s_c(sk.k);
|
|
+ k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
|
|
+ ret = bkey_err(k);
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
+ continue;
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ if (!k.k)
|
|
+ goto next;
|
|
|
|
+ if (!bp.v->level) {
|
|
ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
|
|
if (ret) {
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
continue;
|
|
}
|
|
+ }
|
|
|
|
- data_opts = _data_opts;
|
|
- data_opts.target = io_opts.background_target;
|
|
- data_opts.rewrite_ptrs = 0;
|
|
-
|
|
- unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */
|
|
- unsigned i = 0;
|
|
- const union bch_extent_entry *entry;
|
|
- struct extent_ptr_decoded p;
|
|
- bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
|
|
- if (p.ptr.dev == bucket.inode) {
|
|
- if (p.ptr.cached) {
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
- goto next;
|
|
- }
|
|
- data_opts.rewrite_ptrs |= 1U << i;
|
|
- break;
|
|
- }
|
|
- i++;
|
|
- }
|
|
-
|
|
- ret = bch2_move_extent(ctxt, bucket_in_flight,
|
|
- &iter, k, io_opts, data_opts);
|
|
+ struct data_update_opts data_opts = {};
|
|
+ if (!pred(c, arg, k, &io_opts, &data_opts)) {
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
+ goto next;
|
|
+ }
|
|
|
|
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
- continue;
|
|
- if (ret == -ENOMEM) {
|
|
- /* memory allocation failure, wait for some IO to finish */
|
|
- bch2_move_ctxt_wait_for_io(ctxt);
|
|
- continue;
|
|
- }
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- if (ctxt->stats)
|
|
- atomic64_add(sectors, &ctxt->stats->sectors_seen);
|
|
- sectors_moved += sectors;
|
|
- } else {
|
|
- struct btree *b;
|
|
+ if (data_opts.scrub &&
|
|
+ !bch2_dev_idx_is_online(c, data_opts.read_dev)) {
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
+ ret = -BCH_ERR_device_offline;
|
|
+ break;
|
|
+ }
|
|
|
|
- b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed);
|
|
- ret = PTR_ERR_OR_ZERO(b);
|
|
- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
|
|
- goto next;
|
|
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
- continue;
|
|
- if (ret)
|
|
- goto err;
|
|
- if (!b)
|
|
- goto next;
|
|
+ bch2_bkey_buf_reassemble(&sk, c, k);
|
|
+ k = bkey_i_to_s_c(sk.k);
|
|
|
|
- unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
|
|
+ /* move_extent will drop locks */
|
|
+ unsigned sectors = bp.v->bucket_len;
|
|
|
|
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
|
|
- bch2_trans_iter_exit(trans, &iter);
|
|
+ if (!bp.v->level)
|
|
+ ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
|
|
+ else if (!data_opts.scrub)
|
|
+ ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
|
|
+ else
|
|
+ ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
|
|
|
|
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
- continue;
|
|
- if (ret)
|
|
- goto err;
|
|
+ bch2_trans_iter_exit(trans, &iter);
|
|
|
|
- if (ctxt->rate)
|
|
- bch2_ratelimit_increment(ctxt->rate, sectors);
|
|
- if (ctxt->stats) {
|
|
- atomic64_add(sectors, &ctxt->stats->sectors_seen);
|
|
- atomic64_add(sectors, &ctxt->stats->sectors_moved);
|
|
- }
|
|
- sectors_moved += btree_sectors(c);
|
|
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
+ continue;
|
|
+ if (ret == -ENOMEM) {
|
|
+ /* memory allocation failure, wait for some IO to finish */
|
|
+ bch2_move_ctxt_wait_for_io(ctxt);
|
|
+ continue;
|
|
}
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (ctxt->stats)
|
|
+ atomic64_add(sectors, &ctxt->stats->sectors_seen);
|
|
next:
|
|
bch2_btree_iter_advance(&bp_iter);
|
|
}
|
|
-
|
|
- trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret);
|
|
err:
|
|
bch2_trans_iter_exit(trans, &bp_iter);
|
|
- bch2_dev_put(ca);
|
|
bch2_bkey_buf_exit(&sk, c);
|
|
bch2_bkey_buf_exit(&last_flushed, c);
|
|
return ret;
|
|
}
|
|
|
|
+static int bch2_move_data_phys(struct bch_fs *c,
|
|
+ unsigned dev,
|
|
+ u64 start,
|
|
+ u64 end,
|
|
+ unsigned data_types,
|
|
+ struct bch_ratelimit *rate,
|
|
+ struct bch_move_stats *stats,
|
|
+ struct write_point_specifier wp,
|
|
+ bool wait_on_copygc,
|
|
+ move_pred_fn pred, void *arg)
|
|
+{
|
|
+ struct moving_context ctxt;
|
|
+
|
|
+ bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
|
|
+
|
|
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
|
|
+ ctxt.stats->phys = true;
|
|
+ ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
|
|
+
|
|
+ int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg);
|
|
+ bch2_moving_ctxt_exit(&ctxt);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct evacuate_bucket_arg {
|
|
+ struct bpos bucket;
|
|
+ int gen;
|
|
+ struct data_update_opts data_opts;
|
|
+};
|
|
+
|
|
+static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_update_opts *data_opts)
|
|
+{
|
|
+ struct evacuate_bucket_arg *arg = _arg;
|
|
+
|
|
+ *data_opts = arg->data_opts;
|
|
+
|
|
+ unsigned i = 0;
|
|
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
|
|
+ if (ptr->dev == arg->bucket.inode &&
|
|
+ (arg->gen < 0 || arg->gen == ptr->gen) &&
|
|
+ !ptr->cached)
|
|
+ data_opts->rewrite_ptrs |= BIT(i);
|
|
+ i++;
|
|
+ }
|
|
+
|
|
+ return data_opts->rewrite_ptrs != 0;
|
|
+}
|
|
+
|
|
+int bch2_evacuate_bucket(struct moving_context *ctxt,
|
|
+ struct move_bucket_in_flight *bucket_in_flight,
|
|
+ struct bpos bucket, int gen,
|
|
+ struct data_update_opts data_opts)
|
|
+{
|
|
+ struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
|
|
+
|
|
+ return __bch2_move_data_phys(ctxt, bucket_in_flight,
|
|
+ bucket.inode,
|
|
+ bucket.offset,
|
|
+ bucket.offset + 1,
|
|
+ ~0,
|
|
+ evacuate_bucket_pred, &arg);
|
|
+}
|
|
+
|
|
typedef bool (*move_btree_pred)(struct bch_fs *, void *,
|
|
struct btree *, struct bch_io_opts *,
|
|
struct data_update_opts *);
|
|
@@ -1007,14 +1106,6 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
|
|
return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
|
}
|
|
|
|
-static bool migrate_btree_pred(struct bch_fs *c, void *arg,
|
|
- struct btree *b,
|
|
- struct bch_io_opts *io_opts,
|
|
- struct data_update_opts *data_opts)
|
|
-{
|
|
- return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
|
-}
|
|
-
|
|
/*
|
|
* Ancient versions of bcachefs produced packed formats which could represent
|
|
* keys that the in memory format cannot represent; this checks for those
|
|
@@ -1104,6 +1195,30 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
|
|
return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
|
}
|
|
|
|
+static bool scrub_pred(struct bch_fs *c, void *_arg,
|
|
+ struct bkey_s_c k,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_update_opts *data_opts)
|
|
+{
|
|
+ struct bch_ioctl_data *arg = _arg;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_btree_ptr_v2) {
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
+ if (p.ptr.dev == arg->migrate.dev) {
|
|
+ if (!p.crc.csum_type)
|
|
+ return false;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ data_opts->scrub = true;
|
|
+ data_opts->read_dev = arg->migrate.dev;
|
|
+ return true;
|
|
+}
|
|
+
|
|
int bch2_data_job(struct bch_fs *c,
|
|
struct bch_move_stats *stats,
|
|
struct bch_ioctl_data op)
|
|
@@ -1118,6 +1233,22 @@ int bch2_data_job(struct bch_fs *c,
|
|
bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
|
|
|
|
switch (op.op) {
|
|
+ case BCH_DATA_OP_scrub:
|
|
+ /*
|
|
+ * prevent tests from spuriously failing, make sure we see all
|
|
+ * btree nodes that need to be repaired
|
|
+ */
|
|
+ bch2_btree_interior_updates_flush(c);
|
|
+
|
|
+ ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
|
|
+ op.scrub.data_types,
|
|
+ NULL,
|
|
+ stats,
|
|
+ writepoint_hashed((unsigned long) current),
|
|
+ false,
|
|
+ scrub_pred, &op) ?: ret;
|
|
+ break;
|
|
+
|
|
case BCH_DATA_OP_rereplicate:
|
|
stats->data_type = BCH_DATA_journal;
|
|
ret = bch2_journal_flush_device_pins(&c->journal, -1);
|
|
@@ -1137,14 +1268,14 @@ int bch2_data_job(struct bch_fs *c,
|
|
|
|
stats->data_type = BCH_DATA_journal;
|
|
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
|
|
- ret = bch2_move_btree(c, start, end,
|
|
- migrate_btree_pred, &op, stats) ?: ret;
|
|
- ret = bch2_move_data(c, start, end,
|
|
- NULL,
|
|
- stats,
|
|
- writepoint_hashed((unsigned long) current),
|
|
- true,
|
|
- migrate_pred, &op) ?: ret;
|
|
+ ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX,
|
|
+ ~0,
|
|
+ NULL,
|
|
+ stats,
|
|
+ writepoint_hashed((unsigned long) current),
|
|
+ true,
|
|
+ migrate_pred, &op) ?: ret;
|
|
+ bch2_btree_interior_updates_flush(c);
|
|
ret = bch2_replicas_gc2(c) ?: ret;
|
|
break;
|
|
case BCH_DATA_OP_rewrite_old_nodes:
|
|
@@ -1176,17 +1307,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
|
|
prt_newline(out);
|
|
printbuf_indent_add(out, 2);
|
|
|
|
- prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved));
|
|
- prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced));
|
|
- prt_printf(out, "bytes seen: ");
|
|
+ prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved));
|
|
+ prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced));
|
|
+ prt_printf(out, "bytes seen:\t");
|
|
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
|
|
prt_newline(out);
|
|
|
|
- prt_printf(out, "bytes moved: ");
|
|
+ prt_printf(out, "bytes moved:\t");
|
|
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
|
|
prt_newline(out);
|
|
|
|
- prt_printf(out, "bytes raced: ");
|
|
+ prt_printf(out, "bytes raced:\t");
|
|
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
|
|
prt_newline(out);
|
|
|
|
@@ -1195,7 +1326,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
|
|
|
|
static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
|
|
{
|
|
- struct moving_io *io;
|
|
+ if (!out->nr_tabstops)
|
|
+ printbuf_tabstop_push(out, 32);
|
|
|
|
bch2_move_stats_to_text(out, ctxt->stats);
|
|
printbuf_indent_add(out, 2);
|
|
@@ -1215,8 +1347,9 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
|
|
printbuf_indent_add(out, 2);
|
|
|
|
mutex_lock(&ctxt->lock);
|
|
+ struct moving_io *io;
|
|
list_for_each_entry(io, &ctxt->ios, io_list)
|
|
- bch2_write_op_to_text(out, &io->write.op);
|
|
+ bch2_data_update_inflight_to_text(out, &io->write);
|
|
mutex_unlock(&ctxt->lock);
|
|
|
|
printbuf_indent_sub(out, 4);
|
|
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
|
|
index e22841ef31e4..807f779f6f76 100644
|
|
--- a/fs/bcachefs/move_types.h
|
|
+++ b/fs/bcachefs/move_types.h
|
|
@@ -3,22 +3,36 @@
|
|
#define _BCACHEFS_MOVE_TYPES_H
|
|
|
|
#include "bbpos_types.h"
|
|
+#include "bcachefs_ioctl.h"
|
|
|
|
struct bch_move_stats {
|
|
- enum bch_data_type data_type;
|
|
- struct bbpos pos;
|
|
char name[32];
|
|
+ bool phys;
|
|
+ enum bch_ioctl_data_event_ret ret;
|
|
+
|
|
+ union {
|
|
+ struct {
|
|
+ enum bch_data_type data_type;
|
|
+ struct bbpos pos;
|
|
+ };
|
|
+ struct {
|
|
+ unsigned dev;
|
|
+ u64 offset;
|
|
+ };
|
|
+ };
|
|
|
|
atomic64_t keys_moved;
|
|
atomic64_t keys_raced;
|
|
atomic64_t sectors_seen;
|
|
atomic64_t sectors_moved;
|
|
atomic64_t sectors_raced;
|
|
+ atomic64_t sectors_error_corrected;
|
|
+ atomic64_t sectors_error_uncorrected;
|
|
};
|
|
|
|
struct move_bucket_key {
|
|
struct bpos bucket;
|
|
- u8 gen;
|
|
+ unsigned gen;
|
|
};
|
|
|
|
struct move_bucket {
|
|
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
|
|
index 6718dc37c5a3..5126c870ce5b 100644
|
|
--- a/fs/bcachefs/movinggc.c
|
|
+++ b/fs/bcachefs/movinggc.c
|
|
@@ -167,8 +167,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
|
|
bch2_trans_begin(trans);
|
|
|
|
ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
|
|
- lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
|
|
- lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
|
|
+ lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
|
|
+ lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
|
|
0, k, ({
|
|
struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
|
|
int ret2 = 0;
|
|
@@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
|
|
prt_printf(out, "Currently calculated wait:\t");
|
|
prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
|
|
prt_newline(out);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ struct task_struct *t = rcu_dereference(c->copygc_thread);
|
|
+ if (t)
|
|
+ get_task_struct(t);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (t) {
|
|
+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
|
|
+ put_task_struct(t);
|
|
+ }
|
|
}
|
|
|
|
static int bch2_copygc_thread(void *arg)
|
|
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/namei.c
|
|
similarity index 73%
|
|
rename from fs/bcachefs/fs-common.c
|
|
rename to fs/bcachefs/namei.c
|
|
index 2c3d46ac70c6..93246ad31541 100644
|
|
--- a/fs/bcachefs/fs-common.c
|
|
+++ b/fs/bcachefs/namei.c
|
|
@@ -4,8 +4,8 @@
|
|
#include "acl.h"
|
|
#include "btree_update.h"
|
|
#include "dirent.h"
|
|
-#include "fs-common.h"
|
|
#include "inode.h"
|
|
+#include "namei.h"
|
|
#include "subvolume.h"
|
|
#include "xattr.h"
|
|
|
|
@@ -47,6 +47,10 @@ int bch2_create_trans(struct btree_trans *trans,
|
|
if (ret)
|
|
goto err;
|
|
|
|
+ /* Inherit casefold state from parent. */
|
|
+ if (S_ISDIR(mode))
|
|
+ new_inode->bi_flags |= dir_u->bi_flags & BCH_INODE_casefolded;
|
|
+
|
|
if (!(flags & BCH_CREATE_SNAPSHOT)) {
|
|
/* Normal create path - allocate a new inode: */
|
|
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
|
|
@@ -153,16 +157,14 @@ int bch2_create_trans(struct btree_trans *trans,
|
|
dir_u->bi_nlink++;
|
|
dir_u->bi_mtime = dir_u->bi_ctime = now;
|
|
|
|
- ret = bch2_inode_write(trans, &dir_iter, dir_u);
|
|
- if (ret)
|
|
- goto err;
|
|
-
|
|
- ret = bch2_dirent_create(trans, dir, &dir_hash,
|
|
- dir_type,
|
|
- name,
|
|
- dir_target,
|
|
- &dir_offset,
|
|
- STR_HASH_must_create|BTREE_ITER_with_updates);
|
|
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
|
|
+ dir_type,
|
|
+ name,
|
|
+ dir_target,
|
|
+ &dir_offset,
|
|
+ &dir_u->bi_size,
|
|
+ STR_HASH_must_create|BTREE_ITER_with_updates) ?:
|
|
+ bch2_inode_write(trans, &dir_iter, dir_u);
|
|
if (ret)
|
|
goto err;
|
|
|
|
@@ -225,7 +227,9 @@ int bch2_link_trans(struct btree_trans *trans,
|
|
|
|
ret = bch2_dirent_create(trans, dir, &dir_hash,
|
|
mode_to_type(inode_u->bi_mode),
|
|
- name, inum.inum, &dir_offset,
|
|
+ name, inum.inum,
|
|
+ &dir_offset,
|
|
+ &dir_u->bi_size,
|
|
STR_HASH_must_create);
|
|
if (ret)
|
|
goto err;
|
|
@@ -417,8 +421,8 @@ int bch2_rename_trans(struct btree_trans *trans,
|
|
}
|
|
|
|
ret = bch2_dirent_rename(trans,
|
|
- src_dir, &src_hash,
|
|
- dst_dir, &dst_hash,
|
|
+ src_dir, &src_hash, &src_dir_u->bi_size,
|
|
+ dst_dir, &dst_hash, &dst_dir_u->bi_size,
|
|
src_name, &src_inum, &src_offset,
|
|
dst_name, &dst_inum, &dst_offset,
|
|
mode);
|
|
@@ -560,6 +564,8 @@ int bch2_rename_trans(struct btree_trans *trans,
|
|
return ret;
|
|
}
|
|
|
|
+/* inum_to_path */
|
|
+
|
|
static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
|
|
{
|
|
bch2_printbuf_make_room(out, n);
|
|
@@ -650,3 +656,179 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
|
|
prt_str_reversed(path, "(disconnected)");
|
|
goto out;
|
|
}
|
|
+
|
|
+/* fsck */
|
|
+
|
|
+static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
|
|
+ struct bkey_s_c_dirent d,
|
|
+ struct bch_inode_unpacked *target,
|
|
+ bool in_fsck)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ struct btree_iter bp_iter = { NULL };
|
|
+ int ret = 0;
|
|
+
|
|
+ if (inode_points_to_dirent(target, d))
|
|
+ return 0;
|
|
+
|
|
+ if (!target->bi_dir &&
|
|
+ !target->bi_dir_offset) {
|
|
+ fsck_err_on(S_ISDIR(target->bi_mode),
|
|
+ trans, inode_dir_missing_backpointer,
|
|
+ "directory with missing backpointer\n%s",
|
|
+ (printbuf_reset(&buf),
|
|
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
+ prt_printf(&buf, "\n"),
|
|
+ bch2_inode_unpacked_to_text(&buf, target),
|
|
+ buf.buf));
|
|
+
|
|
+ fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
|
|
+ trans, inode_unlinked_but_has_dirent,
|
|
+ "inode unlinked but has dirent\n%s",
|
|
+ (printbuf_reset(&buf),
|
|
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
+ prt_printf(&buf, "\n"),
|
|
+ bch2_inode_unpacked_to_text(&buf, target),
|
|
+ buf.buf));
|
|
+
|
|
+ target->bi_flags &= ~BCH_INODE_unlinked;
|
|
+ target->bi_dir = d.k->p.inode;
|
|
+ target->bi_dir_offset = d.k->p.offset;
|
|
+ return __bch2_fsck_write_inode(trans, target);
|
|
+ }
|
|
+
|
|
+ if (bch2_inode_should_have_single_bp(target) &&
|
|
+ !fsck_err(trans, inode_wrong_backpointer,
|
|
+ "dirent points to inode that does not point back:\n %s",
|
|
+ (bch2_bkey_val_to_text(&buf, c, d.s_c),
|
|
+ prt_printf(&buf, "\n "),
|
|
+ bch2_inode_unpacked_to_text(&buf, target),
|
|
+ buf.buf)))
|
|
+ goto err;
|
|
+
|
|
+ struct bkey_s_c_dirent bp_dirent =
|
|
+ bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
|
|
+ SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),
|
|
+ 0, dirent);
|
|
+ ret = bkey_err(bp_dirent);
|
|
+ if (ret && !bch2_err_matches(ret, ENOENT))
|
|
+ goto err;
|
|
+
|
|
+ bool backpointer_exists = !ret;
|
|
+ ret = 0;
|
|
+
|
|
+ if (!backpointer_exists) {
|
|
+ if (fsck_err(trans, inode_wrong_backpointer,
|
|
+ "inode %llu:%u has wrong backpointer:\n"
|
|
+ "got %llu:%llu\n"
|
|
+ "should be %llu:%llu",
|
|
+ target->bi_inum, target->bi_snapshot,
|
|
+ target->bi_dir,
|
|
+ target->bi_dir_offset,
|
|
+ d.k->p.inode,
|
|
+ d.k->p.offset)) {
|
|
+ target->bi_dir = d.k->p.inode;
|
|
+ target->bi_dir_offset = d.k->p.offset;
|
|
+ ret = __bch2_fsck_write_inode(trans, target);
|
|
+ }
|
|
+ } else {
|
|
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
|
|
+ prt_newline(&buf);
|
|
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
|
|
+
|
|
+ if (S_ISDIR(target->bi_mode) || target->bi_subvol) {
|
|
+ /*
|
|
+ * XXX: verify connectivity of the other dirent
|
|
+ * up to the root before removing this one
|
|
+ *
|
|
+ * Additionally, bch2_lookup would need to cope with the
|
|
+ * dirent it found being removed - or should we remove
|
|
+ * the other one, even though the inode points to it?
|
|
+ */
|
|
+ if (in_fsck) {
|
|
+ if (fsck_err(trans, inode_dir_multiple_links,
|
|
+ "%s %llu:%u with multiple links\n%s",
|
|
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
|
|
+ target->bi_inum, target->bi_snapshot, buf.buf))
|
|
+ ret = bch2_fsck_remove_dirent(trans, d.k->p);
|
|
+ } else {
|
|
+ bch2_fs_inconsistent(c,
|
|
+ "%s %llu:%u with multiple links\n%s",
|
|
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
|
|
+ target->bi_inum, target->bi_snapshot, buf.buf);
|
|
+ }
|
|
+
|
|
+ goto out;
|
|
+ } else {
|
|
+ /*
|
|
+ * hardlinked file with nlink 0:
|
|
+ * We're just adjusting nlink here so check_nlinks() will pick
|
|
+ * it up, it ignores inodes with nlink 0
|
|
+ */
|
|
+ if (fsck_err_on(!target->bi_nlink,
|
|
+ trans, inode_multiple_links_but_nlink_0,
|
|
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
|
|
+ target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
|
|
+ target->bi_nlink++;
|
|
+ target->bi_flags &= ~BCH_INODE_unlinked;
|
|
+ ret = __bch2_fsck_write_inode(trans, target);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+out:
|
|
+err:
|
|
+fsck_err:
|
|
+ bch2_trans_iter_exit(trans, &bp_iter);
|
|
+ printbuf_exit(&buf);
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int __bch2_check_dirent_target(struct btree_trans *trans,
|
|
+ struct btree_iter *dirent_iter,
|
|
+ struct bkey_s_c_dirent d,
|
|
+ struct bch_inode_unpacked *target,
|
|
+ bool in_fsck)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ int ret = 0;
|
|
+
|
|
+ ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (fsck_err_on(d.v->d_type != inode_d_type(target),
|
|
+ trans, dirent_d_type_wrong,
|
|
+ "incorrect d_type: got %s, should be %s:\n%s",
|
|
+ bch2_d_type_str(d.v->d_type),
|
|
+ bch2_d_type_str(inode_d_type(target)),
|
|
+ (printbuf_reset(&buf),
|
|
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
|
|
+ struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
|
|
+ ret = PTR_ERR_OR_ZERO(n);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bkey_reassemble(&n->k_i, d.s_c);
|
|
+ n->v.d_type = inode_d_type(target);
|
|
+ if (n->v.d_type == DT_SUBVOL) {
|
|
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
|
|
+ n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
|
|
+ } else {
|
|
+ n->v.d_inum = cpu_to_le64(target->bi_inum);
|
|
+ }
|
|
+
|
|
+ ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+err:
|
|
+fsck_err:
|
|
+ printbuf_exit(&buf);
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/namei.h
|
|
similarity index 61%
|
|
rename from fs/bcachefs/fs-common.h
|
|
rename to fs/bcachefs/namei.h
|
|
index 2b59210bb5e8..2e6f6364767f 100644
|
|
--- a/fs/bcachefs/fs-common.h
|
|
+++ b/fs/bcachefs/namei.h
|
|
@@ -1,6 +1,6 @@
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
-#ifndef _BCACHEFS_FS_COMMON_H
|
|
-#define _BCACHEFS_FS_COMMON_H
|
|
+#ifndef _BCACHEFS_NAMEI_H
|
|
+#define _BCACHEFS_NAMEI_H
|
|
|
|
#include "dirent.h"
|
|
|
|
@@ -44,4 +44,29 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
|
|
|
|
int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
|
|
|
|
-#endif /* _BCACHEFS_FS_COMMON_H */
|
|
+int __bch2_check_dirent_target(struct btree_trans *,
|
|
+ struct btree_iter *,
|
|
+ struct bkey_s_c_dirent,
|
|
+ struct bch_inode_unpacked *, bool);
|
|
+
|
|
+static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
|
|
+ struct bkey_s_c_dirent d)
|
|
+{
|
|
+ return inode->bi_dir == d.k->p.inode &&
|
|
+ inode->bi_dir_offset == d.k->p.offset;
|
|
+}
|
|
+
|
|
+static inline int bch2_check_dirent_target(struct btree_trans *trans,
|
|
+ struct btree_iter *dirent_iter,
|
|
+ struct bkey_s_c_dirent d,
|
|
+ struct bch_inode_unpacked *target,
|
|
+ bool in_fsck)
|
|
+{
|
|
+ if (likely(inode_points_to_dirent(target, d) &&
|
|
+ d.v->d_type == inode_d_type(target)))
|
|
+ return 0;
|
|
+
|
|
+ return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck);
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_NAMEI_H */
|
|
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
|
|
index 6772faf385a5..4eea51edafca 100644
|
|
--- a/fs/bcachefs/opts.c
|
|
+++ b/fs/bcachefs/opts.c
|
|
@@ -44,7 +44,7 @@ const char * const __bch2_btree_ids[] = {
|
|
NULL
|
|
};
|
|
|
|
-static const char * const __bch2_csum_types[] = {
|
|
+const char * const __bch2_csum_types[] = {
|
|
BCH_CSUM_TYPES()
|
|
NULL
|
|
};
|
|
@@ -163,16 +163,6 @@ const char * const bch2_d_types[BCH_DT_MAX] = {
|
|
[DT_SUBVOL] = "subvol",
|
|
};
|
|
|
|
-u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
|
|
-{
|
|
- BUG();
|
|
-}
|
|
-
|
|
-void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
|
|
-{
|
|
- BUG();
|
|
-}
|
|
-
|
|
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
|
|
{
|
|
#define x(_name, ...) \
|
|
@@ -223,6 +213,21 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
|
|
}
|
|
}
|
|
|
|
+/* dummy option, for options that aren't stored in the superblock */
|
|
+typedef u64 (*sb_opt_get_fn)(const struct bch_sb *);
|
|
+typedef void (*sb_opt_set_fn)(struct bch_sb *, u64);
|
|
+typedef u64 (*member_opt_get_fn)(const struct bch_member *);
|
|
+typedef void (*member_opt_set_fn)(struct bch_member *, u64);
|
|
+
|
|
+__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL;
|
|
+__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL;
|
|
+__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL;
|
|
+__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL;
|
|
+
|
|
+#define type_compatible_or_null(_p, _type) \
|
|
+ __builtin_choose_expr( \
|
|
+ __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL)
|
|
+
|
|
const struct bch_option bch2_opt_table[] = {
|
|
#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2
|
|
#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \
|
|
@@ -239,15 +244,15 @@ const struct bch_option bch2_opt_table[] = {
|
|
|
|
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
|
|
[Opt_##_name] = { \
|
|
- .attr = { \
|
|
- .name = #_name, \
|
|
- .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
|
|
- }, \
|
|
- .flags = _flags, \
|
|
- .hint = _hint, \
|
|
- .help = _help, \
|
|
- .get_sb = _sb_opt, \
|
|
- .set_sb = SET_##_sb_opt, \
|
|
+ .attr.name = #_name, \
|
|
+ .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
|
|
+ .flags = _flags, \
|
|
+ .hint = _hint, \
|
|
+ .help = _help, \
|
|
+ .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \
|
|
+ .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \
|
|
+ .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \
|
|
+ .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\
|
|
_type \
|
|
},
|
|
|
|
@@ -475,11 +480,18 @@ void bch2_opts_to_text(struct printbuf *out,
|
|
}
|
|
}
|
|
|
|
-int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
|
|
+int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v)
|
|
{
|
|
+ lockdep_assert_held(&c->state_lock);
|
|
+
|
|
int ret = 0;
|
|
|
|
switch (id) {
|
|
+ case Opt_state:
|
|
+ if (ca)
|
|
+ return __bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED);
|
|
+ break;
|
|
+
|
|
case Opt_compression:
|
|
case Opt_background_compression:
|
|
ret = bch2_check_set_has_compressed_data(c, v);
|
|
@@ -495,12 +507,8 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
|
|
|
|
int bch2_opts_check_may_set(struct bch_fs *c)
|
|
{
|
|
- unsigned i;
|
|
- int ret;
|
|
-
|
|
- for (i = 0; i < bch2_opts_nr; i++) {
|
|
- ret = bch2_opt_check_may_set(c, i,
|
|
- bch2_opt_get_by_id(&c->opts, i));
|
|
+ for (unsigned i = 0; i < bch2_opts_nr; i++) {
|
|
+ int ret = bch2_opt_check_may_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i));
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -619,12 +627,25 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
|
|
return ret;
|
|
}
|
|
|
|
-u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
|
|
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx)
|
|
{
|
|
const struct bch_option *opt = bch2_opt_table + id;
|
|
u64 v;
|
|
|
|
- v = opt->get_sb(sb);
|
|
+ if (dev_idx < 0) {
|
|
+ v = opt->get_sb(sb);
|
|
+ } else {
|
|
+ if (WARN(!bch2_member_exists(sb, dev_idx),
|
|
+ "tried to set device option %s on nonexistent device %i",
|
|
+ opt->attr.name, dev_idx))
|
|
+ return 0;
|
|
+
|
|
+ struct bch_member m = bch2_sb_member_get(sb, dev_idx);
|
|
+ v = opt->get_member(&m);
|
|
+ }
|
|
+
|
|
+ if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
|
|
+ --v;
|
|
|
|
if (opt->flags & OPT_SB_FIELD_ILOG2)
|
|
v = 1ULL << v;
|
|
@@ -641,35 +662,19 @@ u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
|
|
*/
|
|
int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
|
|
{
|
|
- unsigned id;
|
|
-
|
|
- for (id = 0; id < bch2_opts_nr; id++) {
|
|
+ for (unsigned id = 0; id < bch2_opts_nr; id++) {
|
|
const struct bch_option *opt = bch2_opt_table + id;
|
|
|
|
- if (opt->get_sb == BCH2_NO_SB_OPT)
|
|
- continue;
|
|
-
|
|
- bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
|
|
+ if (opt->get_sb)
|
|
+ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1));
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
-struct bch_dev_sb_opt_set {
|
|
- void (*set_sb)(struct bch_member *, u64);
|
|
-};
|
|
-
|
|
-static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
|
|
-#define x(n, set) [Opt_##n] = { .set_sb = SET_##set },
|
|
- BCH_DEV_OPT_SETTERS()
|
|
-#undef x
|
|
-};
|
|
-
|
|
void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
|
|
const struct bch_option *opt, u64 v)
|
|
{
|
|
- enum bch_opt_id id = opt - bch2_opt_table;
|
|
-
|
|
if (opt->flags & OPT_SB_FIELD_SECTORS)
|
|
v >>= 9;
|
|
|
|
@@ -679,24 +684,16 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
|
|
if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
|
|
v++;
|
|
|
|
- if (opt->flags & OPT_FS) {
|
|
- if (opt->set_sb != SET_BCH2_NO_SB_OPT)
|
|
- opt->set_sb(sb, v);
|
|
- }
|
|
+ if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0)
|
|
+ opt->set_sb(sb, v);
|
|
|
|
- if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) {
|
|
+ if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) {
|
|
if (WARN(!bch2_member_exists(sb, dev_idx),
|
|
"tried to set device option %s on nonexistent device %i",
|
|
opt->attr.name, dev_idx))
|
|
return;
|
|
|
|
- struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
|
|
-
|
|
- const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
|
|
- if (set->set_sb)
|
|
- set->set_sb(m, v);
|
|
- else
|
|
- pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name);
|
|
+ opt->set_member(bch2_members_v2_get_mut(sb, dev_idx), v);
|
|
}
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
|
|
index 9d397fc2a1f0..9a3102f30e11 100644
|
|
--- a/fs/bcachefs/opts.h
|
|
+++ b/fs/bcachefs/opts.h
|
|
@@ -16,6 +16,7 @@ extern const char * const bch2_version_upgrade_opts[];
|
|
extern const char * const bch2_sb_features[];
|
|
extern const char * const bch2_sb_compat[];
|
|
extern const char * const __bch2_btree_ids[];
|
|
+extern const char * const __bch2_csum_types[];
|
|
extern const char * const __bch2_csum_opts[];
|
|
extern const char * const __bch2_compression_types[];
|
|
extern const char * const bch2_compression_opts[];
|
|
@@ -50,10 +51,6 @@ static inline const char *bch2_d_type_str(unsigned d_type)
|
|
* apply the options from that struct that are defined.
|
|
*/
|
|
|
|
-/* dummy option, for options that aren't stored in the superblock */
|
|
-u64 BCH2_NO_SB_OPT(const struct bch_sb *);
|
|
-void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
|
|
-
|
|
/* When can be set: */
|
|
enum opt_flags {
|
|
OPT_FS = BIT(0), /* Filesystem option */
|
|
@@ -132,19 +129,24 @@ enum fsck_err_opts {
|
|
OPT_FS|OPT_FORMAT| \
|
|
OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
|
|
OPT_UINT(512, 1U << 16), \
|
|
- BCH_SB_BLOCK_SIZE, 8, \
|
|
+ BCH_SB_BLOCK_SIZE, 4 << 10, \
|
|
"size", NULL) \
|
|
x(btree_node_size, u32, \
|
|
OPT_FS|OPT_FORMAT| \
|
|
OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
|
|
OPT_UINT(512, 1U << 20), \
|
|
- BCH_SB_BTREE_NODE_SIZE, 512, \
|
|
+ BCH_SB_BTREE_NODE_SIZE, 256 << 10, \
|
|
"size", "Btree node size, default 256k") \
|
|
x(errors, u8, \
|
|
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_STR(bch2_error_actions), \
|
|
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
|
|
NULL, "Action to take on filesystem error") \
|
|
+ x(write_error_timeout, u16, \
|
|
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_UINT(1, 300), \
|
|
+ BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
|
|
+ NULL, "Number of consecutive write errors allowed before kicking out a device")\
|
|
x(metadata_replicas, u8, \
|
|
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_UINT(1, BCH_REPLICAS_MAX), \
|
|
@@ -181,6 +183,11 @@ enum fsck_err_opts {
|
|
OPT_STR(__bch2_csum_opts), \
|
|
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
|
|
NULL, NULL) \
|
|
+ x(checksum_err_retry_nr, u8, \
|
|
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_UINT(0, 32), \
|
|
+ BCH_SB_CSUM_ERR_RETRY_NR, 3, \
|
|
+ NULL, NULL) \
|
|
x(compression, u8, \
|
|
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_FN(bch2_opt_compression), \
|
|
@@ -197,7 +204,7 @@ enum fsck_err_opts {
|
|
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
|
|
NULL, "Hash function for directory entries and xattrs")\
|
|
x(metadata_target, u16, \
|
|
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_FN(bch2_opt_target), \
|
|
BCH_SB_METADATA_TARGET, 0, \
|
|
"(target)", "Device or label for metadata writes") \
|
|
@@ -308,11 +315,6 @@ enum fsck_err_opts {
|
|
OPT_BOOL(), \
|
|
BCH2_NO_SB_OPT, false, \
|
|
NULL, "Don't kick drives out when splitbrain detected")\
|
|
- x(discard, u8, \
|
|
- OPT_FS|OPT_MOUNT|OPT_DEVICE, \
|
|
- OPT_BOOL(), \
|
|
- BCH2_NO_SB_OPT, true, \
|
|
- NULL, "Enable discard/TRIM support") \
|
|
x(verbose, u8, \
|
|
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_BOOL(), \
|
|
@@ -493,27 +495,32 @@ enum fsck_err_opts {
|
|
BCH2_NO_SB_OPT, false, \
|
|
NULL, "Skip submit_bio() for data reads and writes, " \
|
|
"for performance testing purposes") \
|
|
- x(fs_size, u64, \
|
|
- OPT_DEVICE, \
|
|
+ x(state, u64, \
|
|
+ OPT_DEVICE|OPT_RUNTIME, \
|
|
+ OPT_STR(bch2_member_states), \
|
|
+ BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \
|
|
+ "state", "rw,ro,failed,spare") \
|
|
+ x(bucket_size, u32, \
|
|
+ OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \
|
|
OPT_UINT(0, S64_MAX), \
|
|
- BCH2_NO_SB_OPT, 0, \
|
|
- "size", "Size of filesystem on device") \
|
|
- x(bucket, u32, \
|
|
- OPT_DEVICE, \
|
|
- OPT_UINT(0, S64_MAX), \
|
|
- BCH2_NO_SB_OPT, 0, \
|
|
+ BCH_MEMBER_BUCKET_SIZE, 0, \
|
|
"size", "Specifies the bucket size; must be greater than the btree node size")\
|
|
x(durability, u8, \
|
|
- OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \
|
|
+ OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \
|
|
OPT_UINT(0, BCH_REPLICAS_MAX), \
|
|
- BCH2_NO_SB_OPT, 1, \
|
|
+ BCH_MEMBER_DURABILITY, 1, \
|
|
"n", "Data written to this device will be considered\n"\
|
|
"to have already been replicated n times") \
|
|
x(data_allowed, u8, \
|
|
OPT_DEVICE, \
|
|
OPT_BITFIELD(__bch2_data_types), \
|
|
- BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
|
|
+ BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
|
|
"types", "Allowed data types for this device: journal, btree, and/or user")\
|
|
+ x(discard, u8, \
|
|
+ OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH_MEMBER_DISCARD, true, \
|
|
+ NULL, "Enable discard/TRIM support") \
|
|
x(btree_node_prefetch, u8, \
|
|
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
|
|
OPT_BOOL(), \
|
|
@@ -521,11 +528,6 @@ enum fsck_err_opts {
|
|
NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
|
|
" prefetched sequentially")
|
|
|
|
-#define BCH_DEV_OPT_SETTERS() \
|
|
- x(discard, BCH_MEMBER_DISCARD) \
|
|
- x(durability, BCH_MEMBER_DURABILITY) \
|
|
- x(data_allowed, BCH_MEMBER_DATA_ALLOWED)
|
|
-
|
|
struct bch_opts {
|
|
#define x(_name, _bits, ...) unsigned _name##_defined:1;
|
|
BCH_OPTS()
|
|
@@ -582,8 +584,6 @@ struct printbuf;
|
|
|
|
struct bch_option {
|
|
struct attribute attr;
|
|
- u64 (*get_sb)(const struct bch_sb *);
|
|
- void (*set_sb)(struct bch_sb *, u64);
|
|
enum opt_type type;
|
|
enum opt_flags flags;
|
|
u64 min, max;
|
|
@@ -595,6 +595,12 @@ struct bch_option {
|
|
const char *hint;
|
|
const char *help;
|
|
|
|
+ u64 (*get_sb)(const struct bch_sb *);
|
|
+ void (*set_sb)(struct bch_sb *, u64);
|
|
+
|
|
+ u64 (*get_member)(const struct bch_member *);
|
|
+ void (*set_member)(struct bch_member *, u64);
|
|
+
|
|
};
|
|
|
|
extern const struct bch_option bch2_opt_table[];
|
|
@@ -603,7 +609,7 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
|
|
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
|
|
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
|
|
|
|
-u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
|
|
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int);
|
|
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
|
|
void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
|
|
|
|
@@ -625,7 +631,7 @@ void bch2_opts_to_text(struct printbuf *,
|
|
struct bch_fs *, struct bch_sb *,
|
|
unsigned, unsigned, unsigned);
|
|
|
|
-int bch2_opt_check_may_set(struct bch_fs *, int, u64);
|
|
+int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64);
|
|
int bch2_opts_check_may_set(struct bch_fs *);
|
|
int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
|
|
struct printbuf *, const char *, const char *);
|
|
diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c
|
|
new file mode 100644
|
|
index 000000000000..bafd1c91a802
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/progress.c
|
|
@@ -0,0 +1,63 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "bbpos.h"
|
|
+#include "disk_accounting.h"
|
|
+#include "progress.h"
|
|
+
|
|
+void bch2_progress_init(struct progress_indicator_state *s,
|
|
+ struct bch_fs *c,
|
|
+ u64 btree_id_mask)
|
|
+{
|
|
+ memset(s, 0, sizeof(*s));
|
|
+
|
|
+ s->next_print = jiffies + HZ * 10;
|
|
+
|
|
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
|
|
+ if (!(btree_id_mask & BIT_ULL(i)))
|
|
+ continue;
|
|
+
|
|
+ struct disk_accounting_pos acc = {
|
|
+ .type = BCH_DISK_ACCOUNTING_btree,
|
|
+ .btree.id = i,
|
|
+ };
|
|
+
|
|
+ u64 v;
|
|
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
|
|
+ s->nodes_total += div64_ul(v, btree_sectors(c));
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline bool progress_update_p(struct progress_indicator_state *s)
|
|
+{
|
|
+ bool ret = time_after_eq(jiffies, s->next_print);
|
|
+
|
|
+ if (ret)
|
|
+ s->next_print = jiffies + HZ * 10;
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_progress_update_iter(struct btree_trans *trans,
|
|
+ struct progress_indicator_state *s,
|
|
+ struct btree_iter *iter,
|
|
+ const char *msg)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree *b = path_l(btree_iter_path(trans, iter))->b;
|
|
+
|
|
+ s->nodes_seen += b != s->last_node;
|
|
+ s->last_node = b;
|
|
+
|
|
+ if (progress_update_p(s)) {
|
|
+ struct printbuf buf = PRINTBUF;
|
|
+ unsigned percent = s->nodes_total
|
|
+ ? div64_u64(s->nodes_seen * 100, s->nodes_total)
|
|
+ : 0;
|
|
+
|
|
+ prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
|
|
+ msg, percent, s->nodes_seen, s->nodes_total);
|
|
+ bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
|
|
+
|
|
+ bch_info(c, "%s", buf.buf);
|
|
+ printbuf_exit(&buf);
|
|
+ }
|
|
+}
|
|
diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h
|
|
new file mode 100644
|
|
index 000000000000..23fb1811f943
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/progress.h
|
|
@@ -0,0 +1,29 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_PROGRESS_H
|
|
+#define _BCACHEFS_PROGRESS_H
|
|
+
|
|
+/*
|
|
+ * Lame progress indicators
|
|
+ *
|
|
+ * We don't like to use these because they print to the dmesg console, which is
|
|
+ * spammy - we much prefer to be wired up to a userspace programm (e.g. via
|
|
+ * thread_with_file) and have it print the progress indicator.
|
|
+ *
|
|
+ * But some code is old and doesn't support that, or runs in a context where
|
|
+ * that's not yet practical (mount).
|
|
+ */
|
|
+
|
|
+struct progress_indicator_state {
|
|
+ unsigned long next_print;
|
|
+ u64 nodes_seen;
|
|
+ u64 nodes_total;
|
|
+ struct btree *last_node;
|
|
+};
|
|
+
|
|
+void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64);
|
|
+void bch2_progress_update_iter(struct btree_trans *,
|
|
+ struct progress_indicator_state *,
|
|
+ struct btree_iter *,
|
|
+ const char *);
|
|
+
|
|
+#endif /* _BCACHEFS_PROGRESS_H */
|
|
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
|
|
index d0a1f5cd5c2b..10c6a7fda54c 100644
|
|
--- a/fs/bcachefs/rebalance.c
|
|
+++ b/fs/bcachefs/rebalance.c
|
|
@@ -26,9 +26,8 @@
|
|
|
|
/* bch_extent_rebalance: */
|
|
|
|
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
|
|
+static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
|
|
{
|
|
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
const union bch_extent_entry *entry;
|
|
|
|
bkey_extent_entry_for_each(ptrs, entry)
|
|
@@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s
|
|
return NULL;
|
|
}
|
|
|
|
+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
|
|
+{
|
|
+ return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
|
|
+}
|
|
+
|
|
static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
|
|
struct bch_io_opts *opts,
|
|
struct bkey_s_c k,
|
|
@@ -91,17 +95,24 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
|
|
{
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
|
|
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
|
|
+ return 0;
|
|
+
|
|
return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
|
|
bch2_bkey_ptrs_need_move(c, opts, ptrs);
|
|
}
|
|
|
|
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
|
|
{
|
|
- const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+
|
|
+ const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
|
|
if (!opts)
|
|
return 0;
|
|
|
|
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
|
|
+ return 0;
|
|
+
|
|
const union bch_extent_entry *entry;
|
|
struct extent_ptr_decoded p;
|
|
u64 sectors = 0;
|
|
@@ -341,7 +352,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
|
|
memset(data_opts, 0, sizeof(*data_opts));
|
|
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
|
|
data_opts->target = io_opts->background_target;
|
|
- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
|
|
+ data_opts->write_flags |= BCH_WRITE_only_specified_devs;
|
|
|
|
if (!data_opts->rewrite_ptrs) {
|
|
/*
|
|
@@ -449,7 +460,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
|
|
{
|
|
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
|
|
data_opts->target = io_opts->background_target;
|
|
- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
|
|
+ data_opts->write_flags |= BCH_WRITE_only_specified_devs;
|
|
return data_opts->rewrite_ptrs != 0;
|
|
}
|
|
|
|
@@ -590,8 +601,19 @@ static int bch2_rebalance_thread(void *arg)
|
|
|
|
void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
|
|
{
|
|
+ printbuf_tabstop_push(out, 32);
|
|
+
|
|
struct bch_fs_rebalance *r = &c->rebalance;
|
|
|
|
+ /* print pending work */
|
|
+ struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, };
|
|
+ u64 v;
|
|
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
|
|
+
|
|
+ prt_printf(out, "pending work:\t");
|
|
+ prt_human_readable_u64(out, v);
|
|
+ prt_printf(out, "\n\n");
|
|
+
|
|
prt_str(out, bch2_rebalance_state_strs[r->state]);
|
|
prt_newline(out);
|
|
printbuf_indent_add(out, 2);
|
|
@@ -600,15 +622,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
|
|
case BCH_REBALANCE_waiting: {
|
|
u64 now = atomic64_read(&c->io_clock[WRITE].now);
|
|
|
|
- prt_str(out, "io wait duration: ");
|
|
+ prt_printf(out, "io wait duration:\t");
|
|
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
|
|
prt_newline(out);
|
|
|
|
- prt_str(out, "io wait remaining: ");
|
|
+ prt_printf(out, "io wait remaining:\t");
|
|
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
|
|
prt_newline(out);
|
|
|
|
- prt_str(out, "duration waited: ");
|
|
+ prt_printf(out, "duration waited:\t");
|
|
bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
|
|
prt_newline(out);
|
|
break;
|
|
@@ -621,6 +643,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
|
|
break;
|
|
}
|
|
prt_newline(out);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ struct task_struct *t = rcu_dereference(c->rebalance.thread);
|
|
+ if (t)
|
|
+ get_task_struct(t);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (t) {
|
|
+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
|
|
+ put_task_struct(t);
|
|
+ }
|
|
+
|
|
printbuf_indent_sub(out, 2);
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
|
|
index 71c786cdb192..266c5770c824 100644
|
|
--- a/fs/bcachefs/recovery.c
|
|
+++ b/fs/bcachefs/recovery.c
|
|
@@ -13,12 +13,12 @@
|
|
#include "disk_accounting.h"
|
|
#include "errcode.h"
|
|
#include "error.h"
|
|
-#include "fs-common.h"
|
|
#include "journal_io.h"
|
|
#include "journal_reclaim.h"
|
|
#include "journal_seq_blacklist.h"
|
|
#include "logged_ops.h"
|
|
#include "move.h"
|
|
+#include "namei.h"
|
|
#include "quota.h"
|
|
#include "rebalance.h"
|
|
#include "recovery.h"
|
|
@@ -899,7 +899,7 @@ int bch2_fs_recovery(struct bch_fs *c)
|
|
* journal sequence numbers:
|
|
*/
|
|
if (!c->sb.clean)
|
|
- journal_seq += 8;
|
|
+ journal_seq += JOURNAL_BUF_NR * 4;
|
|
|
|
if (blacklist_seq != journal_seq) {
|
|
ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
|
|
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
|
|
index 418557960ed6..e89b9c783285 100644
|
|
--- a/fs/bcachefs/recovery_passes_types.h
|
|
+++ b/fs/bcachefs/recovery_passes_types.h
|
|
@@ -24,7 +24,7 @@
|
|
x(check_topology, 4, 0) \
|
|
x(accounting_read, 39, PASS_ALWAYS) \
|
|
x(alloc_read, 0, PASS_ALWAYS) \
|
|
- x(stripes_read, 1, PASS_ALWAYS) \
|
|
+ x(stripes_read, 1, 0) \
|
|
x(initialize_subvolumes, 2, 0) \
|
|
x(snapshots_read, 3, PASS_ALWAYS) \
|
|
x(check_allocations, 5, PASS_FSCK) \
|
|
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
|
|
index 441e648f28b5..68172c6eba21 100644
|
|
--- a/fs/bcachefs/reflink.c
|
|
+++ b/fs/bcachefs/reflink.c
|
|
@@ -185,12 +185,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
|
|
BUG_ON(missing_start < refd_start);
|
|
BUG_ON(missing_end > refd_end);
|
|
|
|
- if (fsck_err(trans, reflink_p_to_missing_reflink_v,
|
|
- "pointer to missing indirect extent\n"
|
|
- " %s\n"
|
|
- " missing range %llu-%llu",
|
|
- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
|
|
- missing_start, missing_end)) {
|
|
+ struct bpos missing_pos = bkey_start_pos(p.k);
|
|
+ missing_pos.offset += missing_start - live_start;
|
|
+
|
|
+ prt_printf(&buf, "pointer to missing indirect extent in ");
|
|
+ ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ prt_printf(&buf, "-%llu\n ", (missing_pos.offset + (missing_end - missing_start)) << 9);
|
|
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
|
|
+
|
|
+ prt_printf(&buf, "\n missing reflink btree range %llu-%llu",
|
|
+ missing_start, missing_end);
|
|
+
|
|
+ if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) {
|
|
struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
|
|
ret = PTR_ERR_OR_ZERO(new);
|
|
if (ret)
|
|
@@ -597,7 +606,7 @@ s64 bch2_remap_range(struct bch_fs *c,
|
|
u64 dst_done = 0;
|
|
u32 dst_snapshot, src_snapshot;
|
|
bool reflink_p_may_update_opts_field =
|
|
- bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
|
|
+ !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
|
|
int ret = 0, ret2 = 0;
|
|
|
|
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
|
|
diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c
|
|
index 6992e7469112..2b4b8445d418 100644
|
|
--- a/fs/bcachefs/sb-counters.c
|
|
+++ b/fs/bcachefs/sb-counters.c
|
|
@@ -5,7 +5,13 @@
|
|
|
|
/* BCH_SB_FIELD_counters */
|
|
|
|
-static const char * const bch2_counter_names[] = {
|
|
+static const u8 counters_to_stable_map[] = {
|
|
+#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n,
|
|
+ BCH_PERSISTENT_COUNTERS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+const char * const bch2_counter_names[] = {
|
|
#define x(t, n, ...) (#t),
|
|
BCH_PERSISTENT_COUNTERS()
|
|
#undef x
|
|
@@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
|
|
return 0;
|
|
|
|
return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
|
|
-};
|
|
+}
|
|
|
|
static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
|
|
enum bch_validate_flags flags, struct printbuf *err)
|
|
{
|
|
return 0;
|
|
-};
|
|
+}
|
|
|
|
static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
|
|
struct bch_sb_field *f)
|
|
@@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
|
|
struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
|
|
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
|
|
|
|
- for (unsigned i = 0; i < nr; i++)
|
|
- prt_printf(out, "%s \t%llu\n",
|
|
- i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)",
|
|
- le64_to_cpu(ctrs->d[i]));
|
|
-};
|
|
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
|
+ unsigned stable = counters_to_stable_map[i];
|
|
+ if (stable < nr)
|
|
+ prt_printf(out, "%s \t%llu\n",
|
|
+ bch2_counter_names[i],
|
|
+ le64_to_cpu(ctrs->d[stable]));
|
|
+ }
|
|
+}
|
|
|
|
int bch2_sb_counters_to_cpu(struct bch_fs *c)
|
|
{
|
|
struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
|
|
- unsigned int i;
|
|
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
|
|
- u64 val = 0;
|
|
|
|
- for (i = 0; i < BCH_COUNTER_NR; i++)
|
|
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++)
|
|
c->counters_on_mount[i] = 0;
|
|
|
|
- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
|
|
- val = le64_to_cpu(ctrs->d[i]);
|
|
- percpu_u64_set(&c->counters[i], val);
|
|
- c->counters_on_mount[i] = val;
|
|
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
|
+ unsigned stable = counters_to_stable_map[i];
|
|
+ if (stable < nr) {
|
|
+ u64 v = le64_to_cpu(ctrs->d[stable]);
|
|
+ percpu_u64_set(&c->counters[i], v);
|
|
+ c->counters_on_mount[i] = v;
|
|
+ }
|
|
}
|
|
+
|
|
return 0;
|
|
-};
|
|
+}
|
|
|
|
int bch2_sb_counters_from_cpu(struct bch_fs *c)
|
|
{
|
|
struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
|
|
struct bch_sb_field_counters *ret;
|
|
- unsigned int i;
|
|
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
|
|
|
|
if (nr < BCH_COUNTER_NR) {
|
|
ret = bch2_sb_field_resize(&c->disk_sb, counters,
|
|
- sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
|
|
-
|
|
+ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
|
|
if (ret) {
|
|
ctrs = ret;
|
|
nr = bch2_sb_counter_nr_entries(ctrs);
|
|
}
|
|
}
|
|
|
|
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
|
+ unsigned stable = counters_to_stable_map[i];
|
|
+ if (stable < nr)
|
|
+ ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
|
|
+ }
|
|
|
|
- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
|
|
- ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
|
|
return 0;
|
|
}
|
|
|
|
@@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = {
|
|
.validate = bch2_sb_counters_validate,
|
|
.to_text = bch2_sb_counters_to_text,
|
|
};
|
|
+
|
|
+#ifndef NO_BCACHEFS_CHARDEV
|
|
+long bch2_ioctl_query_counters(struct bch_fs *c,
|
|
+ struct bch_ioctl_query_counters __user *user_arg)
|
|
+{
|
|
+ struct bch_ioctl_query_counters arg;
|
|
+ int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg));
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) ||
|
|
+ arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ arg.nr = min(arg.nr, BCH_COUNTER_NR);
|
|
+ ret = put_user(arg.nr, &user_arg->nr);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
|
+ unsigned stable = counters_to_stable_map[i];
|
|
+
|
|
+ if (stable < arg.nr) {
|
|
+ u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT)
|
|
+ ? percpu_u64_get(&c->counters[i])
|
|
+ : c->counters_on_mount[i];
|
|
+
|
|
+ ret = put_user(v, &user_arg->d[stable]);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+#endif
|
|
diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h
|
|
index 81f8aec9fcb1..a4329ad8dd1b 100644
|
|
--- a/fs/bcachefs/sb-counters.h
|
|
+++ b/fs/bcachefs/sb-counters.h
|
|
@@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *);
|
|
void bch2_fs_counters_exit(struct bch_fs *);
|
|
int bch2_fs_counters_init(struct bch_fs *);
|
|
|
|
+extern const char * const bch2_counter_names[];
|
|
extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
|
|
|
|
+long bch2_ioctl_query_counters(struct bch_fs *,
|
|
+ struct bch_ioctl_query_counters __user *);
|
|
+
|
|
#endif // _BCACHEFS_SB_COUNTERS_H
|
|
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
|
|
index fdcf598f08b1..5c4e5de79d81 100644
|
|
--- a/fs/bcachefs/sb-counters_format.h
|
|
+++ b/fs/bcachefs/sb-counters_format.h
|
|
@@ -9,10 +9,25 @@ enum counters_flags {
|
|
|
|
#define BCH_PERSISTENT_COUNTERS() \
|
|
x(io_read, 0, TYPE_SECTORS) \
|
|
+ x(io_read_inline, 80, TYPE_SECTORS) \
|
|
+ x(io_read_hole, 81, TYPE_SECTORS) \
|
|
+ x(io_read_promote, 30, TYPE_COUNTER) \
|
|
+ x(io_read_bounce, 31, TYPE_COUNTER) \
|
|
+ x(io_read_split, 33, TYPE_COUNTER) \
|
|
+ x(io_read_reuse_race, 34, TYPE_COUNTER) \
|
|
+ x(io_read_retry, 32, TYPE_COUNTER) \
|
|
+ x(io_read_fail_and_poison, 82, TYPE_COUNTER) \
|
|
x(io_write, 1, TYPE_SECTORS) \
|
|
x(io_move, 2, TYPE_SECTORS) \
|
|
+ x(io_move_read, 35, TYPE_SECTORS) \
|
|
+ x(io_move_write, 36, TYPE_SECTORS) \
|
|
+ x(io_move_finish, 37, TYPE_SECTORS) \
|
|
+ x(io_move_fail, 38, TYPE_COUNTER) \
|
|
+ x(io_move_write_fail, 82, TYPE_COUNTER) \
|
|
+ x(io_move_start_fail, 39, TYPE_COUNTER) \
|
|
x(bucket_invalidate, 3, TYPE_COUNTER) \
|
|
x(bucket_discard, 4, TYPE_COUNTER) \
|
|
+ x(bucket_discard_fast, 79, TYPE_COUNTER) \
|
|
x(bucket_alloc, 5, TYPE_COUNTER) \
|
|
x(bucket_alloc_fail, 6, TYPE_COUNTER) \
|
|
x(btree_cache_scan, 7, TYPE_COUNTER) \
|
|
@@ -38,16 +53,6 @@ enum counters_flags {
|
|
x(journal_reclaim_finish, 27, TYPE_COUNTER) \
|
|
x(journal_reclaim_start, 28, TYPE_COUNTER) \
|
|
x(journal_write, 29, TYPE_COUNTER) \
|
|
- x(read_promote, 30, TYPE_COUNTER) \
|
|
- x(read_bounce, 31, TYPE_COUNTER) \
|
|
- x(read_split, 33, TYPE_COUNTER) \
|
|
- x(read_retry, 32, TYPE_COUNTER) \
|
|
- x(read_reuse_race, 34, TYPE_COUNTER) \
|
|
- x(move_extent_read, 35, TYPE_SECTORS) \
|
|
- x(move_extent_write, 36, TYPE_SECTORS) \
|
|
- x(move_extent_finish, 37, TYPE_SECTORS) \
|
|
- x(move_extent_fail, 38, TYPE_COUNTER) \
|
|
- x(move_extent_start_fail, 39, TYPE_COUNTER) \
|
|
x(copygc, 40, TYPE_COUNTER) \
|
|
x(copygc_wait, 41, TYPE_COUNTER) \
|
|
x(gc_gens_end, 42, TYPE_COUNTER) \
|
|
@@ -95,6 +100,13 @@ enum bch_persistent_counters {
|
|
BCH_COUNTER_NR
|
|
};
|
|
|
|
+enum bch_persistent_counters_stable {
|
|
+#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n,
|
|
+ BCH_PERSISTENT_COUNTERS()
|
|
+#undef x
|
|
+ BCH_COUNTER_STABLE_NR
|
|
+};
|
|
+
|
|
struct bch_sb_field_counters {
|
|
struct bch_sb_field field;
|
|
__le64 d[];
|
|
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
|
|
index 051214fdc735..acb5d845841e 100644
|
|
--- a/fs/bcachefs/sb-downgrade.c
|
|
+++ b/fs/bcachefs/sb-downgrade.c
|
|
@@ -90,7 +90,13 @@
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
|
BCH_FSCK_ERR_accounting_mismatch, \
|
|
BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
|
|
- BCH_FSCK_ERR_accounting_key_junk_at_end)
|
|
+ BCH_FSCK_ERR_accounting_key_junk_at_end) \
|
|
+ x(cached_backpointers, \
|
|
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
|
|
+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \
|
|
+ x(stripe_backpointers, \
|
|
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
|
|
+ BCH_FSCK_ERR_ptr_to_missing_backpointer)
|
|
|
|
#define DOWNGRADE_TABLE() \
|
|
x(bucket_stripe_sectors, \
|
|
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
|
|
index b86ec013d7d7..1736abea9ed1 100644
|
|
--- a/fs/bcachefs/sb-errors_format.h
|
|
+++ b/fs/bcachefs/sb-errors_format.h
|
|
@@ -179,6 +179,7 @@ enum bch_fsck_flags {
|
|
x(ptr_crc_redundant, 160, 0) \
|
|
x(ptr_crc_nonce_mismatch, 162, 0) \
|
|
x(ptr_stripe_redundant, 163, 0) \
|
|
+ x(extent_flags_not_at_start, 306, 0) \
|
|
x(reservation_key_nr_replicas_invalid, 164, 0) \
|
|
x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
|
|
x(reflink_v_pos_bad, 292, 0) \
|
|
@@ -310,11 +311,14 @@ enum bch_fsck_flags {
|
|
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
|
|
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
|
|
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
|
|
+ x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \
|
|
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
|
|
x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
|
|
x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
|
|
x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
|
|
- x(MAX, 304, 0)
|
|
+ x(dirent_cf_name_too_big, 304, 0) \
|
|
+ x(dirent_stray_data_after_cf_name, 305, 0) \
|
|
+ x(MAX, 308, 0)
|
|
|
|
enum bch_sb_error_id {
|
|
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
|
|
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
|
|
index 762083b564ee..38261638a611 100644
|
|
--- a/fs/bcachefs/sb-members.h
|
|
+++ b/fs/bcachefs/sb-members.h
|
|
@@ -23,7 +23,19 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca)
|
|
return !percpu_ref_is_zero(&ca->io_ref);
|
|
}
|
|
|
|
-static inline bool bch2_dev_is_readable(struct bch_dev *ca)
|
|
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
|
|
+
|
|
+static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
|
|
+{
|
|
+ rcu_read_lock();
|
|
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
|
|
+ bool ret = ca && bch2_dev_is_online(ca);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
|
|
{
|
|
return bch2_dev_is_online(ca) &&
|
|
ca->mi.state != BCH_MEMBER_STATE_failed;
|
|
@@ -271,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev
|
|
|
|
static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
|
|
{
|
|
+ might_sleep();
|
|
+
|
|
rcu_read_lock();
|
|
struct bch_dev *ca = bch2_dev_rcu(c, dev);
|
|
if (ca && !percpu_ref_tryget(&ca->io_ref))
|
|
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
|
|
index 2adf1221a440..3affec823b3f 100644
|
|
--- a/fs/bcachefs/sb-members_format.h
|
|
+++ b/fs/bcachefs/sb-members_format.h
|
|
@@ -79,6 +79,7 @@ struct bch_member {
|
|
|
|
#define BCH_MEMBER_V1_BYTES 56
|
|
|
|
+LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16)
|
|
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
|
|
/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
|
|
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
|
|
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
|
|
index c54091a28909..e7f197896db1 100644
|
|
--- a/fs/bcachefs/snapshot.c
|
|
+++ b/fs/bcachefs/snapshot.c
|
|
@@ -146,8 +146,9 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
|
|
goto out;
|
|
}
|
|
|
|
- while (id && id < ancestor - IS_ANCESTOR_BITMAP)
|
|
- id = get_ancestor_below(t, id, ancestor);
|
|
+ if (likely(ancestor >= IS_ANCESTOR_BITMAP))
|
|
+ while (id && id < ancestor - IS_ANCESTOR_BITMAP)
|
|
+ id = get_ancestor_below(t, id, ancestor);
|
|
|
|
ret = id && id < ancestor
|
|
? test_ancestor_bitmap(t, id, ancestor)
|
|
@@ -389,7 +390,7 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
|
|
return 0;
|
|
}
|
|
|
|
-static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
|
|
+u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
|
|
{
|
|
u32 id = snapshot_root;
|
|
u32 subvol = 0, s;
|
|
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
|
|
index 00373cf32e7b..81180181d7c9 100644
|
|
--- a/fs/bcachefs/snapshot.h
|
|
+++ b/fs/bcachefs/snapshot.h
|
|
@@ -105,6 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
|
|
return id;
|
|
}
|
|
|
|
+u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32);
|
|
u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
|
|
|
|
static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
|
|
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
|
|
index d78451c2a0c6..93e71119e5a4 100644
|
|
--- a/fs/bcachefs/str_hash.c
|
|
+++ b/fs/bcachefs/str_hash.c
|
|
@@ -50,7 +50,7 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
|
|
for (unsigned i = 0; i < 1000; i++) {
|
|
unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
|
|
old_name.len, old_name.name, i);
|
|
- unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
|
|
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0);
|
|
|
|
if (u64s > U8_MAX)
|
|
return -EINVAL;
|
|
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
|
|
index 55a4ac7bf220..575ad1e03904 100644
|
|
--- a/fs/bcachefs/str_hash.h
|
|
+++ b/fs/bcachefs/str_hash.h
|
|
@@ -12,7 +12,6 @@
|
|
#include "super.h"
|
|
|
|
#include <linux/crc32c.h>
|
|
-#include <crypto/hash.h>
|
|
#include <crypto/sha2.h>
|
|
|
|
static inline enum bch_str_hash_type
|
|
@@ -34,6 +33,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
|
|
|
|
struct bch_hash_info {
|
|
u8 type;
|
|
+ struct unicode_map *cf_encoding;
|
|
/*
|
|
* For crc32 or crc64 string hashes the first key value of
|
|
* the siphash_key (k0) is used as the key.
|
|
@@ -47,17 +47,17 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
|
|
/* XXX ick */
|
|
struct bch_hash_info info = {
|
|
.type = INODE_STR_HASH(bi),
|
|
+#ifdef CONFIG_UNICODE
|
|
+ .cf_encoding = !!(bi->bi_flags & BCH_INODE_casefolded) ? c->cf_encoding : NULL,
|
|
+#endif
|
|
.siphash_key = { .k0 = bi->bi_hash_seed }
|
|
};
|
|
|
|
if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
|
|
- SHASH_DESC_ON_STACK(desc, c->sha256);
|
|
u8 digest[SHA256_DIGEST_SIZE];
|
|
|
|
- desc->tfm = c->sha256;
|
|
-
|
|
- crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
|
|
- sizeof(bi->bi_hash_seed), digest);
|
|
+ sha256((const u8 *)&bi->bi_hash_seed,
|
|
+ sizeof(bi->bi_hash_seed), digest);
|
|
memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
|
|
}
|
|
|
|
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
|
|
index a81a7b6c0989..572b06bfa0b8 100644
|
|
--- a/fs/bcachefs/super-io.c
|
|
+++ b/fs/bcachefs/super-io.c
|
|
@@ -25,9 +25,6 @@
|
|
#include <linux/sort.h>
|
|
#include <linux/string_choices.h>
|
|
|
|
-static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
|
|
-};
|
|
-
|
|
struct bch2_metadata_version {
|
|
u16 version;
|
|
const char *name;
|
|
@@ -69,12 +66,14 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta
|
|
return v;
|
|
}
|
|
|
|
-bool bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
|
|
+int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
|
|
{
|
|
- bool ret = (c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
|
|
- version <= c->sb.version_incompat_allowed;
|
|
+ int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
|
|
+ version <= c->sb.version_incompat_allowed)
|
|
+ ? 0
|
|
+ : -BCH_ERR_may_not_use_incompat_feature;
|
|
|
|
- if (ret) {
|
|
+ if (!ret) {
|
|
mutex_lock(&c->sb_lock);
|
|
SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
|
|
max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
|
|
@@ -366,39 +365,41 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
|
|
return 0;
|
|
}
|
|
|
|
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
|
- enum bch_validate_flags flags, struct printbuf *out)
|
|
+int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
|
|
+ enum bch_validate_flags flags, struct printbuf *out)
|
|
{
|
|
- struct bch_sb *sb = disk_sb->sb;
|
|
struct bch_sb_field_members_v1 *mi;
|
|
enum bch_opt_id opt_id;
|
|
- u16 block_size;
|
|
int ret;
|
|
|
|
ret = bch2_sb_compatible(sb, out);
|
|
if (ret)
|
|
return ret;
|
|
|
|
- if (sb->features[1] ||
|
|
- (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
|
|
- prt_printf(out, "Filesystem has incompatible features");
|
|
+ u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
|
|
+ unsigned incompat_bit = 0;
|
|
+ if (incompat)
|
|
+ incompat_bit = __ffs64(incompat);
|
|
+ else if (sb->features[1])
|
|
+ incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
|
|
+
|
|
+ if (incompat_bit) {
|
|
+ prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
|
|
+ incompat_bit,
|
|
+ bch2_sb_features[BCH_FEATURE_NR - 1],
|
|
+ BCH_FEATURE_NR - 1);
|
|
return -BCH_ERR_invalid_sb_features;
|
|
}
|
|
|
|
if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
|
|
BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
|
|
- prt_printf(out, "Filesystem has incompatible version");
|
|
+ prt_str(out, "Filesystem has incompatible version ");
|
|
+ bch2_version_to_text(out, le16_to_cpu(sb->version));
|
|
+ prt_str(out, ", current version ");
|
|
+ bch2_version_to_text(out, bcachefs_metadata_version_current);
|
|
return -BCH_ERR_invalid_sb_features;
|
|
}
|
|
|
|
- block_size = le16_to_cpu(sb->block_size);
|
|
-
|
|
- if (block_size > PAGE_SECTORS) {
|
|
- prt_printf(out, "Block size too big (got %u, max %u)",
|
|
- block_size, PAGE_SECTORS);
|
|
- return -BCH_ERR_invalid_sb_block_size;
|
|
- }
|
|
-
|
|
if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
|
|
prt_printf(out, "Bad user UUID (got zeroes)");
|
|
return -BCH_ERR_invalid_sb_uuid;
|
|
@@ -409,6 +410,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
|
return -BCH_ERR_invalid_sb_uuid;
|
|
}
|
|
|
|
+ if (!(flags & BCH_VALIDATE_write) &&
|
|
+ le64_to_cpu(sb->offset) != read_offset) {
|
|
+ prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
|
|
+ le64_to_cpu(sb->offset), read_offset);
|
|
+ return -BCH_ERR_invalid_sb_offset;
|
|
+ }
|
|
+
|
|
if (!sb->nr_devices ||
|
|
sb->nr_devices > BCH_SB_MEMBERS_MAX) {
|
|
prt_printf(out, "Bad number of member devices %u (max %u)",
|
|
@@ -464,6 +472,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
|
|
|
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
|
|
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
|
|
+
|
|
+ if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
|
|
+ SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
|
|
+
|
|
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
|
|
+ !BCH_SB_CSUM_ERR_RETRY_NR(sb))
|
|
+ SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
|
|
}
|
|
|
|
#ifdef __KERNEL__
|
|
@@ -474,8 +489,8 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
|
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
|
|
const struct bch_option *opt = bch2_opt_table + opt_id;
|
|
|
|
- if (opt->get_sb != BCH2_NO_SB_OPT) {
|
|
- u64 v = bch2_opt_from_sb(sb, opt_id);
|
|
+ if (opt->get_sb) {
|
|
+ u64 v = bch2_opt_from_sb(sb, opt_id, -1);
|
|
|
|
prt_printf(out, "Invalid option ");
|
|
ret = bch2_opt_validate(opt, v, out);
|
|
@@ -755,7 +770,7 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts,
|
|
memset(sb, 0, sizeof(*sb));
|
|
sb->mode = BLK_OPEN_READ;
|
|
sb->have_bio = true;
|
|
- sb->holder = kmalloc(1, GFP_KERNEL);
|
|
+ sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL);
|
|
if (!sb->holder)
|
|
return -ENOMEM;
|
|
|
|
@@ -881,7 +896,7 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts,
|
|
|
|
sb->have_layout = true;
|
|
|
|
- ret = bch2_sb_validate(sb, 0, &err);
|
|
+ ret = bch2_sb_validate(sb->sb, offset, 0, &err);
|
|
if (ret) {
|
|
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
|
|
path, err.buf);
|
|
@@ -918,16 +933,16 @@ static void write_super_endio(struct bio *bio)
|
|
{
|
|
struct bch_dev *ca = bio->bi_private;
|
|
|
|
+ bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
|
|
+
|
|
/* XXX: return errors directly */
|
|
|
|
- if (bch2_dev_io_err_on(bio->bi_status, ca,
|
|
- bio_data_dir(bio)
|
|
- ? BCH_MEMBER_ERROR_write
|
|
- : BCH_MEMBER_ERROR_read,
|
|
- "superblock %s error: %s",
|
|
+ if (bio->bi_status) {
|
|
+ bch_err_dev_ratelimited(ca, "superblock %s error: %s",
|
|
str_write_read(bio_data_dir(bio)),
|
|
- bch2_blk_status_to_str(bio->bi_status)))
|
|
+ bch2_blk_status_to_str(bio->bi_status));
|
|
ca->sb_write_error = 1;
|
|
+ }
|
|
|
|
closure_put(&ca->fs->sb_write);
|
|
percpu_ref_put(&ca->io_ref);
|
|
@@ -1038,7 +1053,7 @@ int bch2_write_super(struct bch_fs *c)
|
|
darray_for_each(online_devices, ca) {
|
|
printbuf_reset(&err);
|
|
|
|
- ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
|
|
+ ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
|
|
if (ret) {
|
|
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
|
|
goto out;
|
|
@@ -1166,7 +1181,7 @@ int bch2_write_super(struct bch_fs *c)
|
|
!can_mount_with_written), c,
|
|
": Unable to write superblock to sufficient devices (from %ps)",
|
|
(void *) _RET_IP_))
|
|
- ret = -1;
|
|
+ ret = -BCH_ERR_erofs_sb_err;
|
|
out:
|
|
/* Make new options visible after they're persistent: */
|
|
bch2_sb_update(c);
|
|
@@ -1223,12 +1238,11 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
|
|
bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
|
|
|
|
c->disk_sb.sb->version = cpu_to_le16(new_version);
|
|
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
|
|
|
|
if (incompat) {
|
|
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
|
|
SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
|
|
max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
|
|
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field);
|
|
}
|
|
}
|
|
|
|
@@ -1459,8 +1473,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
|
|
for (id = 0; id < bch2_opts_nr; id++) {
|
|
const struct bch_option *opt = bch2_opt_table + id;
|
|
|
|
- if (opt->get_sb != BCH2_NO_SB_OPT) {
|
|
- u64 v = bch2_opt_from_sb(sb, id);
|
|
+ if (opt->get_sb) {
|
|
+ u64 v = bch2_opt_from_sb(sb, id, -1);
|
|
|
|
prt_printf(out, "%s:\t", opt->attr.name);
|
|
bch2_opt_to_text(out, NULL, sb, opt, v,
|
|
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
|
|
index b4cff9ebdebb..78f708a6fbcd 100644
|
|
--- a/fs/bcachefs/super-io.h
|
|
+++ b/fs/bcachefs/super-io.h
|
|
@@ -21,13 +21,13 @@ static inline bool bch2_version_compatible(u16 version)
|
|
void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
|
|
enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
|
|
|
|
-bool bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
|
|
+int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
|
|
|
|
-static inline bool bch2_request_incompat_feature(struct bch_fs *c,
|
|
- enum bcachefs_metadata_version version)
|
|
+static inline int bch2_request_incompat_feature(struct bch_fs *c,
|
|
+ enum bcachefs_metadata_version version)
|
|
{
|
|
return likely(version <= c->sb.version_incompat)
|
|
- ? true
|
|
+ ? 0
|
|
: bch2_set_version_incompat(c, version);
|
|
}
|
|
|
|
@@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
|
|
void bch2_free_super(struct bch_sb_handle *);
|
|
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
|
|
|
|
+int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
|
|
+
|
|
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
|
|
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
|
|
int bch2_write_super(struct bch_fs *);
|
|
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
|
|
index 0459c875e189..99f9a0aaa380 100644
|
|
--- a/fs/bcachefs/super.c
|
|
+++ b/fs/bcachefs/super.c
|
|
@@ -75,9 +75,6 @@
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
|
|
MODULE_DESCRIPTION("bcachefs filesystem");
|
|
-MODULE_SOFTDEP("pre: crc32c");
|
|
-MODULE_SOFTDEP("pre: crc64");
|
|
-MODULE_SOFTDEP("pre: sha256");
|
|
MODULE_SOFTDEP("pre: chacha20");
|
|
MODULE_SOFTDEP("pre: poly1305");
|
|
MODULE_SOFTDEP("pre: xxhash");
|
|
@@ -718,7 +715,7 @@ static int bch2_fs_online(struct bch_fs *c)
|
|
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
|
|
#endif
|
|
kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
|
|
- bch2_opts_create_sysfs_files(&c->opts_dir);
|
|
+ bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS);
|
|
if (ret) {
|
|
bch_err(c, "error creating sysfs objects");
|
|
return ret;
|
|
@@ -837,6 +834,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|
if (ret)
|
|
goto err;
|
|
|
|
+#ifdef CONFIG_UNICODE
|
|
+ /* Default encoding until we can potentially have more as an option. */
|
|
+ c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
|
|
+ if (IS_ERR(c->cf_encoding)) {
|
|
+ printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
|
|
+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
|
|
+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
|
|
+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+#else
|
|
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
|
|
+ printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+#endif
|
|
+
|
|
pr_uuid(&name, c->sb.user_uuid.b);
|
|
ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
|
|
if (ret)
|
|
@@ -1056,6 +1072,7 @@ int bch2_fs_start(struct bch_fs *c)
|
|
}
|
|
|
|
set_bit(BCH_FS_started, &c->flags);
|
|
+ wake_up(&c->ro_ref_wait);
|
|
|
|
if (c->opts.read_only) {
|
|
bch2_fs_read_only(c);
|
|
@@ -1280,8 +1297,8 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
|
|
return 0;
|
|
|
|
if (!ca->kobj.state_in_sysfs) {
|
|
- ret = kobject_add(&ca->kobj, &c->kobj,
|
|
- "dev-%u", ca->dev_idx);
|
|
+ ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?:
|
|
+ bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -1412,6 +1429,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
|
|
ca->disk_sb = *sb;
|
|
memset(sb, 0, sizeof(*sb));
|
|
|
|
+ /*
|
|
+ * Stash pointer to the filesystem for blk_holder_ops - note that once
|
|
+ * attached to a filesystem, we will always close the block device
|
|
+ * before tearing down the filesystem object.
|
|
+ */
|
|
+ ca->disk_sb.holder->c = ca->fs;
|
|
+
|
|
ca->dev = ca->disk_sb.bdev->bd_dev;
|
|
|
|
percpu_ref_reinit(&ca->io_ref);
|
|
@@ -1966,15 +1990,12 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
|
mutex_unlock(&c->sb_lock);
|
|
|
|
if (ca->mi.freespace_initialized) {
|
|
- struct disk_accounting_pos acc = {
|
|
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
|
|
- .dev_data_type.dev = ca->dev_idx,
|
|
- .dev_data_type.data_type = BCH_DATA_free,
|
|
- };
|
|
u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
|
|
|
|
ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
|
|
- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
|
|
+ bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
|
|
+ .dev = ca->dev_idx,
|
|
+ .data_type = BCH_DATA_free)) ?:
|
|
bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
|
|
if (ret)
|
|
goto err;
|
|
@@ -1998,6 +2019,102 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
|
|
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
|
|
}
|
|
|
|
+/* blk_holder_ops: */
|
|
+
|
|
+static struct bch_fs *bdev_get_fs(struct block_device *bdev)
|
|
+ __releases(&bdev->bd_holder_lock)
|
|
+{
|
|
+ struct bch_sb_handle_holder *holder = bdev->bd_holder;
|
|
+ struct bch_fs *c = holder->c;
|
|
+
|
|
+ if (c && !bch2_ro_ref_tryget(c))
|
|
+ c = NULL;
|
|
+
|
|
+ mutex_unlock(&bdev->bd_holder_lock);
|
|
+
|
|
+ if (c)
|
|
+ wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
|
|
+ return c;
|
|
+}
|
|
+
|
|
+/* returns with ref on ca->ref */
|
|
+static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
|
|
+{
|
|
+ for_each_member_device(c, ca)
|
|
+ if (ca->disk_sb.bdev == bdev)
|
|
+ return ca;
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
|
|
+{
|
|
+ struct bch_fs *c = bdev_get_fs(bdev);
|
|
+ if (!c)
|
|
+ return;
|
|
+
|
|
+ struct super_block *sb = c->vfs_sb;
|
|
+ if (sb) {
|
|
+ /*
|
|
+ * Not necessary, c->ro_ref guards against the filesystem being
|
|
+ * unmounted - we only take this to avoid a warning in
|
|
+ * sync_filesystem:
|
|
+ */
|
|
+ down_read(&sb->s_umount);
|
|
+ }
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+ struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
|
|
+ if (!ca)
|
|
+ goto unlock;
|
|
+
|
|
+ if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) {
|
|
+ __bch2_dev_offline(c, ca);
|
|
+ } else {
|
|
+ if (sb) {
|
|
+ if (!surprise)
|
|
+ sync_filesystem(sb);
|
|
+ shrink_dcache_sb(sb);
|
|
+ evict_inodes(sb);
|
|
+ }
|
|
+
|
|
+ bch2_journal_flush(&c->journal);
|
|
+ bch2_fs_emergency_read_only(c);
|
|
+ }
|
|
+
|
|
+ bch2_dev_put(ca);
|
|
+unlock:
|
|
+ if (sb)
|
|
+ up_read(&sb->s_umount);
|
|
+ up_write(&c->state_lock);
|
|
+ bch2_ro_ref_put(c);
|
|
+}
|
|
+
|
|
+static void bch2_fs_bdev_sync(struct block_device *bdev)
|
|
+{
|
|
+ struct bch_fs *c = bdev_get_fs(bdev);
|
|
+ if (!c)
|
|
+ return;
|
|
+
|
|
+ struct super_block *sb = c->vfs_sb;
|
|
+ if (sb) {
|
|
+ /*
|
|
+ * Not necessary, c->ro_ref guards against the filesystem being
|
|
+ * unmounted - we only take this to avoid a warning in
|
|
+ * sync_filesystem:
|
|
+ */
|
|
+ down_read(&sb->s_umount);
|
|
+ sync_filesystem(sb);
|
|
+ up_read(&sb->s_umount);
|
|
+ }
|
|
+
|
|
+ bch2_ro_ref_put(c);
|
|
+}
|
|
+
|
|
+const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
|
|
+ .mark_dead = bch2_fs_bdev_mark_dead,
|
|
+ .sync = bch2_fs_bdev_sync,
|
|
+};
|
|
+
|
|
/* Filesystem open: */
|
|
|
|
static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
|
|
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
|
|
index 04f8287eff5c..23533bce5709 100644
|
|
--- a/fs/bcachefs/super.h
|
|
+++ b/fs/bcachefs/super.h
|
|
@@ -42,4 +42,6 @@ void bch2_fs_stop(struct bch_fs *);
|
|
int bch2_fs_start(struct bch_fs *);
|
|
struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
|
|
|
|
+extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
|
|
+
|
|
#endif /* _BCACHEFS_SUPER_H */
|
|
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
|
|
index 368a63d938cf..3a899f799d1d 100644
|
|
--- a/fs/bcachefs/super_types.h
|
|
+++ b/fs/bcachefs/super_types.h
|
|
@@ -2,13 +2,19 @@
|
|
#ifndef _BCACHEFS_SUPER_TYPES_H
|
|
#define _BCACHEFS_SUPER_TYPES_H
|
|
|
|
+struct bch_fs;
|
|
+
|
|
+struct bch_sb_handle_holder {
|
|
+ struct bch_fs *c;
|
|
+};
|
|
+
|
|
struct bch_sb_handle {
|
|
struct bch_sb *sb;
|
|
struct file *s_bdev_file;
|
|
struct block_device *bdev;
|
|
char *sb_name;
|
|
struct bio *bio;
|
|
- void *holder;
|
|
+ struct bch_sb_handle_holder *holder;
|
|
size_t buffer_size;
|
|
blk_mode_t mode;
|
|
unsigned have_layout:1;
|
|
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
|
|
index a7eb1f511484..8c200b558c69 100644
|
|
--- a/fs/bcachefs/sysfs.c
|
|
+++ b/fs/bcachefs/sysfs.c
|
|
@@ -146,15 +146,15 @@ write_attribute(trigger_journal_writes);
|
|
write_attribute(trigger_btree_cache_shrink);
|
|
write_attribute(trigger_btree_key_cache_shrink);
|
|
write_attribute(trigger_freelist_wakeup);
|
|
+write_attribute(trigger_btree_updates);
|
|
read_attribute(gc_gens_pos);
|
|
+__sysfs_attribute(read_fua_test, 0400);
|
|
|
|
read_attribute(uuid);
|
|
read_attribute(minor);
|
|
read_attribute(flags);
|
|
-read_attribute(bucket_size);
|
|
read_attribute(first_bucket);
|
|
read_attribute(nbuckets);
|
|
-rw_attribute(durability);
|
|
read_attribute(io_done);
|
|
read_attribute(io_errors);
|
|
write_attribute(io_errors_reset);
|
|
@@ -173,10 +173,8 @@ read_attribute(journal_debug);
|
|
read_attribute(btree_cache);
|
|
read_attribute(btree_key_cache);
|
|
read_attribute(btree_reserve_cache);
|
|
-read_attribute(stripes_heap);
|
|
read_attribute(open_buckets);
|
|
read_attribute(open_buckets_partial);
|
|
-read_attribute(write_points);
|
|
read_attribute(nocow_lock_table);
|
|
|
|
#ifdef BCH_WRITE_REF_DEBUG
|
|
@@ -209,8 +207,6 @@ read_attribute(usage_base);
|
|
BCH_PERSISTENT_COUNTERS()
|
|
#undef x
|
|
|
|
-rw_attribute(discard);
|
|
-read_attribute(state);
|
|
rw_attribute(label);
|
|
|
|
read_attribute(copy_gc_wait);
|
|
@@ -315,6 +311,116 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
|
|
prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes);
|
|
}
|
|
|
|
+static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca)
|
|
+{
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ struct bio *bio = NULL;
|
|
+ void *buf = NULL;
|
|
+ unsigned bs = c->opts.block_size, iters;
|
|
+ u64 end, test_duration = NSEC_PER_SEC * 2;
|
|
+ struct bch2_time_stats stats_nofua, stats_fua, stats_random;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_time_stats_init_no_pcpu(&stats_nofua);
|
|
+ bch2_time_stats_init_no_pcpu(&stats_fua);
|
|
+ bch2_time_stats_init_no_pcpu(&stats_random);
|
|
+
|
|
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, READ)) {
|
|
+ prt_str(out, "offline\n");
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ struct block_device *bdev = ca->disk_sb.bdev;
|
|
+
|
|
+ bio = bio_kmalloc(1, GFP_KERNEL);
|
|
+ if (!bio) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ buf = kmalloc(bs, GFP_KERNEL);
|
|
+ if (!buf)
|
|
+ goto err;
|
|
+
|
|
+ end = ktime_get_ns() + test_duration;
|
|
+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
|
|
+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
|
|
+ bch2_bio_map(bio, buf, bs);
|
|
+
|
|
+ u64 submit_time = ktime_get_ns();
|
|
+ ret = submit_bio_wait(bio);
|
|
+ bch2_time_stats_update(&stats_nofua, submit_time);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ end = ktime_get_ns() + test_duration;
|
|
+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
|
|
+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ);
|
|
+ bch2_bio_map(bio, buf, bs);
|
|
+
|
|
+ u64 submit_time = ktime_get_ns();
|
|
+ ret = submit_bio_wait(bio);
|
|
+ bch2_time_stats_update(&stats_fua, submit_time);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca);
|
|
+
|
|
+ end = ktime_get_ns() + test_duration;
|
|
+ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
|
|
+ bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
|
|
+ bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9;
|
|
+ bch2_bio_map(bio, buf, bs);
|
|
+
|
|
+ u64 submit_time = ktime_get_ns();
|
|
+ ret = submit_bio_wait(bio);
|
|
+ bch2_time_stats_update(&stats_random, submit_time);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ u64 ns_nofua = mean_and_variance_get_mean(stats_nofua.duration_stats);
|
|
+ u64 ns_fua = mean_and_variance_get_mean(stats_fua.duration_stats);
|
|
+ u64 ns_rand = mean_and_variance_get_mean(stats_random.duration_stats);
|
|
+
|
|
+ u64 stddev_nofua = mean_and_variance_get_stddev(stats_nofua.duration_stats);
|
|
+ u64 stddev_fua = mean_and_variance_get_stddev(stats_fua.duration_stats);
|
|
+ u64 stddev_rand = mean_and_variance_get_stddev(stats_random.duration_stats);
|
|
+
|
|
+ printbuf_tabstop_push(out, 8);
|
|
+ printbuf_tabstop_push(out, 12);
|
|
+ printbuf_tabstop_push(out, 12);
|
|
+ prt_printf(out, "This test must be run on an idle drive for accurate results\n");
|
|
+ prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device));
|
|
+ prt_printf(out, "fua support advertized: %s\n", bdev_fua(bdev) ? "yes" : "no");
|
|
+ prt_newline(out);
|
|
+ prt_printf(out, "ns:\tlatency\rstddev\r\n");
|
|
+ prt_printf(out, "nofua\t%llu\r%llu\r\n", ns_nofua, stddev_nofua);
|
|
+ prt_printf(out, "fua\t%llu\r%llu\r\n", ns_fua, stddev_fua);
|
|
+ prt_printf(out, "random\t%llu\r%llu\r\n", ns_rand, stddev_rand);
|
|
+
|
|
+ bool read_cache = ns_nofua * 2 < ns_rand;
|
|
+ bool fua_cached = read_cache && ns_fua < (ns_nofua + ns_rand) / 2;
|
|
+
|
|
+ if (!read_cache)
|
|
+ prt_str(out, "reads don't appear to be cached - safe\n");
|
|
+ else if (!fua_cached)
|
|
+ prt_str(out, "fua reads don't appear to be cached - safe\n");
|
|
+ else
|
|
+ prt_str(out, "fua reads appear to be cached - unsafe\n");
|
|
+err:
|
|
+ kfree(buf);
|
|
+ kfree(bio);
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ bch_err_fn(c, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
SHOW(bch2_fs)
|
|
{
|
|
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
|
|
@@ -355,18 +461,12 @@ SHOW(bch2_fs)
|
|
if (attr == &sysfs_btree_reserve_cache)
|
|
bch2_btree_reserve_cache_to_text(out, c);
|
|
|
|
- if (attr == &sysfs_stripes_heap)
|
|
- bch2_stripes_heap_to_text(out, c);
|
|
-
|
|
if (attr == &sysfs_open_buckets)
|
|
bch2_open_buckets_to_text(out, c, NULL);
|
|
|
|
if (attr == &sysfs_open_buckets_partial)
|
|
bch2_open_buckets_partial_to_text(out, c);
|
|
|
|
- if (attr == &sysfs_write_points)
|
|
- bch2_write_points_to_text(out, c);
|
|
-
|
|
if (attr == &sysfs_compression_stats)
|
|
bch2_compression_stats_to_text(out, c);
|
|
|
|
@@ -415,6 +515,9 @@ STORE(bch2_fs)
|
|
|
|
/* Debugging: */
|
|
|
|
+ if (attr == &sysfs_trigger_btree_updates)
|
|
+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
|
|
+
|
|
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
|
|
return -EROFS;
|
|
|
|
@@ -566,10 +669,8 @@ struct attribute *bch2_fs_internal_files[] = {
|
|
&sysfs_btree_key_cache,
|
|
&sysfs_btree_reserve_cache,
|
|
&sysfs_new_stripes,
|
|
- &sysfs_stripes_heap,
|
|
&sysfs_open_buckets,
|
|
&sysfs_open_buckets_partial,
|
|
- &sysfs_write_points,
|
|
#ifdef BCH_WRITE_REF_DEBUG
|
|
&sysfs_write_refs,
|
|
#endif
|
|
@@ -585,6 +686,7 @@ struct attribute *bch2_fs_internal_files[] = {
|
|
&sysfs_trigger_btree_cache_shrink,
|
|
&sysfs_trigger_btree_key_cache_shrink,
|
|
&sysfs_trigger_freelist_wakeup,
|
|
+ &sysfs_trigger_btree_updates,
|
|
|
|
&sysfs_gc_gens_pos,
|
|
|
|
@@ -604,26 +706,34 @@ struct attribute *bch2_fs_internal_files[] = {
|
|
|
|
/* options */
|
|
|
|
-SHOW(bch2_fs_opts_dir)
|
|
+static ssize_t sysfs_opt_show(struct bch_fs *c,
|
|
+ struct bch_dev *ca,
|
|
+ enum bch_opt_id id,
|
|
+ struct printbuf *out)
|
|
{
|
|
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
|
|
- const struct bch_option *opt = container_of(attr, struct bch_option, attr);
|
|
- int id = opt - bch2_opt_table;
|
|
- u64 v = bch2_opt_get_by_id(&c->opts, id);
|
|
+ const struct bch_option *opt = bch2_opt_table + id;
|
|
+ u64 v;
|
|
+
|
|
+ if (opt->flags & OPT_FS) {
|
|
+ v = bch2_opt_get_by_id(&c->opts, id);
|
|
+ } else if ((opt->flags & OPT_DEVICE) && opt->get_member) {
|
|
+ v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx);
|
|
+ } else {
|
|
+ return -EINVAL;
|
|
+ }
|
|
|
|
bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
|
|
prt_char(out, '\n');
|
|
-
|
|
return 0;
|
|
}
|
|
|
|
-STORE(bch2_fs_opts_dir)
|
|
+static ssize_t sysfs_opt_store(struct bch_fs *c,
|
|
+ struct bch_dev *ca,
|
|
+ enum bch_opt_id id,
|
|
+ const char *buf, size_t size)
|
|
{
|
|
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
|
|
- const struct bch_option *opt = container_of(attr, struct bch_option, attr);
|
|
- int ret, id = opt - bch2_opt_table;
|
|
- char *tmp;
|
|
- u64 v;
|
|
+ const struct bch_option *opt = bch2_opt_table + id;
|
|
+ int ret = 0;
|
|
|
|
/*
|
|
* We don't need to take c->writes for correctness, but it eliminates an
|
|
@@ -632,27 +742,28 @@ STORE(bch2_fs_opts_dir)
|
|
if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
|
|
return -EROFS;
|
|
|
|
- tmp = kstrdup(buf, GFP_KERNEL);
|
|
+ down_write(&c->state_lock);
|
|
+
|
|
+ char *tmp = kstrdup(buf, GFP_KERNEL);
|
|
if (!tmp) {
|
|
ret = -ENOMEM;
|
|
goto err;
|
|
}
|
|
|
|
- ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
|
|
+ u64 v;
|
|
+ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?:
|
|
+ bch2_opt_check_may_set(c, ca, id, v);
|
|
kfree(tmp);
|
|
|
|
if (ret < 0)
|
|
goto err;
|
|
|
|
- ret = bch2_opt_check_may_set(c, id, v);
|
|
- if (ret < 0)
|
|
- goto err;
|
|
-
|
|
- bch2_opt_set_sb(c, NULL, opt, v);
|
|
+ bch2_opt_set_sb(c, ca, opt, v);
|
|
bch2_opt_set_by_id(&c->opts, id, v);
|
|
|
|
if (v &&
|
|
(id == Opt_background_target ||
|
|
+ (id == Opt_foreground_target && !c->opts.background_target) ||
|
|
id == Opt_background_compression ||
|
|
(id == Opt_compression && !c->opts.background_compression)))
|
|
bch2_set_rebalance_needs_scan(c, 0);
|
|
@@ -664,27 +775,56 @@ STORE(bch2_fs_opts_dir)
|
|
c->copygc_thread)
|
|
wake_up_process(c->copygc_thread);
|
|
|
|
+ if (id == Opt_discard && !ca) {
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ for_each_member_device(c, ca)
|
|
+ opt->set_member(bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx), v);
|
|
+
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ }
|
|
+
|
|
ret = size;
|
|
err:
|
|
+ up_write(&c->state_lock);
|
|
bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
|
|
return ret;
|
|
}
|
|
+
|
|
+SHOW(bch2_fs_opts_dir)
|
|
+{
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
|
|
+ int id = bch2_opt_lookup(attr->name);
|
|
+ if (id < 0)
|
|
+ return 0;
|
|
+
|
|
+ return sysfs_opt_show(c, NULL, id, out);
|
|
+}
|
|
+
|
|
+STORE(bch2_fs_opts_dir)
|
|
+{
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
|
|
+ int id = bch2_opt_lookup(attr->name);
|
|
+ if (id < 0)
|
|
+ return 0;
|
|
+
|
|
+ return sysfs_opt_store(c, NULL, id, buf, size);
|
|
+}
|
|
SYSFS_OPS(bch2_fs_opts_dir);
|
|
|
|
struct attribute *bch2_fs_opts_dir_files[] = { NULL };
|
|
|
|
-int bch2_opts_create_sysfs_files(struct kobject *kobj)
|
|
+int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
|
|
{
|
|
- const struct bch_option *i;
|
|
- int ret;
|
|
-
|
|
- for (i = bch2_opt_table;
|
|
+ for (const struct bch_option *i = bch2_opt_table;
|
|
i < bch2_opt_table + bch2_opts_nr;
|
|
i++) {
|
|
- if (!(i->flags & OPT_FS))
|
|
+ if (i->flags & OPT_HIDDEN)
|
|
+ continue;
|
|
+ if (!(i->flags & type))
|
|
continue;
|
|
|
|
- ret = sysfs_create_file(kobj, &i->attr);
|
|
+ int ret = sysfs_create_file(kobj, &i->attr);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
@@ -755,11 +895,8 @@ SHOW(bch2_dev)
|
|
|
|
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
|
|
|
|
- sysfs_print(bucket_size, bucket_bytes(ca));
|
|
sysfs_print(first_bucket, ca->mi.first_bucket);
|
|
sysfs_print(nbuckets, ca->mi.nbuckets);
|
|
- sysfs_print(durability, ca->mi.durability);
|
|
- sysfs_print(discard, ca->mi.discard);
|
|
|
|
if (attr == &sysfs_label) {
|
|
if (ca->mi.group)
|
|
@@ -772,11 +909,6 @@ SHOW(bch2_dev)
|
|
prt_char(out, '\n');
|
|
}
|
|
|
|
- if (attr == &sysfs_state) {
|
|
- prt_string_option(out, bch2_member_states, ca->mi.state);
|
|
- prt_char(out, '\n');
|
|
- }
|
|
-
|
|
if (attr == &sysfs_io_done)
|
|
dev_io_done_to_text(out, ca);
|
|
|
|
@@ -802,6 +934,13 @@ SHOW(bch2_dev)
|
|
if (attr == &sysfs_open_buckets)
|
|
bch2_open_buckets_to_text(out, c, ca);
|
|
|
|
+ if (attr == &sysfs_read_fua_test)
|
|
+ return bch2_read_fua_test(out, ca);
|
|
+
|
|
+ int opt_id = bch2_opt_lookup(attr->name);
|
|
+ if (opt_id >= 0)
|
|
+ return sysfs_opt_show(c, ca, opt_id, out);
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
@@ -810,18 +949,6 @@ STORE(bch2_dev)
|
|
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
|
|
struct bch_fs *c = ca->fs;
|
|
|
|
- if (attr == &sysfs_discard) {
|
|
- bool v = strtoul_or_return(buf);
|
|
-
|
|
- bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v);
|
|
- }
|
|
-
|
|
- if (attr == &sysfs_durability) {
|
|
- u64 v = strtoul_or_return(buf);
|
|
-
|
|
- bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v);
|
|
- }
|
|
-
|
|
if (attr == &sysfs_label) {
|
|
char *tmp;
|
|
int ret;
|
|
@@ -839,20 +966,20 @@ STORE(bch2_dev)
|
|
if (attr == &sysfs_io_errors_reset)
|
|
bch2_dev_errors_reset(ca);
|
|
|
|
+ int opt_id = bch2_opt_lookup(attr->name);
|
|
+ if (opt_id >= 0)
|
|
+ return sysfs_opt_store(c, ca, opt_id, buf, size);
|
|
+
|
|
return size;
|
|
}
|
|
SYSFS_OPS(bch2_dev);
|
|
|
|
struct attribute *bch2_dev_files[] = {
|
|
&sysfs_uuid,
|
|
- &sysfs_bucket_size,
|
|
&sysfs_first_bucket,
|
|
&sysfs_nbuckets,
|
|
- &sysfs_durability,
|
|
|
|
/* settings: */
|
|
- &sysfs_discard,
|
|
- &sysfs_state,
|
|
&sysfs_label,
|
|
|
|
&sysfs_has_data,
|
|
@@ -866,6 +993,8 @@ struct attribute *bch2_dev_files[] = {
|
|
&sysfs_io_latency_stats_write,
|
|
&sysfs_congested,
|
|
|
|
+ &sysfs_read_fua_test,
|
|
+
|
|
/* debug: */
|
|
&sysfs_alloc_debug,
|
|
&sysfs_open_buckets,
|
|
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
|
|
index 222cd5062702..303e0433c702 100644
|
|
--- a/fs/bcachefs/sysfs.h
|
|
+++ b/fs/bcachefs/sysfs.h
|
|
@@ -23,7 +23,7 @@ extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
|
|
extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
|
|
extern const struct sysfs_ops bch2_dev_sysfs_ops;
|
|
|
|
-int bch2_opts_create_sysfs_files(struct kobject *);
|
|
+int bch2_opts_create_sysfs_files(struct kobject *, unsigned);
|
|
|
|
#else
|
|
|
|
@@ -41,7 +41,8 @@ static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
|
|
static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
|
|
static const struct sysfs_ops bch2_dev_sysfs_ops;
|
|
|
|
-static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
|
|
+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
|
|
+{ return 0; }
|
|
|
|
#endif /* NO_BCACHEFS_SYSFS */
|
|
|
|
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
|
|
index 3fe82757f93a..a8382d876835 100644
|
|
--- a/fs/bcachefs/time_stats.c
|
|
+++ b/fs/bcachefs/time_stats.c
|
|
@@ -10,6 +10,9 @@
|
|
#include "eytzinger.h"
|
|
#include "time_stats.h"
|
|
|
|
+/* disable automatic switching to percpu mode */
|
|
+#define TIME_STATS_NONPCPU ((struct time_stat_buffer *) 1)
|
|
+
|
|
static const struct time_unit time_units[] = {
|
|
{ "ns", 1 },
|
|
{ "us", NSEC_PER_USEC },
|
|
@@ -123,11 +126,12 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
|
|
{
|
|
unsigned long flags;
|
|
|
|
- if (!stats->buffer) {
|
|
+ if ((unsigned long) stats->buffer <= 1) {
|
|
spin_lock_irqsave(&stats->lock, flags);
|
|
time_stats_update_one(stats, start, end);
|
|
|
|
- if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
|
|
+ if (!stats->buffer &&
|
|
+ mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
|
|
stats->duration_stats.n > 1024)
|
|
stats->buffer =
|
|
alloc_percpu_gfp(struct time_stat_buffer,
|
|
@@ -157,7 +161,8 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats)
|
|
unsigned offset = offsetof(struct bch2_time_stats, min_duration);
|
|
memset((void *) stats + offset, 0, sizeof(*stats) - offset);
|
|
|
|
- if (stats->buffer) {
|
|
+ if (stats->buffer &&
|
|
+ stats->buffer != TIME_STATS_NONPCPU) {
|
|
int cpu;
|
|
for_each_possible_cpu(cpu)
|
|
per_cpu_ptr(stats->buffer, cpu)->nr = 0;
|
|
@@ -167,7 +172,10 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats)
|
|
|
|
void bch2_time_stats_exit(struct bch2_time_stats *stats)
|
|
{
|
|
- free_percpu(stats->buffer);
|
|
+ if (stats->buffer != TIME_STATS_NONPCPU) {
|
|
+ free_percpu(stats->buffer);
|
|
+ stats->buffer = NULL;
|
|
+ }
|
|
}
|
|
|
|
void bch2_time_stats_init(struct bch2_time_stats *stats)
|
|
@@ -177,3 +185,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats)
|
|
stats->min_freq = U64_MAX;
|
|
spin_lock_init(&stats->lock);
|
|
}
|
|
+
|
|
+void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats)
|
|
+{
|
|
+ bch2_time_stats_init(stats);
|
|
+ stats->buffer = TIME_STATS_NONPCPU;
|
|
+}
|
|
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
|
|
index dc6493f7bbab..eddb0985bab4 100644
|
|
--- a/fs/bcachefs/time_stats.h
|
|
+++ b/fs/bcachefs/time_stats.h
|
|
@@ -145,6 +145,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
|
|
void bch2_time_stats_reset(struct bch2_time_stats *);
|
|
void bch2_time_stats_exit(struct bch2_time_stats *);
|
|
void bch2_time_stats_init(struct bch2_time_stats *);
|
|
+void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *);
|
|
|
|
static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
|
|
{
|
|
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
|
|
index c1b51009edf6..8c07189a080a 100644
|
|
--- a/fs/bcachefs/trace.h
|
|
+++ b/fs/bcachefs/trace.h
|
|
@@ -295,12 +295,12 @@ TRACE_EVENT(write_super,
|
|
|
|
/* io.c: */
|
|
|
|
-DEFINE_EVENT(bio, read_promote,
|
|
+DEFINE_EVENT(bio, io_read_promote,
|
|
TP_PROTO(struct bio *bio),
|
|
TP_ARGS(bio)
|
|
);
|
|
|
|
-TRACE_EVENT(read_nopromote,
|
|
+TRACE_EVENT(io_read_nopromote,
|
|
TP_PROTO(struct bch_fs *c, int ret),
|
|
TP_ARGS(c, ret),
|
|
|
|
@@ -319,26 +319,55 @@ TRACE_EVENT(read_nopromote,
|
|
__entry->ret)
|
|
);
|
|
|
|
-DEFINE_EVENT(bio, read_bounce,
|
|
+DEFINE_EVENT(bio, io_read_bounce,
|
|
TP_PROTO(struct bio *bio),
|
|
TP_ARGS(bio)
|
|
);
|
|
|
|
-DEFINE_EVENT(bio, read_split,
|
|
+DEFINE_EVENT(bio, io_read_split,
|
|
TP_PROTO(struct bio *bio),
|
|
TP_ARGS(bio)
|
|
);
|
|
|
|
-DEFINE_EVENT(bio, read_retry,
|
|
+DEFINE_EVENT(bio, io_read_retry,
|
|
TP_PROTO(struct bio *bio),
|
|
TP_ARGS(bio)
|
|
);
|
|
|
|
-DEFINE_EVENT(bio, read_reuse_race,
|
|
+DEFINE_EVENT(bio, io_read_reuse_race,
|
|
TP_PROTO(struct bio *bio),
|
|
TP_ARGS(bio)
|
|
);
|
|
|
|
+DEFINE_EVENT(bio, io_read_fail_and_poison,
|
|
+ TP_PROTO(struct bio *bio),
|
|
+ TP_ARGS(bio)
|
|
+);
|
|
+
|
|
+/* ec.c */
|
|
+
|
|
+TRACE_EVENT(stripe_create,
|
|
+ TP_PROTO(struct bch_fs *c, u64 idx, int ret),
|
|
+ TP_ARGS(c, idx, ret),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(dev_t, dev )
|
|
+ __field(u64, idx )
|
|
+ __field(int, ret )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->dev = c->dev;
|
|
+ __entry->idx = idx;
|
|
+ __entry->ret = ret;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%d,%d idx %llu ret %i",
|
|
+ MAJOR(__entry->dev), MINOR(__entry->dev),
|
|
+ __entry->idx,
|
|
+ __entry->ret)
|
|
+);
|
|
+
|
|
/* Journal */
|
|
|
|
DEFINE_EVENT(bch_fs, journal_full,
|
|
@@ -797,53 +826,37 @@ TRACE_EVENT(bucket_invalidate,
|
|
|
|
/* Moving IO */
|
|
|
|
-TRACE_EVENT(bucket_evacuate,
|
|
- TP_PROTO(struct bch_fs *c, struct bpos *bucket),
|
|
- TP_ARGS(c, bucket),
|
|
-
|
|
- TP_STRUCT__entry(
|
|
- __field(dev_t, dev )
|
|
- __field(u32, dev_idx )
|
|
- __field(u64, bucket )
|
|
- ),
|
|
-
|
|
- TP_fast_assign(
|
|
- __entry->dev = c->dev;
|
|
- __entry->dev_idx = bucket->inode;
|
|
- __entry->bucket = bucket->offset;
|
|
- ),
|
|
-
|
|
- TP_printk("%d:%d %u:%llu",
|
|
- MAJOR(__entry->dev), MINOR(__entry->dev),
|
|
- __entry->dev_idx, __entry->bucket)
|
|
+DEFINE_EVENT(fs_str, io_move,
|
|
+ TP_PROTO(struct bch_fs *c, const char *str),
|
|
+ TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent,
|
|
+DEFINE_EVENT(fs_str, io_move_read,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent_read,
|
|
+DEFINE_EVENT(fs_str, io_move_write,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent_write,
|
|
+DEFINE_EVENT(fs_str, io_move_finish,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent_finish,
|
|
+DEFINE_EVENT(fs_str, io_move_fail,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent_fail,
|
|
+DEFINE_EVENT(fs_str, io_move_write_fail,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
|
|
-DEFINE_EVENT(fs_str, move_extent_start_fail,
|
|
+DEFINE_EVENT(fs_str, io_move_start_fail,
|
|
TP_PROTO(struct bch_fs *c, const char *str),
|
|
TP_ARGS(c, str)
|
|
);
|
|
@@ -881,37 +894,6 @@ TRACE_EVENT(move_data,
|
|
__entry->sectors_raced)
|
|
);
|
|
|
|
-TRACE_EVENT(evacuate_bucket,
|
|
- TP_PROTO(struct bch_fs *c, struct bpos *bucket,
|
|
- unsigned sectors, unsigned bucket_size,
|
|
- int ret),
|
|
- TP_ARGS(c, bucket, sectors, bucket_size, ret),
|
|
-
|
|
- TP_STRUCT__entry(
|
|
- __field(dev_t, dev )
|
|
- __field(u64, member )
|
|
- __field(u64, bucket )
|
|
- __field(u32, sectors )
|
|
- __field(u32, bucket_size )
|
|
- __field(int, ret )
|
|
- ),
|
|
-
|
|
- TP_fast_assign(
|
|
- __entry->dev = c->dev;
|
|
- __entry->member = bucket->inode;
|
|
- __entry->bucket = bucket->offset;
|
|
- __entry->sectors = sectors;
|
|
- __entry->bucket_size = bucket_size;
|
|
- __entry->ret = ret;
|
|
- ),
|
|
-
|
|
- TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
|
|
- MAJOR(__entry->dev), MINOR(__entry->dev),
|
|
- __entry->member, __entry->bucket,
|
|
- __entry->sectors, __entry->bucket_size,
|
|
- __entry->ret)
|
|
-);
|
|
-
|
|
TRACE_EVENT(copygc,
|
|
TP_PROTO(struct bch_fs *c,
|
|
u64 buckets,
|
|
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
|
|
index da2cd11b3025..553de8d8e3e5 100644
|
|
--- a/fs/bcachefs/util.c
|
|
+++ b/fs/bcachefs/util.c
|
|
@@ -473,10 +473,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
|
|
u64 last_q = 0;
|
|
|
|
prt_printf(out, "quantiles (%s):\t", u->name);
|
|
- eytzinger0_for_each(i, NR_QUANTILES) {
|
|
- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
|
|
+ eytzinger0_for_each(j, NR_QUANTILES) {
|
|
+ bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1;
|
|
|
|
- u64 q = max(quantiles->entries[i].m, last_q);
|
|
+ u64 q = max(quantiles->entries[j].m, last_q);
|
|
prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
|
|
if (is_last)
|
|
prt_newline(out);
|
|
@@ -704,12 +704,33 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
|
|
}
|
|
}
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+void bch2_corrupt_bio(struct bio *bio)
|
|
+{
|
|
+ struct bvec_iter iter;
|
|
+ struct bio_vec bv;
|
|
+ unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
|
|
+
|
|
+ bio_for_each_segment(bv, bio, iter) {
|
|
+ unsigned u64s = bv.bv_len / sizeof(u64);
|
|
+
|
|
+ if (offset < u64s) {
|
|
+ u64 *segment = bvec_kmap_local(&bv);
|
|
+ segment[offset] = get_random_u64();
|
|
+ kunmap_local(segment);
|
|
+ return;
|
|
+ }
|
|
+ offset -= u64s;
|
|
+ }
|
|
+}
|
|
+#endif
|
|
+
|
|
#if 0
|
|
void eytzinger1_test(void)
|
|
{
|
|
- unsigned inorder, eytz, size;
|
|
+ unsigned inorder, size;
|
|
|
|
- pr_info("1 based eytzinger test:");
|
|
+ pr_info("1 based eytzinger test:\n");
|
|
|
|
for (size = 2;
|
|
size < 65536;
|
|
@@ -717,13 +738,7 @@ void eytzinger1_test(void)
|
|
unsigned extra = eytzinger1_extra(size);
|
|
|
|
if (!(size % 4096))
|
|
- pr_info("tree size %u", size);
|
|
-
|
|
- BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
|
|
- BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
|
|
-
|
|
- BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0);
|
|
- BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0);
|
|
+ pr_info("tree size %u\n", size);
|
|
|
|
inorder = 1;
|
|
eytzinger1_for_each(eytz, size) {
|
|
@@ -734,15 +749,16 @@ void eytzinger1_test(void)
|
|
|
|
inorder++;
|
|
}
|
|
+ BUG_ON(inorder - 1 != size);
|
|
}
|
|
}
|
|
|
|
void eytzinger0_test(void)
|
|
{
|
|
|
|
- unsigned inorder, eytz, size;
|
|
+ unsigned inorder, size;
|
|
|
|
- pr_info("0 based eytzinger test:");
|
|
+ pr_info("0 based eytzinger test:\n");
|
|
|
|
for (size = 1;
|
|
size < 65536;
|
|
@@ -750,13 +766,7 @@ void eytzinger0_test(void)
|
|
unsigned extra = eytzinger0_extra(size);
|
|
|
|
if (!(size % 4096))
|
|
- pr_info("tree size %u", size);
|
|
-
|
|
- BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
|
|
- BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
|
|
-
|
|
- BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1);
|
|
- BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1);
|
|
+ pr_info("tree size %u\n", size);
|
|
|
|
inorder = 0;
|
|
eytzinger0_for_each(eytz, size) {
|
|
@@ -767,54 +777,191 @@ void eytzinger0_test(void)
|
|
|
|
inorder++;
|
|
}
|
|
+ BUG_ON(inorder != size);
|
|
+
|
|
+ inorder = size - 1;
|
|
+ eytzinger0_for_each_prev(eytz, size) {
|
|
+ BUG_ON(eytz != eytzinger0_first(size) &&
|
|
+ eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz);
|
|
+
|
|
+ inorder--;
|
|
+ }
|
|
+ BUG_ON(inorder != -1);
|
|
}
|
|
}
|
|
|
|
-static inline int cmp_u16(const void *_l, const void *_r, size_t size)
|
|
+static inline int cmp_u16(const void *_l, const void *_r)
|
|
{
|
|
const u16 *l = _l, *r = _r;
|
|
|
|
- return (*l > *r) - (*r - *l);
|
|
+ return (*l > *r) - (*r > *l);
|
|
}
|
|
|
|
-static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
|
|
+static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search)
|
|
{
|
|
- int i, c1 = -1, c2 = -1;
|
|
- ssize_t r;
|
|
+ int r, s;
|
|
+ bool bad;
|
|
|
|
r = eytzinger0_find_le(test_array, nr,
|
|
sizeof(test_array[0]),
|
|
cmp_u16, &search);
|
|
- if (r >= 0)
|
|
- c1 = test_array[r];
|
|
-
|
|
- for (i = 0; i < nr; i++)
|
|
- if (test_array[i] <= search && test_array[i] > c2)
|
|
- c2 = test_array[i];
|
|
-
|
|
- if (c1 != c2) {
|
|
- eytzinger0_for_each(i, nr)
|
|
- pr_info("[%3u] = %12u", i, test_array[i]);
|
|
- pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
|
|
- i, r, c1, c2);
|
|
+ if (r >= 0) {
|
|
+ if (test_array[r] > search) {
|
|
+ bad = true;
|
|
+ } else {
|
|
+ s = eytzinger0_next(r, nr);
|
|
+ bad = s >= 0 && test_array[s] <= search;
|
|
+ }
|
|
+ } else {
|
|
+ s = eytzinger0_last(nr);
|
|
+ bad = s >= 0 && test_array[s] <= search;
|
|
+ }
|
|
+
|
|
+ if (bad) {
|
|
+ s = -1;
|
|
+ eytzinger0_for_each_prev(j, nr) {
|
|
+ if (test_array[j] <= search) {
|
|
+ s = j;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ eytzinger0_for_each(j, nr)
|
|
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
|
|
+ pr_info("find_le(%12u) = %3i should be %3i\n",
|
|
+ search, r, s);
|
|
+ BUG();
|
|
}
|
|
}
|
|
|
|
+static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search)
|
|
+{
|
|
+ int r, s;
|
|
+ bool bad;
|
|
+
|
|
+ r = eytzinger0_find_gt(test_array, nr,
|
|
+ sizeof(test_array[0]),
|
|
+ cmp_u16, &search);
|
|
+ if (r >= 0) {
|
|
+ if (test_array[r] <= search) {
|
|
+ bad = true;
|
|
+ } else {
|
|
+ s = eytzinger0_prev(r, nr);
|
|
+ bad = s >= 0 && test_array[s] > search;
|
|
+ }
|
|
+ } else {
|
|
+ s = eytzinger0_first(nr);
|
|
+ bad = s >= 0 && test_array[s] > search;
|
|
+ }
|
|
+
|
|
+ if (bad) {
|
|
+ s = -1;
|
|
+ eytzinger0_for_each(j, nr) {
|
|
+ if (test_array[j] > search) {
|
|
+ s = j;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ eytzinger0_for_each(j, nr)
|
|
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
|
|
+ pr_info("find_gt(%12u) = %3i should be %3i\n",
|
|
+ search, r, s);
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search)
|
|
+{
|
|
+ int r, s;
|
|
+ bool bad;
|
|
+
|
|
+ r = eytzinger0_find_ge(test_array, nr,
|
|
+ sizeof(test_array[0]),
|
|
+ cmp_u16, &search);
|
|
+ if (r >= 0) {
|
|
+ if (test_array[r] < search) {
|
|
+ bad = true;
|
|
+ } else {
|
|
+ s = eytzinger0_prev(r, nr);
|
|
+ bad = s >= 0 && test_array[s] >= search;
|
|
+ }
|
|
+ } else {
|
|
+ s = eytzinger0_first(nr);
|
|
+ bad = s >= 0 && test_array[s] >= search;
|
|
+ }
|
|
+
|
|
+ if (bad) {
|
|
+ s = -1;
|
|
+ eytzinger0_for_each(j, nr) {
|
|
+ if (test_array[j] >= search) {
|
|
+ s = j;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ eytzinger0_for_each(j, nr)
|
|
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
|
|
+ pr_info("find_ge(%12u) = %3i should be %3i\n",
|
|
+ search, r, s);
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search)
|
|
+{
|
|
+ unsigned r;
|
|
+ int s;
|
|
+ bool bad;
|
|
+
|
|
+ r = eytzinger0_find(test_array, nr,
|
|
+ sizeof(test_array[0]),
|
|
+ cmp_u16, &search);
|
|
+
|
|
+ if (r < nr) {
|
|
+ bad = test_array[r] != search;
|
|
+ } else {
|
|
+ s = eytzinger0_find_le(test_array, nr,
|
|
+ sizeof(test_array[0]),
|
|
+ cmp_u16, &search);
|
|
+ bad = s >= 0 && test_array[s] == search;
|
|
+ }
|
|
+
|
|
+ if (bad) {
|
|
+ eytzinger0_for_each(j, nr)
|
|
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
|
|
+ pr_info("find(%12u) = %3i is incorrect\n",
|
|
+ search, r);
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
|
|
+{
|
|
+ eytzinger0_find_test_le(test_array, nr, search);
|
|
+ eytzinger0_find_test_gt(test_array, nr, search);
|
|
+ eytzinger0_find_test_ge(test_array, nr, search);
|
|
+ eytzinger0_find_test_eq(test_array, nr, search);
|
|
+}
|
|
+
|
|
void eytzinger0_find_test(void)
|
|
{
|
|
unsigned i, nr, allocated = 1 << 12;
|
|
u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
|
|
|
|
for (nr = 1; nr < allocated; nr++) {
|
|
- pr_info("testing %u elems", nr);
|
|
+ u16 prev = 0;
|
|
+
|
|
+ pr_info("testing %u elems\n", nr);
|
|
|
|
get_random_bytes(test_array, nr * sizeof(test_array[0]));
|
|
eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
|
|
|
|
/* verify array is sorted correctly: */
|
|
- eytzinger0_for_each(i, nr)
|
|
- BUG_ON(i != eytzinger0_last(nr) &&
|
|
- test_array[i] > test_array[eytzinger0_next(i, nr)]);
|
|
+ eytzinger0_for_each(j, nr) {
|
|
+ BUG_ON(test_array[j] < prev);
|
|
+ prev = test_array[j];
|
|
+ }
|
|
|
|
for (i = 0; i < U16_MAX; i += 1 << 12)
|
|
eytzinger0_find_test_val(test_array, nr, i);
|
|
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
|
|
index f4a4783219d9..7d921fc920a0 100644
|
|
--- a/fs/bcachefs/util.h
|
|
+++ b/fs/bcachefs/util.h
|
|
@@ -406,6 +406,18 @@ u64 bch2_get_random_u64_below(u64);
|
|
void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
|
|
void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
|
|
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+void bch2_corrupt_bio(struct bio *);
|
|
+
|
|
+static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
|
|
+{
|
|
+ if (ratio && !get_random_u32_below(ratio))
|
|
+ bch2_corrupt_bio(bio);
|
|
+}
|
|
+#else
|
|
+#define bch2_maybe_corrupt_bio(...) do {} while (0)
|
|
+#endif
|
|
+
|
|
static inline void memcpy_u64s_small(void *dst, const void *src,
|
|
unsigned u64s)
|
|
{
|
|
@@ -419,7 +431,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src,
|
|
static inline void __memcpy_u64s(void *dst, const void *src,
|
|
unsigned u64s)
|
|
{
|
|
-#ifdef CONFIG_X86_64
|
|
+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
|
|
long d0, d1, d2;
|
|
|
|
asm volatile("rep ; movsq"
|
|
@@ -496,7 +508,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src,
|
|
u64 *dst = (u64 *) _dst + u64s - 1;
|
|
u64 *src = (u64 *) _src + u64s - 1;
|
|
|
|
-#ifdef CONFIG_X86_64
|
|
+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
|
|
long d0, d1, d2;
|
|
|
|
asm volatile("std ;\n"
|
|
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
|
|
index aed7c6984173..f9667b944c0d 100644
|
|
--- a/fs/bcachefs/xattr.c
|
|
+++ b/fs/bcachefs/xattr.c
|
|
@@ -523,7 +523,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
|
|
if (ret < 0)
|
|
goto err_class_exit;
|
|
|
|
- ret = bch2_opt_check_may_set(c, opt_id, v);
|
|
+ ret = bch2_opt_check_may_set(c, NULL, opt_id, v);
|
|
if (ret < 0)
|
|
goto err_class_exit;
|
|
|
|
--
|
|
2.45.3
|
|
|