gentoo-overlay/sys-kernel/hardened-kernel/files/linux-6.12/0185-bcachefs-Fix-reuse-of-bucket-before-journal-flush-on.patch
Alexander Miroshnichenko ad7c6fc00a
sys-kernel/hardened-kernel: bump v6.12.6
bcachefs patches synced to ca2e7a3de895c703d2cbbd9b63c10d8adfba8228 from master branch

Signed-off-by: Alexander Miroshnichenko <alex@millerson.name>
2024-12-20 17:26:34 +03:00

207 lines
8.8 KiB
Diff

From d7f6becfe039b95593c28ff8180b3b53a2585f69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 6 Dec 2024 23:15:05 -0500
Subject: [PATCH 185/233] bcachefs: Fix reuse of bucket before journal flush on
multiple empty -> nonempty transition
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 8bit
For each bucket we track when the bucket became nonempty and when it
became empty again: if we can ensure that there will be no journal
flushes in the range [nonempty, empty) (possibly because they occured at
the same journal sequence number), then it's safe to reuse the bucket
without waiting for a journal commit.
This is a major performance optimization for erasure coding, where
writes are initially replicated, but the extra replicas are quickly
dropped: if those buckets are reused and overwritten without issuing a
cache flush to the underlying device, then they only cost bus bandwidth.
But there's a tricky corner case when there's multiple empty -> nonempty
-> empty transitions in quick succession, i.e. when data is getting
overwritten immediately as it's being written.
If this happens and the previous empty transition hasn't been flushed,
we need to continue tracking the previous nonempty transition - not
start a new one.
Fixing this means we now need to track both the nonempty and empty
transitions in bch_alloc_v4.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Alexander Miroshnichenko <alex@millerson.name>
---
fs/bcachefs/alloc_background.c | 78 ++++++++++++++-------------
fs/bcachefs/alloc_background_format.h | 4 +-
2 files changed, 42 insertions(+), 40 deletions(-)
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9ae567402b03..94e7bc889cb1 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -323,7 +323,8 @@ void bch2_alloc_v4_swab(struct bkey_s k)
{
struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
- a->journal_seq = swab64(a->journal_seq);
+ a->journal_seq_nonempty = swab64(a->journal_seq_nonempty);
+ a->journal_seq_empty = swab64(a->journal_seq_empty);
a->flags = swab32(a->flags);
a->dirty_sectors = swab32(a->dirty_sectors);
a->cached_sectors = swab32(a->cached_sectors);
@@ -346,16 +347,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
bch2_prt_data_type(out, a->data_type);
prt_newline(out);
- prt_printf(out, "journal_seq %llu\n", a->journal_seq);
- prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
- prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
- prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
- prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
- prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
- prt_printf(out, "stripe %u\n", a->stripe);
- prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
- prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
- prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
+ prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty);
+ prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty);
+ prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
+ prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
+ prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
+ prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
+ prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
+ prt_printf(out, "stripe %u\n", a->stripe);
+ prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
+ prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
+ prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
if (ca)
prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca));
@@ -384,7 +386,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
*out = (struct bch_alloc_v4) {
- .journal_seq = u.journal_seq,
+ .journal_seq_nonempty = u.journal_seq,
.flags = u.need_discard,
.gen = u.gen,
.oldest_gen = u.oldest_gen,
@@ -930,20 +932,29 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
u64 transaction_seq = trans->journal_res.seq;
+ BUG_ON(!transaction_seq);
- if (log_fsck_err_on(transaction_seq && new_a->journal_seq > transaction_seq,
+ if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
trans, alloc_key_journal_seq_in_future,
"bucket journal seq in future (currently at %llu)\n%s",
journal_cur_seq(&c->journal),
(bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
- new_a->journal_seq = transaction_seq;
+ new_a->journal_seq_nonempty = transaction_seq;
int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
(int) data_type_is_empty(old_a->data_type);
- /* Record journal sequence number of empty -> nonempty transition: */
- if (is_empty_delta < 0)
- new_a->journal_seq = max(new_a->journal_seq, transaction_seq);
+ /*
+ * Record journal sequence number of empty -> nonempty transition:
+ * Note that there may be multiple empty -> nonempty
+ * transitions, data in a bucket may be overwritten while we're
+ * still writing to it - so be careful to only record the first:
+ * */
+ if (is_empty_delta < 0 &&
+ new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) {
+ new_a->journal_seq_nonempty = transaction_seq;
+ new_a->journal_seq_empty = 0;
+ }
/*
* Bucket becomes empty: mark it as waiting for a journal flush,
@@ -952,20 +963,21 @@ int bch2_trigger_alloc(struct btree_trans *trans,
* intermediate sequence numbers:
*/
if (is_empty_delta > 0) {
- if (new_a->journal_seq == transaction_seq ||
+ if (new_a->journal_seq_nonempty == transaction_seq ||
bch2_journal_noflush_seq(&c->journal,
- new_a->journal_seq,
- transaction_seq))
- new_a->journal_seq = 0;
- else {
- new_a->journal_seq = transaction_seq;
+ new_a->journal_seq_nonempty,
+ transaction_seq)) {
+ new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0;
+ } else {
+ new_a->journal_seq_empty = transaction_seq;
ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk,
- new.k->p.inode, new.k->p.offset,
- transaction_seq);
+ c->journal.flushed_seq_ondisk,
+ new.k->p.inode, new.k->p.offset,
+ transaction_seq);
if (bch2_fs_fatal_err_on(ret, c,
- "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
+ "setting bucket_needs_journal_commit: %s",
+ bch2_err_str(ret)))
goto err;
}
}
@@ -983,7 +995,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
-#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
+#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk)
if (statechange(a->data_type == BCH_DATA_free) &&
bucket_flushed(new_a))
@@ -1845,16 +1857,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
goto out;
}
- if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
- if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
- trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
- a->v.journal_seq,
- c->journal.flushed_seq_ondisk,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = -EIO;
- goto out;
- }
-
if (!fastpath) {
if (discard_in_flight_add(ca, iter.pos.offset, true))
goto out;
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
index befdaa95c515..740238369a5a 100644
--- a/fs/bcachefs/alloc_background_format.h
+++ b/fs/bcachefs/alloc_background_format.h
@@ -58,7 +58,7 @@ LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
struct bch_alloc_v4 {
struct bch_val v;
- __u64 journal_seq;
+ __u64 journal_seq_nonempty;
__u32 flags;
__u8 gen;
__u8 oldest_gen;
@@ -70,7 +70,7 @@ struct bch_alloc_v4 {
__u32 stripe;
__u32 nr_external_backpointers;
/* end of fields in original version of alloc_v4 */
- __u64 _fragmentation_lru; /* obsolete */
+ __u64 journal_seq_empty;
__u32 stripe_sectors;
__u32 pad;
} __packed __aligned(8);
--
2.45.2