gentoo-overlay/sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-cherry-pick-updates-from-master-432522786827.patch
Alexander Miroshnichenko 9a5f90dc23
sys-kernel/hardened-kernel and virtual/dist-kernel: bump v6.12.8
Signed-off-by: Alexander Miroshnichenko <alex@millerson.name>
2025-01-05 12:52:12 +03:00

21706 lines
705 KiB
Diff

From 3f94dc89581133d018110bc81f108aa3bf485b38 Mon Sep 17 00:00:00 2001
From: Alexander Miroshnichenko <alex@millerson.name>
Date: Sun, 5 Jan 2025 12:38:05 +0300
Subject: [PATCH] bcachefs: cherry-pick updates from master 432522786827
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 8bit
From tag: v6.12 up to:
commit 4325227868277451df623c89e4472a1d9db5df94 (bcachefs/master, bcachefs/for-next, bcachefs/bcachefs-testing)
Author: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat Jan 4 12:10:25 2025 -0500
Signed-off-by: Alexander Miroshnichenko <alex@millerson.name>
---
.../filesystems/bcachefs/CodingStyle.rst | 2 +-
fs/bcachefs/Kconfig | 2 +-
fs/bcachefs/Makefile | 1 +
fs/bcachefs/acl.c | 11 +-
fs/bcachefs/alloc_background.c | 558 ++++++------
fs/bcachefs/alloc_background.h | 18 +-
fs/bcachefs/alloc_background_format.h | 4 +-
fs/bcachefs/alloc_foreground.c | 304 +++----
fs/bcachefs/alloc_foreground.h | 4 +-
fs/bcachefs/backpointers.c | 838 +++++++++++-------
fs/bcachefs/backpointers.h | 97 +-
fs/bcachefs/bbpos.h | 2 +-
fs/bcachefs/bcachefs.h | 70 +-
fs/bcachefs/bcachefs_format.h | 105 ++-
fs/bcachefs/bkey.h | 7 -
fs/bcachefs/bkey_methods.c | 29 +-
fs/bcachefs/bkey_methods.h | 15 +-
fs/bcachefs/bkey_types.h | 28 +
fs/bcachefs/btree_cache.c | 59 +-
fs/bcachefs/btree_cache.h | 14 +-
fs/bcachefs/btree_gc.c | 178 +---
fs/bcachefs/btree_gc.h | 4 +-
fs/bcachefs/btree_io.c | 225 +++--
fs/bcachefs/btree_io.h | 6 +-
fs/bcachefs/btree_iter.c | 590 +++++++-----
fs/bcachefs/btree_iter.h | 134 ++-
fs/bcachefs/btree_journal_iter.c | 237 ++++-
fs/bcachefs/btree_journal_iter.h | 22 +-
fs/bcachefs/btree_journal_iter_types.h | 36 +
fs/bcachefs/btree_key_cache.c | 75 +-
fs/bcachefs/btree_locking.c | 16 +-
fs/bcachefs/btree_locking.h | 50 +-
fs/bcachefs/btree_node_scan.c | 153 ++--
fs/bcachefs/btree_node_scan_types.h | 1 -
fs/bcachefs/btree_trans_commit.c | 205 ++---
fs/bcachefs/btree_types.h | 42 +-
fs/bcachefs/btree_update.c | 70 +-
fs/bcachefs/btree_update.h | 29 +-
fs/bcachefs/btree_update_interior.c | 293 +++---
fs/bcachefs/btree_update_interior.h | 3 +-
fs/bcachefs/btree_write_buffer.c | 83 +-
fs/bcachefs/buckets.c | 133 +--
fs/bcachefs/buckets.h | 30 +-
fs/bcachefs/buckets_types.h | 2 +-
fs/bcachefs/chardev.c | 219 +----
fs/bcachefs/checksum.c | 10 +-
fs/bcachefs/checksum.h | 2 +-
fs/bcachefs/compress.c | 96 +-
fs/bcachefs/darray.h | 2 +-
fs/bcachefs/data_update.c | 76 +-
fs/bcachefs/debug.c | 4 +-
fs/bcachefs/dirent.c | 10 +-
fs/bcachefs/dirent.h | 4 +-
fs/bcachefs/disk_accounting.c | 150 ++--
fs/bcachefs/disk_accounting.h | 73 +-
fs/bcachefs/ec.c | 267 +++---
fs/bcachefs/ec.h | 5 +-
fs/bcachefs/errcode.h | 21 +-
fs/bcachefs/error.c | 187 ++--
fs/bcachefs/error.h | 58 +-
fs/bcachefs/extent_update.c | 4 +-
fs/bcachefs/extents.c | 290 ++----
fs/bcachefs/extents.h | 18 +-
fs/bcachefs/extents_format.h | 15 +-
fs/bcachefs/fs-common.c | 108 ++-
fs/bcachefs/fs-common.h | 2 +
fs/bcachefs/fs-io-buffered.c | 45 +-
fs/bcachefs/fs-io-direct.c | 5 +
fs/bcachefs/fs-io-pagecache.c | 4 +-
fs/bcachefs/fs-io.c | 54 +-
fs/bcachefs/fs-ioctl.c | 7 +-
fs/bcachefs/fs.c | 88 +-
fs/bcachefs/fs.h | 1 +
fs/bcachefs/fsck.c | 731 +++++++++------
fs/bcachefs/fsck.h | 11 +
fs/bcachefs/inode.c | 169 ++--
fs/bcachefs/inode.h | 43 +-
fs/bcachefs/inode_format.h | 15 +-
fs/bcachefs/io_misc.c | 22 +-
fs/bcachefs/io_read.c | 246 +++--
fs/bcachefs/io_read.h | 28 +-
fs/bcachefs/io_write.c | 102 ++-
fs/bcachefs/journal.c | 162 +++-
fs/bcachefs/journal.h | 9 +-
fs/bcachefs/journal_io.c | 225 +++--
fs/bcachefs/journal_io.h | 2 +-
fs/bcachefs/journal_reclaim.c | 19 +-
fs/bcachefs/journal_types.h | 5 +
fs/bcachefs/logged_ops.c | 11 +-
fs/bcachefs/logged_ops_format.h | 5 +
fs/bcachefs/lru.c | 4 +-
fs/bcachefs/lru.h | 2 +-
fs/bcachefs/move.c | 184 ++--
fs/bcachefs/move.h | 5 +-
fs/bcachefs/movinggc.c | 6 +-
fs/bcachefs/opts.c | 26 +-
fs/bcachefs/opts.h | 61 +-
fs/bcachefs/printbuf.h | 15 +-
fs/bcachefs/quota.c | 2 +-
fs/bcachefs/quota.h | 4 +-
fs/bcachefs/rcu_pending.c | 38 +-
fs/bcachefs/rebalance.c | 266 +++++-
fs/bcachefs/rebalance.h | 10 +
fs/bcachefs/rebalance_format.h | 53 ++
fs/bcachefs/rebalance_types.h | 2 -
fs/bcachefs/recovery.c | 212 +++--
fs/bcachefs/recovery.h | 2 +-
fs/bcachefs/recovery_passes.c | 112 ++-
fs/bcachefs/recovery_passes.h | 1 +
fs/bcachefs/recovery_passes_types.h | 92 +-
fs/bcachefs/reflink.c | 496 ++++++++---
fs/bcachefs/reflink.h | 20 +-
fs/bcachefs/reflink_format.h | 7 +-
fs/bcachefs/sb-clean.c | 6 +-
fs/bcachefs/sb-counters_format.h | 165 ++--
fs/bcachefs/sb-downgrade.c | 25 +-
fs/bcachefs/sb-errors_format.h | 53 +-
fs/bcachefs/six.c | 17 +-
fs/bcachefs/six.h | 1 +
fs/bcachefs/snapshot.c | 515 +++++------
fs/bcachefs/snapshot.h | 17 +-
fs/bcachefs/str_hash.c | 286 ++++++
fs/bcachefs/str_hash.h | 28 +-
fs/bcachefs/subvolume.c | 68 +-
fs/bcachefs/subvolume.h | 19 +-
fs/bcachefs/subvolume_types.h | 2 +-
fs/bcachefs/super-io.c | 83 +-
fs/bcachefs/super-io.h | 21 +-
fs/bcachefs/super.c | 54 +-
fs/bcachefs/super.h | 10 -
fs/bcachefs/sysfs.c | 60 +-
fs/bcachefs/tests.c | 26 +-
fs/bcachefs/trace.h | 77 +-
fs/bcachefs/util.h | 32 +
fs/bcachefs/varint.c | 5 +-
fs/bcachefs/xattr.c | 13 +-
fs/bcachefs/xattr.h | 5 +-
fs/fs_parser.c | 3 +-
include/linux/fs_parser.h | 2 +
include/linux/min_heap.h | 4 +-
140 files changed, 7038 insertions(+), 4594 deletions(-)
create mode 100644 fs/bcachefs/btree_journal_iter_types.h
create mode 100644 fs/bcachefs/rebalance_format.h
create mode 100644 fs/bcachefs/str_hash.c
diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst
index 01de555e21d8..b29562a6bf55 100644
--- a/Documentation/filesystems/bcachefs/CodingStyle.rst
+++ b/Documentation/filesystems/bcachefs/CodingStyle.rst
@@ -183,4 +183,4 @@ even better as a code comment.
A good code comment is wonderful, but even better is the comment that didn't
need to exist because the code was so straightforward as to be obvious;
organized into small clean and tidy modules, with clear and descriptive names
-for functions and variable, where every line of code has a clear purpose.
+for functions and variables, where every line of code has a clear purpose.
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5bac803ea367..e8549d04dcb8 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -89,7 +89,7 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
config BCACHEFS_PATH_TRACEPOINTS
bool "Extra btree_path tracepoints"
- depends on BCACHEFS_FS
+ depends on BCACHEFS_FS && TRACING
help
Enable extra tracepoints for debugging btree_path operations; we don't
normally want these enabled because they happen at very high rates.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 56d20e219f59..d2689388d5e8 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -82,6 +82,7 @@ bcachefs-y := \
siphash.o \
six.o \
snapshot.o \
+ str_hash.o \
subvolume.o \
super.o \
super-io.o \
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 87f1be9d4db4..99487727ae64 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -184,11 +184,6 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
return ERR_PTR(-EINVAL);
}
-#define acl_for_each_entry(acl, acl_e) \
- for (acl_e = acl->a_entries; \
- acl_e < acl->a_entries + acl->a_count; \
- acl_e++)
-
/*
* Convert from in-memory to filesystem representation.
*/
@@ -199,11 +194,11 @@ bch2_acl_to_xattr(struct btree_trans *trans,
{
struct bkey_i_xattr *xattr;
bch_acl_header *acl_header;
- const struct posix_acl_entry *acl_e;
+ const struct posix_acl_entry *acl_e, *pe;
void *outptr;
unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
- acl_for_each_entry(acl, acl_e) {
+ FOREACH_ACL_ENTRY(acl_e, acl, pe) {
switch (acl_e->e_tag) {
case ACL_USER:
case ACL_GROUP:
@@ -241,7 +236,7 @@ bch2_acl_to_xattr(struct btree_trans *trans,
outptr = (void *) acl_header + sizeof(*acl_header);
- acl_for_each_entry(acl, acl_e) {
+ FOREACH_ACL_ENTRY(acl_e, acl, pe) {
bch_acl_entry *entry = outptr;
entry->e_tag = cpu_to_le16(acl_e->e_tag);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c84a91572a1d..fc2ef33b67b3 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -198,7 +198,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
}
int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
int ret = 0;
@@ -213,7 +213,7 @@ int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
}
int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_alloc_unpacked u;
int ret = 0;
@@ -226,7 +226,7 @@ int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
}
int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_alloc_unpacked u;
int ret = 0;
@@ -239,7 +239,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
}
int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bch_alloc_v4 a;
int ret = 0;
@@ -322,9 +322,9 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
void bch2_alloc_v4_swab(struct bkey_s k)
{
struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
- struct bch_backpointer *bp, *bps;
- a->journal_seq = swab64(a->journal_seq);
+ a->journal_seq_nonempty = swab64(a->journal_seq_nonempty);
+ a->journal_seq_empty = swab64(a->journal_seq_empty);
a->flags = swab32(a->flags);
a->dirty_sectors = swab32(a->dirty_sectors);
a->cached_sectors = swab32(a->cached_sectors);
@@ -333,13 +333,6 @@ void bch2_alloc_v4_swab(struct bkey_s k)
a->stripe = swab32(a->stripe);
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
a->stripe_sectors = swab32(a->stripe_sectors);
-
- bps = alloc_v4_backpointers(a);
- for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
- bp->bucket_offset = swab40(bp->bucket_offset);
- bp->bucket_len = swab32(bp->bucket_len);
- bch2_bpos_swab(&bp->pos);
- }
}
void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
@@ -354,16 +347,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
bch2_prt_data_type(out, a->data_type);
prt_newline(out);
- prt_printf(out, "journal_seq %llu\n", a->journal_seq);
- prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
- prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
- prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
- prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
- prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
- prt_printf(out, "stripe %u\n", a->stripe);
- prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
- prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
- prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
+ prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty);
+ prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty);
+ prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
+ prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
+ prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
+ prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
+ prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
+ prt_printf(out, "stripe %u\n", a->stripe);
+ prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
+ prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
+ prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
if (ca)
prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca));
@@ -392,7 +386,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
*out = (struct bch_alloc_v4) {
- .journal_seq = u.journal_seq,
+ .journal_seq_nonempty = u.journal_seq,
.flags = u.need_discard,
.gen = u.gen,
.oldest_gen = u.oldest_gen,
@@ -517,7 +511,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
}
int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
int ret = 0;
@@ -664,74 +658,80 @@ int bch2_alloc_read(struct bch_fs *c)
/* Free space/discard btree: */
+static int __need_discard_or_freespace_err(struct btree_trans *trans,
+ struct bkey_s_c alloc_k,
+ bool set, bool discard, bool repair)
+{
+ struct bch_fs *c = trans->c;
+ enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0);
+ enum bch_sb_error_id err_id = discard
+ ? BCH_FSCK_ERR_need_discard_key_wrong
+ : BCH_FSCK_ERR_freespace_key_wrong;
+ enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, alloc_k);
+
+ int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
+ "bucket incorrectly %sset in %s btree\n"
+ " %s",
+ set ? "" : "un",
+ bch2_btree_id_str(btree),
+ buf.buf);
+ if (ret == -BCH_ERR_fsck_ignore ||
+ ret == -BCH_ERR_fsck_errors_not_fixed)
+ ret = 0;
+
+ printbuf_exit(&buf);
+ return ret;
+}
+
+#define need_discard_or_freespace_err(...) \
+ fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__))
+
+#define need_discard_or_freespace_err_on(cond, ...) \
+ (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false)
+
static int bch2_bucket_do_index(struct btree_trans *trans,
struct bch_dev *ca,
struct bkey_s_c alloc_k,
const struct bch_alloc_v4 *a,
bool set)
{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c old;
- struct bkey_i *k;
enum btree_id btree;
- enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
- enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
- struct printbuf buf = PRINTBUF;
- int ret;
+ struct bpos pos;
if (a->data_type != BCH_DATA_free &&
a->data_type != BCH_DATA_need_discard)
return 0;
- k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
- if (IS_ERR(k))
- return PTR_ERR(k);
-
- bkey_init(&k->k);
- k->k.type = new_type;
-
switch (a->data_type) {
case BCH_DATA_free:
btree = BTREE_ID_freespace;
- k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
- bch2_key_resize(&k->k, 1);
+ pos = alloc_freespace_pos(alloc_k.k->p, *a);
break;
case BCH_DATA_need_discard:
btree = BTREE_ID_need_discard;
- k->k.p = alloc_k.k->p;
+ pos = alloc_k.k->p;
break;
default:
return 0;
}
- old = bch2_bkey_get_iter(trans, &iter, btree,
- bkey_start_pos(&k->k),
- BTREE_ITER_intent);
- ret = bkey_err(old);
+ struct btree_iter iter;
+ struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent);
+ int ret = bkey_err(old);
if (ret)
return ret;
- if (ca->mi.freespace_initialized &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
- bch2_trans_inconsistent_on(old.k->type != old_type, trans,
- "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
- " for %s",
- set ? "setting" : "clearing",
- bch2_btree_id_str(btree),
- iter.pos.inode,
- iter.pos.offset,
- bch2_bkey_types[old.k->type],
- bch2_bkey_types[old_type],
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- ret = -EIO;
- goto err;
- }
+ need_discard_or_freespace_err_on(ca->mi.freespace_initialized &&
+ !old.k->type != set,
+ trans, alloc_k, set,
+ btree == BTREE_ID_need_discard, false);
- ret = bch2_trans_update(trans, &iter, k, 0);
-err:
+ ret = bch2_btree_bit_mod_iter(trans, &iter, set);
+fsck_err:
bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
return ret;
}
@@ -858,7 +858,10 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_transactional) {
alloc_data_type_set(new_a, new_a->data_type);
- if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
+ int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
+ (int) data_type_is_empty(old_a->data_type);
+
+ if (is_empty_delta < 0) {
new_a->io_time[READ] = bch2_current_io_time(c, READ);
new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
@@ -928,37 +931,55 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
- u64 journal_seq = trans->journal_res.seq;
- u64 bucket_journal_seq = new_a->journal_seq;
+ u64 transaction_seq = trans->journal_res.seq;
+ BUG_ON(!transaction_seq);
- if ((flags & BTREE_TRIGGER_insert) &&
- data_type_is_empty(old_a->data_type) !=
- data_type_is_empty(new_a->data_type) &&
- new.k->type == KEY_TYPE_alloc_v4) {
- struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
+ if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
+ trans, alloc_key_journal_seq_in_future,
+ "bucket journal seq in future (currently at %llu)\n%s",
+ journal_cur_seq(&c->journal),
+ (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
+ new_a->journal_seq_nonempty = transaction_seq;
- /*
- * If the btree updates referring to a bucket weren't flushed
- * before the bucket became empty again, then the we don't have
- * to wait on a journal flush before we can reuse the bucket:
- */
- v->journal_seq = bucket_journal_seq =
- data_type_is_empty(new_a->data_type) &&
- (journal_seq == v->journal_seq ||
- bch2_journal_noflush_seq(&c->journal, v->journal_seq))
- ? 0 : journal_seq;
+ int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
+ (int) data_type_is_empty(old_a->data_type);
+
+ /*
+ * Record journal sequence number of empty -> nonempty transition:
+ * Note that there may be multiple empty -> nonempty
+ * transitions, data in a bucket may be overwritten while we're
+ * still writing to it - so be careful to only record the first:
+ * */
+ if (is_empty_delta < 0 &&
+ new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) {
+ new_a->journal_seq_nonempty = transaction_seq;
+ new_a->journal_seq_empty = 0;
}
- if (!data_type_is_empty(old_a->data_type) &&
- data_type_is_empty(new_a->data_type) &&
- bucket_journal_seq) {
- ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk,
- new.k->p.inode, new.k->p.offset,
- bucket_journal_seq);
- if (bch2_fs_fatal_err_on(ret, c,
- "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
- goto err;
+ /*
+ * Bucket becomes empty: mark it as waiting for a journal flush,
+ * unless updates since empty -> nonempty transition were never
+ * flushed - we may need to ask the journal not to flush
+ * intermediate sequence numbers:
+ */
+ if (is_empty_delta > 0) {
+ if (new_a->journal_seq_nonempty == transaction_seq ||
+ bch2_journal_noflush_seq(&c->journal,
+ new_a->journal_seq_nonempty,
+ transaction_seq)) {
+ new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0;
+ } else {
+ new_a->journal_seq_empty = transaction_seq;
+
+ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ new.k->p.inode, new.k->p.offset,
+ transaction_seq);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "setting bucket_needs_journal_commit: %s",
+ bch2_err_str(ret)))
+ goto err;
+ }
}
if (new_a->gen != old_a->gen) {
@@ -974,7 +995,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
-#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
+#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk)
if (statechange(a->data_type == BCH_DATA_free) &&
bucket_flushed(new_a))
@@ -1006,6 +1027,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
rcu_read_unlock();
}
err:
+fsck_err:
printbuf_exit(&buf);
bch2_dev_put(ca);
return ret;
@@ -1045,7 +1067,7 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
* btree node min/max is a closed interval, upto takes a half
* open interval:
*/
- k = bch2_btree_iter_peek_upto(&iter2, end);
+ k = bch2_btree_iter_peek_max(&iter2, end);
next = iter2.pos;
bch2_trans_iter_exit(iter->trans, &iter2);
@@ -1129,7 +1151,6 @@ int bch2_check_alloc_key(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a;
- unsigned discard_key_type, freespace_key_type;
unsigned gens_offset;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
@@ -1149,64 +1170,30 @@ int bch2_check_alloc_key(struct btree_trans *trans,
a = bch2_alloc_to_v4(alloc_k, &a_convert);
- discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
k = bch2_btree_iter_peek_slot(discard_iter);
ret = bkey_err(k);
if (ret)
goto err;
- if (fsck_err_on(k.k->type != discard_key_type,
- trans, need_discard_key_wrong,
- "incorrect key in need_discard btree (got %s should be %s)\n"
- " %s",
- bch2_bkey_types[k.k->type],
- bch2_bkey_types[discard_key_type],
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- struct bkey_i *update =
- bch2_trans_kmalloc(trans, sizeof(*update));
-
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- goto err;
-
- bkey_init(&update->k);
- update->k.type = discard_key_type;
- update->k.p = discard_iter->pos;
-
- ret = bch2_trans_update(trans, discard_iter, update, 0);
+ bool is_discarded = a->data_type == BCH_DATA_need_discard;
+ if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded,
+ trans, alloc_k, !is_discarded, true, true)) {
+ ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded);
if (ret)
goto err;
}
- freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
k = bch2_btree_iter_peek_slot(freespace_iter);
ret = bkey_err(k);
if (ret)
goto err;
- if (fsck_err_on(k.k->type != freespace_key_type,
- trans, freespace_key_wrong,
- "incorrect key in freespace btree (got %s should be %s)\n"
- " %s",
- bch2_bkey_types[k.k->type],
- bch2_bkey_types[freespace_key_type],
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- struct bkey_i *update =
- bch2_trans_kmalloc(trans, sizeof(*update));
-
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- goto err;
-
- bkey_init(&update->k);
- update->k.type = freespace_key_type;
- update->k.p = freespace_iter->pos;
- bch2_key_resize(&update->k, 1);
-
- ret = bch2_trans_update(trans, freespace_iter, update, 0);
+ bool is_free = a->data_type == BCH_DATA_free;
+ if (need_discard_or_freespace_err_on(!!k.k->type != is_free,
+ trans, alloc_k, !is_free, false, true)) {
+ ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free);
if (ret)
goto err;
}
@@ -1368,51 +1355,88 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
return ret;
}
-static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
- struct btree_iter *iter)
+struct check_discard_freespace_key_async {
+ struct work_struct work;
+ struct bch_fs *c;
+ struct bbpos pos;
+};
+
+static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ u8 gen;
+ ret = k.k->type != KEY_TYPE_set
+ ? bch2_check_discard_freespace_key(trans, &iter, &gen, false)
+ : 0;
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static void check_discard_freespace_key_work(struct work_struct *work)
+{
+ struct check_discard_freespace_key_async *w =
+ container_of(work, struct check_discard_freespace_key_async, work);
+
+ bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
+ bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key);
+ kfree(w);
+}
+
+int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen,
+ bool async_repair)
{
struct bch_fs *c = trans->c;
- struct btree_iter alloc_iter;
- struct bkey_s_c alloc_k;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
- u64 genbits;
- struct bpos pos;
enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
? BCH_DATA_need_discard
: BCH_DATA_free;
struct printbuf buf = PRINTBUF;
- int ret;
- pos = iter->pos;
- pos.offset &= ~(~0ULL << 56);
- genbits = iter->pos.offset & (~0ULL << 56);
+ struct bpos bucket = iter->pos;
+ bucket.offset &= ~(~0ULL << 56);
+ u64 genbits = iter->pos.offset & (~0ULL << 56);
- alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
- ret = bkey_err(alloc_k);
+ struct btree_iter alloc_iter;
+ struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
+ BTREE_ID_alloc, bucket,
+ async_repair ? BTREE_ITER_cached : 0);
+ int ret = bkey_err(alloc_k);
if (ret)
return ret;
- if (fsck_err_on(!bch2_dev_bucket_exists(c, pos),
- trans, need_discard_freespace_key_to_invalid_dev_bucket,
- "entry in %s btree for nonexistant dev:bucket %llu:%llu",
- bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
- goto delete;
+ if (!bch2_dev_bucket_exists(c, bucket)) {
+ if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket,
+ "entry in %s btree for nonexistant dev:bucket %llu:%llu",
+ bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
+ goto delete;
+ ret = 1;
+ goto out;
+ }
- a = bch2_alloc_to_v4(alloc_k, &a_convert);
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+ if (a->data_type != state ||
+ (state == BCH_DATA_free &&
+ genbits != alloc_freespace_genbits(*a))) {
+ if (fsck_err(trans, need_discard_freespace_key_bad,
+ "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+ bch2_btree_id_str(iter->btree_id),
+ iter->pos.inode,
+ iter->pos.offset,
+ a->data_type == state,
+ genbits >> 56, alloc_freespace_genbits(*a) >> 56))
+ goto delete;
+ ret = 1;
+ goto out;
+ }
- if (fsck_err_on(a->data_type != state ||
- (state == BCH_DATA_free &&
- genbits != alloc_freespace_genbits(*a)),
- trans, need_discard_freespace_key_bad,
- "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
- bch2_btree_id_str(iter->btree_id),
- iter->pos.inode,
- iter->pos.offset,
- a->data_type == state,
- genbits >> 56, alloc_freespace_genbits(*a) >> 56))
- goto delete;
+ *gen = a->gen;
out:
fsck_err:
bch2_set_btree_iter_dontneed(&alloc_iter);
@@ -1420,11 +1444,40 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran
printbuf_exit(&buf);
return ret;
delete:
- ret = bch2_btree_delete_extent_at(trans, iter,
- iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- goto out;
+ if (!async_repair) {
+ ret = bch2_btree_bit_mod_iter(trans, iter, false) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_commit;
+ goto out;
+ } else {
+ /*
+ * We can't repair here when called from the allocator path: the
+ * commit will recurse back into the allocator
+ */
+ struct check_discard_freespace_key_async *w =
+ kzalloc(sizeof(*w), GFP_KERNEL);
+ if (!w)
+ goto out;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) {
+ kfree(w);
+ goto out;
+ }
+
+ INIT_WORK(&w->work, check_discard_freespace_key_work);
+ w->c = c;
+ w->pos = BBPOS(iter->btree_id, iter->pos);
+ queue_work(c->write_ref_wq, &w->work);
+ goto out;
+ }
+}
+
+static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
+{
+ u8 gen;
+ int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false);
+ return ret < 0 ? ret : 0;
}
/*
@@ -1581,7 +1634,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
ret = for_each_btree_key(trans, iter,
BTREE_ID_need_discard, POS_MIN,
BTREE_ITER_prefetch, k,
- bch2_check_discard_freespace_key(trans, &iter));
+ bch2_check_discard_freespace_key_fsck(trans, &iter));
if (ret)
goto err;
@@ -1594,7 +1647,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
break;
ret = bkey_err(k) ?:
- bch2_check_discard_freespace_key(trans, &iter);
+ bch2_check_discard_freespace_key_fsck(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
ret = 0;
continue;
@@ -1757,7 +1810,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
struct bch_dev *ca,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
- struct discard_buckets_state *s)
+ struct discard_buckets_state *s,
+ bool fastpath)
{
struct bch_fs *c = trans->c;
struct bpos pos = need_discard_iter->pos;
@@ -1793,44 +1847,23 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
if (ret)
goto out;
- if (bch2_bucket_sectors_total(a->v)) {
- if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
- trans, "attempting to discard bucket with dirty data\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = -EIO;
- goto out;
- }
-
if (a->v.data_type != BCH_DATA_need_discard) {
- if (data_type_is_empty(a->v.data_type) &&
- BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
- a->v.gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
- goto write;
+ if (need_discard_or_freespace_err(trans, k, true, true, true)) {
+ ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
+ if (ret)
+ goto out;
+ goto commit;
}
- if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
- trans, "bucket incorrectly set in need_discard btree\n"
- "%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = -EIO;
goto out;
}
- if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
- if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
- trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
- a->v.journal_seq,
- c->journal.flushed_seq_ondisk,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = -EIO;
- goto out;
- }
-
- if (discard_in_flight_add(ca, iter.pos.offset, true))
- goto out;
+ if (!fastpath) {
+ if (discard_in_flight_add(ca, iter.pos.offset, true))
+ goto out;
- discard_locked = true;
+ discard_locked = true;
+ }
if (!bkey_eq(*discard_pos_done, iter.pos) &&
ca->mi.discard && !c->opts.nochanges) {
@@ -1844,6 +1877,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
ca->mi.bucket_size,
GFP_KERNEL);
*discard_pos_done = iter.pos;
+ s->discarded++;
ret = bch2_trans_relock_notrace(trans);
if (ret)
@@ -1851,22 +1885,25 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
}
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
-write:
alloc_data_type_set(&a->v, a->v.data_type);
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ if (ret)
+ goto out;
+commit:
+ ret = bch2_trans_commit(trans, NULL, NULL,
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
count_event(c, bucket_discard);
- s->discarded++;
out:
+fsck_err:
if (discard_locked)
discard_in_flight_remove(ca, iter.pos.offset);
- s->seen++;
+ if (!ret)
+ s->seen++;
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
return ret;
@@ -1886,11 +1923,11 @@ static void bch2_do_discards_work(struct work_struct *work)
* successful commit:
*/
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter,
+ for_each_btree_key_max(trans, iter,
BTREE_ID_need_discard,
POS(ca->dev_idx, 0),
POS(ca->dev_idx, U64_MAX), 0, k,
- bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
+ bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
@@ -1923,27 +1960,29 @@ void bch2_do_discards(struct bch_fs *c)
bch2_dev_do_discards(ca);
}
-static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
+static int bch2_do_discards_fast_one(struct btree_trans *trans,
+ struct bch_dev *ca,
+ u64 bucket,
+ struct bpos *discard_pos_done,
+ struct discard_buckets_state *s)
{
- struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
- int ret = bkey_err(k);
+ struct btree_iter need_discard_iter;
+ struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter,
+ BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0);
+ int ret = bkey_err(discard_k);
if (ret)
- goto err;
-
- struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- goto err;
+ return ret;
- BUG_ON(a->v.dirty_sectors);
- SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
- alloc_data_type_set(&a->v, a->v.data_type);
+ if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
+ trans, discarding_bucket_not_in_need_discard_btree,
+ "attempting to discard bucket %u:%llu not in need_discard btree",
+ ca->dev_idx, bucket))
+ goto out;
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
+ ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, &need_discard_iter);
return ret;
}
@@ -1951,6 +1990,10 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
{
struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
struct bch_fs *c = ca->fs;
+ struct discard_buckets_state s = {};
+ struct bpos discard_pos_done = POS_MAX;
+ struct btree_trans *trans = bch2_trans_get(c);
+ int ret = 0;
while (1) {
bool got_bucket = false;
@@ -1971,16 +2014,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
if (!got_bucket)
break;
- if (ca->mi.discard && !c->opts.nochanges)
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca, bucket),
- ca->mi.bucket_size,
- GFP_KERNEL);
-
- int ret = bch2_trans_commit_do(c, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc,
- bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
+ ret = lockrestart_do(trans,
+ bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s));
bch_err_fn(c, ret);
discard_in_flight_remove(ca, bucket);
@@ -1989,6 +2024,9 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
break;
}
+ trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
+
+ bch2_trans_put(trans);
percpu_ref_put(&ca->io_ref);
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
@@ -2030,8 +2068,11 @@ static int invalidate_one_bucket(struct btree_trans *trans,
return 1;
if (!bch2_dev_bucket_exists(c, bucket)) {
- prt_str(&buf, "lru entry points to invalid bucket");
- goto err;
+ if (fsck_err(trans, lru_entry_to_invalid_bucket,
+ "lru key points to nonexistent device:bucket %llu:%llu",
+ bucket.inode, bucket.offset))
+ return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
+ goto out;
}
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
@@ -2072,28 +2113,9 @@ static int invalidate_one_bucket(struct btree_trans *trans,
trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
--*nr_to_invalidate;
out:
+fsck_err:
printbuf_exit(&buf);
return ret;
-err:
- prt_str(&buf, "\n lru key: ");
- bch2_bkey_val_to_text(&buf, c, lru_k);
-
- prt_str(&buf, "\n lru entry: ");
- bch2_lru_pos_to_text(&buf, lru_iter->pos);
-
- prt_str(&buf, "\n alloc key: ");
- if (!a)
- bch2_bpos_to_text(&buf, bucket);
- else
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
-
- bch_err(c, "%s", buf.buf);
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
- bch2_inconsistent_error(c);
- ret = -EINVAL;
- }
-
- goto out;
}
static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
@@ -2101,7 +2123,7 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter
{
struct bkey_s_c k;
again:
- k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
+ k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
if (!k.k && !*wrapped) {
bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
*wrapped = true;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 163a67b97a40..de25ba4ee94b 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,8 +8,6 @@
#include "debug.h"
#include "super.h"
-enum bch_validate_flags;
-
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
@@ -245,10 +243,14 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
-int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
-int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
-int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -282,7 +284,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
})
int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context);
void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
@@ -307,6 +309,8 @@ int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
enum btree_iter_update_trigger_flags);
+
+int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
void bch2_dev_do_discards(struct bch_dev *);
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
index befdaa95c515..740238369a5a 100644
--- a/fs/bcachefs/alloc_background_format.h
+++ b/fs/bcachefs/alloc_background_format.h
@@ -58,7 +58,7 @@ LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
struct bch_alloc_v4 {
struct bch_val v;
- __u64 journal_seq;
+ __u64 journal_seq_nonempty;
__u32 flags;
__u8 gen;
__u8 oldest_gen;
@@ -70,7 +70,7 @@ struct bch_alloc_v4 {
__u32 stripe;
__u32 nr_external_backpointers;
/* end of fields in original version of alloc_v4 */
- __u64 _fragmentation_lru; /* obsolete */
+ __u64 journal_seq_empty;
__u32 stripe_sectors;
__u32 pad;
} __packed __aligned(8);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 372178c8d416..6df41c331a52 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -107,14 +107,10 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
return;
}
- percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock);
-
ob->valid = false;
ob->data_type = 0;
-
spin_unlock(&ob->lock);
- percpu_up_read(&c->mark_lock);
spin_lock(&c->freelist_lock);
bch2_open_bucket_hash_remove(c, ob);
@@ -156,6 +152,14 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
return ob;
}
+static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+{
+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs)
+ return false;
+
+ return bch2_is_superblock_bucket(ca, b);
+}
+
static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
{
BUG_ON(c->open_buckets_partial_nr >=
@@ -175,20 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
closure_wake_up(&c->freelist_wait);
}
-/* _only_ for allocating the journal on a new device: */
-long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
-{
- while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
- u64 b = ca->new_fs_bucket_idx++;
-
- if (!is_superblock_bucket(ca, b) &&
- (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
- return b;
- }
-
- return -1;
-}
-
static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
{
switch (watermark) {
@@ -206,33 +196,40 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
}
}
-static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
- u64 bucket,
- enum bch_watermark watermark,
- const struct bch_alloc_v4 *a,
- struct bucket_alloc_state *s,
- struct closure *cl)
+static inline bool may_alloc_bucket(struct bch_fs *c,
+ struct bpos bucket,
+ struct bucket_alloc_state *s)
{
- struct open_bucket *ob;
-
- if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
- s->skipped_nouse++;
- return NULL;
- }
-
- if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
s->skipped_open++;
- return NULL;
+ return false;
}
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+ c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) {
s->skipped_need_journal_commit++;
- return NULL;
+ return false;
}
- if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
+ if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
s->skipped_nocow++;
+ return false;
+ }
+
+ return true;
+}
+
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ u64 bucket, u8 gen,
+ enum bch_watermark watermark,
+ struct bucket_alloc_state *s,
+ struct closure *cl)
+{
+ if (unlikely(is_superblock_bucket(c, ca, bucket)))
+ return NULL;
+
+ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+ s->skipped_nouse++;
return NULL;
}
@@ -254,14 +251,13 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
return NULL;
}
- ob = bch2_open_bucket_alloc(c);
+ struct open_bucket *ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock);
-
ob->valid = true;
ob->sectors_free = ca->mi.bucket_size;
ob->dev = ca->dev_idx;
- ob->gen = a->gen;
+ ob->gen = gen;
ob->bucket = bucket;
spin_unlock(&ob->lock);
@@ -276,111 +272,29 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
}
static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
- enum bch_watermark watermark, u64 free_entry,
+ enum bch_watermark watermark,
struct bucket_alloc_state *s,
- struct bkey_s_c freespace_k,
+ struct btree_iter *freespace_iter,
struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
- struct open_bucket *ob;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
- u64 b = free_entry & ~(~0ULL << 56);
- unsigned genbits = free_entry >> 56;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
- prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
- " freespace key ",
- ca->mi.first_bucket, ca->mi.nbuckets);
- bch2_bkey_val_to_text(&buf, c, freespace_k);
- bch2_trans_inconsistent(trans, "%s", buf.buf);
- ob = ERR_PTR(-EIO);
- goto err;
- }
+ u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
- k = bch2_bkey_get_iter(trans, &iter,
- BTREE_ID_alloc, POS(ca->dev_idx, b),
- BTREE_ITER_cached);
- ret = bkey_err(k);
- if (ret) {
- ob = ERR_PTR(ret);
- goto err;
- }
-
- a = bch2_alloc_to_v4(k, &a_convert);
-
- if (a->data_type != BCH_DATA_free) {
- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
- ob = NULL;
- goto err;
- }
-
- prt_printf(&buf, "non free bucket in freespace btree\n"
- " freespace key ");
- bch2_bkey_val_to_text(&buf, c, freespace_k);
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_trans_inconsistent(trans, "%s", buf.buf);
- ob = ERR_PTR(-EIO);
- goto err;
- }
-
- if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
- " freespace key ",
- genbits, alloc_freespace_genbits(*a) >> 56);
- bch2_bkey_val_to_text(&buf, c, freespace_k);
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_trans_inconsistent(trans, "%s", buf.buf);
- ob = ERR_PTR(-EIO);
- goto err;
- }
-
- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- struct bch_backpointer bp;
- struct bpos bp_pos = POS_MIN;
-
- ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1,
- &bp_pos, &bp,
- BTREE_ITER_nopreserve);
- if (ret) {
- ob = ERR_PTR(ret);
- goto err;
- }
+ if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s))
+ return NULL;
- if (!bkey_eq(bp_pos, POS_MAX)) {
- /*
- * Bucket may have data in it - we don't call
- * bc2h_trans_inconnsistent() because fsck hasn't
- * finished yet
- */
- ob = NULL;
- goto err;
- }
- }
+ u8 gen;
+ int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret)
+ return NULL;
- ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
- if (!ob)
- bch2_set_btree_iter_dontneed(&iter);
-err:
- if (iter.path)
- bch2_set_btree_iter_dontneed(&iter);
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ob;
+ return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl);
}
/*
* This path is for before the freespace btree is initialized:
- *
- * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
- * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
*/
static noinline struct open_bucket *
bch2_bucket_alloc_early(struct btree_trans *trans,
@@ -389,10 +303,11 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
struct bucket_alloc_state *s,
struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct btree_iter iter, citer;
struct bkey_s_c k, ck;
struct open_bucket *ob = NULL;
- u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+ u64 first_bucket = ca->mi.first_bucket;
u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
u64 alloc_cursor = alloc_start;
@@ -415,10 +330,6 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
break;
- if (ca->new_fs_bucket_idx &&
- is_superblock_bucket(ca, k.k->p.offset))
- continue;
-
if (s->btree_bitmap != BTREE_BITMAP_ANY &&
s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
@@ -452,7 +363,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
s->buckets_seen++;
- ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+ ob = may_alloc_bucket(c, k.k->p, s)
+ ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen,
+ watermark, s, cl)
+ : NULL;
next:
bch2_set_btree_iter_dontneed(&citer);
bch2_trans_iter_exit(trans, &citer);
@@ -489,20 +403,21 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
u64 alloc_cursor = alloc_start;
int ret;
-
- BUG_ON(ca->new_fs_bucket_idx);
again:
- for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
- POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
- if (k.k->p.inode != ca->dev_idx)
- break;
+ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace,
+ POS(ca->dev_idx, alloc_cursor),
+ POS(ca->dev_idx, U64_MAX),
+ 0, k, ret) {
+ /*
+ * peek normally dosen't trim extents - they can span iter.pos,
+ * which is not what we want here:
+ */
+ iter.k.size = iter.k.p.offset - iter.pos.offset;
- for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
- alloc_cursor < k.k->p.offset;
- alloc_cursor++) {
+ while (iter.k.size) {
s->buckets_seen++;
- u64 bucket = alloc_cursor & ~(~0ULL << 56);
+ u64 bucket = iter.pos.offset & ~(~0ULL << 56);
if (s->btree_bitmap != BTREE_BITMAP_ANY &&
s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
@@ -511,32 +426,36 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
goto fail;
bucket = sector_to_bucket(ca,
- round_up(bucket_to_sector(ca, bucket) + 1,
+ round_up(bucket_to_sector(ca, bucket + 1),
1ULL << ca->mi.btree_bitmap_shift));
- u64 genbits = alloc_cursor >> 56;
- alloc_cursor = bucket | (genbits << 56);
+ alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
- if (alloc_cursor > k.k->p.offset)
- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
+ bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
s->skipped_mi_btree_bitmap++;
- continue;
+ goto next;
}
- ob = try_alloc_bucket(trans, ca, watermark,
- alloc_cursor, s, k, cl);
+ ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl);
if (ob) {
+ if (!IS_ERR(ob))
+ *dev_alloc_cursor = iter.pos.offset;
bch2_set_btree_iter_dontneed(&iter);
break;
}
- }
+ iter.k.size--;
+ iter.pos.offset++;
+ }
+next:
if (ob || ret)
break;
}
fail:
bch2_trans_iter_exit(trans, &iter);
- if (!ob && ret)
+ BUG_ON(ob && ret);
+
+ if (ret)
ob = ERR_PTR(ret);
if (!ob && alloc_start > ca->mi.first_bucket) {
@@ -544,8 +463,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
goto again;
}
- *dev_alloc_cursor = alloc_cursor;
-
return ob;
}
@@ -595,6 +512,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
* @watermark: how important is this allocation?
* @data_type: BCH_DATA_journal, btree, user...
* @cl: if not NULL, closure to be used to wait if buckets not available
+ * @nowait: if true, do not wait for buckets to become available
* @usage: for secondarily also returning the current device usage
*
* Returns: an open_bucket on success, or an ERR_PTR() on failure.
@@ -629,6 +547,10 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
bch2_dev_do_invalidates(ca);
if (!avail) {
+ if (watermark > BCH_WATERMARK_normal &&
+ c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+ goto alloc;
+
if (cl && !waiting) {
closure_wait(&c->freelist_wait, cl);
waiting = true;
@@ -711,9 +633,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
unsigned i;
for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
- ret.devs[ret.nr++] = i;
+ ret.data[ret.nr++] = i;
- bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
+ bubble_sort(ret.data, ret.nr, dev_stripe_cmp);
return ret;
}
@@ -785,18 +707,13 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct dev_alloc_list devs_sorted =
- bch2_dev_alloc_list(c, stripe, devs_may_alloc);
int ret = -BCH_ERR_insufficient_devices;
BUG_ON(*nr_effective >= nr_replicas);
- for (unsigned i = 0; i < devs_sorted.nr; i++) {
- struct bch_dev_usage usage;
- struct open_bucket *ob;
-
- unsigned dev = devs_sorted.devs[i];
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
+ struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+ darray_for_each(devs_sorted, i) {
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i);
if (!ca)
continue;
@@ -805,8 +722,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
- ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
- cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
+ struct bch_dev_usage usage;
+ struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
+ cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
if (!IS_ERR(ob))
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
bch2_dev_put(ca);
@@ -850,10 +768,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct dev_alloc_list devs_sorted;
- struct ec_stripe_head *h;
- struct open_bucket *ob;
- unsigned i, ec_idx;
int ret = 0;
if (nr_replicas < 2)
@@ -862,34 +776,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
if (ec_open_bucket(c, ptrs))
return 0;
- h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
+ struct ec_stripe_head *h =
+ bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
if (IS_ERR(h))
return PTR_ERR(h);
if (!h)
return 0;
- devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
-
- for (i = 0; i < devs_sorted.nr; i++)
- for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+ struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+ darray_for_each(devs_sorted, i)
+ for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
if (!h->s->blocks[ec_idx])
continue;
- ob = c->open_buckets + h->s->blocks[ec_idx];
- if (ob->dev == devs_sorted.devs[i] &&
- !test_and_set_bit(ec_idx, h->s->blocks_allocated))
- goto got_bucket;
+ struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx];
+ if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) {
+ ob->ec_idx = ec_idx;
+ ob->ec = h->s;
+ ec_stripe_new_get(h->s, STRIPE_REF_io);
+
+ ret = add_new_bucket(c, ptrs, devs_may_alloc,
+ nr_replicas, nr_effective,
+ have_cache, ob);
+ goto out;
+ }
}
- goto out_put_head;
-got_bucket:
- ob->ec_idx = ec_idx;
- ob->ec = h->s;
- ec_stripe_new_get(h->s, STRIPE_REF_io);
-
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob);
-out_put_head:
+out:
bch2_ec_stripe_head_put(c, h);
return ret;
}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 1a16fd5bd4f8..f25481a0d1a0 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *);
struct dev_alloc_list {
unsigned nr;
- u8 devs[BCH_SB_MEMBERS_MAX];
+ u8 data[BCH_SB_MEMBERS_MAX];
};
struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
@@ -28,8 +28,6 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
struct bch_devs_mask *);
void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
-long bch2_bucket_alloc_new_fs(struct bch_dev *);
-
static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
{
return bch2_dev_have_ref(c, ob->dev);
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 654a58132a4d..ebeb6a5ff9d2 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -14,42 +14,8 @@
#include <linux/mm.h>
-static bool extent_matches_bp(struct bch_fs *c,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k,
- struct bpos bucket,
- struct bch_backpointer bp)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- rcu_read_lock();
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bpos bucket2;
- struct bch_backpointer bp2;
-
- if (p.ptr.cached)
- continue;
-
- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
- if (!ca)
- continue;
-
- bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2);
- if (bpos_eq(bucket, bucket2) &&
- !memcmp(&bp, &bp2, sizeof(bp))) {
- rcu_read_unlock();
- return true;
- }
- }
- rcu_read_unlock();
-
- return false;
-}
-
int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
int ret = 0;
@@ -59,67 +25,70 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
"backpointer level bad: %u >= %u",
bp.v->level, BTREE_MAX_DEPTH);
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
- if (!ca) {
- /* these will be caught by fsck */
- rcu_read_unlock();
- return 0;
- }
-
- struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p);
- struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset);
- rcu_read_unlock();
-
- bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
- !bpos_eq(bp.k->p, bp_pos),
- c, backpointer_bucket_offset_wrong,
- "backpointer bucket_offset wrong");
+ bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID,
+ c, backpointer_dev_bad,
+ "backpointer for BCH_SB_MEMBER_INVALID");
fsck_err:
return ret;
}
-void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
- prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
- bch2_btree_id_str(bp->btree_id),
- bp->level,
- (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
- (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
- bp->bucket_len);
- bch2_bpos_to_text(out, bp->pos);
-}
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
if (ca) {
- struct bpos bucket = bp_pos_to_bucket(ca, k.k->p);
+ u32 bucket_offset;
+ struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
rcu_read_unlock();
- prt_str(out, "bucket=");
- bch2_bpos_to_text(out, bucket);
- prt_str(out, " ");
+ prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
} else {
rcu_read_unlock();
+ prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
}
- bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+ bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
+ prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
+ (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+ bp.v->bucket_len,
+ bp.v->bucket_gen);
+ bch2_bpos_to_text(out, bp.v->pos);
}
void bch2_backpointer_swab(struct bkey_s k)
{
struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
- bp.v->bucket_offset = swab40(bp.v->bucket_offset);
bp.v->bucket_len = swab32(bp.v->bucket_len);
bch2_bpos_swab(&bp.v->pos);
}
+static bool extent_matches_bp(struct bch_fs *c,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k,
+ struct bkey_s_c_backpointer bp)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bkey_i_backpointer bp2;
+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2);
+
+ if (bpos_eq(bp.k->p, bp2.k.p) &&
+ !memcmp(bp.v, &bp2.v, sizeof(bp2.v)))
+ return true;
+ }
+
+ return false;
+}
+
static noinline int backpointer_mod_err(struct btree_trans *trans,
- struct bch_backpointer bp,
- struct bkey_s_c bp_k,
struct bkey_s_c orig_k,
+ struct bkey_i_backpointer *new_bp,
+ struct bkey_s_c found_bp,
bool insert)
{
struct bch_fs *c = trans->c;
@@ -127,12 +96,12 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
if (insert) {
prt_printf(&buf, "existing backpointer found when inserting ");
- bch2_backpointer_to_text(&buf, &bp);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
prt_newline(&buf);
printbuf_indent_add(&buf, 2);
prt_printf(&buf, "found ");
- bch2_bkey_val_to_text(&buf, c, bp_k);
+ bch2_bkey_val_to_text(&buf, c, found_bp);
prt_newline(&buf);
prt_printf(&buf, "for ");
@@ -144,11 +113,11 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
printbuf_indent_add(&buf, 2);
prt_printf(&buf, "searching for ");
- bch2_backpointer_to_text(&buf, &bp);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
prt_newline(&buf);
prt_printf(&buf, "got ");
- bch2_bkey_val_to_text(&buf, c, bp_k);
+ bch2_bkey_val_to_text(&buf, c, found_bp);
prt_newline(&buf);
prt_printf(&buf, "for ");
@@ -167,161 +136,118 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
}
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bpos bucket,
- struct bch_backpointer bp,
struct bkey_s_c orig_k,
+ struct bkey_i_backpointer *bp,
bool insert)
{
struct btree_iter bp_iter;
- struct bkey_s_c k;
- struct bkey_i_backpointer *bp_k;
- int ret;
-
- bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
- ret = PTR_ERR_OR_ZERO(bp_k);
- if (ret)
- return ret;
-
- bkey_backpointer_init(&bp_k->k_i);
- bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
- bp_k->v = bp;
-
- if (!insert) {
- bp_k->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&bp_k->k, 0);
- }
-
- k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
- bp_k->k.p,
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+ bp->k.p,
BTREE_ITER_intent|
BTREE_ITER_slots|
BTREE_ITER_with_updates);
- ret = bkey_err(k);
+ int ret = bkey_err(k);
if (ret)
- goto err;
+ return ret;
if (insert
? k.k->type
: (k.k->type != KEY_TYPE_backpointer ||
- memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) {
- ret = backpointer_mod_err(trans, bp, k, orig_k, insert);
+ memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) {
+ ret = backpointer_mod_err(trans, orig_k, bp, k, insert);
if (ret)
goto err;
}
- ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+ if (!insert) {
+ bp->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp->k, 0);
+ }
+
+ ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0);
err:
bch2_trans_iter_exit(trans, &bp_iter);
return ret;
}
-/*
- * Find the next backpointer >= *bp_offset:
- */
-int bch2_get_next_backpointer(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bpos bucket, int gen,
- struct bpos *bp_pos,
- struct bch_backpointer *bp,
- unsigned iter_flags)
+static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
{
- struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0);
- struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
- struct bkey_s_c k;
- int ret = 0;
-
- if (bpos_ge(*bp_pos, bp_end_pos))
- goto done;
-
- if (gen >= 0) {
- k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
- bucket, BTREE_ITER_cached|iter_flags);
- ret = bkey_err(k);
- if (ret)
- goto out;
-
- if (k.k->type != KEY_TYPE_alloc_v4 ||
- bkey_s_c_to_alloc_v4(k).v->gen != gen)
- goto done;
- }
-
- *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0));
-
- for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
- *bp_pos, iter_flags, k, ret) {
- if (bpos_ge(k.k->p, bp_end_pos))
- break;
+ return (likely(!bch2_backpointers_no_use_write_buffer)
+ ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
+ : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+}
- *bp_pos = k.k->p;
- *bp = *bkey_s_c_to_backpointer(k).v;
- goto out;
- }
-done:
- *bp_pos = SPOS_MAX;
-out:
- bch2_trans_iter_exit(trans, &bp_iter);
- bch2_trans_iter_exit(trans, &alloc_iter);
- return ret;
+static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
+ struct bkey_s_c visiting_k,
+ struct bkey_buf *last_flushed)
+{
+ return likely(!bch2_backpointers_no_use_write_buffer)
+ ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed)
+ : 0;
}
-static void backpointer_not_found(struct btree_trans *trans,
- struct bpos bp_pos,
- struct bch_backpointer bp,
- struct bkey_s_c k)
+static int backpointer_target_not_found(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ struct bkey_s_c target_k,
+ struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
+ int ret = 0;
/*
* If we're using the btree write buffer, the backpointer we were
* looking at may have already been deleted - failure to find what it
* pointed to is not an error:
*/
- if (likely(!bch2_backpointers_no_use_write_buffer))
- return;
-
- struct bpos bucket;
- if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
- return;
+ ret = last_flushed
+ ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed)
+ : 0;
+ if (ret)
+ return ret;
prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
- bp.level ? "btree node" : "extent");
- prt_printf(&buf, "bucket: ");
- bch2_bpos_to_text(&buf, bucket);
- prt_printf(&buf, "\n ");
+ bp.v->level ? "btree node" : "extent");
+ bch2_bkey_val_to_text(&buf, c, bp.s_c);
- prt_printf(&buf, "backpointer pos: ");
- bch2_bpos_to_text(&buf, bp_pos);
prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, target_k);
- bch2_backpointer_to_text(&buf, &bp);
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, k);
- if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers)
- bch_err_ratelimited(c, "%s", buf.buf);
- else
- bch2_trans_inconsistent(trans, "%s", buf.buf);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry)
+ if (p.ptr.dev == bp.k->p.inode) {
+ prt_printf(&buf, "\n ");
+ struct bkey_i_backpointer bp2;
+ bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i));
+ }
+ if (fsck_err(trans, backpointer_to_missing_ptr,
+ "%s", buf.buf))
+ ret = bch2_backpointer_del(trans, bp.k->p);
+fsck_err:
printbuf_exit(&buf);
+ return ret;
}
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
struct btree_iter *iter,
- struct bpos bp_pos,
- struct bch_backpointer bp,
- unsigned iter_flags)
+ unsigned iter_flags,
+ struct bkey_buf *last_flushed)
{
- if (likely(!bp.level)) {
- struct bch_fs *c = trans->c;
+ struct bch_fs *c = trans->c;
- struct bpos bucket;
- if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
- return bkey_s_c_err(-EIO);
+ if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
+ return bkey_s_c_null;
+ if (likely(!bp.v->level)) {
bch2_trans_node_iter_init(trans, iter,
- bp.btree_id,
- bp.pos,
+ bp.v->btree_id,
+ bp.v->pos,
0, 0,
iter_flags);
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
@@ -330,67 +256,64 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
return k;
}
- if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+ if (k.k &&
+ extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
return k;
bch2_trans_iter_exit(trans, iter);
- backpointer_not_found(trans, bp_pos, bp, k);
- return bkey_s_c_null;
+ int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
} else {
- struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
+ struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
+ if (IS_ERR_OR_NULL(b))
+ return ((struct bkey_s_c) { .k = ERR_CAST(b) });
- if (IS_ERR_OR_NULL(b)) {
- bch2_trans_iter_exit(trans, iter);
- return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
- }
return bkey_i_to_s_c(&b->key);
}
}
struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
struct btree_iter *iter,
- struct bpos bp_pos,
- struct bch_backpointer bp)
+ struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
- BUG_ON(!bp.level);
-
- struct bpos bucket;
- if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
- return ERR_PTR(-EIO);
+ BUG_ON(!bp.v->level);
bch2_trans_node_iter_init(trans, iter,
- bp.btree_id,
- bp.pos,
+ bp.v->btree_id,
+ bp.v->pos,
0,
- bp.level - 1,
+ bp.v->level - 1,
0);
struct btree *b = bch2_btree_iter_peek_node(iter);
if (IS_ERR_OR_NULL(b))
goto err;
- BUG_ON(b->c.level != bp.level - 1);
+ BUG_ON(b->c.level != bp.v->level - 1);
- if (extent_matches_bp(c, bp.btree_id, bp.level,
- bkey_i_to_s_c(&b->key),
- bucket, bp))
+ if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
+ bkey_i_to_s_c(&b->key), bp))
return b;
if (btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else {
- backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
- b = NULL;
+ int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed);
+ b = ret ? ERR_PTR(ret) : NULL;
}
err:
bch2_trans_iter_exit(trans, iter);
return b;
}
-static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
- struct bkey_s_c k)
+static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
+ struct bkey_buf *last_flushed)
{
+ if (k.k->type != KEY_TYPE_backpointer)
+ return 0;
+
struct bch_fs *c = trans->c;
struct btree_iter alloc_iter = { NULL };
struct bkey_s_c alloc_k;
@@ -399,10 +322,14 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
struct bpos bucket;
if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
+ ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
+ if (ret)
+ goto out;
+
if (fsck_err(trans, backpointer_to_missing_device,
"backpointer for missing device:\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ ret = bch2_backpointer_del(trans, k.k->p);
goto out;
}
@@ -411,13 +338,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
if (ret)
goto out;
- if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4,
- trans, backpointer_to_missing_alloc,
- "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
- alloc_iter.pos.inode, alloc_iter.pos.offset,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, bp_iter, 0);
- goto out;
+ if (alloc_k.k->type != KEY_TYPE_alloc_v4) {
+ ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
+ if (ret)
+ goto out;
+
+ if (fsck_err(trans, backpointer_to_missing_alloc,
+ "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+ alloc_iter.pos.inode, alloc_iter.pos.offset,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_backpointer_del(trans, k.k->p);
}
out:
fsck_err:
@@ -429,18 +359,24 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
/* verify that every backpointer has a corresponding alloc key */
int bch2_check_btree_backpointers(struct bch_fs *c)
{
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_backpointers, POS_MIN, 0, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_btree_backpointer(trans, &iter, k)));
+ bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed)));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
bch_err_fn(c, ret);
return ret;
}
struct extents_to_bp_state {
- struct bpos bucket_start;
- struct bpos bucket_end;
+ struct bpos bp_start;
+ struct bpos bp_end;
struct bkey_buf last_flushed;
};
@@ -501,9 +437,13 @@ static int check_extent_checksum(struct btree_trans *trans,
goto err;
prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
- prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree));
+ prt_printf(&buf, "\n ");
+ bch2_btree_id_to_text(&buf, btree);
+ prt_str(&buf, " ");
bch2_bkey_val_to_text(&buf, c, extent);
- prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
+ prt_printf(&buf, "\n ");
+ bch2_btree_id_to_text(&buf, o_btree);
+ prt_str(&buf, " ");
bch2_bkey_val_to_text(&buf, c, extent2);
struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
@@ -524,41 +464,25 @@ static int check_extent_checksum(struct btree_trans *trans,
static int check_bp_exists(struct btree_trans *trans,
struct extents_to_bp_state *s,
- struct bpos bucket,
- struct bch_backpointer bp,
+ struct bkey_i_backpointer *bp,
struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
- struct btree_iter bp_iter = {};
struct btree_iter other_extent_iter = {};
struct printbuf buf = PRINTBUF;
- struct bkey_s_c bp_k;
- int ret = 0;
- struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket);
- if (!ca) {
- prt_str(&buf, "extent for nonexistent device:bucket ");
- bch2_bpos_to_text(&buf, bucket);
- prt_str(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, orig_k);
- bch_err(c, "%s", buf.buf);
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto err;
- }
-
- if (bpos_lt(bucket, s->bucket_start) ||
- bpos_gt(bucket, s->bucket_end))
- goto out;
+ if (bpos_lt(bp->k.p, s->bp_start) ||
+ bpos_gt(bp->k.p, s->bp_end))
+ return 0;
- bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp(ca, bucket, bp.bucket_offset),
- 0);
- ret = bkey_err(bp_k);
+ struct btree_iter bp_iter;
+ struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0);
+ int ret = bkey_err(bp_k);
if (ret)
goto err;
if (bp_k.k->type != KEY_TYPE_backpointer ||
- memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
+ memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) {
ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed);
if (ret)
goto err;
@@ -570,7 +494,6 @@ static int check_bp_exists(struct btree_trans *trans,
fsck_err:
bch2_trans_iter_exit(trans, &other_extent_iter);
bch2_trans_iter_exit(trans, &bp_iter);
- bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
check_existing_bp:
@@ -578,10 +501,10 @@ static int check_bp_exists(struct btree_trans *trans,
if (bp_k.k->type != KEY_TYPE_backpointer)
goto missing;
- struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;
+ struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
struct bkey_s_c other_extent =
- bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
+ bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL);
ret = bkey_err(other_extent);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
ret = 0;
@@ -600,19 +523,23 @@ static int check_bp_exists(struct btree_trans *trans,
bch_err(c, "%s", buf.buf);
if (other_extent.k->size <= orig_k.k->size) {
- ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
+ ret = drop_dev_and_update(trans, other_bp.v->btree_id,
+ other_extent, bp->k.p.inode);
if (ret)
goto err;
goto out;
} else {
- ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
+ ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode);
if (ret)
goto err;
goto missing;
}
}
- ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
+ ret = check_extent_checksum(trans,
+ other_bp.v->btree_id, other_extent,
+ bp->v.btree_id, orig_k,
+ bp->k.p.inode);
if (ret < 0)
goto err;
if (ret) {
@@ -620,7 +547,8 @@ static int check_bp_exists(struct btree_trans *trans,
goto missing;
}
- ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
+ ret = check_extent_checksum(trans, bp->v.btree_id, orig_k,
+ other_bp.v->btree_id, other_extent, bp->k.p.inode);
if (ret < 0)
goto err;
if (ret) {
@@ -629,7 +557,7 @@ static int check_bp_exists(struct btree_trans *trans,
}
printbuf_reset(&buf);
- prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode);
+ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode);
bch2_bkey_val_to_text(&buf, c, orig_k);
prt_str(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, other_extent);
@@ -638,21 +566,15 @@ static int check_bp_exists(struct btree_trans *trans,
goto err;
missing:
printbuf_reset(&buf);
- prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
- bch2_btree_id_str(bp.btree_id), bp.level);
+ prt_str(&buf, "missing backpointer\n for: ");
bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_printf(&buf, "\n got: ");
+ prt_printf(&buf, "\n want: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
+ prt_printf(&buf, "\n got: ");
bch2_bkey_val_to_text(&buf, c, bp_k);
- struct bkey_i_backpointer n_bp_k;
- bkey_backpointer_init(&n_bp_k.k_i);
- n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
- n_bp_k.v = bp;
- prt_printf(&buf, "\n want: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
-
if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
- ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true);
+ ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true);
goto out;
}
@@ -663,31 +585,33 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- int ret;
- ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bpos bucket_pos = POS_MIN;
- struct bch_backpointer bp;
-
if (p.ptr.cached)
continue;
+ if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
+ continue;
+
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
- if (ca)
- bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp);
+ bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches);
+ bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty);
rcu_read_unlock();
- if (!ca)
- continue;
+ if (check || empty) {
+ struct bkey_i_backpointer bp;
+ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
- ret = check_bp_exists(trans, s, bucket_pos, bp, k);
- if (ret)
- return ret;
+ int ret = check
+ ? check_bp_exists(trans, s, &bp, k)
+ : bch2_bucket_backpointer_mod(trans, k, &bp, true);
+ if (ret)
+ return ret;
+ }
}
return 0;
@@ -896,54 +820,330 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
return 0;
}
+enum alloc_sector_counter {
+ ALLOC_dirty,
+ ALLOC_cached,
+ ALLOC_stripe,
+ ALLOC_SECTORS_NR
+};
+
+static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t)
+{
+ switch (t) {
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ return ALLOC_dirty;
+ case BCH_DATA_cached:
+ return ALLOC_cached;
+ case BCH_DATA_stripe:
+ return ALLOC_stripe;
+ default:
+ BUG();
+ }
+}
+
+static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos);
+
+static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k,
+ struct bkey_buf *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
+ bool need_commit = false;
+
+ if (a->data_type == BCH_DATA_sb ||
+ a->data_type == BCH_DATA_journal ||
+ a->data_type == BCH_DATA_parity)
+ return 0;
+
+ u32 sectors[ALLOC_SECTORS_NR];
+ memset(sectors, 0, sizeof(sectors));
+
+ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p);
+ if (!ca)
+ return 0;
+
+ struct btree_iter iter;
+ struct bkey_s_c bp_k;
+ int ret = 0;
+ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp_start(ca, alloc_k.k->p),
+ bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) {
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ continue;
+
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
+
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen &&
+ (bp.v->bucket_gen != a->gen ||
+ bp.v->pad)) {
+ ret = bch2_backpointer_del(trans, bp_k.k->p);
+ if (ret)
+ break;
+
+ need_commit = true;
+ continue;
+ }
+
+ if (bp.v->bucket_gen != a->gen)
+ continue;
+
+ sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len;
+ };
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ goto err;
+
+ if (need_commit) {
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+ }
+
+ /* Cached pointers don't have backpointers: */
+
+ if (sectors[ALLOC_dirty] != a->dirty_sectors ||
+ sectors[ALLOC_stripe] != a->stripe_sectors) {
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
+ ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
+ if (ret)
+ goto err;
+ }
+
+ if (sectors[ALLOC_dirty] > a->dirty_sectors ||
+ sectors[ALLOC_stripe] > a->stripe_sectors) {
+ ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
+ if (!sectors[ALLOC_dirty] &&
+ !sectors[ALLOC_stripe])
+ __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty);
+ else
+ __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
+ }
+err:
+ bch2_dev_put(ca);
+ return ret;
+}
+
+static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr_v2: {
+ bool ret = false;
+
+ rcu_read_lock();
+ struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
+ while (pos.inode <= k.k->p.inode) {
+ if (pos.inode >= c->sb.nr_devices)
+ break;
+
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
+ if (!ca)
+ goto next;
+
+ struct bpos bucket = bp_pos_to_bucket(ca, pos);
+ bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches,
+ ca->mi.nbuckets, bucket.offset);
+ if (bucket.offset == ca->mi.nbuckets)
+ goto next;
+
+ ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p);
+ if (ret)
+ break;
+next:
+ pos = SPOS(pos.inode + 1, 0, 0);
+ }
+ rcu_read_unlock();
+
+ return ret;
+ }
+ case KEY_TYPE_btree_ptr:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k,
+ enum btree_id btree, unsigned level)
+{
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0);
+ struct btree *b = bch2_btree_iter_peek_node(&iter);
+ int ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto err;
+
+ if (b)
+ bch2_node_pin(trans->c, b);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans,
+ struct bpos start, struct bpos *end)
+{
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
+ struct bkey_buf tmp;
+ bch2_bkey_buf_init(&tmp);
+
+ bch2_btree_cache_unpin(c);
+
+ *end = SPOS_MAX;
+
+ s64 mem_may_pin = mem_may_pin_bytes(c);
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
+ 0, 1, BTREE_ITER_prefetch);
+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ if (!backpointer_node_has_missing(c, k))
+ continue;
+
+ mem_may_pin -= c->opts.btree_node_size;
+ if (mem_may_pin <= 0)
+ break;
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ struct btree_path *path = btree_iter_path(trans, &iter);
+
+ BUG_ON(path->level != 1);
+
+ bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1);
+ }));
+ if (ret)
+ return ret;
+
+ struct bpos pinned = SPOS_MAX;
+ mem_may_pin = mem_may_pin_bytes(c);
+ bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
+ 0, 1, BTREE_ITER_prefetch);
+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ if (!backpointer_node_has_missing(c, k))
+ continue;
+
+ mem_may_pin -= c->opts.btree_node_size;
+ if (mem_may_pin <= 0) {
+ *end = pinned;
+ break;
+ }
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ struct btree_path *path = btree_iter_path(trans, &iter);
+
+ BUG_ON(path->level != 1);
+
+ int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1);
+
+ if (!ret2)
+ pinned = tmp.k->k.p;
+
+ ret;
+ }));
+ if (ret)
+ return ret;
+
+ return ret;
+}
+
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
+ int ret = 0;
+
+ /*
+ * Can't allow devices to come/go/resize while we have bucket bitmaps
+ * allocated
+ */
+ lockdep_assert_held(&c->state_lock);
+
+ for_each_member_device(c, ca) {
+ BUG_ON(ca->bucket_backpointer_mismatches);
+ ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!ca->bucket_backpointer_mismatches ||
+ !ca->bucket_backpointer_empty) {
+ bch2_dev_put(ca);
+ ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
+ goto err_free_bitmaps;
+ }
+ }
+
struct btree_trans *trans = bch2_trans_get(c);
- struct extents_to_bp_state s = { .bucket_start = POS_MIN };
- int ret;
+ struct extents_to_bp_state s = { .bp_start = POS_MIN };
bch2_bkey_buf_init(&s.last_flushed);
bkey_init(&s.last_flushed.k->k);
+ ret = for_each_btree_key(trans, iter, BTREE_ID_alloc,
+ POS_MIN, BTREE_ITER_prefetch, k, ({
+ check_bucket_backpointer_mismatch(trans, k, &s.last_flushed);
+ }));
+ if (ret)
+ goto err;
+
+ u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0;
+ for_each_member_device(c, ca) {
+ nr_buckets += ca->mi.nbuckets;
+ nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets);
+ nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets);
+ }
+
+ if (!nr_mismatches && !nr_empty)
+ goto err;
+
+ bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
+ nr_mismatches + nr_empty, nr_buckets);
+
while (1) {
- struct bbpos end;
- ret = bch2_get_btree_in_memory_pos(trans,
- BIT_ULL(BTREE_ID_backpointers),
- BIT_ULL(BTREE_ID_backpointers),
- BBPOS(BTREE_ID_backpointers, s.bucket_start), &end);
+ ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
if (ret)
break;
- s.bucket_end = end.pos;
-
- if ( bpos_eq(s.bucket_start, POS_MIN) &&
- !bpos_eq(s.bucket_end, SPOS_MAX))
+ if ( bpos_eq(s.bp_start, POS_MIN) &&
+ !bpos_eq(s.bp_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
__func__, btree_nodes_fit_in_ram(c));
- if (!bpos_eq(s.bucket_start, POS_MIN) ||
- !bpos_eq(s.bucket_end, SPOS_MAX)) {
+ if (!bpos_eq(s.bp_start, POS_MIN) ||
+ !bpos_eq(s.bp_end, SPOS_MAX)) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "check_extents_to_backpointers(): ");
- bch2_bpos_to_text(&buf, s.bucket_start);
+ bch2_bpos_to_text(&buf, s.bp_start);
prt_str(&buf, "-");
- bch2_bpos_to_text(&buf, s.bucket_end);
+ bch2_bpos_to_text(&buf, s.bp_end);
bch_verbose(c, "%s", buf.buf);
printbuf_exit(&buf);
}
ret = bch2_check_extents_to_backpointers_pass(trans, &s);
- if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
+ if (ret || bpos_eq(s.bp_end, SPOS_MAX))
break;
- s.bucket_start = bpos_successor(s.bucket_end);
+ s.bp_start = bpos_successor(s.bp_end);
}
+err:
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
-
bch2_btree_cache_unpin(c);
+err_free_bitmaps:
+ for_each_member_device(c, ca) {
+ kvfree(ca->bucket_backpointer_empty);
+ ca->bucket_backpointer_empty = NULL;
+ kvfree(ca->bucket_backpointer_mismatches);
+ ca->bucket_backpointer_mismatches = NULL;
+ }
bch_err_fn(c, ret);
return ret;
@@ -959,44 +1159,43 @@ static int check_one_backpointer(struct btree_trans *trans,
return 0;
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
struct bbpos pos = bp_to_bbpos(*bp.v);
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
- int ret;
if (bbpos_cmp(pos, start) < 0 ||
bbpos_cmp(pos, end) > 0)
return 0;
- k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0);
- ret = bkey_err(k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed);
+ int ret = bkey_err(k);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
return 0;
if (ret)
return ret;
- if (!k.k) {
- ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed);
- if (ret)
- goto out;
-
- if (fsck_err(trans, backpointer_to_missing_ptr,
- "backpointer for missing %s\n %s",
- bp.v->level ? "btree node" : "extent",
- (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
- ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
- goto out;
- }
- }
-out:
-fsck_err:
bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
return ret;
}
+static int check_bucket_backpointers_to_extents(struct btree_trans *trans,
+ struct bch_dev *ca, struct bpos bucket)
+{
+ u32 restart_count = trans->restart_count;
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp_start(ca, bucket),
+ bucket_pos_to_bp_end(ca, bucket),
+ 0, k,
+ check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed)
+ );
+
+ bch2_bkey_buf_exit(&last_flushed, trans->c);
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
@@ -1009,9 +1208,8 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
bkey_init(&last_flushed.k->k);
progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
- int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
- POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
+ POS_MIN, BTREE_ITER_prefetch, k, ({
progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
check_one_backpointer(trans, start, end, k, &last_flushed);
}));
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 3b29fdf519dd..060dad1521ee 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -18,14 +18,14 @@ static inline u64 swab40(u64 x)
((x & 0xff00000000ULL) >> 32));
}
-int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags);
-void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
-void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k,
+ struct bkey_validate_context);
+void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_backpointer_swab(struct bkey_s);
#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \
.key_validate = bch2_backpointer_validate, \
- .val_to_text = bch2_backpointer_k_to_text, \
+ .val_to_text = bch2_backpointer_to_text, \
.swab = bch2_backpointer_swab, \
.min_val_size = 32, \
})
@@ -43,22 +43,24 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos
return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
}
+static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos,
+ u32 *bucket_offset)
+{
+ u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+ return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset));
+}
+
static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
{
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
if (ca)
*bucket = bp_pos_to_bucket(ca, bp_pos);
rcu_read_unlock();
return ca != NULL;
}
-static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
-{
- return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket),
- c, "backpointer for missing device %llu", bp_pos.inode);
-}
-
static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
struct bpos bucket,
u64 bucket_offset)
@@ -80,31 +82,35 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
return ret;
}
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *,
- struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool);
+static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket)
+{
+ return bucket_pos_to_bp(ca, bucket, 0);
+}
+
+static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket)
+{
+ return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0));
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
+ struct bkey_s_c,
+ struct bkey_i_backpointer *,
+ bool);
static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bpos bucket,
- struct bch_backpointer bp,
struct bkey_s_c orig_k,
+ struct bkey_i_backpointer *bp,
bool insert)
{
if (unlikely(bch2_backpointers_no_use_write_buffer))
- return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert);
-
- struct bkey_i_backpointer bp_k;
-
- bkey_backpointer_init(&bp_k.k_i);
- bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
- bp_k.v = bp;
+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
if (!insert) {
- bp_k.k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&bp_k.k, 0);
+ bp->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp->k, 0);
}
- return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
+ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i);
}
static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
@@ -134,44 +140,29 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
}
}
-static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
const union bch_extent_entry *entry,
- struct bpos *bucket_pos, struct bch_backpointer *bp,
- u64 sectors)
+ struct bkey_i_backpointer *bp)
{
- u32 bucket_offset;
- *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset);
- *bp = (struct bch_backpointer) {
+ bkey_backpointer_init(&bp->k_i);
+ bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset);
+ bp->v = (struct bch_backpointer) {
.btree_id = btree_id,
.level = level,
.data_type = bch2_bkey_ptr_data_type(k, p, entry),
- .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
- p.crc.offset,
- .bucket_len = sectors,
+ .bucket_gen = p.ptr.gen,
+ .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p),
.pos = k.k->p,
};
}
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- const union bch_extent_entry *entry,
- struct bpos *bucket_pos, struct bch_backpointer *bp)
-{
- u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
-
- __bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors);
-}
-
-int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int,
- struct bpos *, struct bch_backpointer *, unsigned);
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
- struct bpos, struct bch_backpointer,
- unsigned);
-struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
- struct bpos, struct bch_backpointer);
+struct bkey_buf;
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
+ struct btree_iter *, unsigned, struct bkey_buf *);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
+ struct btree_iter *, struct bkey_buf *);
int bch2_check_btree_backpointers(struct bch_fs *);
int bch2_check_extents_to_backpointers(struct bch_fs *);
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
index be2edced5213..63abe17f35ea 100644
--- a/fs/bcachefs/bbpos.h
+++ b/fs/bcachefs/bbpos.h
@@ -29,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos)
static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
{
- prt_str(out, bch2_btree_id_str(pos.btree));
+ bch2_btree_id_to_text(out, pos.btree);
prt_char(out, ':');
bch2_bpos_to_text(out, pos.pos);
}
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index e94a83b8113e..161cf2f05d2a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -205,6 +205,7 @@
#include <linux/zstd.h>
#include "bcachefs_format.h"
+#include "btree_journal_iter_types.h"
#include "disk_accounting_types.h"
#include "errcode.h"
#include "fifo.h"
@@ -293,6 +294,8 @@ do { \
#define bch_info(c, fmt, ...) \
bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_info_ratelimited(c, fmt, ...) \
+ bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_notice(c, fmt, ...) \
bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn(c, fmt, ...) \
@@ -352,6 +355,12 @@ do { \
bch_info(c, fmt, ##__VA_ARGS__); \
} while (0)
+#define bch_verbose_ratelimited(c, fmt, ...) \
+do { \
+ if ((c)->opts.verbose) \
+ bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \
+} while (0)
+
#define pr_verbose_init(opts, fmt, ...) \
do { \
if (opt_get(opts, verbose)) \
@@ -538,20 +547,20 @@ struct bch_dev {
/*
* Buckets:
- * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
- * gc_gens_lock, for device resize - holding any is sufficient for
- * access: Or rcu_read_lock(), but only for dev_ptr_stale():
+ * Per-bucket arrays are protected by either rcu_read_lock or
+ * state_lock, for device resize.
*/
GENRADIX(struct bucket) buckets_gc;
struct bucket_gens __rcu *bucket_gens;
u8 *oldest_gen;
unsigned long *buckets_nouse;
- struct rw_semaphore bucket_lock;
+
+ unsigned long *bucket_backpointer_mismatches;
+ unsigned long *bucket_backpointer_empty;
struct bch_dev_usage __percpu *usage;
/* Allocator: */
- u64 new_fs_bucket_idx;
u64 alloc_cursor[3];
unsigned nr_open_buckets;
@@ -606,6 +615,7 @@ struct bch_dev {
x(going_ro) \
x(write_disable_complete) \
x(clean_shutdown) \
+ x(recovery_running) \
x(fsck_running) \
x(initial_gc_unfixed) \
x(need_delete_dead_snapshots) \
@@ -650,28 +660,6 @@ struct journal_seq_blacklist_table {
} entries[];
};
-struct journal_keys {
- /* must match layout in darray_types.h */
- size_t nr, size;
- struct journal_key {
- u64 journal_seq;
- u32 journal_offset;
- enum btree_id btree_id:8;
- unsigned level:8;
- bool allocated;
- bool overwritten;
- struct bkey_i *k;
- } *data;
- /*
- * Gap buffer: instead of all the empty space in the array being at the
- * end of the buffer - from @nr to @size - the empty space is at @gap.
- * This means that sequential insertions are O(n) instead of O(n^2).
- */
- size_t gap;
- atomic_t ref;
- bool initial_ref_held;
-};
-
struct btree_trans_buf {
struct btree_trans *trans;
};
@@ -680,6 +668,7 @@ struct btree_trans_buf {
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
#define BCH_WRITE_REFS() \
+ x(journal) \
x(trans) \
x(write) \
x(promote) \
@@ -692,6 +681,7 @@ struct btree_trans_buf {
x(dio_write) \
x(discard) \
x(discard_fast) \
+ x(check_discard_freespace_key) \
x(invalidate) \
x(delete_dead_snapshots) \
x(gc_gens) \
@@ -734,6 +724,12 @@ struct bch_fs {
#else
struct percpu_ref writes;
#endif
+ /*
+ * Certain operations are only allowed in single threaded mode, during
+ * recovery, and we want to assert that this is the case:
+ */
+ struct task_struct *recovery_task;
+
/*
* Analagous to c->writes, for asynchronous ops that don't necessarily
* need fs to be read-write
@@ -764,6 +760,8 @@ struct bch_fs {
__uuid_t user_uuid;
u16 version;
+ u16 version_incompat;
+ u16 version_incompat_allowed;
u16 version_min;
u16 version_upgrade_complete;
@@ -834,9 +832,10 @@ struct bch_fs {
struct work_struct btree_interior_update_work;
struct workqueue_struct *btree_node_rewrite_worker;
-
- struct list_head pending_node_rewrites;
- struct mutex pending_node_rewrites_lock;
+ struct list_head btree_node_rewrites;
+ struct list_head btree_node_rewrites_pending;
+ spinlock_t btree_node_rewrites_lock;
+ struct closure_waitlist btree_node_rewrites_wait;
/* btree_io.c: */
spinlock_t btree_write_error_lock;
@@ -967,8 +966,7 @@ struct bch_fs {
struct rhashtable promote_table;
mempool_t compression_bounce[2];
- mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR];
- mempool_t decompress_workspace;
+ mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
size_t zstd_workspace_size;
struct crypto_shash *sha256;
@@ -1027,6 +1025,7 @@ struct bch_fs {
struct list_head vfs_inodes_list;
struct mutex vfs_inodes_lock;
struct rhashtable vfs_inodes_table;
+ struct rhltable vfs_inodes_by_inum_table;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
@@ -1048,10 +1047,12 @@ struct bch_fs {
* for signaling to the toplevel code which pass we want to run now.
*/
enum bch_recovery_pass curr_recovery_pass;
+ enum bch_recovery_pass next_recovery_pass;
/* bitmask of recovery passes that we actually ran */
u64 recovery_passes_complete;
/* never rewinds version of curr_recovery_pass */
enum bch_recovery_pass recovery_pass_done;
+ spinlock_t recovery_pass_lock;
struct semaphore online_fsck_mutex;
/* DEBUG JUNK */
@@ -1062,9 +1063,6 @@ struct bch_fs {
struct btree_node *verify_ondisk;
struct mutex verify_lock;
- u64 *unused_inode_hints;
- unsigned inode_shard_bits;
-
/*
* A btree node on disk could have too many bsets for an iterator to fit
* on the stack - have to dynamically allocate them
@@ -1086,8 +1084,6 @@ struct bch_fs {
u64 counters_on_mount[BCH_COUNTER_NR];
u64 __percpu *counters;
- unsigned copy_gc_enabled:1;
-
struct bch2_time_stats times[BCH_TIME_STAT_NR];
struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5004f6ba997c..0680930508a3 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k)
x(snapshot_tree, 31) \
x(logged_op_truncate, 32) \
x(logged_op_finsert, 33) \
- x(accounting, 34)
+ x(accounting, 34) \
+ x(inode_alloc_cursor, 35)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -463,7 +464,8 @@ struct bch_backpointer {
__u8 btree_id;
__u8 level;
__u8 data_type;
- __u64 bucket_offset:40;
+ __u8 bucket_gen;
+ __u32 pad;
__u32 bucket_len;
struct bpos pos;
} __packed __aligned(8);
@@ -499,8 +501,6 @@ struct bch_sb_field {
#include "disk_groups_format.h"
#include "extents_format.h"
#include "ec_format.h"
-#include "dirent_format.h"
-#include "disk_groups_format.h"
#include "inode_format.h"
#include "journal_seq_blacklist_format.h"
#include "logged_ops_format.h"
@@ -679,7 +679,13 @@ struct bch_sb_field_ext {
x(disk_accounting_v3, BCH_VERSION(1, 10)) \
x(disk_accounting_inum, BCH_VERSION(1, 11)) \
x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \
- x(inode_has_child_snapshots, BCH_VERSION(1, 13))
+ x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \
+ x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \
+ x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \
+ x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \
+ x(inode_depth, BCH_VERSION(1, 17)) \
+ x(persistent_inode_cursors, BCH_VERSION(1, 18)) \
+ x(autofix_errors, BCH_VERSION(1, 19))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -844,6 +850,10 @@ LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
struct bch_sb, flags[5], 0, 16);
LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
struct bch_sb, flags[5], 16, 32);
+LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
+LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
+ struct bch_sb, flags[5], 48, 64);
+LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
@@ -896,21 +906,22 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
x(new_varint, 15) \
x(journal_no_flush, 16) \
x(alloc_v2, 17) \
- x(extents_across_btree_nodes, 18)
+ x(extents_across_btree_nodes, 18) \
+ x(incompat_version_field, 19)
#define BCH_SB_FEATURES_ALWAYS \
- ((1ULL << BCH_FEATURE_new_extent_overwrite)| \
- (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
- (1ULL << BCH_FEATURE_btree_updates_journalled)|\
- (1ULL << BCH_FEATURE_alloc_v2)|\
- (1ULL << BCH_FEATURE_extents_across_btree_nodes))
+ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \
+ BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\
+ BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\
+ BIT_ULL(BCH_FEATURE_alloc_v2)|\
+ BIT_ULL(BCH_FEATURE_extents_across_btree_nodes))
#define BCH_SB_FEATURES_ALL \
(BCH_SB_FEATURES_ALWAYS| \
- (1ULL << BCH_FEATURE_new_siphash)| \
- (1ULL << BCH_FEATURE_btree_ptr_v2)| \
- (1ULL << BCH_FEATURE_new_varint)| \
- (1ULL << BCH_FEATURE_journal_no_flush))
+ BIT_ULL(BCH_FEATURE_new_siphash)| \
+ BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
+ BIT_ULL(BCH_FEATURE_new_varint)| \
+ BIT_ULL(BCH_FEATURE_journal_no_flush))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
@@ -1032,7 +1043,7 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
x(crc64, 2) \
x(xxhash, 3)
-enum bch_csum_opts {
+enum bch_csum_opt {
#define x(t, n) BCH_CSUM_OPT_##t = n,
BCH_CSUM_OPTS()
#undef x
@@ -1221,6 +1232,15 @@ struct jset_entry_log {
u8 d[];
} __packed __aligned(8);
+static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l)
+{
+ unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d);
+
+ while (b && !l->d[b - 1])
+ --b;
+ return b;
+}
+
struct jset_entry_datetime {
struct jset_entry entry;
__le64 seconds;
@@ -1268,14 +1288,18 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
/* Btree: */
enum btree_id_flags {
- BTREE_ID_EXTENTS = BIT(0),
- BTREE_ID_SNAPSHOTS = BIT(1),
- BTREE_ID_SNAPSHOT_FIELD = BIT(2),
- BTREE_ID_DATA = BIT(3),
+ BTREE_IS_extents = BIT(0),
+ BTREE_IS_snapshots = BIT(1),
+ BTREE_IS_snapshot_field = BIT(2),
+ BTREE_IS_data = BIT(3),
+ BTREE_IS_write_buffer = BIT(4),
};
#define BCH_BTREE_IDS() \
- x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
+ x(extents, 0, \
+ BTREE_IS_extents| \
+ BTREE_IS_snapshots| \
+ BTREE_IS_data, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_error)| \
BIT_ULL(KEY_TYPE_cookie)| \
@@ -1283,17 +1307,20 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_reservation)| \
BIT_ULL(KEY_TYPE_reflink_p)| \
BIT_ULL(KEY_TYPE_inline_data)) \
- x(inodes, 1, BTREE_ID_SNAPSHOTS, \
+ x(inodes, 1, \
+ BTREE_IS_snapshots, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_inode)| \
BIT_ULL(KEY_TYPE_inode_v2)| \
BIT_ULL(KEY_TYPE_inode_v3)| \
BIT_ULL(KEY_TYPE_inode_generation)) \
- x(dirents, 2, BTREE_ID_SNAPSHOTS, \
+ x(dirents, 2, \
+ BTREE_IS_snapshots, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_hash_whiteout)| \
BIT_ULL(KEY_TYPE_dirent)) \
- x(xattrs, 3, BTREE_ID_SNAPSHOTS, \
+ x(xattrs, 3, \
+ BTREE_IS_snapshots, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_cookie)| \
BIT_ULL(KEY_TYPE_hash_whiteout)| \
@@ -1307,7 +1334,9 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_quota)) \
x(stripes, 6, 0, \
BIT_ULL(KEY_TYPE_stripe)) \
- x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \
+ x(reflink, 7, \
+ BTREE_IS_extents| \
+ BTREE_IS_data, \
BIT_ULL(KEY_TYPE_reflink_v)| \
BIT_ULL(KEY_TYPE_indirect_inline_data)| \
BIT_ULL(KEY_TYPE_error)) \
@@ -1315,28 +1344,38 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_subvolume)) \
x(snapshots, 9, 0, \
BIT_ULL(KEY_TYPE_snapshot)) \
- x(lru, 10, 0, \
+ x(lru, 10, \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_set)) \
- x(freespace, 11, BTREE_ID_EXTENTS, \
+ x(freespace, 11, \
+ BTREE_IS_extents, \
BIT_ULL(KEY_TYPE_set)) \
x(need_discard, 12, 0, \
BIT_ULL(KEY_TYPE_set)) \
- x(backpointers, 13, 0, \
+ x(backpointers, 13, \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_backpointer)) \
x(bucket_gens, 14, 0, \
BIT_ULL(KEY_TYPE_bucket_gens)) \
x(snapshot_trees, 15, 0, \
BIT_ULL(KEY_TYPE_snapshot_tree)) \
- x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
+ x(deleted_inodes, 16, \
+ BTREE_IS_snapshot_field| \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
- BIT_ULL(KEY_TYPE_logged_op_finsert)) \
- x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
+ BIT_ULL(KEY_TYPE_logged_op_finsert)| \
+ BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \
+ x(rebalance_work, 18, \
+ BTREE_IS_snapshot_field| \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
x(subvolume_children, 19, 0, \
BIT_ULL(KEY_TYPE_set)) \
- x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \
+ x(accounting, 20, \
+ BTREE_IS_snapshot_field| \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_accounting)) \
enum btree_id {
@@ -1361,6 +1400,8 @@ static inline bool btree_id_is_alloc(enum btree_id id)
case BTREE_ID_need_discard:
case BTREE_ID_freespace:
case BTREE_ID_bucket_gens:
+ case BTREE_ID_lru:
+ case BTREE_ID_accounting:
return true;
default:
return false;
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 41df24a53d97..054e2d5e8448 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -9,13 +9,6 @@
#include "util.h"
#include "vstructs.h"
-enum bch_validate_flags {
- BCH_VALIDATE_write = BIT(0),
- BCH_VALIDATE_commit = BIT(1),
- BCH_VALIDATE_journal = BIT(2),
- BCH_VALIDATE_silent = BIT(3),
-};
-
#if 0
/*
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index e7ac227ba7e8..15c93576b5c2 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -28,7 +28,7 @@ const char * const bch2_bkey_types[] = {
};
static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
return 0;
}
@@ -42,7 +42,7 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
})
static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
int ret = 0;
@@ -59,7 +59,7 @@ static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
})
static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
return 0;
}
@@ -83,7 +83,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
})
static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
return 0;
}
@@ -124,7 +124,7 @@ const struct bkey_ops bch2_bkey_null_ops = {
};
int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
return 0;
@@ -140,7 +140,7 @@ int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
if (!ops->key_validate)
return 0;
- ret = ops->key_validate(c, k, flags);
+ ret = ops->key_validate(c, k, from);
fsck_err:
return ret;
}
@@ -161,9 +161,10 @@ const char *bch2_btree_node_type_str(enum btree_node_type type)
}
int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
- enum btree_node_type type,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
+ enum btree_node_type type = __btree_node_type(from.level, from.btree);
+
if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
return 0;
@@ -177,7 +178,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
return 0;
bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
- (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) &&
+ (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
c, bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",
@@ -228,15 +229,15 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
}
int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
- enum btree_node_type type,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
- return __bch2_bkey_validate(c, k, type, flags) ?:
- bch2_bkey_val_validate(c, k, flags);
+ return __bch2_bkey_validate(c, k, from) ?:
+ bch2_bkey_val_validate(c, k, from);
}
int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k, enum bch_validate_flags flags)
+ struct bkey_s_c k,
+ struct bkey_validate_context from)
{
int ret = 0;
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 018fb72e32d3..bf34111cdf00 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -22,7 +22,7 @@ extern const struct bkey_ops bch2_bkey_null_ops;
*/
struct bkey_ops {
int (*key_validate)(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags);
+ struct bkey_validate_context from);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void (*swab)(struct bkey_s);
@@ -48,13 +48,14 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
: &bch2_bkey_null_ops;
}
-int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
-int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
- enum bch_validate_flags);
-int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
- enum bch_validate_flags);
+int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context from);
void bch2_bpos_to_text(struct printbuf *, struct bpos);
void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
index c9ae9e42b385..b4f328f9853c 100644
--- a/fs/bcachefs/bkey_types.h
+++ b/fs/bcachefs/bkey_types.h
@@ -210,4 +210,32 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
BCH_BKEY_TYPES();
#undef x
+enum bch_validate_flags {
+ BCH_VALIDATE_write = BIT(0),
+ BCH_VALIDATE_commit = BIT(1),
+ BCH_VALIDATE_silent = BIT(2),
+};
+
+#define BKEY_VALIDATE_CONTEXTS() \
+ x(unknown) \
+ x(superblock) \
+ x(journal) \
+ x(btree_root) \
+ x(btree_node) \
+ x(commit)
+
+struct bkey_validate_context {
+ enum {
+#define x(n) BKEY_VALIDATE_##n,
+ BKEY_VALIDATE_CONTEXTS()
+#undef x
+ } from:8;
+ enum bch_validate_flags flags:8;
+ u8 level;
+ enum btree_id btree;
+ bool root:1;
+ unsigned journal_offset;
+ u64 journal_seq;
+};
+
#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 7123019ab3bc..672ca2c1d37d 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -222,7 +222,6 @@ void bch2_node_pin(struct bch_fs *c, struct btree *b)
struct btree_cache *bc = &c->btree_cache;
mutex_lock(&bc->lock);
- BUG_ON(!__btree_node_pinned(bc, b));
if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
set_btree_node_pinned(b);
list_move(&b->list, &bc->live[1].list);
@@ -326,7 +325,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans,
if (!IS_ERR_OR_NULL(b)) {
mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+ __bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, new);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
@@ -1004,16 +1003,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
return;
prt_printf(&buf,
- "btree node header doesn't match ptr\n"
- "btree %s level %u\n"
- "ptr: ",
- bch2_btree_id_str(b->c.btree_id), b->c.level);
+ "btree node header doesn't match ptr: ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_str(&buf, "\nptr: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_printf(&buf, "\nheader: btree %s level %llu\n"
- "min ",
- bch2_btree_id_str(BTREE_NODE_ID(b->data)),
- BTREE_NODE_LEVEL(b->data));
+ prt_str(&buf, "\nheader: ");
+ bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data));
+ prt_str(&buf, "\nmin ");
bch2_bpos_to_text(&buf, b->data->min_key);
prt_printf(&buf, "\nmax ");
@@ -1133,7 +1130,7 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-BCH_ERR_btree_node_read_error);
+ return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@@ -1223,7 +1220,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-BCH_ERR_btree_node_read_error);
+ return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@@ -1305,7 +1302,7 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->c.lock);
- b = ERR_PTR(-BCH_ERR_btree_node_read_error);
+ b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
goto out;
}
@@ -1398,13 +1395,31 @@ void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
prt_printf(out, "(unknown btree %u)", btree);
}
+void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level)
+{
+ prt_str(out, "btree=");
+ bch2_btree_id_to_text(out, btree);
+ prt_printf(out, " level=%u", level);
+}
+
+void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
+ enum btree_id btree, unsigned level, struct bkey_s_c k)
+{
+ bch2_btree_id_to_text(out, btree);
+ prt_printf(out, " level %u/", level);
+ struct btree_root *r = bch2_btree_id_root(c, btree);
+ if (r)
+ prt_printf(out, "%u", r->level);
+ else
+ prt_printf(out, "(unknown)");
+ prt_printf(out, "\n ");
+
+ bch2_bkey_val_to_text(out, c, k);
+}
+
void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
{
- prt_printf(out, "%s level %u/%u\n ",
- bch2_btree_id_str(b->c.btree_id),
- b->c.level,
- bch2_btree_id_root(c, b->c.btree_id)->level);
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+ __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key));
}
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
@@ -1478,8 +1493,12 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
prt_newline(out);
- for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
- prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
+ bch2_btree_id_to_text(out, i);
+ prt_printf(out, "\t");
+ prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size);
+ prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]);
+ }
prt_newline(out);
prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 66e86d1a178d..ca3c1b145330 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -128,19 +128,27 @@ static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned i
} else {
unsigned idx = id - BTREE_ID_NR;
- EBUG_ON(idx >= c->btree_roots_extra.nr);
+ /* This can happen when we're called from btree_node_scan */
+ if (idx >= c->btree_roots_extra.nr)
+ return NULL;
+
return &c->btree_roots_extra.data[idx];
}
}
static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
{
- return bch2_btree_id_root(c, b->c.btree_id)->b;
+ struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
+
+ return r ? r->b : NULL;
}
-const char *bch2_btree_id_str(enum btree_id);
+const char *bch2_btree_id_str(enum btree_id); /* avoid */
void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
+void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned);
+void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *,
+ enum btree_id, unsigned, struct bkey_s_c);
void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 81dcf9e512c0..dd1d9b74076e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -29,6 +29,7 @@
#include "move.h"
#include "recovery_passes.h"
#include "reflink.h"
+#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
#include "trace.h"
@@ -56,8 +57,8 @@ void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
{
prt_str(out, bch2_gc_phase_strs[p->phase]);
prt_char(out, ' ');
- bch2_btree_id_to_text(out, p->btree);
- prt_printf(out, " l=%u ", p->level);
+ bch2_btree_id_level_to_text(out, p->btree, p->level);
+ prt_char(out, ' ');
bch2_bpos_to_text(out, p->pos);
}
@@ -209,8 +210,9 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
if (bpos_eq(expected_start, cur->data->min_key))
return 0;
- prt_printf(&buf, " at btree %s level %u:\n parent: ",
- bch2_btree_id_str(b->c.btree_id), b->c.level);
+ prt_printf(&buf, " at ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_printf(&buf, ":\n parent: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
if (prev) {
@@ -277,8 +279,9 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
if (bpos_eq(child->key.k.p, b->key.k.p))
return 0;
- prt_printf(&buf, "at btree %s level %u:\n parent: ",
- bch2_btree_id_str(b->c.btree_id), b->c.level);
+ prt_printf(&buf, " at ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_printf(&buf, ":\n parent: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_str(&buf, "\n child: ");
@@ -341,14 +344,14 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
ret = PTR_ERR_OR_ZERO(cur);
printbuf_reset(&buf);
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
+ prt_char(&buf, ' ');
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
- trans, btree_node_unreadable,
- "Topology repair: unreadable btree node at btree %s level %u:\n"
+ trans, btree_node_read_error,
+ "Topology repair: unreadable btree node at\n"
" %s",
- bch2_btree_id_str(b->c.btree_id),
- b->c.level - 1,
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
cur = NULL;
@@ -357,11 +360,9 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
if (ret)
break;
- if (!btree_id_is_alloc(b->c.btree_id)) {
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
- if (ret)
- break;
- }
+ ret = bch2_btree_lost_data(c, b->c.btree_id);
+ if (ret)
+ break;
continue;
}
@@ -370,7 +371,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
break;
if (bch2_btree_node_is_stale(c, cur)) {
- bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
+ bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf);
six_unlock_read(&cur->c.lock);
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
@@ -478,14 +479,13 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
}
printbuf_reset(&buf);
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
if (mustfix_fsck_err_on(!have_child,
trans, btree_node_topology_interior_node_empty,
- "empty interior btree node at btree %s level %u\n"
- " %s",
- bch2_btree_id_str(b->c.btree_id),
- b->c.level, buf.buf))
+ "empty interior btree node at %s", buf.buf))
ret = DROP_THIS_NODE;
err:
fsck_err:
@@ -511,6 +511,7 @@ int bch2_check_topology(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
struct bpos pulled_from_scan = POS_MIN;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
bch2_trans_srcu_unlock(trans);
@@ -519,19 +520,22 @@ int bch2_check_topology(struct bch_fs *c)
struct btree_root *r = bch2_btree_id_root(c, i);
bool reconstructed_root = false;
+ printbuf_reset(&buf);
+ bch2_btree_id_to_text(&buf, i);
+
if (r->error) {
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ ret = bch2_btree_lost_data(c, i);
if (ret)
break;
reconstruct_root:
- bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
+ bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
r->alive = false;
r->error = 0;
if (!bch2_btree_has_scanned_nodes(c, i)) {
mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
- "no nodes found for btree %s, continue?", bch2_btree_id_str(i));
+ "no nodes found for btree %s, continue?", buf.buf);
bch2_btree_root_alloc_fake_trans(trans, i, 0);
} else {
bch2_btree_root_alloc_fake_trans(trans, i, 1);
@@ -560,13 +564,14 @@ int bch2_check_topology(struct bch_fs *c)
if (!reconstructed_root)
goto reconstruct_root;
- bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
+ bch_err(c, "empty btree root %s", buf.buf);
bch2_btree_root_alloc_fake_trans(trans, i, 0);
r->alive = false;
ret = 0;
}
}
fsck_err:
+ printbuf_exit(&buf);
bch2_trans_put(trans);
return ret;
}
@@ -713,6 +718,7 @@ static int bch2_gc_btrees(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
enum btree_id ids[BTREE_ID_NR];
+ struct printbuf buf = PRINTBUF;
unsigned i;
int ret = 0;
@@ -727,14 +733,9 @@ static int bch2_gc_btrees(struct bch_fs *c)
continue;
ret = bch2_gc_btree(trans, btree, true);
-
- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
- trans, btree_node_read_error,
- "btree node read error for %s",
- bch2_btree_id_str(btree)))
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
}
-fsck_err:
+
+ printbuf_exit(&buf);
bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
@@ -802,7 +803,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
old = bch2_alloc_to_v4(k, &old_convert);
gc = new = *old;
- percpu_down_read(&c->mark_lock);
__bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
old_gc = gc;
@@ -813,7 +813,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
gc.data_type = old->data_type;
gc.dirty_sectors = old->dirty_sectors;
}
- percpu_up_read(&c->mark_lock);
/*
* gc.data_type doesn't yet include need_discard & need_gc_gen states -
@@ -831,11 +830,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
* safe w.r.t. transaction restarts, so fixup the gc_bucket so
* we don't run it twice:
*/
- percpu_down_read(&c->mark_lock);
struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
gc_m->data_type = gc.data_type;
gc_m->dirty_sectors = gc.dirty_sectors;
- percpu_up_read(&c->mark_lock);
}
if (fsck_err_on(new.data_type != gc.data_type,
@@ -895,11 +892,11 @@ static int bch2_gc_alloc_done(struct bch_fs *c)
for_each_member_device(c, ca) {
ret = bch2_trans_run(c,
- for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc,
+ for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
POS(ca->dev_idx, ca->mi.first_bucket),
POS(ca->dev_idx, ca->mi.nbuckets - 1),
BTREE_ITER_slots|BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_alloc_write_key(trans, &iter, ca, k)));
if (ret) {
bch2_dev_put(ca);
@@ -928,98 +925,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
return ret;
}
-static int bch2_gc_write_reflink_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- size_t *idx)
-{
- struct bch_fs *c = trans->c;
- const __le64 *refcount = bkey_refcount_c(k);
- struct printbuf buf = PRINTBUF;
- struct reflink_gc *r;
- int ret = 0;
-
- if (!refcount)
- return 0;
-
- while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
- r->offset < k.k->p.offset)
- ++*idx;
-
- if (!r ||
- r->offset != k.k->p.offset ||
- r->size != k.k->size) {
- bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
- return -EINVAL;
- }
-
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
- trans, reflink_v_refcount_wrong,
- "reflink key has wrong refcount:\n"
- " %s\n"
- " should be %u",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
- r->refcount)) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto out;
-
- if (!r->refcount)
- new->k.type = KEY_TYPE_deleted;
- else
- *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
- ret = bch2_trans_update(trans, iter, new, 0);
- }
-out:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int bch2_gc_reflink_done(struct bch_fs *c)
-{
- size_t idx = 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
- c->reflink_gc_nr = 0;
- return ret;
-}
-
-static int bch2_gc_reflink_start(struct bch_fs *c)
-{
- c->reflink_gc_nr = 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- const __le64 *refcount = bkey_refcount_c(k);
-
- if (!refcount)
- continue;
-
- struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
- c->reflink_gc_nr++, GFP_KERNEL);
- if (!r) {
- ret = -BCH_ERR_ENOMEM_gc_reflink_start;
- break;
- }
-
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- 0;
- })));
-
- bch_err_fn(c, ret);
- return ret;
-}
-
static int bch2_gc_write_stripes_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
@@ -1171,7 +1076,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
return -EROFS;
- percpu_down_read(&c->mark_lock);
rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
@@ -1180,7 +1084,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
if (dev_ptr_stale(ca, ptr) > 16) {
rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
goto update;
}
}
@@ -1195,7 +1098,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
*gen = ptr->gen;
}
rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
return 0;
update:
u = bch2_bkey_make_mut(trans, iter, &k, 0);
@@ -1224,7 +1126,6 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev
return ret;
a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
- alloc_data_type_set(&a_mut->v, a_mut->v.data_type);
return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
}
@@ -1337,9 +1238,16 @@ void bch2_gc_gens_async(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
}
-void bch2_fs_gc_init(struct bch_fs *c)
+void bch2_fs_btree_gc_exit(struct bch_fs *c)
{
- seqcount_init(&c->gc_pos_lock);
+}
+int bch2_fs_btree_gc_init(struct bch_fs *c)
+{
+ seqcount_init(&c->gc_pos_lock);
INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
+
+ init_rwsem(&c->gc_lock);
+ mutex_init(&c->gc_gens_lock);
+ return 0;
}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 8a47e8bd0791..9693a90a48a2 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -82,6 +82,8 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_gens_async(struct bch_fs *);
-void bch2_fs_gc_init(struct bch_fs *);
+
+void bch2_fs_btree_gc_exit(struct bch_fs *);
+int bch2_fs_btree_gc_init(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 839d68802e42..e371e60e3133 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -25,9 +25,8 @@
static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
{
- prt_printf(out, "btree=%s l=%u seq %llux\n",
- bch2_btree_id_str(BTREE_NODE_ID(bn)),
- (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq);
+ bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
+ prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn));
prt_str(out, "min: ");
bch2_bpos_to_text(out, bn->min_key);
prt_newline(out);
@@ -490,8 +489,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
if (b->nsets == MAX_BSETS &&
!btree_node_write_in_flight(b) &&
should_compact_all(c, b)) {
- bch2_btree_node_write(c, b, SIX_LOCK_write,
- BTREE_WRITE_init_next_bset);
+ bch2_btree_node_write_trans(trans, b, SIX_LOCK_write,
+ BTREE_WRITE_init_next_bset);
reinit_iter = true;
}
@@ -832,13 +831,32 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
return ret;
}
+static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b,
+ struct bkey_s_c k,
+ enum bch_validate_flags flags)
+{
+ return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = b->c.level,
+ .btree = b->c.btree_id,
+ .flags = flags
+ });
+}
+
static int bset_key_validate(struct bch_fs *c, struct btree *b,
struct bkey_s_c k,
- bool updated_range, int rw)
+ bool updated_range,
+ enum bch_validate_flags flags)
{
- return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?:
- (!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?:
- (rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0);
+ struct bkey_validate_context from = (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = b->c.level,
+ .btree = b->c.btree_id,
+ .flags = flags,
+ };
+ return __bch2_bkey_validate(c, k, from) ?:
+ (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?:
+ (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0);
}
static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
@@ -855,7 +873,21 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
- return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent);
+ return !__bch2_bkey_validate(c, u.s_c,
+ (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = b->c.level,
+ .btree = b->c.btree_id,
+ .flags = BCH_VALIDATE_silent
+ });
+}
+
+static inline int btree_node_read_bkey_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
+{
+ return bch2_bkey_cmp_packed(b, l, r)
+ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l);
}
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
@@ -918,7 +950,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
BSET_BIG_ENDIAN(i), write,
&b->format, k);
- if (prev && bkey_iter_cmp(b, prev, k) > 0) {
+ if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) {
struct bkey up = bkey_unpack_key(b, prev);
printbuf_reset(&buf);
@@ -965,6 +997,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
got_good_key:
le16_add_cpu(&i->u64s, -next_good_key);
memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
+ set_btree_node_need_rewrite(b);
}
fsck_err:
printbuf_exit(&buf);
@@ -1038,39 +1071,51 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
while (b->written < (ptr_written ?: btree_sectors(c))) {
unsigned sectors;
- struct nonce nonce;
bool first = !b->written;
- bool csum_bad;
- if (!b->written) {
+ if (first) {
+ bne = NULL;
i = &b->data->keys;
+ } else {
+ bne = write_block(b);
+ i = &bne->keys;
- btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_unknown_csum,
- "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
- nonce = btree_nonce(i, b->written << 9);
+ if (i->seq != b->data->keys.seq)
+ break;
+ }
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
- csum_bad = bch2_crc_cmp(b->data->csum, csum);
- if (csum_bad)
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ struct nonce nonce = btree_nonce(i, b->written << 9);
+ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
- btree_err_on(csum_bad,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_bad_csum,
- "%s",
- (printbuf_reset(&buf),
- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
- buf.buf));
-
- ret = bset_encrypt(c, i, b->written << 9);
- if (bch2_fs_fatal_err_on(ret, c,
- "decrypting btree node: %s", bch2_err_str(ret)))
- goto fsck_err;
+ btree_err_on(!good_csum_type,
+ bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))
+ ? -BCH_ERR_btree_node_read_err_must_retry
+ : -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i, NULL,
+ bset_unknown_csum,
+ "unknown checksum type %llu", BSET_CSUM_TYPE(i));
+
+ if (first) {
+ if (good_csum_type) {
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+ bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
+ if (csum_bad)
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+ btree_err_on(csum_bad,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i, NULL,
+ bset_bad_csum,
+ "%s",
+ (printbuf_reset(&buf),
+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
+ buf.buf));
+
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "decrypting btree node: %s", bch2_err_str(ret)))
+ goto fsck_err;
+ }
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
@@ -1081,37 +1126,26 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
sectors = vstruct_sectors(b->data, c->block_bits);
} else {
- bne = write_block(b);
- i = &bne->keys;
-
- if (i->seq != b->data->keys.seq)
- break;
-
- btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_unknown_csum,
- "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
- nonce = btree_nonce(i, b->written << 9);
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
- csum_bad = bch2_crc_cmp(bne->csum, csum);
- if (ca && csum_bad)
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
- btree_err_on(csum_bad,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_bad_csum,
- "%s",
- (printbuf_reset(&buf),
- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
- buf.buf));
-
- ret = bset_encrypt(c, i, b->written << 9);
- if (bch2_fs_fatal_err_on(ret, c,
- "decrypting btree node: %s", bch2_err_str(ret)))
- goto fsck_err;
+ if (good_csum_type) {
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+ bool csum_bad = bch2_crc_cmp(bne->csum, csum);
+ if (ca && csum_bad)
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+ btree_err_on(csum_bad,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i, NULL,
+ bset_bad_csum,
+ "%s",
+ (printbuf_reset(&buf),
+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
+ buf.buf));
+
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "decrypting btree node: %s", bch2_err_str(ret)))
+ goto fsck_err;
+ }
sectors = vstruct_sectors(bne, c->block_bits);
}
@@ -1216,7 +1250,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
- ret = bch2_bkey_val_validate(c, u.s_c, READ);
+ ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
if (ret == -BCH_ERR_fsck_delete_bkey ||
(bch2_inject_invalid_keys &&
!bversion_cmp(u.k->bversion, MAX_VERSION))) {
@@ -1226,6 +1260,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
set_btree_bset_end(b, b->set);
+ set_btree_node_need_rewrite(b);
continue;
}
if (ret)
@@ -1339,13 +1374,18 @@ static void btree_node_read_work(struct work_struct *work)
rb->start_time);
bio_put(&rb->bio);
- if (saw_error &&
+ if ((saw_error ||
+ btree_node_need_rewrite(b)) &&
!btree_node_read_error(b) &&
c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
- printbuf_reset(&buf);
- bch2_bpos_to_text(&buf, b->key.k.p);
- bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
- __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
+ if (saw_error) {
+ printbuf_reset(&buf);
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_str(&buf, " ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s",
+ __func__, buf.buf);
+ }
bch2_btree_node_rewrite_async(c, b);
}
@@ -1933,7 +1973,12 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
bool saw_error;
int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
- BKEY_TYPE_btree, WRITE);
+ (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = b->c.level + 1,
+ .btree = b->c.btree_id,
+ .flags = BCH_VALIDATE_write,
+ });
if (ret) {
bch2_fs_inconsistent(c, "invalid btree node key before write");
return ret;
@@ -2300,6 +2345,34 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
}
}
+void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b,
+ enum six_lock_type lock_type_held,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+
+ if (lock_type_held == SIX_LOCK_intent ||
+ (lock_type_held == SIX_LOCK_read &&
+ six_lock_tryupgrade(&b->c.lock))) {
+ __bch2_btree_node_write(c, b, flags);
+
+ /* don't cycle lock unnecessarily: */
+ if (btree_node_just_written(b) &&
+ six_trylock_write(&b->c.lock)) {
+ bch2_btree_post_write_cleanup(c, b);
+ __bch2_btree_node_unlock_write(trans, b);
+ }
+
+ if (lock_type_held == SIX_LOCK_read)
+ six_lock_downgrade(&b->c.lock);
+ } else {
+ __bch2_btree_node_write(c, b, flags);
+ if (lock_type_held == SIX_LOCK_write &&
+ btree_node_just_written(b))
+ bch2_btree_post_write_cleanup(c, b);
+ }
+}
+
static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
{
struct bucket_table *tbl;
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 9b01ca3de907..6f9e4a6dacf7 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -144,11 +144,13 @@ enum btree_write_flags {
void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type, unsigned);
+void bch2_btree_node_write_trans(struct btree_trans *, struct btree *,
+ enum six_lock_type, unsigned);
-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b,
enum six_lock_type lock_held)
{
- bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
+ bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
}
bool bch2_btree_flush_all_reads(struct bch_fs *);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index eef9b89c561d..367231ab1980 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -270,8 +270,10 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
iter->pos.snapshot != iter->snapshot);
- BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
- bkey_gt(iter->pos, iter->k.p));
+ BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) :
+ !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) :
+ (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
+ bkey_gt(iter->pos, iter->k.p)));
}
static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
@@ -327,7 +329,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos)
{
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
struct btree_path *path;
struct trans_for_each_path_inorder_iter iter;
@@ -697,6 +699,19 @@ void bch2_trans_node_add(struct btree_trans *trans,
bch2_trans_revalidate_updates_in_node(trans, b);
}
+void bch2_trans_node_drop(struct btree_trans *trans,
+ struct btree *b)
+{
+ struct btree_path *path;
+ unsigned i, level = b->c.level;
+
+ trans_for_each_path(trans, path, i)
+ if (path->l[level].b == b) {
+ btree_node_unlock(trans, path, level);
+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+ }
+}
+
/*
* A btree node has been modified in such a way as to invalidate iterators - fix
* them:
@@ -720,7 +735,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
+ struct btree_root *r = bch2_btree_id_root(c, path->btree_id);
enum six_lock_type lock_type;
unsigned i;
int ret;
@@ -728,7 +743,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
EBUG_ON(path->nodes_locked);
while (1) {
- b = READ_ONCE(*rootp);
+ struct btree *b = READ_ONCE(r->b);
+ if (unlikely(!b)) {
+ BUG_ON(!r->error);
+ return r->error;
+ }
+
path->level = READ_ONCE(b->c.level);
if (unlikely(path->level < depth_want)) {
@@ -748,14 +768,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
ret = btree_node_lock(trans, path, &b->c,
path->level, lock_type, trace_ip);
if (unlikely(ret)) {
- if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
- continue;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
BUG();
}
- if (likely(b == READ_ONCE(*rootp) &&
+ if (likely(b == READ_ONCE(r->b) &&
b->c.level == path->level &&
!race_fault())) {
for (i = 0; i < path->level; i++)
@@ -825,6 +843,8 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
bch2_bkey_buf_init(&tmp);
+ jiter->fail_if_too_many_whiteouts = true;
+
while (nr-- && !ret) {
if (!bch2_btree_node_relock(trans, path, path->level))
break;
@@ -1000,7 +1020,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
bch2_trans_unlock(trans);
cond_resched();
- trans_set_locked(trans);
+ trans_set_locked(trans, false);
if (unlikely(trans->memory_allocation_failure)) {
struct closure cl;
@@ -1267,7 +1287,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
{
int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
EBUG_ON(!trans->paths[path_idx].ref);
trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
@@ -1427,17 +1447,31 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_
(void *) trans->last_begin_ip);
}
-void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
+static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct printbuf buf = PRINTBUF;
+ bch2_prt_backtrace(&buf, &trans->last_restarted_trace);
+ panic("in transaction restart: %s, last restarted by\n%s",
+ bch2_err_str(trans->restarted),
+ buf.buf);
+#else
panic("in transaction restart: %s, last restarted by %pS\n",
bch2_err_str(trans->restarted),
(void *) trans->last_restarted_ip);
+#endif
}
-void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
+void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans)
{
- panic("trans should be locked, unlocked by %pS\n",
- (void *) trans->last_unlock_ip);
+ if (trans->restarted)
+ bch2_trans_in_restart_error(trans);
+
+ if (!trans->locked)
+ panic("trans should be locked, unlocked by %pS\n",
+ (void *) trans->last_unlock_ip);
+
+ BUG();
}
noinline __cold
@@ -1450,10 +1484,11 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
trans_for_each_update(trans, i) {
struct bkey_s_c old = { &i->old_k, i->old_v };
- prt_printf(buf, "update: btree=%s cached=%u %pS\n",
- bch2_btree_id_str(i->btree_id),
- i->cached,
- (void *) i->ip_allocated);
+ prt_str(buf, "update: btree=");
+ bch2_btree_id_to_text(buf, i->btree_id);
+ prt_printf(buf, " cached=%u %pS\n",
+ i->cached,
+ (void *) i->ip_allocated);
prt_printf(buf, " old ");
bch2_bkey_val_to_text(buf, trans->c, old);
@@ -1486,13 +1521,13 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra
{
struct btree_path *path = trans->paths + path_idx;
- prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ",
+ prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ",
path_idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
- path->cached ? 'C' : 'B',
- bch2_btree_id_str(path->btree_id),
- path->level);
+ path->cached ? 'C' : 'B');
+ bch2_btree_id_level_to_text(out, path->btree_id, path->level);
+ prt_str(out, " pos ");
bch2_bpos_to_text(out, path->pos);
if (!path->cached && btree_node_locked(path, path->level)) {
@@ -1717,8 +1752,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
struct trans_for_each_path_inorder_iter iter;
btree_path_idx_t path_pos = 0, path_idx;
- bch2_trans_verify_not_unlocked(trans);
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_trans_verify_locks(trans);
btree_trans_sort_paths(trans);
@@ -1833,7 +1867,7 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
!bkey_eq(path->pos, ck->key.pos));
*u = ck->k->k;
- k = bkey_i_to_s_c(ck->k);
+ k = (struct bkey_s_c) { u, &ck->k->v };
}
return k;
@@ -1843,7 +1877,6 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
return (struct bkey_s_c) { u, NULL };
}
-
void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
@@ -1870,7 +1903,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
struct btree_trans *trans = iter->trans;
int ret;
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
iter->path = bch2_btree_path_set_pos(trans, iter->path,
btree_iter_search_key(iter),
@@ -1945,7 +1978,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
int ret;
EBUG_ON(trans->paths[iter->path].cached);
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify(iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
@@ -2101,7 +2134,7 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
{
struct btree_path *path = btree_iter_path(trans, iter);
- return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+ return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
path->level,
path->pos,
end_pos,
@@ -2124,21 +2157,47 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
}
static noinline
-struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
+void btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
bch2_btree_journal_peek(trans, iter,
- k.k ? k.k->p : path_l(path)->b->key.k.p);
-
+ k->k ? k->k->p : path_l(path)->b->key.k.p);
if (next_journal) {
iter->k = next_journal->k;
- k = bkey_i_to_s_c(next_journal);
+ *k = bkey_i_to_s_c(next_journal);
}
+}
- return k;
+static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos end_pos)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+
+ return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
+ path->level,
+ path->pos,
+ end_pos,
+ &iter->journal_idx);
+}
+
+static noinline
+void btree_trans_peek_prev_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c *k)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct bkey_i *next_journal =
+ bch2_btree_journal_peek_prev(trans, iter,
+ k->k ? k->k->p : path_l(path)->b->key.k.p);
+
+ if (next_journal) {
+ iter->k = next_journal->k;
+ *k = bkey_i_to_s_c(next_journal);
+ }
}
/*
@@ -2154,8 +2213,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
struct bkey_s_c k;
int ret;
- bch2_trans_verify_not_in_restart(trans);
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
if ((iter->flags & BTREE_ITER_key_cache_fill) &&
bpos_eq(iter->pos, pos))
@@ -2184,10 +2242,15 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
- if (k.k && !bkey_err(k)) {
- iter->k = u;
- k.k = &iter->k;
- }
+ if (!k.k)
+ return k;
+
+ if ((iter->flags & BTREE_ITER_all_snapshots) &&
+ !bpos_eq(pos, k.k->p))
+ return bkey_s_c_null;
+
+ iter->k = u;
+ k.k = &iter->k;
return k;
}
@@ -2201,8 +2264,6 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
bch2_btree_iter_verify(iter);
while (1) {
- struct btree_path_level *l;
-
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
@@ -2212,17 +2273,17 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
/* ensure that iter->k is consistent with iter->pos: */
bch2_btree_iter_set_pos(iter, iter->pos);
k = bkey_s_c_err(ret);
- goto out;
+ break;
}
struct btree_path *path = btree_iter_path(trans, iter);
- l = path_l(path);
+ struct btree_path_level *l = path_l(path);
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
bch2_btree_iter_set_pos(iter, SPOS_MAX);
k = bkey_s_c_null;
- goto out;
+ break;
}
btree_path_set_should_be_locked(trans, path);
@@ -2233,15 +2294,14 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
k.k &&
(k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
k = k2;
- ret = bkey_err(k);
- if (ret) {
+ if (bkey_err(k)) {
bch2_btree_iter_set_pos(iter, iter->pos);
- goto out;
+ break;
}
}
if (unlikely(iter->flags & BTREE_ITER_with_journal))
- k = btree_trans_peek_journal(trans, iter, k);
+ btree_trans_peek_journal(trans, iter, &k);
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
@@ -2270,32 +2330,32 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
/* End of btree: */
bch2_btree_iter_set_pos(iter, SPOS_MAX);
k = bkey_s_c_null;
- goto out;
+ break;
}
}
-out:
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(iter);
return k;
}
/**
- * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * bch2_btree_iter_peek_max() - returns first key greater than or equal to
* iterator's current position
* @iter: iterator to peek from
* @end: search limit: returns keys less than or equal to @end
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end)
{
struct btree_trans *trans = iter->trans;
struct bpos search_key = btree_iter_search_key(iter);
struct bkey_s_c k;
- struct bpos iter_pos;
+ struct bpos iter_pos = iter->pos;
int ret;
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+ bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
if (iter->update_path) {
@@ -2304,8 +2364,6 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
iter->update_path = 0;
}
- bch2_btree_iter_verify_entry_exit(iter);
-
while (1) {
k = __bch2_btree_iter_peek(iter, search_key);
if (unlikely(!k.k))
@@ -2313,75 +2371,75 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
if (unlikely(bkey_err(k)))
goto out_no_locked;
- /*
- * We need to check against @end before FILTER_SNAPSHOTS because
- * if we get to a different inode that requested we might be
- * seeing keys for a different snapshot tree that will all be
- * filtered out.
- *
- * But we can't do the full check here, because bkey_start_pos()
- * isn't monotonically increasing before FILTER_SNAPSHOTS, and
- * that's what we check against in extents mode:
- */
- if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
- ? bkey_gt(k.k->p, end)
- : k.k->p.inode > end.inode))
- goto end;
+ if (iter->flags & BTREE_ITER_filter_snapshots) {
+ /*
+ * We need to check against @end before FILTER_SNAPSHOTS because
+ * if we get to a different inode that requested we might be
+ * seeing keys for a different snapshot tree that will all be
+ * filtered out.
+ *
+ * But we can't do the full check here, because bkey_start_pos()
+ * isn't monotonically increasing before FILTER_SNAPSHOTS, and
+ * that's what we check against in extents mode:
+ */
+ if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
+ ? bkey_gt(k.k->p, end)
+ : k.k->p.inode > end.inode))
+ goto end;
+
+ if (iter->update_path &&
+ !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
+ bch2_path_put_nokeep(trans, iter->update_path,
+ iter->flags & BTREE_ITER_intent);
+ iter->update_path = 0;
+ }
- if (iter->update_path &&
- !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
- bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
- iter->update_path = 0;
- }
+ if ((iter->flags & BTREE_ITER_intent) &&
+ !(iter->flags & BTREE_ITER_is_extents) &&
+ !iter->update_path) {
+ struct bpos pos = k.k->p;
- if ((iter->flags & BTREE_ITER_filter_snapshots) &&
- (iter->flags & BTREE_ITER_intent) &&
- !(iter->flags & BTREE_ITER_is_extents) &&
- !iter->update_path) {
- struct bpos pos = k.k->p;
+ if (pos.snapshot < iter->snapshot) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
- if (pos.snapshot < iter->snapshot) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
+ pos.snapshot = iter->snapshot;
- pos.snapshot = iter->snapshot;
+ /*
+ * advance, same as on exit for iter->path, but only up
+ * to snapshot
+ */
+ __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
+ iter->update_path = iter->path;
+
+ iter->update_path = bch2_btree_path_set_pos(trans,
+ iter->update_path, pos,
+ iter->flags & BTREE_ITER_intent,
+ _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+ }
/*
- * advance, same as on exit for iter->path, but only up
- * to snapshot
+ * We can never have a key in a leaf node at POS_MAX, so
+ * we don't have to check these successor() calls:
*/
- __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
- iter->update_path = iter->path;
-
- iter->update_path = bch2_btree_path_set_pos(trans,
- iter->update_path, pos,
- iter->flags & BTREE_ITER_intent,
- _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out_no_locked;
+ if (!bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ search_key = bpos_successor(k.k->p);
+ continue;
}
- }
-
- /*
- * We can never have a key in a leaf node at POS_MAX, so
- * we don't have to check these successor() calls:
- */
- if ((iter->flags & BTREE_ITER_filter_snapshots) &&
- !bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot)) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_all_snapshots)) {
- search_key = bkey_successor(iter, k.k->p);
- continue;
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_key_cache_fill)) {
+ search_key = bkey_successor(iter, k.k->p);
+ continue;
+ }
}
/*
@@ -2451,127 +2509,204 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
return bch2_btree_iter_peek(iter);
}
-/**
- * bch2_btree_iter_peek_prev() - returns first key less than or equal to
- * iterator's current position
- * @iter: iterator to peek from
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
{
struct btree_trans *trans = iter->trans;
- struct bpos search_key = iter->pos;
- struct bkey_s_c k;
- struct bkey saved_k;
- const struct bch_val *saved_v;
- btree_path_idx_t saved_path = 0;
- int ret;
-
- bch2_trans_verify_not_unlocked(trans);
- EBUG_ON(btree_iter_path(trans, iter)->cached ||
- btree_iter_path(trans, iter)->level);
-
- if (iter->flags & BTREE_ITER_with_journal)
- return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
+ struct bkey_s_c k, k2;
bch2_btree_iter_verify(iter);
- bch2_btree_iter_verify_entry_exit(iter);
-
- if (iter->flags & BTREE_ITER_filter_snapshots)
- search_key.snapshot = U32_MAX;
while (1) {
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
+ iter->flags & BTREE_ITER_intent,
+ btree_iter_ip_allocated(iter));
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
/* ensure that iter->k is consistent with iter->pos: */
bch2_btree_iter_set_pos(iter, iter->pos);
k = bkey_s_c_err(ret);
- goto out_no_locked;
+ break;
}
struct btree_path *path = btree_iter_path(trans, iter);
+ struct btree_path_level *l = path_l(path);
+
+ if (unlikely(!l->b)) {
+ /* No btree nodes at requested level: */
+ bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ k = bkey_s_c_null;
+ break;
+ }
+
+ btree_path_set_should_be_locked(trans, path);
- k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
- if (!k.k ||
- ((iter->flags & BTREE_ITER_is_extents)
- ? bpos_ge(bkey_start_pos(k.k), search_key)
- : bpos_gt(k.k->p, search_key)))
- k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
+ k = btree_path_level_peek_all(trans->c, l, &iter->k);
+ if (!k.k || bpos_gt(k.k->p, search_key)) {
+ k = btree_path_level_prev(trans, path, l, &iter->k);
+
+ BUG_ON(k.k && bpos_gt(k.k->p, search_key));
+ }
+
+ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
+ k.k &&
+ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ k = k2;
+ if (bkey_err(k2)) {
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ break;
+ }
+ }
+
+ if (unlikely(iter->flags & BTREE_ITER_with_journal))
+ btree_trans_peek_prev_journal(trans, iter, &k);
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
bch2_btree_trans_peek_prev_updates(trans, iter, &k);
- if (likely(k.k)) {
- if (iter->flags & BTREE_ITER_filter_snapshots) {
- if (k.k->p.snapshot == iter->snapshot)
- goto got_key;
+ if (likely(k.k && !bkey_deleted(k.k))) {
+ break;
+ } else if (k.k) {
+ search_key = bpos_predecessor(k.k->p);
+ } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
+ /* Advance to previous leaf node: */
+ search_key = bpos_predecessor(path->l[0].b->data->min_key);
+ } else {
+ /* Start of btree: */
+ bch2_btree_iter_set_pos(iter, POS_MIN);
+ k = bkey_s_c_null;
+ break;
+ }
+ }
+
+ bch2_btree_iter_verify(iter);
+ return k;
+}
+
+/**
+ * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
+ * iterator's current position
+ * @iter: iterator to peek from
+ * @end: search limit: returns keys greater than or equal to @end
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
+{
+ if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
+ !bkey_eq(iter->pos, POS_MAX)) {
+ /*
+ * bkey_start_pos(), for extents, is not monotonically
+ * increasing until after filtering for snapshots:
+ *
+ * Thus, for extents we need to search forward until we find a
+ * real visible extents - easiest to just use peek_slot() (which
+ * internally uses peek() for extents)
+ */
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ if (bkey_err(k))
+ return k;
+
+ if (!bkey_deleted(k.k) &&
+ (!(iter->flags & BTREE_ITER_is_extents) ||
+ bkey_lt(bkey_start_pos(k.k), iter->pos)))
+ return k;
+ }
+
+ struct btree_trans *trans = iter->trans;
+ struct bpos search_key = iter->pos;
+ struct bkey_s_c k;
+ btree_path_idx_t saved_path = 0;
+
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+ bch2_btree_iter_verify_entry_exit(iter);
+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
+
+ while (1) {
+ k = __bch2_btree_iter_peek_prev(iter, search_key);
+ if (unlikely(!k.k))
+ goto end;
+ if (unlikely(bkey_err(k)))
+ goto out_no_locked;
+ if (iter->flags & BTREE_ITER_filter_snapshots) {
+ struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
+ if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
/*
- * If we have a saved candidate, and we're no
- * longer at the same _key_ (not pos), return
- * that candidate
+ * If we have a saved candidate, and we're past
+ * the last possible snapshot overwrite, return
+ * it:
*/
- if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
- bch2_path_put_nokeep(trans, iter->path,
- iter->flags & BTREE_ITER_intent);
- iter->path = saved_path;
+ bch2_path_put_nokeep(trans, iter->path,
+ iter->flags & BTREE_ITER_intent);
+ iter->path = saved_path;
+ saved_path = 0;
+ k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
+ break;
+ }
+
+ /*
+ * We need to check against @end before FILTER_SNAPSHOTS because
+ * if we get to a different inode that requested we might be
+ * seeing keys for a different snapshot tree that will all be
+ * filtered out.
+ */
+ if (unlikely(bkey_lt(k.k->p, end)))
+ goto end;
+
+ if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
+ search_key = bpos_predecessor(k.k->p);
+ continue;
+ }
+
+ if (k.k->p.snapshot != iter->snapshot) {
+ /*
+ * Have a key visible in iter->snapshot, but
+ * might have overwrites: - save it and keep
+ * searching. Unless it's a whiteout - then drop
+ * our previous saved candidate:
+ */
+ if (saved_path) {
+ bch2_path_put_nokeep(trans, saved_path,
+ iter->flags & BTREE_ITER_intent);
saved_path = 0;
- iter->k = saved_k;
- k.v = saved_v;
- goto got_key;
}
- if (bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot)) {
- if (saved_path)
- bch2_path_put_nokeep(trans, saved_path,
- iter->flags & BTREE_ITER_intent);
+ if (!bkey_whiteout(k.k)) {
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_intent,
_THIS_IP_);
- path = btree_iter_path(trans, iter);
- trace_btree_path_save_pos(trans, path, trans->paths + saved_path);
- saved_k = *k.k;
- saved_v = k.v;
+ trace_btree_path_save_pos(trans,
+ trans->paths + iter->path,
+ trans->paths + saved_path);
}
search_key = bpos_predecessor(k.k->p);
continue;
}
-got_key:
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_all_snapshots)) {
+
+ if (bkey_whiteout(k.k)) {
search_key = bkey_predecessor(iter, k.k->p);
- if (iter->flags & BTREE_ITER_filter_snapshots)
- search_key.snapshot = U32_MAX;
+ search_key.snapshot = U32_MAX;
continue;
}
-
- btree_path_set_should_be_locked(trans, path);
- break;
- } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
- /* Advance to previous leaf node: */
- search_key = bpos_predecessor(path->l[0].b->data->min_key);
- } else {
- /* Start of btree: */
- bch2_btree_iter_set_pos(iter, POS_MIN);
- k = bkey_s_c_null;
- goto out_no_locked;
}
- }
- EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
+ EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) :
+ iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) :
+ bkey_gt(k.k->p, iter->pos));
+
+ if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) :
+ iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) :
+ bkey_lt(k.k->p, end)))
+ goto end;
+
+ break;
+ }
/* Extents can straddle iter->pos: */
- if (bkey_lt(k.k->p, iter->pos))
- iter->pos = k.k->p;
+ iter->pos = bpos_min(iter->pos, k.k->p);;
if (iter->flags & BTREE_ITER_filter_snapshots)
iter->pos.snapshot = iter->snapshot;
@@ -2581,8 +2716,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
-
return k;
+end:
+ bch2_btree_iter_set_pos(iter, end);
+ k = bkey_s_c_null;
+ goto out_no_locked;
}
/**
@@ -2607,7 +2745,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bkey_s_c k;
int ret;
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
@@ -2632,6 +2770,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out_no_locked;
}
+ struct btree_path *path = btree_iter_path(trans, iter);
+ if (unlikely(!btree_path_node(path, path->level)))
+ return bkey_s_c_null;
+
if ((iter->flags & BTREE_ITER_cached) ||
!(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
k = bkey_s_c_null;
@@ -2658,6 +2800,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
if (unlikely(!k.k))
goto out_no_locked;
+
+ if (unlikely(k.k->type == KEY_TYPE_whiteout &&
+ (iter->flags & BTREE_ITER_filter_snapshots) &&
+ !(iter->flags & BTREE_ITER_key_cache_fill)))
+ iter->k.type = KEY_TYPE_deleted;
} else {
struct bpos next;
struct bpos end = iter->pos;
@@ -2671,7 +2818,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct btree_iter iter2;
bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek_upto(&iter2, end);
+ k = bch2_btree_iter_peek_max(&iter2, end);
if (k.k && !bkey_err(k)) {
swap(iter->key_cache_path, iter2.key_cache_path);
@@ -2682,7 +2829,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
} else {
struct bpos pos = iter->pos;
- k = bch2_btree_iter_peek_upto(iter, end);
+ k = bch2_btree_iter_peek_max(iter, end);
if (unlikely(bkey_err(k)))
bch2_btree_iter_set_pos(iter, pos);
else
@@ -2902,7 +3049,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans,
unsigned flags)
{
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
- bch2_btree_iter_flags(trans, btree_id, flags),
+ bch2_btree_iter_flags(trans, btree_id, 0, flags),
_RET_IP_);
}
@@ -2918,8 +3065,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
flags |= BTREE_ITER_snapshot_field;
flags |= BTREE_ITER_all_snapshots;
+ if (!depth && btree_id_cached(trans->c, btree_id))
+ flags |= BTREE_ITER_with_key_cache;
+
bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
- __bch2_btree_iter_flags(trans, btree_id, flags),
+ bch2_btree_iter_flags(trans, btree_id, depth, flags),
_RET_IP_);
iter->min_depth = depth;
@@ -3122,14 +3272,14 @@ u32 bch2_trans_begin(struct btree_trans *trans)
trans->last_begin_ip = _RET_IP_;
- trans_set_locked(trans);
+ trans_set_locked(trans, false);
if (trans->restarted) {
bch2_btree_path_traverse_all(trans);
trans->notrace_relock_fail = false;
}
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
return trans->restart_count;
}
@@ -3228,7 +3378,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
trans->srcu_held = true;
- trans_set_locked(trans);
+ trans_set_locked(trans, false);
closure_init_stack_release(&trans->ref);
return trans;
@@ -3262,6 +3412,9 @@ void bch2_trans_put(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
+ if (trans->restarted)
+ bch2_trans_in_restart_error(trans);
+
bch2_trans_unlock(trans);
trans_for_each_update(trans, i)
@@ -3285,6 +3438,10 @@ void bch2_trans_put(struct btree_trans *trans)
closure_return_sync(&trans->ref);
trans->locking_wait.task = NULL;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ darray_exit(&trans->last_restarted_trace);
+#endif
+
unsigned long *paths_allocated = trans->paths_allocated;
trans->paths_allocated = NULL;
trans->paths = NULL;
@@ -3338,8 +3495,9 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
pid = owner ? owner->pid : 0;
rcu_read_unlock();
- prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
- b->level, bch2_btree_id_str(b->btree_id));
+ prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
+ bch2_btree_id_to_text(out, b->btree_id);
+ prt_printf(out, " l=%u:", b->level);
bch2_bpos_to_text(out, btree_node_pos(b));
prt_printf(out, "\t locks %u:%u:%u held by pid %u",
@@ -3378,11 +3536,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
if (!path->nodes_locked)
continue;
- prt_printf(out, " path %u %c l=%u %s:",
- idx,
- path->cached ? 'c' : 'b',
- path->level,
- bch2_btree_id_str(path->btree_id));
+ prt_printf(out, " path %u %c ",
+ idx,
+ path->cached ? 'c' : 'b');
+ bch2_btree_id_to_text(out, path->btree_id);
+ prt_printf(out, " l=%u:", path->level);
bch2_bpos_to_text(out, path->pos);
prt_newline(out);
@@ -3488,7 +3646,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
#ifdef CONFIG_LOCKDEP
fs_reclaim_acquire(GFP_KERNEL);
struct btree_trans *trans = bch2_trans_get(c);
- trans_set_locked(trans);
+ trans_set_locked(trans, false);
bch2_trans_put(trans);
fs_reclaim_release(GFP_KERNEL);
#endif
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 0bda054f80d7..b9538e6e6d65 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -23,6 +23,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path
{
unsigned idx = path - trans->paths;
+ EBUG_ON(idx >= trans->nr_paths);
EBUG_ON(!test_bit(idx, trans->paths_allocated));
if (unlikely(path->ref == U8_MAX)) {
bch2_dump_trans_paths_updates(trans);
@@ -36,6 +37,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path
static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
{
+ EBUG_ON(path - trans->paths >= trans->nr_paths);
EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
EBUG_ON(!path->ref);
EBUG_ON(!path->intent_ref && intent);
@@ -234,12 +236,12 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
btree_path_idx_t,
unsigned, unsigned long);
-static inline void bch2_trans_verify_not_unlocked(struct btree_trans *);
+static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *);
static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
btree_path_idx_t path, unsigned flags)
{
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
@@ -324,38 +326,33 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
bch2_trans_restart_error(trans, restart_count);
}
-void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
+void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *);
-static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
+static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans)
{
- if (trans->restarted)
- bch2_trans_in_restart_error(trans);
-}
-
-void __noreturn bch2_trans_unlocked_error(struct btree_trans *);
-
-static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans)
-{
- if (!trans->locked)
- bch2_trans_unlocked_error(trans);
+ if (trans->restarted || !trans->locked)
+ bch2_trans_unlocked_or_in_restart_error(trans);
}
__always_inline
-static int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
+static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
{
BUG_ON(err <= 0);
BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
trans->restarted = err;
- trans->last_restarted_ip = _THIS_IP_;
+ trans->last_restarted_ip = ip;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ darray_exit(&trans->last_restarted_trace);
+ bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
+#endif
return -err;
}
__always_inline
static int btree_trans_restart(struct btree_trans *trans, int err)
{
- btree_trans_restart_nounlock(trans, err);
- return -err;
+ return btree_trans_restart_ip(trans, err, _THIS_IP_);
}
bool bch2_btree_node_upgrade(struct btree_trans *,
@@ -375,6 +372,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
void bch2_trans_downgrade(struct btree_trans *);
void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
+void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
@@ -384,15 +382,21 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
struct btree *bch2_btree_iter_next_node(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
{
- return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+ return bch2_btree_iter_peek_max(iter, SPOS_MAX);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
+
+static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+{
+ return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
}
-struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
@@ -443,10 +447,17 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna
void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
-static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
- unsigned btree_id,
- unsigned flags)
+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
+ unsigned btree_id,
+ unsigned level,
+ unsigned flags)
{
+ if (level || !btree_id_cached(trans->c, btree_id)) {
+ flags &= ~BTREE_ITER_cached;
+ flags &= ~BTREE_ITER_with_key_cache;
+ } else if (!(flags & BTREE_ITER_cached))
+ flags |= BTREE_ITER_with_key_cache;
+
if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
btree_id_is_extents(btree_id))
flags |= BTREE_ITER_is_extents;
@@ -465,19 +476,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
return flags;
}
-static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
- unsigned btree_id,
- unsigned flags)
-{
- if (!btree_id_cached(trans->c, btree_id)) {
- flags &= ~BTREE_ITER_cached;
- flags &= ~BTREE_ITER_with_key_cache;
- } else if (!(flags & BTREE_ITER_cached))
- flags |= BTREE_ITER_with_key_cache;
-
- return __bch2_btree_iter_flags(trans, btree_id, flags);
-}
-
static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
@@ -514,7 +512,7 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans,
if (__builtin_constant_p(btree_id) &&
__builtin_constant_p(flags))
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
- bch2_btree_iter_flags(trans, btree_id, flags),
+ bch2_btree_iter_flags(trans, btree_id, 0, flags),
_THIS_IP_);
else
bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
@@ -593,13 +591,18 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \
_btree_id, _pos, _flags, KEY_TYPE_##_type))
+static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k)
+{
+ unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k));
+ memcpy(dst_v, src_k.v, b);
+ if (unlikely(b < dst_size))
+ memset(dst_v + b, 0, dst_size - b);
+}
+
#define bkey_val_copy(_dst_v, _src_k) \
do { \
- unsigned b = min_t(unsigned, sizeof(*_dst_v), \
- bkey_val_bytes(_src_k.k)); \
- memcpy(_dst_v, _src_k.v, b); \
- if (b < sizeof(*_dst_v)) \
- memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \
+ BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \
+ __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \
} while (0)
static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
@@ -608,17 +611,10 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
unsigned val_size, void *val)
{
struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
- ret = bkey_err(k);
+ struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
+ int ret = bkey_err(k);
if (!ret) {
- unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size);
-
- memcpy(val, k.v, b);
- if (unlikely(b < sizeof(*val)))
- memset((void *) val + b, 0, sizeof(*val) - b);
+ __bkey_val_copy(val, val_size, k);
bch2_trans_iter_exit(trans, &iter);
}
@@ -677,12 +673,12 @@ static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
bch2_btree_iter_peek(iter);
}
-static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter,
struct bpos end,
unsigned flags)
{
if (!(flags & BTREE_ITER_slots))
- return bch2_btree_iter_peek_upto(iter, end);
+ return bch2_btree_iter_peek_max(iter, end);
if (bkey_gt(iter->pos, end))
return bkey_s_c_null;
@@ -746,7 +742,7 @@ transaction_restart: \
_ret2 ?: trans_was_restarted(_trans, _restart_count); \
})
-#define for_each_btree_key_upto_continue(_trans, _iter, \
+#define for_each_btree_key_max_continue(_trans, _iter, \
_end, _flags, _k, _do) \
({ \
struct bkey_s_c _k; \
@@ -754,7 +750,7 @@ transaction_restart: \
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_max_type(&(_iter), \
_end, (_flags)); \
if (!(_k).k) \
break; \
@@ -768,9 +764,9 @@ transaction_restart: \
})
#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \
- for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
+ for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
-#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
+#define for_each_btree_key_max(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _do) \
({ \
bch2_trans_begin(trans); \
@@ -779,12 +775,12 @@ transaction_restart: \
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
\
- for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\
+ for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\
})
#define for_each_btree_key(_trans, _iter, _btree_id, \
_start, _flags, _k, _do) \
- for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \
+ for_each_btree_key_max(_trans, _iter, _btree_id, _start, \
SPOS_MAX, _flags, _k, _do)
#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
@@ -828,33 +824,33 @@ transaction_restart: \
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
-#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \
+#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \
_start, _end, _iter_flags, _k, \
_disk_res, _journal_seq, _commit_flags,\
_do) \
- for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+ for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
+#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
+ (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
-#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
+#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\
for (; \
- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \
+ (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
- for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
+ for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
SPOS_MAX, _flags, _k, _ret)
#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
@@ -866,7 +862,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
bch2_btree_iter_rewind(&(_iter)))
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
- for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+ for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
/*
* This should not be used in a fastpath, without first trying _do in
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index c1657182c275..6d25e3f85ce8 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -16,6 +16,17 @@
* operations for the regular btree iter code to use:
*/
+static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos)
+{
+ size_t gap_size = keys->size - keys->nr;
+
+ BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size);
+
+ if (pos >= keys->gap)
+ pos -= gap_size;
+ return pos;
+}
+
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
{
size_t gap_size = keys->size - keys->nr;
@@ -61,7 +72,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
}
/* Returns first non-overwritten key >= search key: */
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
unsigned level, struct bpos pos,
struct bpos end_pos, size_t *idx)
{
@@ -84,27 +95,92 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree
}
}
+ struct bkey_i *ret = NULL;
+ rcu_read_lock(); /* for overwritten_ranges */
+
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
- return NULL;
+ break;
if (k->overwritten) {
- (*idx)++;
+ if (k->overwritten_range)
+ *idx = rcu_dereference(k->overwritten_range)->end;
+ else
+ *idx += 1;
continue;
}
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
- return k->k;
+ if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
+ ret = k->k;
+ break;
+ }
(*idx)++;
iters++;
if (iters == 10) {
*idx = 0;
+ rcu_read_unlock();
goto search;
}
}
- return NULL;
+ rcu_read_unlock();
+ return ret;
+}
+
+struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos,
+ struct bpos end_pos, size_t *idx)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ unsigned iters = 0;
+ struct journal_key *k;
+
+ BUG_ON(*idx > keys->nr);
+search:
+ if (!*idx)
+ *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+ while (*idx &&
+ __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+ (*idx)++;
+ iters++;
+ if (iters == 10) {
+ *idx = 0;
+ goto search;
+ }
+ }
+
+ struct bkey_i *ret = NULL;
+ rcu_read_lock(); /* for overwritten_ranges */
+
+ while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+ if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
+ break;
+
+ if (k->overwritten) {
+ if (k->overwritten_range)
+ *idx = rcu_dereference(k->overwritten_range)->start - 1;
+ else
+ *idx -= 1;
+ continue;
+ }
+
+ if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
+ ret = k->k;
+ break;
+ }
+
+ --(*idx);
+ iters++;
+ if (iters == 10) {
+ *idx = 0;
+ goto search;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
}
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
@@ -112,11 +188,12 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
{
size_t idx = 0;
- return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+ return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx);
}
static void journal_iter_verify(struct journal_iter *iter)
{
+#ifdef CONFIG_BCACHEFS_DEBUG
struct journal_keys *keys = iter->keys;
size_t gap_size = keys->size - keys->nr;
@@ -126,10 +203,10 @@ static void journal_iter_verify(struct journal_iter *iter)
if (iter->idx < keys->size) {
struct journal_key *k = keys->data + iter->idx;
- int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
- cmp_int(k->level, iter->level);
- BUG_ON(cmp < 0);
+ int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
+ BUG_ON(cmp > 0);
}
+#endif
}
static void journal_iters_fix(struct bch_fs *c)
@@ -182,7 +259,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
* Ensure these keys are done last by journal replay, to unblock
* journal reclaim:
*/
- .journal_seq = U32_MAX,
+ .journal_seq = U64_MAX,
};
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
@@ -290,6 +367,68 @@ bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
bkey_deleted(&keys->data[idx].k->k));
}
+static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
+{
+ struct journal_key *k = keys->data + pos;
+ size_t idx = pos_to_idx(keys, pos);
+
+ k->overwritten = true;
+
+ struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL;
+ struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL;
+
+ bool prev_overwritten = prev && prev->overwritten;
+ bool next_overwritten = next && next->overwritten;
+
+ struct journal_key_range_overwritten *prev_range =
+ prev_overwritten ? prev->overwritten_range : NULL;
+ struct journal_key_range_overwritten *next_range =
+ next_overwritten ? next->overwritten_range : NULL;
+
+ BUG_ON(prev_range && prev_range->end != idx);
+ BUG_ON(next_range && next_range->start != idx + 1);
+
+ if (prev_range && next_range) {
+ prev_range->end = next_range->end;
+
+ keys->data[pos].overwritten_range = prev_range;
+ for (size_t i = next_range->start; i < next_range->end; i++) {
+ struct journal_key *ip = keys->data + idx_to_pos(keys, i);
+ BUG_ON(ip->overwritten_range != next_range);
+ ip->overwritten_range = prev_range;
+ }
+
+ kfree_rcu_mightsleep(next_range);
+ } else if (prev_range) {
+ prev_range->end++;
+ k->overwritten_range = prev_range;
+ if (next_overwritten) {
+ prev_range->end++;
+ next->overwritten_range = prev_range;
+ }
+ } else if (next_range) {
+ next_range->start--;
+ k->overwritten_range = next_range;
+ if (prev_overwritten) {
+ next_range->start--;
+ prev->overwritten_range = next_range;
+ }
+ } else if (prev_overwritten || next_overwritten) {
+ struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
+ if (!r)
+ return;
+
+ r->start = idx - (size_t) prev_overwritten;
+ r->end = idx + 1 + (size_t) next_overwritten;
+
+ rcu_assign_pointer(k->overwritten_range, r);
+ if (prev_overwritten)
+ prev->overwritten_range = r;
+ if (next_overwritten)
+ next->overwritten_range = r;
+ }
+}
+
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos pos)
{
@@ -299,8 +438,12 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
if (idx < keys->size &&
keys->data[idx].btree_id == btree &&
keys->data[idx].level == level &&
- bpos_eq(keys->data[idx].k->k.p, pos))
- keys->data[idx].overwritten = true;
+ bpos_eq(keys->data[idx].k->k.p, pos) &&
+ !keys->data[idx].overwritten) {
+ mutex_lock(&keys->overwrite_lock);
+ __bch2_journal_key_overwritten(keys, idx);
+ mutex_unlock(&keys->overwrite_lock);
+ }
}
static void bch2_journal_iter_advance(struct journal_iter *iter)
@@ -314,24 +457,32 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
+ struct bkey_s_c ret = bkey_s_c_null;
+
journal_iter_verify(iter);
+ rcu_read_lock();
while (iter->idx < iter->keys->size) {
struct journal_key *k = iter->keys->data + iter->idx;
- int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
- cmp_int(k->level, iter->level);
- if (cmp > 0)
+ int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
+ if (cmp < 0)
break;
BUG_ON(cmp);
- if (!k->overwritten)
- return bkey_i_to_s_c(k->k);
+ if (!k->overwritten) {
+ ret = bkey_i_to_s_c(k->k);
+ break;
+ }
- bch2_journal_iter_advance(iter);
+ if (k->overwritten_range)
+ iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
+ else
+ bch2_journal_iter_advance(iter);
}
+ rcu_read_unlock();
- return bkey_s_c_null;
+ return ret;
}
static void bch2_journal_iter_exit(struct journal_iter *iter)
@@ -382,6 +533,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
: (level > 1 ? 1 : 16);
iter.prefetch = false;
+ iter.fail_if_too_many_whiteouts = true;
bch2_bkey_buf_init(&tmp);
while (nr--) {
@@ -400,6 +552,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
+ size_t iters = 0;
if (iter->prefetch && iter->journal.level)
btree_and_journal_iter_prefetch(iter);
@@ -407,6 +560,11 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
if (iter->at_end)
return bkey_s_c_null;
+ iters++;
+
+ if (iters > 20 && iter->fail_if_too_many_whiteouts)
+ return bkey_s_c_null;
+
while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
bpos_lt(btree_k.k->p, iter->pos))
bch2_journal_iter_advance_btree(iter);
@@ -481,16 +639,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
/* sort and dedup all keys in the journal: */
-void bch2_journal_entries_free(struct bch_fs *c)
-{
- struct journal_replay **i;
- struct genradix_iter iter;
-
- genradix_for_each(&c->journal_entries, iter, i)
- kvfree(*i);
- genradix_free(&c->journal_entries);
-}
-
/*
* When keys compare equal, oldest compares first:
*/
@@ -515,15 +663,26 @@ void bch2_journal_keys_put(struct bch_fs *c)
move_gap(keys, keys->nr);
- darray_for_each(*keys, i)
+ darray_for_each(*keys, i) {
+ if (i->overwritten_range &&
+ (i == &darray_last(*keys) ||
+ i->overwritten_range != i[1].overwritten_range))
+ kfree(i->overwritten_range);
+
if (i->allocated)
kfree(i->k);
+ }
kvfree(keys->data);
keys->data = NULL;
keys->nr = keys->gap = keys->size = 0;
- bch2_journal_entries_free(c);
+ struct journal_replay **i;
+ struct genradix_iter iter;
+
+ genradix_for_each(&c->journal_entries, iter, i)
+ kvfree(*i);
+ genradix_free(&c->journal_entries);
}
static void __journal_keys_sort(struct journal_keys *keys)
@@ -628,8 +787,20 @@ void bch2_journal_keys_dump(struct bch_fs *c)
darray_for_each(*keys, i) {
printbuf_reset(&buf);
+ prt_printf(&buf, "btree=");
+ bch2_btree_id_to_text(&buf, i->btree_id);
+ prt_printf(&buf, " l=%u ", i->level);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
- pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf);
+ pr_err("%s", buf.buf);
}
printbuf_exit(&buf);
}
+
+void bch2_fs_journal_keys_init(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+
+ atomic_set(&keys->ref, 1);
+ keys->initial_ref_held = true;
+ mutex_init(&keys->overwrite_lock);
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 1653de9d609b..2a3082919b8d 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -26,16 +26,24 @@ struct btree_and_journal_iter {
struct bpos pos;
bool at_end;
bool prefetch;
+ bool fail_if_too_many_whiteouts;
};
+static inline int __journal_key_btree_cmp(enum btree_id l_btree_id,
+ unsigned l_level,
+ const struct journal_key *r)
+{
+ return -cmp_int(l_level, r->level) ?:
+ cmp_int(l_btree_id, r->btree_id);
+}
+
static inline int __journal_key_cmp(enum btree_id l_btree_id,
unsigned l_level,
struct bpos l_pos,
const struct journal_key *r)
{
- return (cmp_int(l_btree_id, r->btree_id) ?:
- cmp_int(l_level, r->level) ?:
- bpos_cmp(l_pos, r->k->k.p));
+ return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
+ bpos_cmp(l_pos, r->k->k.p);
}
static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
@@ -43,7 +51,9 @@ static inline int journal_key_cmp(const struct journal_key *l, const struct jour
return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
}
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
@@ -79,8 +89,6 @@ static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
c->journal_keys.initial_ref_held = false;
}
-void bch2_journal_entries_free(struct bch_fs *);
-
int bch2_journal_keys_sort(struct bch_fs *);
void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
@@ -89,4 +97,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
void bch2_journal_keys_dump(struct bch_fs *);
+void bch2_fs_journal_keys_init(struct bch_fs *);
+
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
new file mode 100644
index 000000000000..8b773823704f
--- /dev/null
+++ b/fs/bcachefs/btree_journal_iter_types.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
+#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
+
+struct journal_key_range_overwritten {
+ size_t start, end;
+};
+
+struct journal_key {
+ u64 journal_seq;
+ u32 journal_offset;
+ enum btree_id btree_id:8;
+ unsigned level:8;
+ bool allocated;
+ bool overwritten;
+ struct journal_key_range_overwritten __rcu *
+ overwritten_range;
+ struct bkey_i *k;
+};
+
+struct journal_keys {
+ /* must match layout in darray_types.h */
+ size_t nr, size;
+ struct journal_key *data;
+ /*
+ * Gap buffer: instead of all the empty space in the array being at the
+ * end of the buffer - from @nr to @size - the empty space is at @gap.
+ * This means that sequential insertions are O(n) instead of O(n^2).
+ */
+ size_t gap;
+ atomic_t ref;
+ bool initial_ref_held;
+ struct mutex overwrite_lock;
+};
+
+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 244610b1d0b5..3b62296c3100 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -197,7 +197,9 @@ bkey_cached_reuse(struct btree_key_cache *c)
return ck;
}
-static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path,
+static int btree_key_cache_create(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_path *ck_path,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
@@ -217,7 +219,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
key_u64s = min(256U, (key_u64s * 3) / 2);
key_u64s = roundup_pow_of_two(key_u64s);
- struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s);
+ struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s);
int ret = PTR_ERR_OR_ZERO(ck);
if (ret)
return ret;
@@ -226,19 +228,19 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
- bch2_btree_id_str(path->btree_id));
+ bch2_btree_id_str(ck_path->btree_id));
return -BCH_ERR_ENOMEM_btree_key_cache_create;
}
}
ck->c.level = 0;
- ck->c.btree_id = path->btree_id;
- ck->key.btree_id = path->btree_id;
- ck->key.pos = path->pos;
+ ck->c.btree_id = ck_path->btree_id;
+ ck->key.btree_id = ck_path->btree_id;
+ ck->key.pos = ck_path->pos;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
if (unlikely(key_u64s > ck->u64s)) {
- mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+ mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
kmalloc(key_u64s * sizeof(u64), _gfp));
@@ -258,22 +260,29 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
bkey_reassemble(ck->k, k);
+ ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c);
+ if (unlikely(ret))
+ goto err;
+
ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
+
+ bch2_btree_node_unlock_write(trans, path, path_l(path)->b);
+
if (unlikely(ret)) /* raced with another fill? */
goto err;
atomic_long_inc(&bc->nr_keys);
six_unlock_write(&ck->c.lock);
- enum six_lock_type lock_want = __btree_lock_want(path, 0);
+ enum six_lock_type lock_want = __btree_lock_want(ck_path, 0);
if (lock_want == SIX_LOCK_read)
six_lock_downgrade(&ck->c.lock);
- btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
- path->uptodate = BTREE_ITER_UPTODATE;
+ btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want);
+ ck_path->uptodate = BTREE_ITER_UPTODATE;
return 0;
err:
bkey_cached_free(bc, ck);
- mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+ mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
return ret;
}
@@ -282,10 +291,8 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
struct btree_path *ck_path,
unsigned flags)
{
- if (flags & BTREE_ITER_cached_nofill) {
- ck_path->uptodate = BTREE_ITER_UPTODATE;
+ if (flags & BTREE_ITER_cached_nofill)
return 0;
- }
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -293,6 +300,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
+ BTREE_ITER_intent|
BTREE_ITER_key_cache_fill|
BTREE_ITER_cached_nofill);
iter.flags &= ~BTREE_ITER_with_journal;
@@ -306,9 +314,19 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
if (unlikely(ret))
goto out;
- ret = btree_key_cache_create(trans, ck_path, k);
+ ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k);
if (ret)
goto err;
+
+ if (trace_key_cache_fill_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bpos_to_text(&buf, ck_path->pos);
+ prt_char(&buf, ' ');
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ trace_key_cache_fill(trans, buf.buf);
+ printbuf_exit(&buf);
+ }
out:
/* We're not likely to need this iterator again: */
bch2_set_btree_iter_dontneed(&iter);
@@ -424,8 +442,15 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
!test_bit(JOURNAL_space_low, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
- ret = bch2_btree_iter_traverse(&b_iter) ?:
- bch2_trans_update(trans, &b_iter, ck->k,
+ struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter);
+ ret = bkey_err(btree_k);
+ if (ret)
+ goto err;
+
+ /* * Check that we're not violating cache coherency rules: */
+ BUG_ON(bkey_deleted(btree_k.k));
+
+ ret = bch2_trans_update(trans, &b_iter, ck->k,
BTREE_UPDATE_key_cache_reclaim|
BTREE_UPDATE_internal_snapshot_node|
BTREE_TRIGGER_norun) ?:
@@ -433,7 +458,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
commit_flags);
-
+err:
bch2_fs_fatal_err_on(ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
!bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
@@ -586,8 +611,18 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
bkey_cached_free(bc, ck);
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- path->should_be_locked = false;
+
+ struct btree_path *path2;
+ unsigned i;
+ trans_for_each_path(trans, path2, i)
+ if (path2->l[0].b == (void *) ck) {
+ __bch2_btree_path_unlock(trans, path2);
+ path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop);
+ path2->should_be_locked = false;
+ btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE);
+ }
+
+ bch2_trans_verify_locks(trans);
}
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index efe2a007b482..8503931463d1 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -782,7 +782,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
return bch2_trans_relock_fail(trans, path, &f, trace);
}
- trans_set_locked(trans);
+ trans_set_locked(trans, true);
out:
bch2_trans_verify_locks(trans);
return 0;
@@ -818,6 +818,17 @@ void bch2_trans_unlock_long(struct btree_trans *trans)
bch2_trans_srcu_unlock(trans);
}
+void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+ struct btree_path *path;
+ unsigned i;
+
+ trans_for_each_path(trans, path, i)
+ for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++)
+ if (btree_node_write_locked(path, l))
+ bch2_btree_node_unlock_write(trans, path, path->l[l].b);
+}
+
int __bch2_trans_mutex_lock(struct btree_trans *trans,
struct mutex *lock)
{
@@ -856,6 +867,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path)
(want == BTREE_NODE_UNLOCKED ||
have != BTREE_NODE_WRITE_LOCKED) &&
want != have);
+
+ BUG_ON(btree_node_locked(path, l) &&
+ path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock));
}
}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 7c07f9fa9add..b54ef48eb8cc 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -16,6 +16,7 @@
void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
void bch2_trans_unlock_noassert(struct btree_trans *);
+void bch2_trans_unlock_write(struct btree_trans *);
static inline bool is_btree_node(struct btree_path *path, unsigned l)
{
@@ -75,13 +76,6 @@ static inline void mark_btree_node_locked_noreset(struct btree_path *path,
path->nodes_locked |= (type + 1) << (level << 1);
}
-static inline void mark_btree_node_unlocked(struct btree_path *path,
- unsigned level)
-{
- EBUG_ON(btree_node_write_locked(path, level));
- mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
-}
-
static inline void mark_btree_node_locked(struct btree_trans *trans,
struct btree_path *path,
unsigned level,
@@ -124,19 +118,25 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
/* unlock: */
+void bch2_btree_node_unlock_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
+
static inline void btree_node_unlock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
int lock_type = btree_node_locked_type(path, level);
EBUG_ON(level >= BTREE_MAX_DEPTH);
- EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED);
if (lock_type != BTREE_NODE_UNLOCKED) {
+ if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) {
+ bch2_btree_node_unlock_write(trans, path, path->l[level].b);
+ lock_type = BTREE_NODE_INTENT_LOCKED;
+ }
six_unlock_type(&path->l[level].b->c.lock, lock_type);
btree_trans_lock_hold_time_update(trans, path, level);
+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
}
- mark_btree_node_unlocked(path, level);
}
static inline int btree_path_lowest_level_locked(struct btree_path *path)
@@ -162,36 +162,40 @@ static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
* Updates the saved lock sequence number, so that bch2_btree_node_relock() will
* succeed:
*/
+static inline void
+__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b)
+{
+ if (!b->c.lock.write_lock_recurse) {
+ struct btree_path *linked;
+ unsigned i;
+
+ trans_for_each_path_with_node(trans, b, linked, i)
+ linked->l[b->c.level].lock_seq++;
+ }
+
+ six_unlock_write(&b->c.lock);
+}
+
static inline void
bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
struct btree *b)
{
- struct btree_path *linked;
- unsigned i;
-
EBUG_ON(path->l[b->c.level].b != b);
EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-
- trans_for_each_path_with_node(trans, b, linked, i)
- linked->l[b->c.level].lock_seq++;
-
- six_unlock_write(&b->c.lock);
+ __bch2_btree_node_unlock_write(trans, b);
}
-void bch2_btree_node_unlock_write(struct btree_trans *,
- struct btree_path *, struct btree *);
-
int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
/* lock: */
-static inline void trans_set_locked(struct btree_trans *trans)
+static inline void trans_set_locked(struct btree_trans *trans, bool try)
{
if (!trans->locked) {
- lock_acquire_exclusive(&trans->dep_map, 0, 0, NULL, _THIS_IP_);
+ lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_);
trans->locked = true;
trans->last_unlock_ip = 0;
@@ -282,7 +286,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 30131c3bdd97..a7f06deee13c 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -12,6 +12,7 @@
#include "recovery_passes.h"
#include <linux/kthread.h>
+#include <linux/min_heap.h>
#include <linux/sort.h>
struct find_btree_nodes_worker {
@@ -22,17 +23,15 @@ struct find_btree_nodes_worker {
static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
{
- prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ",
- bch2_btree_id_str(n->btree_id), n->level, n->seq,
- n->journal_seq, n->cookie);
+ bch2_btree_id_level_to_text(out, n->btree_id, n->level);
+ prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
+ n->seq, n->journal_seq, n->cookie);
bch2_bpos_to_text(out, n->min_key);
prt_str(out, "-");
bch2_bpos_to_text(out, n->max_key);
if (n->range_updated)
prt_str(out, " range updated");
- if (n->overwritten)
- prt_str(out, " overwritten");
for (unsigned i = 0; i < n->nr_ptrs; i++) {
prt_char(out, ' ');
@@ -140,6 +139,24 @@ static int found_btree_node_cmp_pos(const void *_l, const void *_r)
-found_btree_node_cmp_time(l, r);
}
+static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
+{
+ return found_btree_node_cmp_pos(l, r) < 0;
+}
+
+static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
+{
+ struct found_btree_node *l = _l;
+ struct found_btree_node *r = _r;
+
+ swap(*l, *r);
+}
+
+static const struct min_heap_callbacks found_btree_node_heap_cbs = {
+ .less = found_btree_node_cmp_pos_less,
+ .swp = found_btree_node_swap,
+};
+
static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
struct bio *bio, struct btree_node *bn, u64 offset)
{
@@ -159,6 +176,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
return;
if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
+ if (!c->chacha20)
+ return;
+
struct nonce nonce = btree_nonce(&bn->keys, 0);
unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
@@ -292,55 +312,48 @@ static int read_btree_nodes(struct find_btree_nodes *f)
return f->ret ?: ret;
}
-static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
+static bool nodes_overlap(const struct found_btree_node *l,
+ const struct found_btree_node *r)
{
- while (n + 1 < end &&
- found_btree_node_cmp_pos(n, n + 1) > 0) {
- swap(n[0], n[1]);
- n++;
- }
+ return (l->btree_id == r->btree_id &&
+ l->level == r->level &&
+ bpos_gt(l->max_key, r->min_key));
}
static int handle_overwrites(struct bch_fs *c,
- struct found_btree_node *start,
- struct found_btree_node *end)
+ struct found_btree_node *l,
+ found_btree_nodes *nodes_heap)
{
- struct found_btree_node *n;
-again:
- for (n = start + 1;
- n < end &&
- n->btree_id == start->btree_id &&
- n->level == start->level &&
- bpos_lt(n->min_key, start->max_key);
- n++) {
- int cmp = found_btree_node_cmp_time(start, n);
+ struct found_btree_node *r;
+
+ while ((r = min_heap_peek(nodes_heap)) &&
+ nodes_overlap(l, r)) {
+ int cmp = found_btree_node_cmp_time(l, r);
if (cmp > 0) {
- if (bpos_cmp(start->max_key, n->max_key) >= 0)
- n->overwritten = true;
+ if (bpos_cmp(l->max_key, r->max_key) >= 0)
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
else {
- n->range_updated = true;
- n->min_key = bpos_successor(start->max_key);
- n->range_updated = true;
- bubble_up(n, end);
- goto again;
+ r->range_updated = true;
+ r->min_key = bpos_successor(l->max_key);
+ r->range_updated = true;
+ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
}
} else if (cmp < 0) {
- BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
+ BUG_ON(bpos_eq(l->min_key, r->min_key));
- start->max_key = bpos_predecessor(n->min_key);
- start->range_updated = true;
- } else if (n->level) {
- n->overwritten = true;
+ l->max_key = bpos_predecessor(r->min_key);
+ l->range_updated = true;
+ } else if (r->level) {
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
} else {
- if (bpos_cmp(start->max_key, n->max_key) >= 0)
- n->overwritten = true;
+ if (bpos_cmp(l->max_key, r->max_key) >= 0)
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
else {
- n->range_updated = true;
- n->min_key = bpos_successor(start->max_key);
- n->range_updated = true;
- bubble_up(n, end);
- goto again;
+ r->range_updated = true;
+ r->min_key = bpos_successor(l->max_key);
+ r->range_updated = true;
+ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
}
}
}
@@ -352,6 +365,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
{
struct find_btree_nodes *f = &c->found_btree_nodes;
struct printbuf buf = PRINTBUF;
+ found_btree_nodes nodes_heap = {};
size_t dst;
int ret = 0;
@@ -406,29 +420,57 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
bch2_print_string_as_lines(KERN_INFO, buf.buf);
}
- dst = 0;
- darray_for_each(f->nodes, i) {
- if (i->overwritten)
- continue;
+ swap(nodes_heap, f->nodes);
+
+ {
+ /* darray must have same layout as a heap */
+ min_heap_char real_heap;
+ BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr));
+ BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size));
+ BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr));
+ BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size));
+ }
+
+ min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
- ret = handle_overwrites(c, i, &darray_top(f->nodes));
+ if (nodes_heap.nr) {
+ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
if (ret)
goto err;
- BUG_ON(i->overwritten);
- f->nodes.data[dst++] = *i;
+ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
}
- f->nodes.nr = dst;
- if (c->opts.verbose) {
+ while (true) {
+ ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
+ if (ret)
+ goto err;
+
+ if (!nodes_heap.nr)
+ break;
+
+ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
+ if (ret)
+ goto err;
+
+ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
+ }
+
+ for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
+ BUG_ON(nodes_overlap(n, n + 1));
+
+ if (0 && c->opts.verbose) {
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ } else {
+ bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
}
eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
err:
+ darray_exit(&nodes_heap);
printbuf_exit(&buf);
return ret;
}
@@ -499,7 +541,9 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
if (c->opts.verbose) {
struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
+ prt_str(&buf, "recovery ");
+ bch2_btree_id_level_to_text(&buf, btree, level);
+ prt_str(&buf, " ");
bch2_bpos_to_text(&buf, node_min);
prt_str(&buf, " - ");
bch2_bpos_to_text(&buf, node_max);
@@ -533,7 +577,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
printbuf_exit(&buf);
- BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0));
+ BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
+ (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = level + 1,
+ .btree = btree,
+ }));
ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
if (ret)
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
index b6c36c45d0be..2811b6857c97 100644
--- a/fs/bcachefs/btree_node_scan_types.h
+++ b/fs/bcachefs/btree_node_scan_types.h
@@ -6,7 +6,6 @@
struct found_btree_node {
bool range_updated:1;
- bool overwritten:1;
u8 btree_id;
u8 level;
unsigned sectors_written;
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 9bf471fa4361..6b79b672e0b1 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -133,7 +133,7 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans)
return 0;
}
-static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans)
{
if (likely(trans->write_locked)) {
trans_for_each_update(trans, i)
@@ -249,7 +249,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
new |= 1 << BTREE_NODE_need_write;
} while (!try_cmpxchg(&b->flags, &old, new));
- btree_node_write_if_need(c, b, SIX_LOCK_read);
+ btree_node_write_if_need(trans, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
bch2_trans_put(trans);
@@ -384,7 +384,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
struct bkey_i *new_k;
int ret;
- bch2_trans_unlock_write(trans);
+ bch2_trans_unlock_updates_write(trans);
bch2_trans_unlock(trans);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
@@ -479,8 +479,7 @@ static int run_one_mem_trigger(struct btree_trans *trans,
old, flags);
}
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
- bool overwrite)
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i)
{
verify_update_old_key(trans, i);
@@ -507,10 +506,10 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
BTREE_TRIGGER_insert|
BTREE_TRIGGER_overwrite|flags) ?: 1;
- } else if (overwrite && !i->overwrite_trigger_run) {
+ } else if (!i->overwrite_trigger_run) {
i->overwrite_trigger_run = true;
return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
- } else if (!overwrite && !i->insert_trigger_run) {
+ } else if (!i->insert_trigger_run) {
i->insert_trigger_run = true;
return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
} else {
@@ -519,39 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
}
static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- unsigned btree_id_start)
+ unsigned *btree_id_updates_start)
{
- for (int overwrite = 1; overwrite >= 0; --overwrite) {
- bool trans_trigger_run;
+ bool trans_trigger_run;
- /*
- * Running triggers will append more updates to the list of updates as
- * we're walking it:
- */
- do {
- trans_trigger_run = false;
-
- for (unsigned i = btree_id_start;
- i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
- i++) {
- if (trans->updates[i].btree_id != btree_id)
- continue;
+ /*
+ * Running triggers will append more updates to the list of updates as
+ * we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
- int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
- if (ret < 0)
- return ret;
- if (ret)
- trans_trigger_run = true;
+ for (unsigned i = *btree_id_updates_start;
+ i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
+ i++) {
+ if (trans->updates[i].btree_id < btree_id) {
+ *btree_id_updates_start = i;
+ continue;
}
- } while (trans_trigger_run);
- }
+
+ int ret = run_one_trans_trigger(trans, trans->updates + i);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ trans_trigger_run = true;
+ }
+ } while (trans_trigger_run);
+
+ trans_for_each_update(trans, i)
+ BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
+ i->btree_id == btree_id &&
+ btree_node_type_has_trans_triggers(i->bkey_type) &&
+ (!i->insert_trigger_run || !i->overwrite_trigger_run));
return 0;
}
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- unsigned btree_id = 0, btree_id_start = 0;
+ unsigned btree_id = 0, btree_id_updates_start = 0;
int ret = 0;
/*
@@ -565,27 +570,15 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
if (btree_id == BTREE_ID_alloc)
continue;
- while (btree_id_start < trans->nr_updates &&
- trans->updates[btree_id_start].btree_id < btree_id)
- btree_id_start++;
-
- ret = run_btree_triggers(trans, btree_id, btree_id_start);
+ ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start);
if (ret)
return ret;
}
- for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
- struct btree_insert_entry *i = trans->updates + idx;
-
- if (i->btree_id > BTREE_ID_alloc)
- break;
- if (i->btree_id == BTREE_ID_alloc) {
- ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
- if (ret)
- return ret;
- break;
- }
- }
+ btree_id_updates_start = 0;
+ ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start);
+ if (ret)
+ return ret;
#ifdef CONFIG_BCACHEFS_DEBUG
trans_for_each_update(trans, i)
@@ -609,14 +602,6 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
return 0;
}
-static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
-{
- return (struct bversion) {
- .hi = res->seq >> 32,
- .lo = (res->seq << 32) | (res->offset + offset),
- };
-}
-
static inline int
bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
struct btree_insert_entry **stopped_at,
@@ -627,12 +612,11 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
unsigned u64s = 0;
int ret = 0;
- bch2_trans_verify_not_unlocked(trans);
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
if (race_fault()) {
trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
- return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
}
/*
@@ -701,25 +685,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
struct jset_entry *entry = trans->journal_entries;
percpu_down_read(&c->mark_lock);
-
for (entry = trans->journal_entries;
entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
entry = vstruct_next(entry))
if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
entry->start->k.type == KEY_TYPE_accounting) {
- BUG_ON(!trans->journal_res.ref);
-
- struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
-
- a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
- (u64 *) entry - (u64 *) trans->journal_entries);
- BUG_ON(bversion_zero(a->k.bversion));
-
- if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
- ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal);
- if (ret)
- goto revert_fs_usage;
- }
+ ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags);
+ if (ret)
+ goto revert_fs_usage;
}
percpu_up_read(&c->mark_lock);
@@ -739,33 +712,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
goto fatal_err;
}
- trans_for_each_update(trans, i) {
- enum bch_validate_flags invalid_flags = 0;
+ struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit };
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
- ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
- i->bkey_type, invalid_flags);
- if (unlikely(ret)){
- bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
- trans->fn, (void *) i->ip_allocated);
- goto fatal_err;
- }
- btree_insert_entry_checks(trans, i);
- }
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+ validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit;
for (struct jset_entry *i = trans->journal_entries;
i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
i = vstruct_next(i)) {
- enum bch_validate_flags invalid_flags = 0;
-
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
ret = bch2_journal_entry_validate(c, NULL, i,
bcachefs_metadata_version_current,
- CPU_BIG_ENDIAN, invalid_flags);
+ CPU_BIG_ENDIAN, validate_context);
if (unlikely(ret)) {
bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
trans->fn);
@@ -773,6 +730,19 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
}
+ trans_for_each_update(trans, i) {
+ validate_context.level = i->level;
+ validate_context.btree = i->btree_id;
+
+ ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context);
+ if (unlikely(ret)){
+ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
+ trans->fn, (void *) i->ip_allocated);
+ goto fatal_err;
+ }
+ btree_insert_entry_checks(trans, i);
+ }
+
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal;
struct jset_entry *entry;
@@ -833,13 +803,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
entry2 != entry;
entry2 = vstruct_next(entry2))
if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys &&
- entry2->start->k.type == KEY_TYPE_accounting) {
- struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
-
- bch2_accounting_neg(a);
- bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
- bch2_accounting_neg(a);
- }
+ entry2->start->k.type == KEY_TYPE_accounting)
+ bch2_accounting_trans_commit_revert(trans,
+ bkey_i_to_accounting(entry2->start), flags);
percpu_up_read(&c->mark_lock);
return ret;
}
@@ -902,7 +868,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans);
- bch2_trans_unlock_write(trans);
+ bch2_trans_unlock_updates_write(trans);
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
@@ -994,24 +960,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
return ret;
}
-static noinline int
-bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- int ret;
-
- if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
- test_bit(BCH_FS_started, &c->flags))
- return -BCH_ERR_erofs_trans_commit;
-
- ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
- if (ret)
- return ret;
-
- bch2_write_ref_get(c, BCH_WRITE_REF_trans);
- return 0;
-}
-
/*
* This is for updates done in the early part of fsck - btree_gc - before we've
* gone RW. we only add the new key to the list of keys for journal replay to
@@ -1022,6 +970,8 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
+ BUG_ON(current != c->recovery_task);
+
trans_for_each_update(trans, i) {
int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
if (ret)
@@ -1047,8 +997,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
int ret = 0;
- bch2_trans_verify_not_unlocked(trans);
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
if (!trans->nr_updates &&
!trans->journal_entries_u64s)
@@ -1058,16 +1007,13 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (ret)
goto out_reset;
- if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
- ret = do_bch2_trans_commit_to_journal_replay(trans);
- goto out_reset;
- }
-
if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
- ret = bch2_trans_commit_get_rw_cold(trans, flags);
- if (ret)
- goto out_reset;
+ if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
+ ret = do_bch2_trans_commit_to_journal_replay(trans);
+ else
+ ret = -BCH_ERR_erofs_trans_commit;
+ goto out_reset;
}
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
@@ -1112,8 +1058,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
}
retry:
errored_at = NULL;
- bch2_trans_verify_not_unlocked(trans);
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4568a41fefaf..a6f251eb4164 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -513,6 +513,9 @@ struct btree_trans {
u64 last_begin_time;
unsigned long last_begin_ip;
unsigned long last_restarted_ip;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ bch_stacktrace last_restarted_trace;
+#endif
unsigned long last_unlock_ip;
unsigned long srcu_lock_time;
@@ -787,53 +790,64 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type)
return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
}
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
+static inline bool btree_id_is_extents(enum btree_id btree)
{
const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return BIT_ULL(type) & mask;
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_id_is_extents(enum btree_id btree)
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
- return btree_node_type_is_extents(__btree_node_type(0, btree));
+ return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1);
+}
+
+static inline bool btree_type_has_snapshots(enum btree_id btree)
+{
+ const u64 mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr)
+ BCH_BTREE_IDS()
+#undef x
+ ;
+
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_type_has_snapshots(enum btree_id id)
+static inline bool btree_type_has_snapshot_field(enum btree_id btree)
{
const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
+#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr)
BCH_BTREE_IDS()
#undef x
;
- return BIT_ULL(id) & mask;
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_type_has_snapshot_field(enum btree_id id)
+static inline bool btree_type_has_ptrs(enum btree_id btree)
{
const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return BIT_ULL(id) & mask;
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_type_has_ptrs(enum btree_id id)
+static inline bool btree_type_uses_write_buffer(enum btree_id btree)
{
const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr)
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return BIT_ULL(id) & mask;
+ return BIT_ULL(btree) & mask;
}
struct btree_root {
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 5d809e8bd170..13d794f201a5 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -144,7 +144,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
!(ret = bkey_err(old_k)) &&
bkey_eq(old_pos, old_k.k->p)) {
struct bpos whiteout_pos =
- SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+ SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
@@ -296,7 +296,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
BTREE_ITER_intent|
BTREE_ITER_with_updates|
BTREE_ITER_not_extents);
- k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+ k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -323,7 +323,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
goto out;
next:
bch2_btree_iter_advance(&iter);
- k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+ k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -588,12 +588,9 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
enum btree_id btree, struct bpos end)
{
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent);
- k = bch2_btree_iter_prev(iter);
- ret = bkey_err(k);
+ bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
+ int ret = bkey_err(k);
if (ret)
goto err;
@@ -672,27 +669,19 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
bch2_btree_insert_trans(trans, id, k, iter_flags));
}
-int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
- unsigned len, unsigned update_flags)
+int bch2_btree_delete_at(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned update_flags)
{
- struct bkey_i *k;
-
- k = bch2_trans_kmalloc(trans, sizeof(*k));
- if (IS_ERR(k))
- return PTR_ERR(k);
+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+ int ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ return ret;
bkey_init(&k->k);
k->k.p = iter->pos;
- bch2_key_resize(&k->k, len);
return bch2_trans_update(trans, iter, k, update_flags);
}
-int bch2_btree_delete_at(struct btree_trans *trans,
- struct btree_iter *iter, unsigned update_flags)
-{
- return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
-}
-
int bch2_btree_delete(struct btree_trans *trans,
enum btree_id btree, struct bpos pos,
unsigned update_flags)
@@ -721,7 +710,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
int ret = 0;
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
- while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
+ while ((k = bch2_btree_iter_peek_max(&iter, end)).k) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(trans->c, 0);
struct bkey_i delete;
@@ -794,8 +783,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
return ret;
}
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
- struct bpos pos, bool set)
+int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set)
{
struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
int ret = PTR_ERR_OR_ZERO(k);
@@ -804,13 +792,21 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
bkey_init(&k->k);
k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
- k->k.p = pos;
+ k->k.p = iter->pos;
+ if (iter->flags & BTREE_ITER_is_extents)
+ bch2_key_resize(&k->k, 1);
+ return bch2_trans_update(trans, iter, k, 0);
+}
+
+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
+ struct bpos pos, bool set)
+{
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, k, 0);
+ int ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_bit_mod_iter(trans, &iter, set);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -827,10 +823,17 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
return bch2_trans_update_buffered(trans, btree, &k);
}
-static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s)
+int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
{
+ unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64));
+ prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos);
+
+ int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+ if (ret)
+ return ret;
+
struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
- int ret = PTR_ERR_OR_ZERO(e);
+ ret = PTR_ERR_OR_ZERO(e);
if (ret)
return ret;
@@ -865,9 +868,8 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
memcpy(l->d, buf.buf, buf.pos);
c->journal.early_journal_entries.nr += jset_u64s(u64s);
} else {
- ret = bch2_trans_commit_do(c, NULL, NULL,
- BCH_TRANS_COMMIT_lazy_rw|commit_flags,
- __bch2_trans_log_msg(trans, &buf, u64s));
+ ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
+ bch2_trans_log_msg(trans, &buf));
}
err:
printbuf_exit(&buf);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 70b3c989fac2..8f22ef9a7651 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -24,7 +24,6 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
#define BCH_TRANS_COMMIT_FLAGS() \
x(no_enospc, "don't check for enospc") \
x(no_check_rw, "don't attempt to take a ref on c->writes") \
- x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \
x(no_journal_res, "don't take a journal reservation, instead " \
"pin journal entry referred to by trans->journal_res.seq") \
x(journal_reclaim, "operation required for journal reclaim; may return error" \
@@ -47,8 +46,6 @@ enum bch_trans_commit_flags {
void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
-int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
- unsigned, unsigned);
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
@@ -66,6 +63,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, unsigned, u64 *);
+int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool);
int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
@@ -161,6 +159,7 @@ void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *, unsigned);
+int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
@@ -244,7 +243,8 @@ static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *tra
KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k, unsigned flags,
+ struct bkey_s_c *k,
+ enum btree_iter_update_trigger_flags flags,
unsigned type, unsigned min_bytes)
{
struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
@@ -261,8 +261,9 @@ static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, str
return mut;
}
-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k, unsigned flags)
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans,
+ struct btree_iter *iter, struct bkey_s_c *k,
+ enum btree_iter_update_trigger_flags flags)
{
return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
}
@@ -274,7 +275,8 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struc
static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
- unsigned flags, unsigned type, unsigned min_bytes)
+ enum btree_iter_update_trigger_flags flags,
+ unsigned type, unsigned min_bytes)
{
struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
btree_id, pos, flags|BTREE_ITER_intent, type);
@@ -289,7 +291,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr
static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
}
@@ -297,7 +299,8 @@ static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *tran
static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
- unsigned flags, unsigned type, unsigned min_bytes)
+ enum btree_iter_update_trigger_flags flags,
+ unsigned type, unsigned min_bytes)
{
struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
@@ -318,7 +321,8 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
- unsigned flags, unsigned min_bytes)
+ enum btree_iter_update_trigger_flags flags,
+ unsigned min_bytes)
{
return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
}
@@ -326,7 +330,7 @@ static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans
static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
}
@@ -337,7 +341,8 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
- unsigned flags, unsigned type, unsigned val_size)
+ enum btree_iter_update_trigger_flags flags,
+ unsigned type, unsigned val_size)
{
struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
int ret;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d596ef93239f..f4aeadbe53c1 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -58,11 +58,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
!bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
b->data->min_key));
+ bch2_bkey_buf_init(&prev);
+ bkey_init(&prev.k->k);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+
if (b == btree_node_root(c, b)) {
if (!bpos_eq(b->data->min_key, POS_MIN)) {
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->min_key);
- need_fsck_err(trans, btree_root_bad_min_key,
+ log_fsck_err(trans, btree_root_bad_min_key,
"btree root with incorrect min_key: %s", buf.buf);
goto topology_repair;
}
@@ -70,18 +74,14 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->max_key);
- need_fsck_err(trans, btree_root_bad_max_key,
+ log_fsck_err(trans, btree_root_bad_max_key,
"btree root with incorrect max_key: %s", buf.buf);
goto topology_repair;
}
}
if (!b->c.level)
- return 0;
-
- bch2_bkey_buf_init(&prev);
- bkey_init(&prev.k->k);
- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ goto out;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
if (k.k->type != KEY_TYPE_btree_ptr_v2)
@@ -97,16 +97,16 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
bch2_topology_error(c);
printbuf_reset(&buf);
- prt_str(&buf, "end of prev node doesn't match start of next node\n"),
- prt_printf(&buf, " in btree %s level %u node ",
- bch2_btree_id_str(b->c.btree_id), b->c.level);
+ prt_str(&buf, "end of prev node doesn't match start of next node\n in ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_str(&buf, " node ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_str(&buf, "\n prev ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
prt_str(&buf, "\n next ");
bch2_bkey_val_to_text(&buf, c, k);
- need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
+ log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
goto topology_repair;
}
@@ -118,25 +118,25 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
bch2_topology_error(c);
printbuf_reset(&buf);
- prt_str(&buf, "empty interior node\n");
- prt_printf(&buf, " in btree %s level %u node ",
- bch2_btree_id_str(b->c.btree_id), b->c.level);
+ prt_str(&buf, "empty interior node\n in ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_str(&buf, " node ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
+ log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
goto topology_repair;
} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
bch2_topology_error(c);
printbuf_reset(&buf);
- prt_str(&buf, "last child node doesn't end at end of parent node\n");
- prt_printf(&buf, " in btree %s level %u node ",
- bch2_btree_id_str(b->c.btree_id), b->c.level);
+ prt_str(&buf, "last child node doesn't end at end of parent node\n in ");
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_str(&buf, " node ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_str(&buf, "\n last key ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
- need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
+ log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
goto topology_repair;
}
out:
@@ -146,13 +146,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
printbuf_exit(&buf);
return ret;
topology_repair:
- if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
- bch2_inconsistent_error(c);
- ret = -BCH_ERR_btree_need_topology_repair;
- } else {
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- }
+ ret = bch2_topology_error(c);
goto out;
}
@@ -244,7 +238,6 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
struct btree *b)
{
struct bch_fs *c = trans->c;
- unsigned i, level = b->c.level;
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
@@ -255,13 +248,9 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
mutex_unlock(&c->btree_cache.lock);
six_unlock_write(&b->c.lock);
- mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
+ mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- trans_for_each_path(trans, path, i)
- if (path->l[level].b == b) {
- btree_node_unlock(trans, path, level);
- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
- }
+ bch2_trans_node_drop(trans, b);
}
static void bch2_btree_node_free_never_used(struct btree_update *as,
@@ -270,8 +259,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
{
struct bch_fs *c = as->c;
struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
- struct btree_path *path;
- unsigned i, level = b->c.level;
BUG_ON(!list_empty(&b->write_blocked));
BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
@@ -293,11 +280,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
six_unlock_intent(&b->c.lock);
- trans_for_each_path(trans, path, i)
- if (path->l[level].b == b) {
- btree_node_unlock(trans, path, level);
- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
- }
+ bch2_trans_node_drop(trans, b);
}
static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
@@ -809,7 +792,7 @@ static void btree_update_nodes_written(struct btree_update *as)
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
six_unlock_write(&b->c.lock);
- btree_node_write_if_need(c, b, SIX_LOCK_intent);
+ btree_node_write_if_need(trans, b, SIX_LOCK_intent);
btree_node_unlock(trans, path, b->c.level);
bch2_path_put(trans, path_idx, true);
}
@@ -830,7 +813,7 @@ static void btree_update_nodes_written(struct btree_update *as)
b = as->new_nodes[i];
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- btree_node_write_if_need(c, b, SIX_LOCK_read);
+ btree_node_write_if_need(trans, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
}
@@ -1366,9 +1349,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
- if (bch2_bkey_validate(c, bkey_i_to_s_c(insert),
- btree_node_type(b), BCH_VALIDATE_write) ?:
- bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) {
+ struct bkey_validate_context from = (struct bkey_validate_context) {
+ .from = BKEY_VALIDATE_btree_node,
+ .level = b->c.level,
+ .btree = b->c.btree_id,
+ .flags = BCH_VALIDATE_commit,
+ };
+ if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?:
+ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) {
bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
dump_stack();
}
@@ -1418,15 +1406,26 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
(bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
;
- while (!bch2_keylist_empty(keys)) {
- insert = bch2_keylist_front(keys);
+ for (;
+ insert != keys->top && bpos_le(insert->k.p, b->key.k.p);
+ insert = bkey_next(insert))
+ bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
- if (bpos_gt(insert->k.p, b->key.k.p))
- break;
+ if (bch2_btree_node_check_topology(trans, b)) {
+ struct printbuf buf = PRINTBUF;
- bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
- bch2_keylist_pop_front(keys);
+ for (struct bkey_i *k = keys->keys;
+ k != insert;
+ k = bkey_next(k)) {
+ bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
+ prt_newline(&buf);
+ }
+
+ panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf);
}
+
+ memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);
+ keys->top_p -= insert->_data - keys->keys_p;
}
static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
@@ -1575,8 +1574,6 @@ static void btree_split_insert_keys(struct btree_update *as,
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
-
- BUG_ON(bch2_btree_node_check_topology(trans, b));
}
}
@@ -1599,8 +1596,6 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (ret)
return ret;
- bch2_btree_interior_update_will_free_node(as, b);
-
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
struct btree *n[2];
@@ -1699,16 +1694,18 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (ret)
goto err;
+ bch2_btree_interior_update_will_free_node(as, b);
+
if (n3) {
bch2_btree_update_get_open_buckets(as, n3);
- bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0);
}
if (n2) {
bch2_btree_update_get_open_buckets(as, n2);
- bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0);
}
bch2_btree_update_get_open_buckets(as, n1);
- bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0);
/*
* The old node must be freed (in memory) _before_ unlocking the new
@@ -1827,8 +1824,6 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
btree_update_updated_node(as, b);
bch2_btree_node_unlock_write(trans, path, b);
-
- BUG_ON(bch2_btree_node_check_topology(trans, b));
return 0;
split:
/*
@@ -1905,7 +1900,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
BUG_ON(ret);
bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
bch2_trans_node_add(trans, path, n);
six_unlock_intent(&n->c.lock);
@@ -1953,8 +1948,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
u64 start_time = local_clock();
int ret = 0;
- bch2_trans_verify_not_in_restart(trans);
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
BUG_ON(!trans->paths[path].should_be_locked);
BUG_ON(!btree_node_locked(&trans->paths[path], level));
@@ -2058,9 +2052,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
trace_and_count(c, btree_node_merge, trans, b);
- bch2_btree_interior_update_will_free_node(as, b);
- bch2_btree_interior_update_will_free_node(as, m);
-
n = bch2_btree_node_alloc(as, trans, b->c.level);
SET_BTREE_NODE_SEQ(n->data,
@@ -2096,10 +2087,13 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (ret)
goto err_free_update;
+ bch2_btree_interior_update_will_free_node(as, b);
+ bch2_btree_interior_update_will_free_node(as, m);
+
bch2_trans_verify_paths(trans);
bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
bch2_btree_node_free_inmem(trans, trans->paths + path, b);
bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
@@ -2150,8 +2144,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
if (ret)
goto out;
- bch2_btree_interior_update_will_free_node(as, b);
-
n = bch2_btree_node_alloc_replacement(as, trans, b);
bch2_btree_build_aux_trees(n);
@@ -2175,8 +2167,10 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
if (ret)
goto err;
+ bch2_btree_interior_update_will_free_node(as, b);
+
bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
@@ -2201,42 +2195,50 @@ struct async_btree_rewrite {
struct list_head list;
enum btree_id btree_id;
unsigned level;
- struct bpos pos;
- __le64 seq;
+ struct bkey_buf key;
};
static int async_btree_node_rewrite_trans(struct btree_trans *trans,
struct async_btree_rewrite *a)
{
- struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct btree *b;
- int ret;
-
- bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
+ bch2_trans_node_iter_init(trans, &iter,
+ a->btree_id, a->key.k->k.p,
BTREE_MAX_DEPTH, a->level, 0);
- b = bch2_btree_iter_peek_node(&iter);
- ret = PTR_ERR_OR_ZERO(b);
+ struct btree *b = bch2_btree_iter_peek_node(&iter);
+ int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto out;
- if (!b || b->data->keys.seq != a->seq) {
+ bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
+ ret = found
+ ? bch2_btree_node_rewrite(trans, &iter, b, 0)
+ : -ENOENT;
+
+#if 0
+ /* Tracepoint... */
+ if (!ret || ret == -ENOENT) {
+ struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- if (b)
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- else
- prt_str(&buf, "(null");
- bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
- __func__, a->seq, buf.buf);
+ if (!ret) {
+ prt_printf(&buf, "rewrite node:\n ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
+ } else {
+ prt_printf(&buf, "node to rewrite not found:\n want: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
+ prt_printf(&buf, "\n got: ");
+ if (b)
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ else
+ prt_str(&buf, "(null)");
+ }
+ bch_info(c, "%s", buf.buf);
printbuf_exit(&buf);
- goto out;
}
-
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+#endif
out:
bch2_trans_iter_exit(trans, &iter);
-
return ret;
}
@@ -2247,81 +2249,97 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
struct bch_fs *c = a->c;
int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
- bch_err_fn_ratelimited(c, ret);
+ if (ret != -ENOENT)
+ bch_err_fn_ratelimited(c, ret);
+
+ spin_lock(&c->btree_node_rewrites_lock);
+ list_del(&a->list);
+ spin_unlock(&c->btree_node_rewrites_lock);
+
+ closure_wake_up(&c->btree_node_rewrites_wait);
+
+ bch2_bkey_buf_exit(&a->key, c);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
{
- struct async_btree_rewrite *a;
- int ret;
-
- a = kmalloc(sizeof(*a), GFP_NOFS);
- if (!a) {
- bch_err(c, "%s: error allocating memory", __func__);
+ struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
+ if (!a)
return;
- }
a->c = c;
a->btree_id = b->c.btree_id;
a->level = b->c.level;
- a->pos = b->key.k.p;
- a->seq = b->data->keys.seq;
INIT_WORK(&a->work, async_btree_node_rewrite_work);
- if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
- mutex_lock(&c->pending_node_rewrites_lock);
- list_add(&a->list, &c->pending_node_rewrites);
- mutex_unlock(&c->pending_node_rewrites_lock);
- return;
- }
+ bch2_bkey_buf_init(&a->key);
+ bch2_bkey_buf_copy(&a->key, c, &b->key);
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
- if (test_bit(BCH_FS_started, &c->flags)) {
- bch_err(c, "%s: error getting c->writes ref", __func__);
- kfree(a);
- return;
- }
+ bool now = false, pending = false;
- ret = bch2_fs_read_write_early(c);
- bch_err_msg(c, ret, "going read-write");
- if (ret) {
- kfree(a);
- return;
- }
+ spin_lock(&c->btree_node_rewrites_lock);
+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay &&
+ bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+ list_add(&a->list, &c->btree_node_rewrites);
+ now = true;
+ } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
+ list_add(&a->list, &c->btree_node_rewrites_pending);
+ pending = true;
+ }
+ spin_unlock(&c->btree_node_rewrites_lock);
- bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+ if (now) {
+ queue_work(c->btree_node_rewrite_worker, &a->work);
+ } else if (pending) {
+ /* bch2_do_pending_node_rewrites will execute */
+ } else {
+ bch2_bkey_buf_exit(&a->key, c);
+ kfree(a);
}
+}
- queue_work(c->btree_node_rewrite_worker, &a->work);
+void bch2_async_btree_node_rewrites_flush(struct bch_fs *c)
+{
+ closure_wait_event(&c->btree_node_rewrites_wait,
+ list_empty(&c->btree_node_rewrites));
}
void bch2_do_pending_node_rewrites(struct bch_fs *c)
{
- struct async_btree_rewrite *a, *n;
-
- mutex_lock(&c->pending_node_rewrites_lock);
- list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
- list_del(&a->list);
+ while (1) {
+ spin_lock(&c->btree_node_rewrites_lock);
+ struct async_btree_rewrite *a =
+ list_pop_entry(&c->btree_node_rewrites_pending,
+ struct async_btree_rewrite, list);
+ if (a)
+ list_add(&a->list, &c->btree_node_rewrites);
+ spin_unlock(&c->btree_node_rewrites_lock);
+
+ if (!a)
+ break;
bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
queue_work(c->btree_node_rewrite_worker, &a->work);
}
- mutex_unlock(&c->pending_node_rewrites_lock);
}
void bch2_free_pending_node_rewrites(struct bch_fs *c)
{
- struct async_btree_rewrite *a, *n;
+ while (1) {
+ spin_lock(&c->btree_node_rewrites_lock);
+ struct async_btree_rewrite *a =
+ list_pop_entry(&c->btree_node_rewrites_pending,
+ struct async_btree_rewrite, list);
+ spin_unlock(&c->btree_node_rewrites_lock);
- mutex_lock(&c->pending_node_rewrites_lock);
- list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
- list_del(&a->list);
+ if (!a)
+ break;
+ bch2_bkey_buf_exit(&a->key, c);
kfree(a);
}
- mutex_unlock(&c->pending_node_rewrites_lock);
}
static int __bch2_btree_node_update_key(struct btree_trans *trans,
@@ -2575,8 +2593,9 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update
prt_printf(out, "%ps: ", (void *) as->ip_started);
bch2_trans_commit_flags_to_text(out, as->flags);
- prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
- bch2_btree_id_str(as->btree_id),
+ prt_str(out, " ");
+ bch2_btree_id_to_text(out, as->btree_id);
+ prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
as->update_level_start,
as->update_level_end,
bch2_btree_update_modes[as->mode],
@@ -2677,6 +2696,9 @@ void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
{
+ WARN_ON(!list_empty(&c->btree_node_rewrites));
+ WARN_ON(!list_empty(&c->btree_node_rewrites_pending));
+
if (c->btree_node_rewrite_worker)
destroy_workqueue(c->btree_node_rewrite_worker);
if (c->btree_interior_update_worker)
@@ -2692,8 +2714,9 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
mutex_init(&c->btree_interior_update_lock);
INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
- INIT_LIST_HEAD(&c->pending_node_rewrites);
- mutex_init(&c->pending_node_rewrites_lock);
+ INIT_LIST_HEAD(&c->btree_node_rewrites);
+ INIT_LIST_HEAD(&c->btree_node_rewrites_pending);
+ spin_lock_init(&c->btree_node_rewrites_lock);
}
int bch2_fs_btree_interior_update_init(struct bch_fs *c)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 10f400957f21..7930ffea3075 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -159,7 +159,7 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
unsigned level,
unsigned flags)
{
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
btree_prev_sib) ?:
@@ -334,6 +334,7 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
struct jset_entry *, unsigned long);
+void bch2_async_btree_node_rewrites_flush(struct bch_fs *);
void bch2_do_pending_node_rewrites(struct bch_fs *);
void bch2_free_pending_node_rewrites(struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 1639c60dffa0..b56c4987b8c9 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -19,8 +19,6 @@
static int bch2_btree_write_buffer_journal_flush(struct journal *,
struct journal_entry_pin *, u64);
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);
-
static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
{
return (cmp_int(l->hi, r->hi) ?:
@@ -314,6 +312,8 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
darray_for_each(wb->sorted, i) {
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
+ BUG_ON(!btree_type_uses_write_buffer(k->btree));
+
for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
prefetch(&wb->flushing.keys.data[n->idx]);
@@ -481,21 +481,55 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
return ret;
}
-static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
+{
+ struct journal_keys_to_wb dst;
+ int ret = 0;
+
+ bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
+
+ for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
+ jset_entry_for_each_key(entry, k) {
+ ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
+ if (ret)
+ goto out;
+ }
+
+ entry->type = BCH_JSET_ENTRY_btree_keys;
+ }
+out:
+ ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
+ return ret;
+}
+
+static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq)
{
struct journal *j = &c->journal;
struct journal_buf *buf;
+ bool blocked;
int ret = 0;
- while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) {
+ while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) {
ret = bch2_journal_keys_to_write_buffer(c, buf);
+
+ if (!blocked && !ret) {
+ spin_lock(&j->lock);
+ buf->need_flush_to_write_buffer = false;
+ spin_unlock(&j->lock);
+ }
+
mutex_unlock(&j->buf_lock);
+
+ if (blocked) {
+ bch2_journal_unblock(j);
+ break;
+ }
}
return ret;
}
-static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq,
+static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq,
bool *did_work)
{
struct bch_fs *c = trans->c;
@@ -505,7 +539,7 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq,
do {
bch2_trans_unlock(trans);
- fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
+ fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq);
*did_work |= wb->inc.keys.nr || wb->flushing.keys.nr;
@@ -518,8 +552,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq,
mutex_unlock(&wb->flushing.lock);
} while (!ret &&
(fetch_from_journal_err ||
- (wb->inc.pin.seq && wb->inc.pin.seq <= seq) ||
- (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq)));
+ (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) ||
+ (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq)));
return ret;
}
@@ -600,6 +634,14 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
bch2_bkey_buf_init(&tmp);
if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
+ if (trace_write_buffer_maybe_flush_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, referring_k);
+ trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf);
+ printbuf_exit(&buf);
+ }
+
bch2_bkey_buf_reassemble(&tmp, c, referring_k);
if (bkey_is_btree_ptr(referring_k.k)) {
@@ -771,31 +813,6 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_
return ret;
}
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
-{
- struct journal_keys_to_wb dst;
- int ret = 0;
-
- bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
-
- for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
- jset_entry_for_each_key(entry, k) {
- ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
- if (ret)
- goto out;
- }
-
- entry->type = BCH_JSET_ENTRY_btree_keys;
- }
-
- spin_lock(&c->journal.lock);
- buf->need_flush_to_write_buffer = false;
- spin_unlock(&c->journal.lock);
-out:
- ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
- return ret;
-}
-
static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
{
if (wb->keys.size >= new_size)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ec7d9a59bea9..345b117a4a4a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -18,7 +18,9 @@
#include "error.h"
#include "inode.h"
#include "movinggc.h"
+#include "rebalance.h"
#include "recovery.h"
+#include "recovery_passes.h"
#include "reflink.h"
#include "replicas.h"
#include "subvolume.h"
@@ -260,8 +262,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret = 0;
- percpu_down_read(&c->mark_lock);
-
bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
if (ret)
@@ -362,7 +362,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
bch_info(c, "new key %s", buf.buf);
}
- percpu_up_read(&c->mark_lock);
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
BTREE_ITER_intent|BTREE_ITER_all_snapshots);
@@ -371,8 +370,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
BTREE_UPDATE_internal_snapshot_node|
BTREE_TRIGGER_norun);
bch2_trans_iter_exit(trans, &iter);
- percpu_down_read(&c->mark_lock);
-
if (ret)
goto err;
@@ -380,7 +377,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
}
err:
- percpu_up_read(&c->mark_lock);
printbuf_exit(&buf);
return ret;
}
@@ -401,8 +397,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
BUG_ON(!sectors);
if (gen_after(ptr->gen, b_gen)) {
- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- ptr_gen_newer_than_bucket_gen,
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -415,8 +411,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
}
if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- ptr_too_stale,
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, ptr_too_stale,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -435,8 +431,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
}
if (b_gen != ptr->gen) {
- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- stale_dirty_ptr,
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, stale_dirty_ptr,
"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -451,8 +447,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
}
if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- ptr_bucket_data_type_mismatch,
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, ptr_bucket_data_type_mismatch,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -466,8 +462,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
}
if ((u64) *bucket_sectors + sectors > U32_MAX) {
- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- bucket_sector_count_overflow,
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, bucket_sector_count_overflow,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -485,7 +481,9 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
printbuf_exit(&buf);
return ret;
err:
+fsck_err:
bch2_dump_trans_updates(trans);
+ bch2_inconsistent_error(c);
ret = -BCH_ERR_bucket_ref_update;
goto out;
}
@@ -543,7 +541,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
struct bkey_s_c k,
const struct extent_ptr_decoded *p,
s64 sectors, enum bch_data_type ptr_data_type,
- struct bch_alloc_v4 *a)
+ struct bch_alloc_v4 *a,
+ bool insert)
{
u32 *dst_sectors = p->has_ec ? &a->stripe_sectors :
!p->ptr.cached ? &a->dirty_sectors :
@@ -553,8 +552,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
if (ret)
return ret;
-
- alloc_data_type_set(a, ptr_data_type);
+ if (insert)
+ alloc_data_type_set(a, ptr_data_type);
return 0;
}
@@ -570,8 +569,10 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret = 0;
- u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
- *sectors = insert ? abs_sectors : -abs_sectors;
+ struct bkey_i_backpointer bp;
+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp);
+
+ *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len;
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (unlikely(!ca)) {
@@ -580,41 +581,36 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
goto err;
}
- struct bpos bucket;
- struct bch_backpointer bp;
- __bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors);
+ struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
if (flags & BTREE_TRIGGER_transactional) {
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
ret = PTR_ERR_OR_ZERO(a) ?:
- __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v);
+ __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert);
if (ret)
goto err;
if (!p.ptr.cached) {
- ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert);
+ ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
if (ret)
goto err;
}
}
if (flags & BTREE_TRIGGER_gc) {
- percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, bucket.offset);
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
p.ptr.dev,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = -BCH_ERR_trigger_pointer;
- goto err_unlock;
+ goto err;
}
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
- ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new);
+ ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert);
alloc_to_bucket(g, new);
bucket_unlock(g);
-err_unlock:
- percpu_up_read(&c->mark_lock);
if (!ret)
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
@@ -951,6 +947,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
enum bch_data_type type,
unsigned sectors)
{
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
int ret = 0;
@@ -960,8 +957,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
return PTR_ERR(a);
if (a->v.data_type && type && a->v.data_type != type) {
- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- bucket_metadata_type_mismatch,
+ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ log_fsck_err(trans, bucket_metadata_type_mismatch,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
@@ -979,6 +976,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
}
err:
+fsck_err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -990,11 +988,10 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
struct bch_fs *c = trans->c;
int ret = 0;
- percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, b);
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
ca->dev_idx, bch2_data_type_str(data_type)))
- goto err_unlock;
+ goto err;
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
@@ -1004,26 +1001,24 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
"different types of data in same bucket: %s, %s",
bch2_data_type_str(g->data_type),
bch2_data_type_str(data_type)))
- goto err;
+ goto err_unlock;
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
"bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
ca->dev_idx, b, g->gen,
bch2_data_type_str(g->data_type ?: data_type),
g->dirty_sectors, sectors))
- goto err;
+ goto err_unlock;
g->data_type = data_type;
g->dirty_sectors += sectors;
struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
bucket_unlock(g);
- percpu_up_read(&c->mark_lock);
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
return ret;
-err:
- bucket_unlock(g);
err_unlock:
- percpu_up_read(&c->mark_lock);
+ bucket_unlock(g);
+err:
return -BCH_ERR_metadata_bucket_inconsistency;
}
@@ -1155,6 +1150,31 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c)
return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
}
+bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ u64 b_offset = bucket_to_sector(ca, b);
+ u64 b_end = bucket_to_sector(ca, b + 1);
+ unsigned i;
+
+ if (!b)
+ return true;
+
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+ u64 end = offset + (1 << layout->sb_max_size_bits);
+
+ if (!(offset >= b_end || end <= b_offset))
+ return true;
+ }
+
+ for (i = 0; i < ca->journal.nr; i++)
+ if (b == ca->journal.buckets[i])
+ return true;
+
+ return false;
+}
+
/* Disk reservations: */
#define SECTORS_CACHE 1024
@@ -1238,7 +1258,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c)
for_each_member_device(c, ca) {
BUG_ON(ca->buckets_nouse);
- ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
+ ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO);
if (!ca->buckets_nouse) {
@@ -1264,10 +1284,15 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bool resize = ca->bucket_gens != NULL;
int ret;
- BUG_ON(resize && ca->buckets_nouse);
+ if (resize)
+ lockdep_assert_held(&c->state_lock);
+
+ if (resize && ca->buckets_nouse)
+ return -BCH_ERR_no_resize_with_buckets_nouse;
- if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
- GFP_KERNEL|__GFP_ZERO))) {
+ bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!bucket_gens) {
ret = -BCH_ERR_ENOMEM_bucket_gens;
goto err;
}
@@ -1277,19 +1302,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bucket_gens->nbuckets_minus_first =
bucket_gens->nbuckets - bucket_gens->first_bucket;
- if (resize) {
- down_write(&ca->bucket_lock);
- percpu_down_write(&c->mark_lock);
- }
-
old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
if (resize) {
- size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
-
+ bucket_gens->nbuckets = min(bucket_gens->nbuckets,
+ old_bucket_gens->nbuckets);
+ bucket_gens->nbuckets_minus_first =
+ bucket_gens->nbuckets - bucket_gens->first_bucket;
memcpy(bucket_gens->b,
old_bucket_gens->b,
- n);
+ bucket_gens->nbuckets);
}
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
@@ -1297,11 +1319,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
nbuckets = ca->mi.nbuckets;
- if (resize) {
- percpu_up_write(&c->mark_lock);
- up_write(&ca->bucket_lock);
- }
-
ret = 0;
err:
if (bucket_gens)
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index ccc78bfe2fd4..a9acdd6c0c86 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -82,16 +82,15 @@ static inline void bucket_lock(struct bucket *b)
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
- return genradix_ptr(&ca->buckets_gc, b);
+ return bucket_valid(ca, b)
+ ? genradix_ptr(&ca->buckets_gc, b)
+ : NULL;
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
{
return rcu_dereference_check(ca->bucket_gens,
- !ca->fs ||
- percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->state_lock) ||
- lockdep_is_held(&ca->bucket_lock));
+ lockdep_is_held(&ca->fs->state_lock));
}
static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
@@ -308,26 +307,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
enum btree_iter_update_trigger_flags);
int bch2_trans_mark_dev_sbs(struct bch_fs *);
-static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
- u64 b_offset = bucket_to_sector(ca, b);
- u64 b_end = bucket_to_sector(ca, b + 1);
- unsigned i;
-
- if (!b)
- return true;
-
- for (i = 0; i < layout->nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout->sb_offset[i]);
- u64 end = offset + (1 << layout->sb_max_size_bits);
-
- if (!(offset >= b_end || end <= b_offset))
- return true;
- }
-
- return false;
-}
+bool bch2_is_superblock_bucket(struct bch_dev *, u64);
static inline const char *bch2_data_type_str(enum bch_data_type type)
{
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 28bd09a253c8..7174047b8e92 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -24,7 +24,7 @@ struct bucket_gens {
u16 first_bucket;
size_t nbuckets;
size_t nbuckets_minus_first;
- u8 b[];
+ u8 b[] __counted_by(nbuckets);
};
struct bch_dev_usage {
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 2182b555c112..46e9e32105a9 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -6,11 +6,11 @@
#include "buckets.h"
#include "chardev.h"
#include "disk_accounting.h"
+#include "fsck.h"
#include "journal.h"
#include "move.h"
#include "recovery_passes.h"
#include "replicas.h"
-#include "super.h"
#include "super-io.h"
#include "thread_with_file.h"
@@ -127,130 +127,6 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
}
#endif
-struct fsck_thread {
- struct thread_with_stdio thr;
- struct bch_fs *c;
- struct bch_opts opts;
-};
-
-static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
-{
- struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
- kfree(thr);
-}
-
-static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
-{
- struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
- struct bch_fs *c = thr->c;
-
- int ret = PTR_ERR_OR_ZERO(c);
- if (ret)
- return ret;
-
- ret = bch2_fs_start(thr->c);
- if (ret)
- goto err;
-
- if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
- bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
- ret |= 1;
- }
- if (test_bit(BCH_FS_error, &c->flags)) {
- bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
- ret |= 4;
- }
-err:
- bch2_fs_stop(c);
- return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
- .exit = bch2_fsck_thread_exit,
- .fn = bch2_fsck_offline_thread_fn,
-};
-
-static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
-{
- struct bch_ioctl_fsck_offline arg;
- struct fsck_thread *thr = NULL;
- darray_str(devs) = {};
- long ret = 0;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- for (size_t i = 0; i < arg.nr_devs; i++) {
- u64 dev_u64;
- ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
- if (ret)
- goto err;
-
- char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
- ret = PTR_ERR_OR_ZERO(dev_str);
- if (ret)
- goto err;
-
- ret = darray_push(&devs, dev_str);
- if (ret) {
- kfree(dev_str);
- goto err;
- }
- }
-
- thr = kzalloc(sizeof(*thr), GFP_KERNEL);
- if (!thr) {
- ret = -ENOMEM;
- goto err;
- }
-
- thr->opts = bch2_opts_empty();
-
- if (arg.opts) {
- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
- ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
- if (!IS_ERR(optstr))
- kfree(optstr);
-
- if (ret)
- goto err;
- }
-
- opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
- opt_set(thr->opts, read_only, 1);
- opt_set(thr->opts, ratelimit_errors, 0);
-
- /* We need request_key() to be called before we punt to kthread: */
- opt_set(thr->opts, nostart, true);
-
- bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
-
- thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
-
- if (!IS_ERR(thr->c) &&
- thr->c->opts.errors == BCH_ON_ERROR_panic)
- thr->c->opts.errors = BCH_ON_ERROR_ro;
-
- ret = __bch2_run_thread_with_stdio(&thr->thr);
-out:
- darray_for_each(devs, i)
- kfree(*i);
- darray_exit(&devs);
- return ret;
-err:
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- pr_err("ret %s", bch2_err_str(ret));
- goto out;
-}
-
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
{
long ret;
@@ -775,99 +651,6 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return ret;
}
-static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
-{
- struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
- struct bch_fs *c = thr->c;
-
- c->stdio_filter = current;
- c->stdio = &thr->thr.stdio;
-
- /*
- * XXX: can we figure out a way to do this without mucking with c->opts?
- */
- unsigned old_fix_errors = c->opts.fix_errors;
- if (opt_defined(thr->opts, fix_errors))
- c->opts.fix_errors = thr->opts.fix_errors;
- else
- c->opts.fix_errors = FSCK_FIX_ask;
-
- c->opts.fsck = true;
- set_bit(BCH_FS_fsck_running, &c->flags);
-
- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
- int ret = bch2_run_online_recovery_passes(c);
-
- clear_bit(BCH_FS_fsck_running, &c->flags);
- bch_err_fn(c, ret);
-
- c->stdio = NULL;
- c->stdio_filter = NULL;
- c->opts.fix_errors = old_fix_errors;
-
- up(&c->online_fsck_mutex);
- bch2_ro_ref_put(c);
- return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
- .exit = bch2_fsck_thread_exit,
- .fn = bch2_fsck_online_thread_fn,
-};
-
-static long bch2_ioctl_fsck_online(struct bch_fs *c,
- struct bch_ioctl_fsck_online arg)
-{
- struct fsck_thread *thr = NULL;
- long ret = 0;
-
- if (arg.flags)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!bch2_ro_ref_tryget(c))
- return -EROFS;
-
- if (down_trylock(&c->online_fsck_mutex)) {
- bch2_ro_ref_put(c);
- return -EAGAIN;
- }
-
- thr = kzalloc(sizeof(*thr), GFP_KERNEL);
- if (!thr) {
- ret = -ENOMEM;
- goto err;
- }
-
- thr->c = c;
- thr->opts = bch2_opts_empty();
-
- if (arg.opts) {
- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-
- ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
- if (!IS_ERR(optstr))
- kfree(optstr);
-
- if (ret)
- goto err;
- }
-
- ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
-err:
- if (ret < 0) {
- bch_err_fn(c, ret);
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- up(&c->online_fsck_mutex);
- bch2_ro_ref_put(c);
- }
- return ret;
-}
-
#define BCH_IOCTL(_name, _argtype) \
do { \
_argtype i; \
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index ce8fc677bef9..23a383577d4c 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "checksum.h"
#include "errcode.h"
+#include "error.h"
#include "super.h"
#include "super-io.h"
@@ -252,6 +253,10 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,
if (!bch2_csum_type_is_encryption(type))
return 0;
+ if (bch2_fs_inconsistent_on(!c->chacha20,
+ c, "attempting to encrypt without encryption key"))
+ return -BCH_ERR_no_encryption_key;
+
return do_encrypt(c->chacha20, nonce, data, len);
}
@@ -337,8 +342,9 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
size_t sgl_len = 0;
int ret = 0;
- if (!bch2_csum_type_is_encryption(type))
- return 0;
+ if (bch2_fs_inconsistent_on(!c->chacha20,
+ c, "attempting to encrypt without encryption key"))
+ return -BCH_ERR_no_encryption_key;
darray_init(&sgl);
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index e40499fde9a4..43b9d71f2f2b 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -109,7 +109,7 @@ int bch2_enable_encryption(struct bch_fs *, bool);
void bch2_fs_encryption_exit(struct bch_fs *);
int bch2_fs_encryption_init(struct bch_fs *);
-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type,
bool data)
{
switch (type) {
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 1410365a8891..f99ff1819597 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -2,13 +2,33 @@
#include "bcachefs.h"
#include "checksum.h"
#include "compress.h"
+#include "error.h"
#include "extents.h"
+#include "opts.h"
#include "super-io.h"
#include <linux/lz4.h>
#include <linux/zlib.h>
#include <linux/zstd.h>
+static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type)
+{
+ switch (type) {
+ case BCH_COMPRESSION_TYPE_none:
+ case BCH_COMPRESSION_TYPE_incompressible:
+ return BCH_COMPRESSION_OPT_none;
+ case BCH_COMPRESSION_TYPE_lz4_old:
+ case BCH_COMPRESSION_TYPE_lz4:
+ return BCH_COMPRESSION_OPT_lz4;
+ case BCH_COMPRESSION_TYPE_gzip:
+ return BCH_COMPRESSION_OPT_gzip;
+ case BCH_COMPRESSION_TYPE_zstd:
+ return BCH_COMPRESSION_OPT_zstd;
+ default:
+ BUG();
+ }
+}
+
/* Bounce buffer: */
struct bbuf {
void *b;
@@ -158,6 +178,19 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
void *workspace;
int ret;
+ enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
+ mempool_t *workspace_pool = &c->compress_workspace[opt];
+ if (unlikely(!mempool_initialized(workspace_pool))) {
+ if (fsck_err(c, compression_type_not_marked_in_sb,
+ "compression type %s set but not marked in superblock",
+ __bch2_compression_types[crc.compression_type]))
+ ret = bch2_check_set_has_compressed_data(c, opt);
+ else
+ ret = -BCH_ERR_compression_workspace_not_initialized;
+ if (ret)
+ goto out;
+ }
+
src_data = bio_map_or_bounce(c, src, READ);
switch (crc.compression_type) {
@@ -176,13 +209,13 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
.avail_out = dst_len,
};
- workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+ workspace = mempool_alloc(workspace_pool, GFP_NOFS);
zlib_set_workspace(&strm, workspace);
zlib_inflateInit2(&strm, -MAX_WBITS);
ret = zlib_inflate(&strm, Z_FINISH);
- mempool_free(workspace, &c->decompress_workspace);
+ mempool_free(workspace, workspace_pool);
if (ret != Z_STREAM_END)
goto err;
@@ -195,14 +228,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
if (real_src_len > src_len - 4)
goto err;
- workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+ workspace = mempool_alloc(workspace_pool, GFP_NOFS);
ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
ret = zstd_decompress_dctx(ctx,
dst_data, dst_len,
src_data.b + 4, real_src_len);
- mempool_free(workspace, &c->decompress_workspace);
+ mempool_free(workspace, workspace_pool);
if (ret != dst_len)
goto err;
@@ -212,6 +245,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
BUG();
}
ret = 0;
+fsck_err:
out:
bio_unmap_or_unbounce(c, src_data);
return ret;
@@ -394,8 +428,21 @@ static unsigned __bio_compress(struct bch_fs *c,
unsigned pad;
int ret = 0;
- BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
- BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+ /* bch2_compression_decode catches unknown compression types: */
+ BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR);
+
+ mempool_t *workspace_pool = &c->compress_workspace[compression.type];
+ if (unlikely(!mempool_initialized(workspace_pool))) {
+ if (fsck_err(c, compression_opt_not_marked_in_sb,
+ "compression opt %s set but not marked in superblock",
+ bch2_compression_opts[compression.type])) {
+ ret = bch2_check_set_has_compressed_data(c, compression.type);
+ if (ret) /* memory allocation failure, don't compress */
+ return 0;
+ } else {
+ return 0;
+ }
+ }
/* If it's only one block, don't bother trying to compress: */
if (src->bi_iter.bi_size <= c->opts.block_size)
@@ -404,7 +451,7 @@ static unsigned __bio_compress(struct bch_fs *c,
dst_data = bio_map_or_bounce(c, dst, WRITE);
src_data = bio_map_or_bounce(c, src, READ);
- workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS);
+ workspace = mempool_alloc(workspace_pool, GFP_NOFS);
*src_len = src->bi_iter.bi_size;
*dst_len = dst->bi_iter.bi_size;
@@ -447,7 +494,7 @@ static unsigned __bio_compress(struct bch_fs *c,
*src_len = round_down(*src_len, block_bytes(c));
}
- mempool_free(workspace, &c->compress_workspace[compression_type]);
+ mempool_free(workspace, workspace_pool);
if (ret)
goto err;
@@ -477,6 +524,9 @@ static unsigned __bio_compress(struct bch_fs *c,
err:
ret = BCH_COMPRESSION_TYPE_incompressible;
goto out;
+fsck_err:
+ ret = 0;
+ goto out;
}
unsigned bch2_bio_compress(struct bch_fs *c,
@@ -559,7 +609,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
{
unsigned i;
- mempool_exit(&c->decompress_workspace);
for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
mempool_exit(&c->compress_workspace[i]);
mempool_exit(&c->compression_bounce[WRITE]);
@@ -568,7 +617,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
{
- size_t decompress_workspace_size = 0;
ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
c->opts.encoded_extent_max);
@@ -576,19 +624,17 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
struct {
unsigned feature;
- enum bch_compression_type type;
+ enum bch_compression_opts type;
size_t compress_workspace;
- size_t decompress_workspace;
} compression_types[] = {
- { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
- max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
- 0 },
- { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
- zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
- zlib_inflate_workspacesize(), },
- { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
- c->zstd_workspace_size,
- zstd_dctx_workspace_bound() },
+ { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4,
+ max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
+ { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip,
+ max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+ zlib_inflate_workspacesize()) },
+ { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd,
+ max(c->zstd_workspace_size,
+ zstd_dctx_workspace_bound()) },
}, *i;
bool have_compressed = false;
@@ -613,9 +659,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
for (i = compression_types;
i < compression_types + ARRAY_SIZE(compression_types);
i++) {
- decompress_workspace_size =
- max(decompress_workspace_size, i->decompress_workspace);
-
if (!(features & (1 << i->feature)))
continue;
@@ -628,11 +671,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
return -BCH_ERR_ENOMEM_compression_workspace_init;
}
- if (!mempool_initialized(&c->decompress_workspace) &&
- mempool_init_kvmalloc_pool(&c->decompress_workspace,
- 1, decompress_workspace_size))
- return -BCH_ERR_ENOMEM_decompression_workspace_init;
-
return 0;
}
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 8f4c3f0665c4..c6151495985f 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -83,7 +83,7 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
#define darray_for_each_reverse(_d, _i) \
- for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
#define darray_init(_d) \
do { \
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 8e75a852b358..585214931e05 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -110,11 +110,8 @@ static void trace_move_extent_fail2(struct data_update *m,
{
struct bch_fs *c = m->op.c;
struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
- const union bch_extent_entry *entry;
- struct bch_extent_ptr *ptr;
- struct extent_ptr_decoded p;
struct printbuf buf = PRINTBUF;
- unsigned i, rewrites_found = 0;
+ unsigned rewrites_found = 0;
if (!trace_move_extent_fail_enabled())
return;
@@ -122,27 +119,25 @@ static void trace_move_extent_fail2(struct data_update *m,
prt_str(&buf, msg);
if (insert) {
- i = 0;
+ const union bch_extent_entry *entry;
+ struct bch_extent_ptr *ptr;
+ struct extent_ptr_decoded p;
+
+ unsigned ptr_bit = 1;
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
- if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached)
- rewrites_found |= 1U << i;
- i++;
+ rewrites_found |= ptr_bit;
+ ptr_bit <<= 1;
}
}
- prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u",
- (m->data_opts.rewrite_ptrs & (1 << 0)) != 0,
- (m->data_opts.rewrite_ptrs & (1 << 1)) != 0,
- (m->data_opts.rewrite_ptrs & (1 << 2)) != 0,
- (m->data_opts.rewrite_ptrs & (1 << 3)) != 0);
+ prt_str(&buf, "rewrites found:\t");
+ bch2_prt_u64_base2(&buf, rewrites_found);
+ prt_newline(&buf);
- prt_printf(&buf, "\nrewrites found: %u%u%u%u",
- (rewrites_found & (1 << 0)) != 0,
- (rewrites_found & (1 << 1)) != 0,
- (rewrites_found & (1 << 2)) != 0,
- (rewrites_found & (1 << 3)) != 0);
+ bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
@@ -194,7 +189,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
struct bpos next_pos;
bool should_check_enospc;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
- unsigned rewrites_found = 0, durability, i;
+ unsigned rewrites_found = 0, durability, ptr_bit;
bch2_trans_begin(trans);
@@ -231,16 +226,16 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
*
* Fist, drop rewrite_ptrs from @new:
*/
- i = 0;
+ ptr_bit = 1;
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
- if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) {
bch2_extent_ptr_set_cached(c, &m->op.opts,
bkey_i_to_s(insert), ptr);
- rewrites_found |= 1U << i;
+ rewrites_found |= ptr_bit;
}
- i++;
+ ptr_bit <<= 1;
}
if (m->data_opts.rewrite_ptrs &&
@@ -323,8 +318,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
* it's been hard to reproduce, so this should give us some more
* information when it does occur:
*/
- int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id),
- BCH_VALIDATE_commit);
+ int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert),
+ (struct bkey_validate_context) {
+ .btree = m->btree_id,
+ .flags = BCH_VALIDATE_commit,
+ });
if (invalid) {
struct printbuf buf = PRINTBUF;
@@ -362,7 +360,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
- bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
+ bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, &op->res,
@@ -540,7 +538,7 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
prt_newline(out);
prt_str(out, "compression:\t");
- bch2_compression_opt_to_text(out, background_compression(*io_opts));
+ bch2_compression_opt_to_text(out, io_opts->background_compression);
prt_newline(out);
prt_str(out, "opts.replicas:\t");
@@ -614,7 +612,7 @@ int bch2_data_update_init(struct btree_trans *trans,
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
+ unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
int ret = 0;
/*
@@ -622,7 +620,7 @@ int bch2_data_update_init(struct btree_trans *trans,
* and we have to check for this because we go rw before repairing the
* snapshots table - just skip it, we can move it later.
*/
- if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
+ if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
return -BCH_ERR_data_update_done;
if (!bkey_get_dev_refs(c, k))
@@ -652,22 +650,22 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_MOVE|
m->data_opts.write_flags;
- m->op.compression_opt = background_compression(io_opts);
+ m->op.compression_opt = io_opts.background_compression;
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
unsigned durability_have = 0, durability_removing = 0;
- i = 0;
+ unsigned ptr_bit = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (!p.ptr.cached) {
rcu_read_lock();
- if (BIT(i) & m->data_opts.rewrite_ptrs) {
+ if (ptr_bit & m->data_opts.rewrite_ptrs) {
if (crc_is_compressed(p.crc))
reserve_sectors += k.k->size;
m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
durability_removing += bch2_extent_ptr_desired_durability(c, &p);
- } else if (!(BIT(i) & m->data_opts.kill_ptrs)) {
+ } else if (!(ptr_bit & m->data_opts.kill_ptrs)) {
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
}
@@ -687,7 +685,7 @@ int bch2_data_update_init(struct btree_trans *trans,
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
m->op.incompressible = true;
- i++;
+ ptr_bit <<= 1;
}
unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
@@ -750,14 +748,14 @@ int bch2_data_update_init(struct btree_trans *trans,
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned i = 0;
+ unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
- if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
- opts->kill_ptrs |= 1U << i;
- opts->rewrite_ptrs ^= 1U << i;
+ if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) {
+ opts->kill_ptrs |= ptr_bit;
+ opts->rewrite_ptrs ^= ptr_bit;
}
- i++;
+ ptr_bit <<= 1;
}
}
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 45aec1afdb0e..b5de52a50d10 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -472,7 +472,9 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
- prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level);
+ prt_printf(out, "%px ", b);
+ bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level);
+ prt_printf(out, "\n");
printbuf_indent_add(out, 2);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index faffc98d5605..600eee936f13 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -101,7 +101,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
};
int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
@@ -120,7 +120,7 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
* Check new keys don't exceed the max length
* (older keys may be larger.)
*/
- bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
+ bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
c, dirent_name_too_long,
"dirent name too big (%u > %u)",
d_name.len, BCH_NAME_MAX);
@@ -266,7 +266,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
} else {
target->subvol = le32_to_cpu(d.v->d_child_subvol);
- ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s);
+ ret = bch2_subvolume_get(trans, target->subvol, true, &s);
target->inum = le64_to_cpu(s.inode);
}
@@ -500,7 +500,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32
struct bkey_s_c k;
int ret;
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
SPOS(dir, 0, snapshot),
POS(dir, U64_MAX), 0, k, ret)
if (k.k->type == KEY_TYPE_dirent) {
@@ -549,7 +549,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
bch2_bkey_buf_init(&sk);
int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents,
POS(inum.inum, ctx->pos),
POS(inum.inum, U64_MAX),
inum.subvol, 0, k, ({
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 53ad99666022..362b3b2f2f2e 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -4,10 +4,10 @@
#include "str_hash.h"
-enum bch_validate_flags;
extern const struct bch_hash_desc bch2_dirent_hash_desc;
-int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_dirent ((struct bkey_ops) { \
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 07eb8fa1b026..b32e91ba8be8 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -79,6 +79,8 @@ static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_
memcpy_u64s_small(acc->v.d, d, nr);
}
+static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
+
int bch2_disk_accounting_mod(struct btree_trans *trans,
struct disk_accounting_pos *k,
s64 *d, unsigned nr, bool gc)
@@ -96,9 +98,16 @@ int bch2_disk_accounting_mod(struct btree_trans *trans,
accounting_key_init(&k_i.k, k, d, nr);
- return likely(!gc)
- ? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k)
- : bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+ if (unlikely(gc)) {
+ int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+ if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
+ ret = drop_locks_do(trans,
+ bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
+ bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+ return ret;
+ } else {
+ return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k);
+ }
}
int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
@@ -127,14 +136,15 @@ static inline bool is_zero(char *start, char *end)
#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member))
int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, k.k->p);
void *end = &acc_k + 1;
int ret = 0;
- bkey_fsck_err_on(bversion_zero(k.k->bversion),
+ bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) &&
+ bversion_zero(k.k->bversion),
c, accounting_key_version_0,
"accounting key with version=0");
@@ -217,7 +227,8 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
prt_printf(out, "id=%u", k->snapshot.id);
break;
case BCH_DISK_ACCOUNTING_btree:
- prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id));
+ prt_str(out, "btree=");
+ bch2_btree_id_to_text(out, k->btree.id);
break;
}
}
@@ -243,10 +254,10 @@ void bch2_accounting_swab(struct bkey_s k)
}
static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
- struct disk_accounting_pos acc)
+ struct disk_accounting_pos *acc)
{
- unsafe_memcpy(r, &acc.replicas,
- replicas_entry_bytes(&acc.replicas),
+ unsafe_memcpy(r, &acc->replicas,
+ replicas_entry_bytes(&acc->replicas),
"variable length struct");
}
@@ -257,7 +268,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_replicas:
- __accounting_to_replicas(r, acc_k);
+ __accounting_to_replicas(r, &acc_k);
return true;
default:
return false;
@@ -322,6 +333,14 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, NULL);
+
+ if (trace_accounting_mem_insert_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_accounting_to_text(&buf, c, a.s_c);
+ trace_accounting_mem_insert(c, buf.buf);
+ printbuf_exit(&buf);
+ }
return 0;
err:
free_percpu(n.v[1]);
@@ -461,32 +480,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc
return ret;
}
-void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
-
- percpu_down_read(&c->mark_lock);
- out->atomic++;
-
- eytzinger0_for_each(i, acc->k.nr) {
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos);
-
- bch2_accounting_key_to_text(out, &acc_k);
-
- u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
-
- prt_str(out, ":");
- for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
- prt_printf(out, " %llu", v[j]);
- prt_newline(out);
- }
-
- --out->atomic;
- percpu_up_read(&c->mark_lock);
-}
-
static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
{
darray_for_each(acc->k, e) {
@@ -625,7 +618,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
switch (acc.type) {
case BCH_DISK_ACCOUNTING_replicas: {
struct bch_replicas_padded r;
- __accounting_to_replicas(&r.e, acc);
+ __accounting_to_replicas(&r.e, &acc);
for (unsigned i = 0; i < r.e.nr_devs; i++)
if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
@@ -699,11 +692,45 @@ int bch2_accounting_read(struct bch_fs *c)
struct btree_trans *trans = bch2_trans_get(c);
struct printbuf buf = PRINTBUF;
- int ret = for_each_btree_key(trans, iter,
- BTREE_ID_accounting, POS_MIN,
+ /*
+ * We might run more than once if we rewind to start topology repair or
+ * btree node scan - and those might cause us to get different results,
+ * so we can't just skip if we've already run.
+ *
+ * Instead, zero out any accounting we have:
+ */
+ percpu_down_write(&c->mark_lock);
+ darray_for_each(acc->k, e)
+ percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
+ for_each_member_device(c, ca)
+ percpu_memset(ca->usage, 0, sizeof(*ca->usage));
+ percpu_memset(c->usage, 0, sizeof(*c->usage));
+ percpu_up_write(&c->mark_lock);
+
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
+ iter.flags &= ~BTREE_ITER_with_journal;
+ int ret = for_each_btree_key_continue(trans, iter,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
struct bkey u;
struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
+
+ if (k.k->type != KEY_TYPE_accounting)
+ continue;
+
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ break;
+
+ if (!bch2_accounting_is_mem(acc_k)) {
+ struct disk_accounting_pos next = { .type = acc_k.type + 1 };
+ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
+ continue;
+ }
+
accounting_read_key(trans, k);
}));
if (ret)
@@ -715,6 +742,12 @@ int bch2_accounting_read(struct bch_fs *c)
darray_for_each(*keys, i) {
if (i->k->k.type == KEY_TYPE_accounting) {
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
+
+ if (!bch2_accounting_is_mem(acc_k))
+ continue;
+
struct bkey_s_c k = bkey_i_to_s_c(i->k);
unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
sizeof(acc->k.data[0]),
@@ -748,15 +781,16 @@ int bch2_accounting_read(struct bch_fs *c)
keys->gap = keys->nr = dst - keys->data;
percpu_down_write(&c->mark_lock);
- unsigned i = 0;
- while (i < acc->k.nr) {
- unsigned idx = inorder_to_eytzinger0(i, acc->k.nr);
+ darray_for_each_reverse(acc->k, i) {
struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos);
+ bpos_to_disk_accounting_pos(&acc_k, i->pos);
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false);
+ memset(v, 0, sizeof(v));
+
+ for (unsigned j = 0; j < i->nr_counters; j++)
+ v[j] = percpu_u64_get(i->v[0] + j);
/*
* If the entry counters are zeroed, it should be treated as
@@ -765,26 +799,25 @@ int bch2_accounting_read(struct bch_fs *c)
* Remove it, so that if it's re-added it gets re-marked in the
* superblock:
*/
- ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters)
+ ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
? -BCH_ERR_remove_disk_accounting_entry
- : bch2_disk_accounting_validate_late(trans, acc_k,
- v, acc->k.data[idx].nr_counters);
+ : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters);
if (ret == -BCH_ERR_remove_disk_accounting_entry) {
- free_percpu(acc->k.data[idx].v[0]);
- free_percpu(acc->k.data[idx].v[1]);
- darray_remove_item(&acc->k, &acc->k.data[idx]);
- eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, NULL);
+ free_percpu(i->v[0]);
+ free_percpu(i->v[1]);
+ darray_remove_item(&acc->k, i);
ret = 0;
continue;
}
if (ret)
goto fsck_err;
- i++;
}
+ eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, NULL);
+
preempt_disable();
struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
@@ -804,7 +837,7 @@ int bch2_accounting_read(struct bch_fs *c)
break;
case BCH_DISK_ACCOUNTING_dev_data_type:
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
if (ca) {
struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
percpu_u64_set(&d->buckets, v[0]);
@@ -881,10 +914,13 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
bpos_to_disk_accounting_pos(&acc_k, k.k->p);
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- continue;
+ break;
- if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
+ if (!bch2_accounting_is_mem(acc_k)) {
+ struct disk_accounting_pos next = { .type = acc_k.type + 1 };
+ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
continue;
+ }
bch2_accounting_mem_read(c, k.k->p, v, nr);
@@ -910,7 +946,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
break;
case BCH_DISK_ACCOUNTING_dev_data_type: {
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
if (!ca) {
rcu_read_unlock();
continue;
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 4ea6c8a092bc..5360cbb3ec29 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_DISK_ACCOUNTING_H
#define _BCACHEFS_DISK_ACCOUNTING_H
+#include "btree_update.h"
#include "eytzinger.h"
#include "sb-members.h"
@@ -62,27 +63,32 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage
static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
{
- acc->_pad = p;
+ BUILD_BUG_ON(sizeof(*acc) != sizeof(p));
+
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- bch2_bpos_swab(&acc->_pad);
+ acc->_pad = p;
+#else
+ memcpy_swab(acc, &p, sizeof(p));
#endif
}
-static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k)
+static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc)
{
- struct bpos ret = k->_pad;
-
+ struct bpos p;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- bch2_bpos_swab(&ret);
+ p = acc->_pad;
+#else
+ memcpy_swab(&p, acc, sizeof(p));
#endif
- return ret;
+ return p;
}
int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
s64 *, unsigned, bool);
int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
-int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_accounting_swab(struct bkey_s);
@@ -112,6 +118,12 @@ enum bch_accounting_mode {
int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
void bch2_accounting_mem_gc(struct bch_fs *);
+static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
+{
+ return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR &&
+ acc.type != BCH_DISK_ACCOUNTING_inum;
+}
+
/*
* Update in memory counters so they match the btree update we're doing; called
* from transaction commit path
@@ -126,9 +138,10 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
bpos_to_disk_accounting_pos(&acc_k, a.k->p);
bool gc = mode == BCH_ACCOUNTING_gc;
- EBUG_ON(gc && !acc->gc_running);
+ if (gc && !acc->gc_running)
+ return 0;
- if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
+ if (!bch2_accounting_is_mem(acc_k))
return 0;
if (mode == BCH_ACCOUNTING_normal) {
@@ -141,7 +154,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
break;
case BCH_DISK_ACCOUNTING_dev_data_type:
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
if (ca) {
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
@@ -204,9 +217,45 @@ static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
}
+static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
+{
+ EBUG_ON(!res->ref);
+
+ return (struct bversion) {
+ .hi = res->seq >> 32,
+ .lo = (res->seq << 32) | (res->offset + offset),
+ };
+}
+
+static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
+ struct bkey_i_accounting *a,
+ unsigned commit_flags)
+{
+ a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
+ (u64 *) a - (u64 *) trans->journal_entries);
+
+ EBUG_ON(bversion_zero(a->k.bversion));
+
+ return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
+ ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
+ : 0;
+}
+
+static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans,
+ struct bkey_i_accounting *a_i,
+ unsigned commit_flags)
+{
+ if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
+ struct bkey_s_accounting a = accounting_i_to_s(a_i);
+
+ bch2_accounting_neg(a);
+ bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
+ bch2_accounting_neg(a);
+ }
+}
+
int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
-void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *);
int bch2_gc_accounting_start(struct bch_fs *);
int bch2_gc_accounting_done(struct bch_fs *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 749dcf368841..b211e90ac54e 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -26,6 +26,7 @@
#include "util.h"
#include <linux/sort.h>
+#include <linux/string_choices.h>
#ifdef __KERNEL__
@@ -109,7 +110,7 @@ struct ec_bio {
/* Stripes btree keys: */
int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
int ret = 0;
@@ -129,7 +130,7 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
"invalid csum granularity (%u >= 64)",
s->csum_granularity_bits);
- ret = bch2_bkey_ptrs_validate(c, k, flags);
+ ret = bch2_bkey_ptrs_validate(c, k, from);
fsck_err:
return ret;
}
@@ -304,13 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans,
}
if (flags & BTREE_TRIGGER_gc) {
- percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, bucket.offset);
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
ptr->dev,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -BCH_ERR_mark_stripe;
- goto err_unlock;
+ goto err;
}
bucket_lock(g);
@@ -318,8 +318,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
alloc_to_bucket(g, new);
bucket_unlock(g);
-err_unlock:
- percpu_up_read(&c->mark_lock);
+
if (!ret)
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
}
@@ -732,7 +731,7 @@ static void ec_block_endio(struct bio *bio)
? BCH_MEMBER_ERROR_write
: BCH_MEMBER_ERROR_read,
"erasure coding %s error: %s",
- bio_data_dir(bio) ? "write" : "read",
+ str_write_read(bio_data_dir(bio)),
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
@@ -909,7 +908,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
bch2_bkey_val_to_text(&msgbuf, c, orig_k);
bch_err_ratelimited(c,
"error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
- printbuf_exit(&msgbuf);;
+ printbuf_exit(&msgbuf);
ret = -BCH_ERR_stripe_reconstruct;
goto out;
}
@@ -1275,11 +1274,11 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
struct bch_dev *ca,
struct bpos bucket, u8 gen,
struct ec_stripe_buf *s,
- struct bpos *bp_pos)
+ struct bkey_s_c_backpointer bp,
+ struct bkey_buf *last_flushed)
{
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
struct bch_fs *c = trans->c;
- struct bch_backpointer bp;
struct btree_iter iter;
struct bkey_s_c k;
const struct bch_extent_ptr *ptr_c;
@@ -1288,33 +1287,26 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
struct bkey_i *n;
int ret, dev, block;
- ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
- bp_pos, &bp, BTREE_ITER_cached);
- if (ret)
- return ret;
- if (bpos_eq(*bp_pos, SPOS_MAX))
- return 0;
-
- if (bp.level) {
+ if (bp.v->level) {
struct printbuf buf = PRINTBUF;
struct btree_iter node_iter;
struct btree *b;
- b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
+ b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed);
bch2_trans_iter_exit(trans, &node_iter);
if (!b)
return 0;
prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
- bch2_backpointer_to_text(&buf, &bp);
+ bch2_bkey_val_to_text(&buf, c, bp.s_c);
bch2_fs_inconsistent(c, "%s", buf.buf);
printbuf_exit(&buf);
return -EIO;
}
- k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent);
+ k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
ret = bkey_err(k);
if (ret)
return ret;
@@ -1373,7 +1365,6 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
struct bch_fs *c = trans->c;
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
struct bch_extent_ptr ptr = v->ptrs[block];
- struct bpos bp_pos = POS_MIN;
int ret = 0;
struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
@@ -1382,19 +1373,27 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
- while (1) {
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos));
- if (ret)
- break;
- if (bkey_eq(bp_pos, POS_MAX))
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp_start(ca, bucket_pos),
+ bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc, ({
+ if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0)))
break;
- bp_pos = bpos_nosnap_successor(bp_pos);
- }
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ continue;
+ ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
+ bkey_s_c_to_backpointer(bp_k), &last_flushed);
+ }));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
bch2_dev_put(ca);
return ret;
}
@@ -1716,7 +1715,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
set_bkey_val_u64s(&s->k, u64s);
}
-static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
{
struct ec_stripe_new *s;
@@ -1724,7 +1723,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s)
- return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
+ return NULL;
mutex_init(&s->lock);
closure_init(&s->iodone, NULL);
@@ -1739,10 +1738,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
ec_stripe_key_init(c, &s->new_stripe.key,
s->nr_data, s->nr_parity,
h->blocksize, h->disk_label);
-
- h->s = s;
- h->nr_created++;
- return 0;
+ return s;
}
static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
@@ -1887,25 +1883,26 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
return h;
}
-static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
+static int new_stripe_alloc_buckets(struct btree_trans *trans,
+ struct ec_stripe_head *h, struct ec_stripe_new *s,
enum bch_watermark watermark, struct closure *cl)
{
struct bch_fs *c = trans->c;
struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
struct open_buckets buckets;
- struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
bool have_cache = true;
int ret = 0;
- BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity);
- BUG_ON(v->nr_redundant != h->s->nr_parity);
+ BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
+ BUG_ON(v->nr_redundant != s->nr_parity);
/* * We bypass the sector allocator which normally does this: */
bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
- for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
+ for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
/*
* Note: we don't yet repair invalid blocks (failed/removed
* devices) when reusing stripes - we still need a codepath to
@@ -1915,21 +1912,21 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
__clear_bit(v->ptrs[i].dev, devs.d);
- if (i < h->s->nr_data)
+ if (i < s->nr_data)
nr_have_data++;
else
nr_have_parity++;
}
- BUG_ON(nr_have_data > h->s->nr_data);
- BUG_ON(nr_have_parity > h->s->nr_parity);
+ BUG_ON(nr_have_data > s->nr_data);
+ BUG_ON(nr_have_parity > s->nr_parity);
buckets.nr = 0;
- if (nr_have_parity < h->s->nr_parity) {
+ if (nr_have_parity < s->nr_parity) {
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
&h->parity_stripe,
&devs,
- h->s->nr_parity,
+ s->nr_parity,
&nr_have_parity,
&have_cache, 0,
BCH_DATA_parity,
@@ -1937,14 +1934,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
cl);
open_bucket_for_each(c, &buckets, ob, i) {
- j = find_next_zero_bit(h->s->blocks_gotten,
- h->s->nr_data + h->s->nr_parity,
- h->s->nr_data);
- BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+ j = find_next_zero_bit(s->blocks_gotten,
+ s->nr_data + s->nr_parity,
+ s->nr_data);
+ BUG_ON(j >= s->nr_data + s->nr_parity);
- h->s->blocks[j] = buckets.v[i];
+ s->blocks[j] = buckets.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
- __set_bit(j, h->s->blocks_gotten);
+ __set_bit(j, s->blocks_gotten);
}
if (ret)
@@ -1952,11 +1949,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
}
buckets.nr = 0;
- if (nr_have_data < h->s->nr_data) {
+ if (nr_have_data < s->nr_data) {
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
&h->block_stripe,
&devs,
- h->s->nr_data,
+ s->nr_data,
&nr_have_data,
&have_cache, 0,
BCH_DATA_user,
@@ -1964,13 +1961,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
cl);
open_bucket_for_each(c, &buckets, ob, i) {
- j = find_next_zero_bit(h->s->blocks_gotten,
- h->s->nr_data, 0);
- BUG_ON(j >= h->s->nr_data);
+ j = find_next_zero_bit(s->blocks_gotten,
+ s->nr_data, 0);
+ BUG_ON(j >= s->nr_data);
- h->s->blocks[j] = buckets.v[i];
+ s->blocks[j] = buckets.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
- __set_bit(j, h->s->blocks_gotten);
+ __set_bit(j, s->blocks_gotten);
}
if (ret)
@@ -2016,73 +2013,78 @@ static s64 get_existing_stripe(struct bch_fs *c,
return ret;
}
-static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
+static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
{
- struct bch_fs *c = trans->c;
- struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
- struct bch_stripe *existing_v;
+ struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+ struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
unsigned i;
- s64 idx;
- int ret;
-
- /*
- * If we can't allocate a new stripe, and there's no stripes with empty
- * blocks for us to reuse, that means we have to wait on copygc:
- */
- idx = get_existing_stripe(c, h);
- if (idx < 0)
- return -BCH_ERR_stripe_alloc_blocked;
-
- ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
- bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
- "reading stripe key: %s", bch2_err_str(ret));
- if (ret) {
- bch2_stripe_close(c, h->s);
- return ret;
- }
- existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
-
- BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
- h->s->nr_data = existing_v->nr_blocks -
+ BUG_ON(existing_v->nr_redundant != s->nr_parity);
+ s->nr_data = existing_v->nr_blocks -
existing_v->nr_redundant;
- ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
+ int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
if (ret) {
- bch2_stripe_close(c, h->s);
+ bch2_stripe_close(c, s);
return ret;
}
- BUG_ON(h->s->existing_stripe.size != h->blocksize);
- BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
+ BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
/*
* Free buckets we initially allocated - they might conflict with
* blocks from the stripe we're reusing:
*/
- for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
- bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
- h->s->blocks[i] = 0;
+ for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
+ bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
+ s->blocks[i] = 0;
}
- memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
- memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
+ memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
+ memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
- for (i = 0; i < existing_v->nr_blocks; i++) {
+ for (unsigned i = 0; i < existing_v->nr_blocks; i++) {
if (stripe_blockcount_get(existing_v, i)) {
- __set_bit(i, h->s->blocks_gotten);
- __set_bit(i, h->s->blocks_allocated);
+ __set_bit(i, s->blocks_gotten);
+ __set_bit(i, s->blocks_allocated);
}
- ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+ ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
}
- bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
- h->s->have_existing_stripe = true;
+ bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
+ s->have_existing_stripe = true;
return 0;
}
-static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h,
+ struct ec_stripe_new *s)
+{
+ struct bch_fs *c = trans->c;
+ s64 idx;
+ int ret;
+
+ /*
+ * If we can't allocate a new stripe, and there's no stripes with empty
+ * blocks for us to reuse, that means we have to wait on copygc:
+ */
+ idx = get_existing_stripe(c, h);
+ if (idx < 0)
+ return -BCH_ERR_stripe_alloc_blocked;
+
+ ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
+ bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
+ "reading stripe key: %s", bch2_err_str(ret));
+ if (ret) {
+ bch2_stripe_close(c, s);
+ return ret;
+ }
+
+ return init_new_stripe_from_existing(c, s);
+}
+
+static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h,
+ struct ec_stripe_new *s)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -2091,15 +2093,19 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
int ret;
- if (!h->s->res.sectors) {
- ret = bch2_disk_reservation_get(c, &h->s->res,
+ if (!s->res.sectors) {
+ ret = bch2_disk_reservation_get(c, &s->res,
h->blocksize,
- h->s->nr_parity,
+ s->nr_parity,
BCH_DISK_RESERVATION_NOFAIL);
if (ret)
return ret;
}
+ /*
+ * Allocate stripe slot
+ * XXX: we're going to need a bitrange btree of free stripes
+ */
for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
@@ -2114,7 +2120,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
}
if (bkey_deleted(k.k) &&
- bch2_try_open_stripe(c, h->s, k.k->p.offset))
+ bch2_try_open_stripe(c, s, k.k->p.offset))
break;
}
@@ -2125,16 +2131,16 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
ret = ec_stripe_mem_alloc(trans, &iter);
if (ret) {
- bch2_stripe_close(c, h->s);
+ bch2_stripe_close(c, s);
goto err;
}
- h->s->new_stripe.key.k.p = iter.pos;
+ s->new_stripe.key.k.p = iter.pos;
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
err:
- bch2_disk_reservation_put(c, &h->s->res);
+ bch2_disk_reservation_put(c, &s->res);
goto out;
}
@@ -2165,22 +2171,27 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
return h;
if (!h->s) {
- ret = ec_new_stripe_alloc(c, h);
- if (ret) {
+ h->s = ec_new_stripe_alloc(c, h);
+ if (!h->s) {
+ ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
bch_err(c, "failed to allocate new stripe");
goto err;
}
+
+ h->nr_created++;
}
- if (h->s->allocated)
+ struct ec_stripe_new *s = h->s;
+
+ if (s->allocated)
goto allocated;
- if (h->s->have_existing_stripe)
+ if (s->have_existing_stripe)
goto alloc_existing;
/* First, try to allocate a full stripe: */
- ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
- __bch2_ec_stripe_head_reserve(trans, h);
+ ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?:
+ __bch2_ec_stripe_head_reserve(trans, h, s);
if (!ret)
goto allocate_buf;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
@@ -2192,15 +2203,15 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
* existing stripe:
*/
while (1) {
- ret = __bch2_ec_stripe_head_reuse(trans, h);
+ ret = __bch2_ec_stripe_head_reuse(trans, h, s);
if (!ret)
break;
if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
goto err;
if (watermark == BCH_WATERMARK_copygc) {
- ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
- __bch2_ec_stripe_head_reserve(trans, h);
+ ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?:
+ __bch2_ec_stripe_head_reserve(trans, h, s);
if (ret)
goto err;
goto allocate_buf;
@@ -2218,19 +2229,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
* Retry allocating buckets, with the watermark for this
* particular write:
*/
- ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
+ ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl);
if (ret)
goto err;
allocate_buf:
- ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
+ ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize);
if (ret)
goto err;
- h->s->allocated = true;
+ s->allocated = true;
allocated:
- BUG_ON(!h->s->idx);
- BUG_ON(!h->s->new_stripe.data[0]);
+ BUG_ON(!s->idx);
+ BUG_ON(!s->new_stripe.data[0]);
BUG_ON(trans->restarted);
return h;
err:
@@ -2295,7 +2306,7 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_
int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
{
return bch2_trans_run(c,
- for_each_btree_key_upto_commit(trans, iter,
+ for_each_btree_key_max_commit(trans, iter,
BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
BTREE_ITER_intent, k,
NULL, NULL, 0, ({
@@ -2458,11 +2469,9 @@ void bch2_fs_ec_exit(struct bch_fs *c)
while (1) {
mutex_lock(&c->ec_stripe_head_lock);
- h = list_first_entry_or_null(&c->ec_stripe_head_list,
- struct ec_stripe_head, list);
- if (h)
- list_del(&h->list);
+ h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list);
mutex_unlock(&c->ec_stripe_head_lock);
+
if (!h)
break;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 43326370b410..583ca6a226da 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -6,9 +6,8 @@
#include "buckets_types.h"
#include "extents_types.h"
-enum bch_validate_flags;
-
-int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 9c4fe5cdbfb7..4590cd0c7c90 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -54,7 +54,8 @@
x(ENOMEM, ENOMEM_compression_bounce_read_init) \
x(ENOMEM, ENOMEM_compression_bounce_write_init) \
x(ENOMEM, ENOMEM_compression_workspace_init) \
- x(ENOMEM, ENOMEM_decompression_workspace_init) \
+ x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \
+ x(EIO, compression_workspace_not_initialized) \
x(ENOMEM, ENOMEM_bucket_gens) \
x(ENOMEM, ENOMEM_buckets_nouse) \
x(ENOMEM, ENOMEM_usage_init) \
@@ -116,6 +117,8 @@
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
x(ENOENT, ENOENT_dev_not_found) \
x(ENOENT, ENOENT_dev_idx_not_found) \
+ x(ENOENT, ENOENT_inode_no_backpointer) \
+ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
x(EEXIST, EEXIST_str_hash_set) \
@@ -148,6 +151,7 @@
x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
x(BCH_ERR_transaction_restart, transaction_restart_nested) \
+ x(BCH_ERR_transaction_restart, transaction_restart_commit) \
x(0, no_btree_node) \
x(BCH_ERR_no_btree_node, no_btree_node_relock) \
x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \
@@ -164,7 +168,6 @@
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
x(0, backpointer_to_overwritten_btree_node) \
- x(0, lock_fail_root_changed) \
x(0, journal_reclaim_would_deadlock) \
x(EINVAL, fsck) \
x(BCH_ERR_fsck, fsck_fix) \
@@ -173,7 +176,9 @@
x(BCH_ERR_fsck, fsck_errors_not_fixed) \
x(BCH_ERR_fsck, fsck_repair_unimplemented) \
x(BCH_ERR_fsck, fsck_repair_impossible) \
- x(0, restart_recovery) \
+ x(EINVAL, restart_recovery) \
+ x(EINVAL, not_in_recovery) \
+ x(EINVAL, cannot_rewind_recovery) \
x(0, data_update_done) \
x(EINVAL, device_state_not_allowed) \
x(EINVAL, member_info_missing) \
@@ -192,7 +197,9 @@
x(EINVAL, opt_parse_error) \
x(EINVAL, remove_with_metadata_missing_unimplemented)\
x(EINVAL, remove_would_lose_data) \
- x(EINVAL, btree_iter_with_journal_not_supported) \
+ x(EINVAL, no_resize_with_buckets_nouse) \
+ x(EINVAL, inode_unpack_error) \
+ x(EINVAL, varint_decode_error) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -241,7 +248,10 @@
x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
+ x(EIO, journal_shutdown) \
+ x(EIO, journal_flush_err) \
x(EIO, btree_node_read_err) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \
x(EIO, sb_not_downgraded) \
x(EIO, btree_node_write_all_failed) \
x(EIO, btree_node_read_error) \
@@ -257,6 +267,8 @@
x(EIO, no_device_to_read_from) \
x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \
+ x(EIO, no_encryption_key) \
+ x(EIO, insufficient_journal_devices) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@@ -305,6 +317,7 @@ static inline long bch2_err_class(long err)
#define BLK_STS_REMOVED ((__force blk_status_t)128)
+#include <linux/blk_types.h>
const char *bch2_blk_status_to_str(blk_status_t);
#endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index b679def8fb98..038da6a61f6b 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,7 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_cache.h"
#include "btree_iter.h"
#include "error.h"
+#include "fs-common.h"
#include "journal.h"
#include "recovery_passes.h"
#include "super.h"
@@ -33,7 +35,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
int bch2_topology_error(struct bch_fs *c)
{
set_bit(BCH_FS_topology_error, &c->flags);
- if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
+ if (!test_bit(BCH_FS_recovery_running, &c->flags)) {
bch2_inconsistent_error(c);
return -BCH_ERR_btree_need_topology_repair;
} else {
@@ -218,6 +220,30 @@ static const u8 fsck_flags_extra[] = {
#undef x
};
+static int do_fsck_ask_yn(struct bch_fs *c,
+ struct btree_trans *trans,
+ struct printbuf *question,
+ const char *action)
+{
+ prt_str(question, ", ");
+ prt_str(question, action);
+
+ if (bch2_fs_stdio_redirect(c))
+ bch2_print(c, "%s", question->buf);
+ else
+ bch2_print_string_as_lines(KERN_ERR, question->buf);
+
+ int ask = bch2_fsck_ask_yn(c, trans);
+
+ if (trans) {
+ int ret = bch2_trans_relock(trans);
+ if (ret)
+ return ret;
+ }
+
+ return ask;
+}
+
int __bch2_fsck_err(struct bch_fs *c,
struct btree_trans *trans,
enum bch_fsck_flags flags,
@@ -226,7 +252,7 @@ int __bch2_fsck_err(struct bch_fs *c,
{
struct fsck_err_state *s = NULL;
va_list args;
- bool print = true, suppressing = false, inconsistent = false;
+ bool print = true, suppressing = false, inconsistent = false, exiting = false;
struct printbuf buf = PRINTBUF, *out = &buf;
int ret = -BCH_ERR_fsck_ignore;
const char *action_orig = "fix?", *action = action_orig;
@@ -256,9 +282,10 @@ int __bch2_fsck_err(struct bch_fs *c,
!trans &&
bch2_current_has_btree_trans(c));
- if ((flags & FSCK_CAN_FIX) &&
- test_bit(err, c->sb.errors_silent))
- return -BCH_ERR_fsck_fix;
+ if (test_bit(err, c->sb.errors_silent))
+ return flags & FSCK_CAN_FIX
+ ? -BCH_ERR_fsck_fix
+ : -BCH_ERR_fsck_ignore;
bch2_sb_error_count(c, err);
@@ -289,16 +316,14 @@ int __bch2_fsck_err(struct bch_fs *c,
*/
if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
ret = s->ret;
- mutex_unlock(&c->fsck_error_msgs_lock);
- goto err;
+ goto err_unlock;
}
kfree(s->last_msg);
s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
if (!s->last_msg) {
- mutex_unlock(&c->fsck_error_msgs_lock);
ret = -ENOMEM;
- goto err;
+ goto err_unlock;
}
if (c->opts.ratelimit_errors &&
@@ -318,13 +343,19 @@ int __bch2_fsck_err(struct bch_fs *c,
prt_printf(out, bch2_log_msg(c, ""));
#endif
- if ((flags & FSCK_CAN_FIX) &&
- (flags & FSCK_AUTOFIX) &&
+ if ((flags & FSCK_AUTOFIX) &&
(c->opts.errors == BCH_ON_ERROR_continue ||
c->opts.errors == BCH_ON_ERROR_fix_safe)) {
prt_str(out, ", ");
- prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
+ if (flags & FSCK_CAN_FIX) {
+ prt_actioning(out, action);
+ ret = -BCH_ERR_fsck_fix;
+ } else {
+ prt_str(out, ", continuing");
+ ret = -BCH_ERR_fsck_ignore;
+ }
+
+ goto print;
} else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
if (c->opts.errors != BCH_ON_ERROR_continue ||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
@@ -348,31 +379,18 @@ int __bch2_fsck_err(struct bch_fs *c,
: c->opts.fix_errors;
if (fix == FSCK_FIX_ask) {
- prt_str(out, ", ");
- prt_str(out, action);
-
- if (bch2_fs_stdio_redirect(c))
- bch2_print(c, "%s", out->buf);
- else
- bch2_print_string_as_lines(KERN_ERR, out->buf);
print = false;
- int ask = bch2_fsck_ask_yn(c, trans);
-
- if (trans) {
- ret = bch2_trans_relock(trans);
- if (ret) {
- mutex_unlock(&c->fsck_error_msgs_lock);
- goto err;
- }
- }
+ ret = do_fsck_ask_yn(c, trans, out, action);
+ if (ret < 0)
+ goto err_unlock;
- if (ask >= YN_ALLNO && s)
- s->fix = ask == YN_ALLNO
+ if (ret >= YN_ALLNO && s)
+ s->fix = ret == YN_ALLNO
? FSCK_FIX_no
: FSCK_FIX_yes;
- ret = ask & 1
+ ret = ret & 1
? -BCH_ERR_fsck_fix
: -BCH_ERR_fsck_ignore;
} else if (fix == FSCK_FIX_yes ||
@@ -385,9 +403,7 @@ int __bch2_fsck_err(struct bch_fs *c,
prt_str(out, ", not ");
prt_actioning(out, action);
}
- } else if (flags & FSCK_NEED_FSCK) {
- prt_str(out, " (run fsck to correct)");
- } else {
+ } else if (!(flags & FSCK_CAN_IGNORE)) {
prt_str(out, " (repair unimplemented)");
}
@@ -396,14 +412,13 @@ int __bch2_fsck_err(struct bch_fs *c,
!(flags & FSCK_CAN_IGNORE)))
ret = -BCH_ERR_fsck_errors_not_fixed;
- bool exiting =
- test_bit(BCH_FS_fsck_running, &c->flags) &&
- (ret != -BCH_ERR_fsck_fix &&
- ret != -BCH_ERR_fsck_ignore);
-
- if (exiting)
+ if (test_bit(BCH_FS_fsck_running, &c->flags) &&
+ (ret != -BCH_ERR_fsck_fix &&
+ ret != -BCH_ERR_fsck_ignore)) {
+ exiting = true;
print = true;
-
+ }
+print:
if (print) {
if (bch2_fs_stdio_redirect(c))
bch2_print(c, "%s\n", out->buf);
@@ -419,17 +434,24 @@ int __bch2_fsck_err(struct bch_fs *c,
if (s)
s->ret = ret;
- mutex_unlock(&c->fsck_error_msgs_lock);
-
if (inconsistent)
bch2_inconsistent_error(c);
- if (ret == -BCH_ERR_fsck_fix) {
- set_bit(BCH_FS_errors_fixed, &c->flags);
- } else {
- set_bit(BCH_FS_errors_not_fixed, &c->flags);
- set_bit(BCH_FS_error, &c->flags);
+ /*
+ * We don't yet track whether the filesystem currently has errors, for
+ * log_fsck_err()s: that would require us to track for every error type
+ * which recovery pass corrects it, to get the fsck exit status correct:
+ */
+ if (flags & FSCK_CAN_FIX) {
+ if (ret == -BCH_ERR_fsck_fix) {
+ set_bit(BCH_FS_errors_fixed, &c->flags);
+ } else {
+ set_bit(BCH_FS_errors_not_fixed, &c->flags);
+ set_bit(BCH_FS_error, &c->flags);
+ }
}
+err_unlock:
+ mutex_unlock(&c->fsck_error_msgs_lock);
err:
if (action != action_orig)
kfree(action);
@@ -437,28 +459,52 @@ int __bch2_fsck_err(struct bch_fs *c,
return ret;
}
+static const char * const bch2_bkey_validate_contexts[] = {
+#define x(n) #n,
+ BKEY_VALIDATE_CONTEXTS()
+#undef x
+ NULL
+};
+
int __bch2_bkey_fsck_err(struct bch_fs *c,
struct bkey_s_c k,
- enum bch_validate_flags validate_flags,
+ struct bkey_validate_context from,
enum bch_sb_error_id err,
const char *fmt, ...)
{
- if (validate_flags & BCH_VALIDATE_silent)
+ if (from.flags & BCH_VALIDATE_silent)
return -BCH_ERR_fsck_delete_bkey;
unsigned fsck_flags = 0;
- if (!(validate_flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)))
+ if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
+ if (test_bit(err, c->sb.errors_silent))
+ return -BCH_ERR_fsck_delete_bkey;
+
fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
+ }
+ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
+ fsck_flags |= fsck_flags_extra[err];
struct printbuf buf = PRINTBUF;
- va_list args;
+ prt_printf(&buf, "invalid bkey in %s",
+ bch2_bkey_validate_contexts[from.from]);
+
+ if (from.from == BKEY_VALIDATE_journal)
+ prt_printf(&buf, " journal seq=%llu offset=%u",
+ from.journal_seq, from.journal_offset);
+
+ prt_str(&buf, " btree=");
+ bch2_btree_id_to_text(&buf, from.btree);
+ prt_printf(&buf, " level=%u: ", from.level);
- prt_str(&buf, "invalid bkey ");
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, "\n ");
+
+ va_list args;
va_start(args, fmt);
prt_vprintf(&buf, fmt, args);
va_end(args);
+
prt_str(&buf, ": delete?");
int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf);
@@ -483,3 +529,36 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
mutex_unlock(&c->fsck_error_msgs_lock);
}
+
+int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum)
+{
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+
+ /* XXX: we don't yet attempt to print paths when we don't know the subvol */
+ if (inum.subvol)
+ ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out));
+ if (!inum.subvol || ret)
+ prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
+
+ return trans_was_restarted(trans, restart_count);
+}
+
+int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+ subvol_inum inum, u64 offset)
+{
+ int ret = bch2_inum_err_msg_trans(trans, out, inum);
+ prt_printf(out, " offset %llu: ", offset);
+ return ret;
+}
+
+void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum)
+{
+ bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum));
+}
+
+void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
+ subvol_inum inum, u64 offset)
+{
+ bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
+}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 6551ada926b6..7acf2a27ca28 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -45,32 +45,11 @@ int bch2_topology_error(struct bch_fs *);
bch2_inconsistent_error(c); \
})
-#define bch2_fs_inconsistent_on(cond, c, ...) \
+#define bch2_fs_inconsistent_on(cond, ...) \
({ \
bool _ret = unlikely(!!(cond)); \
- \
- if (_ret) \
- bch2_fs_inconsistent(c, __VA_ARGS__); \
- _ret; \
-})
-
-/*
- * Later we might want to mark only the particular device inconsistent, not the
- * entire filesystem:
- */
-
-#define bch2_dev_inconsistent(ca, ...) \
-do { \
- bch_err(ca, __VA_ARGS__); \
- bch2_inconsistent_error((ca)->fs); \
-} while (0)
-
-#define bch2_dev_inconsistent_on(cond, ca, ...) \
-({ \
- bool _ret = unlikely(!!(cond)); \
- \
if (_ret) \
- bch2_dev_inconsistent(ca, __VA_ARGS__); \
+ bch2_fs_inconsistent(__VA_ARGS__); \
_ret; \
})
@@ -123,9 +102,9 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
void bch2_flush_fsck_errs(struct bch_fs *);
-#define __fsck_err(c, _flags, _err_type, ...) \
+#define fsck_err_wrap(_do) \
({ \
- int _ret = bch2_fsck_err(c, _flags, _err_type, __VA_ARGS__); \
+ int _ret = _do; \
if (_ret != -BCH_ERR_fsck_fix && \
_ret != -BCH_ERR_fsck_ignore) { \
ret = _ret; \
@@ -135,6 +114,8 @@ void bch2_flush_fsck_errs(struct bch_fs *);
_ret == -BCH_ERR_fsck_fix; \
})
+#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
+
/* These macros return true if error should be fixed: */
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
@@ -149,12 +130,6 @@ void bch2_flush_fsck_errs(struct bch_fs *);
(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
})
-#define need_fsck_err_on(cond, c, _err_type, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
-
-#define need_fsck_err(c, _err_type, ...) \
- __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
-
#define mustfix_fsck_err(c, _err_type, ...) \
__fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
@@ -167,11 +142,22 @@ void bch2_flush_fsck_errs(struct bch_fs *);
#define fsck_err_on(cond, c, _err_type, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
+#define log_fsck_err(c, _err_type, ...) \
+ __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
+
+#define log_fsck_err_on(cond, ...) \
+({ \
+ bool _ret = unlikely(!!(cond)); \
+ if (_ret) \
+ log_fsck_err(__VA_ARGS__); \
+ _ret; \
+})
+
enum bch_validate_flags;
__printf(5, 6)
int __bch2_bkey_fsck_err(struct bch_fs *,
struct bkey_s_c,
- enum bch_validate_flags,
+ struct bkey_validate_context from,
enum bch_sb_error_id,
const char *, ...);
@@ -181,7 +167,7 @@ int __bch2_bkey_fsck_err(struct bch_fs *,
*/
#define bkey_fsck_err(c, _err_type, _err_msg, ...) \
do { \
- int _ret = __bch2_bkey_fsck_err(c, k, flags, \
+ int _ret = __bch2_bkey_fsck_err(c, k, from, \
BCH_FSCK_ERR_##_err_type, \
_err_msg, ##__VA_ARGS__); \
if (_ret != -BCH_ERR_fsck_fix && \
@@ -252,4 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
_ret; \
})
+int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum);
+int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
+
+void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum);
+void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
+
#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 5f4fecb358da..6aac579a692a 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -64,7 +64,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
break;
case KEY_TYPE_reflink_p: {
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- u64 idx = le64_to_cpu(p.v->idx);
+ u64 idx = REFLINK_P_IDX(p.v);
unsigned sectors = bpos_min(*end, p.k->p).offset -
bkey_start_offset(p.k);
struct btree_iter iter;
@@ -128,7 +128,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
bch2_trans_copy_iter(&copy, iter);
- for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) {
+ for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) {
unsigned offset = 0;
if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 37e3d69bec06..05d5f71a7ca9 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -21,6 +21,7 @@
#include "extents.h"
#include "inode.h"
#include "journal.h"
+#include "rebalance.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
@@ -88,6 +89,14 @@ static inline bool ptr_better(struct bch_fs *c,
u64 l1 = dev_latency(c, p1.ptr.dev);
u64 l2 = dev_latency(c, p2.ptr.dev);
+ /*
+ * Square the latencies, to bias more in favor of the faster
+ * device - we never want to stop issuing reads to the slower
+ * device altogether, so that we can update our latency numbers:
+ */
+ l1 *= l1;
+ l2 *= l2;
+
/* Pick at random, biased in favor of the faster device: */
return bch2_rand_range(l1 + l2) > l1;
@@ -169,7 +178,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
/* KEY_TYPE_btree_ptr: */
int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
int ret = 0;
@@ -177,7 +186,7 @@ int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
c, btree_ptr_val_too_big,
"value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
- ret = bch2_bkey_ptrs_validate(c, k, flags);
+ ret = bch2_bkey_ptrs_validate(c, k, from);
fsck_err:
return ret;
}
@@ -189,7 +198,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
}
int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
int ret = 0;
@@ -203,12 +212,13 @@ int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
c, btree_ptr_v2_min_key_bad,
"min_key > key");
- if (flags & BCH_VALIDATE_write)
+ if ((from.flags & BCH_VALIDATE_write) &&
+ c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
bkey_fsck_err_on(!bp.v->sectors_written,
c, btree_ptr_v2_written_0,
"sectors_written == 0");
- ret = bch2_bkey_ptrs_validate(c, k, flags);
+ ret = bch2_bkey_ptrs_validate(c, k, from);
fsck_err:
return ret;
}
@@ -395,7 +405,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
/* KEY_TYPE_reservation: */
int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
int ret = 0;
@@ -1120,6 +1130,57 @@ void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_cr
bch2_prt_compression_type(out, crc->compression_type);
}
+static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
+ const struct bch_extent_rebalance *r)
+{
+ prt_str(out, "rebalance:");
+
+ prt_printf(out, " replicas=%u", r->data_replicas);
+ if (r->data_replicas_from_inode)
+ prt_str(out, " (inode)");
+
+ prt_str(out, " checksum=");
+ bch2_prt_csum_opt(out, r->data_checksum);
+ if (r->data_checksum_from_inode)
+ prt_str(out, " (inode)");
+
+ if (r->background_compression || r->background_compression_from_inode) {
+ prt_str(out, " background_compression=");
+ bch2_compression_opt_to_text(out, r->background_compression);
+
+ if (r->background_compression_from_inode)
+ prt_str(out, " (inode)");
+ }
+
+ if (r->background_target || r->background_target_from_inode) {
+ prt_str(out, " background_target=");
+ if (c)
+ bch2_target_to_text(out, c, r->background_target);
+ else
+ prt_printf(out, "%u", r->background_target);
+
+ if (r->background_target_from_inode)
+ prt_str(out, " (inode)");
+ }
+
+ if (r->promote_target || r->promote_target_from_inode) {
+ prt_str(out, " promote_target=");
+ if (c)
+ bch2_target_to_text(out, c, r->promote_target);
+ else
+ prt_printf(out, "%u", r->promote_target);
+
+ if (r->promote_target_from_inode)
+ prt_str(out, " (inode)");
+ }
+
+ if (r->erasure_code || r->erasure_code_from_inode) {
+ prt_printf(out, " ec=%u", r->erasure_code);
+ if (r->erasure_code_from_inode)
+ prt_str(out, " (inode)");
+ }
+}
+
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
@@ -1155,18 +1216,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
(u64) ec->idx, ec->block);
break;
}
- case BCH_EXTENT_ENTRY_rebalance: {
- const struct bch_extent_rebalance *r = &entry->rebalance;
-
- prt_str(out, "rebalance: target ");
- if (c)
- bch2_target_to_text(out, c, r->target);
- else
- prt_printf(out, "%u", r->target);
- prt_str(out, " compression ");
- bch2_compression_opt_to_text(out, r->compression);
+ case BCH_EXTENT_ENTRY_rebalance:
+ bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
break;
- }
+
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
@@ -1178,13 +1231,19 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
static int extent_ptr_validate(struct bch_fs *c,
struct bkey_s_c k,
- enum bch_validate_flags flags,
+ struct bkey_validate_context from,
const struct bch_extent_ptr *ptr,
unsigned size_ondisk,
bool metadata)
{
int ret = 0;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr(ptrs, ptr2)
+ bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
+ c, ptr_to_duplicate_device,
+ "multiple pointers to same device (%u)", ptr->dev);
+
/* bad pointers are repaired by check_fix_ptrs(): */
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
@@ -1199,13 +1258,6 @@ static int extent_ptr_validate(struct bch_fs *c,
unsigned bucket_size = ca->mi.bucket_size;
rcu_read_unlock();
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr2)
- bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
- c, ptr_to_duplicate_device,
- "multiple pointers to same device (%u)", ptr->dev);
-
-
bkey_fsck_err_on(bucket >= nbuckets,
c, ptr_after_last_bucket,
"pointer past last bucket (%llu > %llu)", bucket, nbuckets);
@@ -1221,7 +1273,7 @@ static int extent_ptr_validate(struct bch_fs *c,
}
int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -1248,7 +1300,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
- ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false);
+ ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
if (ret)
return ret;
@@ -1270,9 +1322,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
- c, ptr_crc_uncompressed_size_too_small,
- "checksum offset + key size > uncompressed size");
bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
c, ptr_crc_csum_type_unknown,
"invalid checksum type");
@@ -1280,6 +1329,19 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
c, ptr_crc_compression_type_unknown,
"invalid compression type");
+ bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
+ c, ptr_crc_uncompressed_size_too_small,
+ "checksum offset + key size > uncompressed size");
+ bkey_fsck_err_on(crc_is_encoded(crc) &&
+ (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+ (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
+ c, ptr_crc_uncompressed_size_too_big,
+ "too large encoded extent");
+ bkey_fsck_err_on(!crc_is_compressed(crc) &&
+ crc.compressed_size != crc.uncompressed_size,
+ c, ptr_crc_uncompressed_size_mismatch,
+ "not compressed but compressed != uncompressed size");
+
if (bch2_csum_type_is_encryption(crc.csum_type)) {
if (nonce == UINT_MAX)
nonce = crc.offset + crc.nonce;
@@ -1293,12 +1355,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
"redundant crc entry");
crc_since_last_ptr = true;
- bkey_fsck_err_on(crc_is_encoded(crc) &&
- (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
- (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
- c, ptr_crc_uncompressed_size_too_big,
- "too large encoded extent");
-
size_ondisk = crc.compressed_size;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
@@ -1391,166 +1447,6 @@ void bch2_ptr_swab(struct bkey_s k)
}
}
-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
-
- bkey_extent_entry_for_each(ptrs, entry)
- if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
- return &entry->rebalance;
-
- return NULL;
-}
-
-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
- unsigned target, unsigned compression)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned rewrite_ptrs = 0;
-
- if (compression) {
- unsigned compression_type = bch2_compression_opt_to_type(compression);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned i = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
- p.ptr.unwritten) {
- rewrite_ptrs = 0;
- goto incompressible;
- }
-
- if (!p.ptr.cached && p.crc.compression_type != compression_type)
- rewrite_ptrs |= 1U << i;
- i++;
- }
- }
-incompressible:
- if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
- unsigned i = 0;
-
- bkey_for_each_ptr(ptrs, ptr) {
- if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
- rewrite_ptrs |= 1U << i;
- i++;
- }
- }
-
- return rewrite_ptrs;
-}
-
-bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
-
- /*
- * If it's an indirect extent, we don't delete the rebalance entry when
- * done so that we know what options were applied - check if it still
- * needs work done:
- */
- if (r &&
- k.k->type == KEY_TYPE_reflink_v &&
- !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
- r = NULL;
-
- return r != NULL;
-}
-
-static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
- unsigned target, unsigned compression)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- u64 sectors = 0;
-
- if (compression) {
- unsigned compression_type = bch2_compression_opt_to_type(compression);
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
- p.ptr.unwritten) {
- sectors = 0;
- goto incompressible;
- }
-
- if (!p.ptr.cached && p.crc.compression_type != compression_type)
- sectors += p.crc.compressed_size;
- }
- }
-incompressible:
- if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target))
- sectors += p.crc.compressed_size;
- }
-
- return sectors;
-}
-
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
-
- return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0;
-}
-
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
- struct bch_io_opts *opts)
-{
- struct bkey_s k = bkey_i_to_s(_k);
- struct bch_extent_rebalance *r;
- unsigned target = opts->background_target;
- unsigned compression = background_compression(*opts);
- bool needs_rebalance;
-
- if (!bkey_extent_is_direct_data(k.k))
- return 0;
-
- /* get existing rebalance entry: */
- r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
- if (r) {
- if (k.k->type == KEY_TYPE_reflink_v) {
- /*
- * indirect extents: existing options take precedence,
- * so that we don't move extents back and forth if
- * they're referenced by different inodes with different
- * options:
- */
- if (r->target)
- target = r->target;
- if (r->compression)
- compression = r->compression;
- }
-
- r->target = target;
- r->compression = compression;
- }
-
- needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
-
- if (needs_rebalance && !r) {
- union bch_extent_entry *new = bkey_val_end(k);
-
- new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance;
- new->rebalance.compression = compression;
- new->rebalance.target = target;
- new->rebalance.unused = 0;
- k.k->u64s += extent_entry_u64s(new);
- } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
- /*
- * For indirect extents, don't delete the rebalance entry when
- * we're finished so that we know we specifically moved it or
- * compressed it to its current location/compression type
- */
- extent_entry_drop(k, (union bch_extent_entry *) r);
- }
-
- return 0;
-}
-
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
@@ -1610,7 +1506,7 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
case KEY_TYPE_reflink_p: {
struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
- le64_add_cpu(&p.v->idx, sub);
+ SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
break;
}
case KEY_TYPE_inline_data:
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index bcffcf60aaaf..620b284aa34f 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -8,7 +8,6 @@
struct bch_fs;
struct btree_trans;
-enum bch_validate_flags;
/* extent entries: */
@@ -410,12 +409,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
/* KEY_TYPE_btree_ptr: */
int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context);
void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
int, struct bkey_s);
@@ -452,7 +451,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
/* KEY_TYPE_reservation: */
int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -696,7 +695,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context);
static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
struct bch_extent_ptr ptr2)
@@ -710,15 +709,6 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
void bch2_ptr_swab(struct bkey_s);
-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
- unsigned, unsigned);
-bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
-
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
- struct bch_io_opts *);
-
/* Generic extent code: */
enum bch_extent_overlap {
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
index 3bd2fdbb0817..c198dfc376d6 100644
--- a/fs/bcachefs/extents_format.h
+++ b/fs/bcachefs/extents_format.h
@@ -201,19 +201,8 @@ struct bch_extent_stripe_ptr {
#endif
};
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:34,
- compression:8, /* enum bch_compression_opt */
- target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 target:16,
- compression:8,
- unused:34,
- type:6;
-#endif
-};
+/* bch_extent_rebalance: */
+#include "rebalance_format.h"
union bch_extent_entry {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 7e10a9ddcfd9..2c3d46ac70c6 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -69,9 +69,7 @@ int bch2_create_trans(struct btree_trans *trans,
if (!snapshot_src.inum) {
/* Inode wasn't specified, just snapshot: */
struct bch_subvolume s;
-
- ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
- BTREE_ITER_cached, &s);
+ ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s);
if (ret)
goto err;
@@ -172,6 +170,10 @@ int bch2_create_trans(struct btree_trans *trans,
new_inode->bi_dir_offset = dir_offset;
}
+ if (S_ISDIR(mode) &&
+ !new_inode->bi_subvol)
+ new_inode->bi_depth = dir_u->bi_depth + 1;
+
inode_iter.flags &= ~BTREE_ITER_all_snapshots;
bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
@@ -512,6 +514,15 @@ int bch2_rename_trans(struct btree_trans *trans,
dst_dir_u->bi_nlink++;
}
+ if (S_ISDIR(src_inode_u->bi_mode) &&
+ !src_inode_u->bi_subvol)
+ src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
+
+ if (mode == BCH_RENAME_EXCHANGE &&
+ S_ISDIR(dst_inode_u->bi_mode) &&
+ !dst_inode_u->bi_subvol)
+ dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
+
if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
dst_dir_u->bi_nlink--;
src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
@@ -548,3 +559,94 @@ int bch2_rename_trans(struct btree_trans *trans,
bch2_trans_iter_exit(trans, &src_dir_iter);
return ret;
}
+
+static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
+{
+ bch2_printbuf_make_room(out, n);
+
+ unsigned can_print = min(n, printbuf_remaining(out));
+
+ b += n;
+
+ for (unsigned i = 0; i < can_print; i++)
+ out->buf[out->pos++] = *((char *) --b);
+
+ printbuf_nul_terminate(out);
+}
+
+static inline void prt_str_reversed(struct printbuf *out, const char *s)
+{
+ prt_bytes_reversed(out, s, strlen(s));
+}
+
+static inline void reverse_bytes(void *b, size_t n)
+{
+ char *e = b + n, *s = b;
+
+ while (s < e) {
+ --e;
+ swap(*s, *e);
+ s++;
+ }
+}
+
+/* XXX: we don't yet attempt to print paths when we don't know the subvol */
+int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path)
+{
+ unsigned orig_pos = path->pos;
+ int ret = 0;
+
+ while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL &&
+ inum.inum == BCACHEFS_ROOT_INO)) {
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_find_by_inum_trans(trans, inum, &inode);
+ if (ret)
+ goto disconnected;
+
+ if (!inode.bi_dir && !inode.bi_dir_offset) {
+ ret = -BCH_ERR_ENOENT_inode_no_backpointer;
+ goto disconnected;
+ }
+
+ inum.subvol = inode.bi_parent_subvol ?: inum.subvol;
+ inum.inum = inode.bi_dir;
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto disconnected;
+
+ struct btree_iter d_iter;
+ struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
+ BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot),
+ 0, dirent);
+ ret = bkey_err(d.s_c);
+ if (ret)
+ goto disconnected;
+
+ struct qstr dirent_name = bch2_dirent_get_name(d);
+ prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
+
+ prt_char(path, '/');
+
+ bch2_trans_iter_exit(trans, &d_iter);
+ }
+
+ if (orig_pos == path->pos)
+ prt_char(path, '/');
+out:
+ ret = path->allocation_failure ? -ENOMEM : 0;
+ if (ret)
+ goto err;
+
+ reverse_bytes(path->buf + orig_pos, path->pos - orig_pos);
+ return 0;
+err:
+ return ret;
+disconnected:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto err;
+
+ prt_str_reversed(path, "(disconnected)");
+ goto out;
+}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index c934e807b380..2b59210bb5e8 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -42,4 +42,6 @@ int bch2_rename_trans(struct btree_trans *,
bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
struct bch_inode_unpacked *);
+int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
+
#endif /* _BCACHEFS_FS_COMMON_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 95972809e76d..ab1d5db2fa56 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -164,7 +164,8 @@ static void bchfs_read(struct btree_trans *trans,
BTREE_ITER_slots);
while (1) {
struct bkey_s_c k;
- unsigned bytes, sectors, offset_into_extent;
+ unsigned bytes, sectors;
+ s64 offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
bch2_trans_begin(trans);
@@ -197,7 +198,7 @@ static void bchfs_read(struct btree_trans *trans,
k = bkey_i_to_s_c(sk.k);
- sectors = min(sectors, k.k->size - offset_into_extent);
+ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
if (readpages_iter) {
ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
@@ -230,10 +231,12 @@ static void bchfs_read(struct btree_trans *trans,
bch2_trans_iter_exit(trans, &iter);
if (ret) {
- bch_err_inum_offset_ratelimited(c,
- iter.pos.inode,
- iter.pos.offset << 9,
- "read error %i from btree lookup", ret);
+ struct printbuf buf = PRINTBUF;
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9);
+ prt_printf(&buf, "read error %i from btree lookup", ret);
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+
rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
}
@@ -248,6 +251,7 @@ void bch2_readahead(struct readahead_control *ractl)
struct bch_io_opts opts;
struct folio *folio;
struct readpages_iter readpages_iter;
+ struct blk_plug plug;
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
@@ -255,6 +259,16 @@ void bch2_readahead(struct readahead_control *ractl)
if (ret)
return;
+ /*
+ * Besides being a general performance optimization, plugging helps with
+ * avoiding btree transaction srcu warnings - submitting a bio can
+ * block, and we don't want todo that with the transaction locked.
+ *
+ * However, plugged bios are submitted when we schedule; we ideally
+ * would have our own scheduler hook to call unlock_long() before
+ * scheduling.
+ */
+ blk_start_plug(&plug);
bch2_pagecache_add_get(inode);
struct btree_trans *trans = bch2_trans_get(c);
@@ -281,7 +295,7 @@ void bch2_readahead(struct readahead_control *ractl)
bch2_trans_put(trans);
bch2_pagecache_add_put(inode);
-
+ blk_finish_plug(&plug);
darray_exit(&readpages_iter.folios);
}
@@ -296,9 +310,13 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_read_bio *rbio;
struct bch_io_opts opts;
+ struct blk_plug plug;
int ret;
DECLARE_COMPLETION_ONSTACK(done);
+ BUG_ON(folio_test_uptodate(folio));
+ BUG_ON(folio_test_dirty(folio));
+
if (!bch2_folio_create(folio, GFP_KERNEL))
return -ENOMEM;
@@ -313,7 +331,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+ blk_start_plug(&plug);
bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
+ blk_finish_plug(&plug);
wait_for_completion(&done);
ret = blk_status_to_errno(rbio->bio.bi_status);
@@ -605,15 +625,6 @@ static int __bch2_writepage(struct folio *folio,
BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
sectors << 9, offset << 9));
- /* Check for writing past i_size: */
- WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
- round_up(i_size, block_bytes(c)) &&
- !test_bit(BCH_FS_emergency_ro, &c->flags),
- "writing past i_size: %llu > %llu (unrounded %llu)\n",
- bio_end_sector(&w->io->op.wbio.bio) << 9,
- round_up(i_size, block_bytes(c)),
- i_size);
-
w->io->op.res.sectors += reserved_sectors;
w->io->op.i_sectors_delta -= dirty_sectors;
w->io->op.new_i_size = i_size;
@@ -669,7 +680,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
FGP_WRITEBEGIN | fgf_set_order(len),
mapping_gfp_mask(mapping));
- if (IS_ERR_OR_NULL(folio))
+ if (IS_ERR(folio))
goto err_unlock;
offset = pos - folio_pos(folio);
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 6d3a05ae5da8..2089c36b5866 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -70,6 +70,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
struct bch_io_opts opts;
struct dio_read *dio;
struct bio *bio;
+ struct blk_plug plug;
loff_t offset = req->ki_pos;
bool sync = is_sync_kiocb(req);
size_t shorten;
@@ -128,6 +129,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
*/
dio->should_dirty = iter_is_iovec(iter);
+ blk_start_plug(&plug);
+
goto start;
while (iter->count) {
bio = bio_alloc_bioset(NULL,
@@ -160,6 +163,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
}
+ blk_finish_plug(&plug);
+
iter->count += shorten;
if (sync) {
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index 1d4910ea0f1d..e072900e6a5b 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -29,7 +29,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
break;
f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
- if (IS_ERR_OR_NULL(f))
+ if (IS_ERR(f))
break;
BUG_ON(fs->nr && folio_pos(f) != pos);
@@ -199,7 +199,7 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
unsigned folio_idx = 0;
return bch2_trans_run(c,
- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
POS(inum.inum, offset),
POS(inum.inum, U64_MAX),
inum.subvol, BTREE_ITER_slots, k, ({
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2456c41b215e..94bf34b9b65f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -167,6 +167,34 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
/* fsync: */
+static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum,
+ u64 *seq)
+{
+ struct printbuf buf = PRINTBUF;
+ struct bch_inode_unpacked u;
+ struct btree_iter iter;
+ int ret = bch2_inode_peek(trans, &iter, &u, inum, 0);
+ if (ret)
+ return ret;
+
+ u64 cur_seq = journal_cur_seq(&trans->c->journal);
+ *seq = min(cur_seq, u.bi_journal_seq);
+
+ if (fsck_err_on(u.bi_journal_seq > cur_seq,
+ trans, inode_journal_seq_in_future,
+ "inode journal seq in future (currently at %llu)\n%s",
+ cur_seq,
+ (bch2_inode_unpacked_to_text(&buf, &u),
+ buf.buf))) {
+ u.bi_journal_seq = cur_seq;
+ ret = bch2_inode_write(trans, &iter, &u);
+ }
+fsck_err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
/*
* inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
* insert trigger: look up the btree inode instead
@@ -180,9 +208,10 @@ static int bch2_flush_inode(struct bch_fs *c,
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
return -EROFS;
- struct bch_inode_unpacked u;
- int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
- bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?:
+ u64 seq;
+ int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+ bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
+ bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
bch2_inode_flush_nocow_writes(c, inode);
bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
return ret;
@@ -222,7 +251,7 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
struct bpos end)
{
return bch2_trans_run(c,
- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end,
subvol, 0, k, ({
bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
})));
@@ -256,7 +285,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
folio = __filemap_get_folio(mapping, index,
FGP_LOCK|FGP_CREAT, GFP_KERNEL);
- if (IS_ERR_OR_NULL(folio)) {
+ if (IS_ERR(folio)) {
ret = -ENOMEM;
goto out;
}
@@ -806,7 +835,7 @@ static int quota_reserve_range(struct bch_inode_info *inode,
u64 sectors = end - start;
int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_upto(trans, iter,
+ for_each_btree_key_in_subvolume_max(trans, iter,
BTREE_ID_extents,
POS(inode->v.i_ino, start),
POS(inode->v.i_ino, end - 1),
@@ -877,11 +906,18 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
bch2_mark_pagecache_unallocated(src, pos_src >> 9,
(pos_src + aligned_len) >> 9);
+ /*
+ * XXX: we'd like to be telling bch2_remap_range() if we have
+ * permission to write to the source file, and thus if io path option
+ * changes should be propagated through the copy, but we need mnt_idmap
+ * from the pathwalk, awkward
+ */
ret = bch2_remap_range(c,
inode_inum(dst), pos_dst >> 9,
inode_inum(src), pos_src >> 9,
aligned_len >> 9,
- pos_dst + len, &i_sectors_delta);
+ pos_dst + len, &i_sectors_delta,
+ false);
if (ret < 0)
goto err;
@@ -922,7 +958,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
return -ENXIO;
int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
POS(inode->v.i_ino, offset >> 9),
POS(inode->v.i_ino, U64_MAX),
inum.subvol, 0, k, ({
@@ -958,7 +994,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
return -ENXIO;
int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
POS(inode->v.i_ino, offset >> 9),
POS(inode->v.i_ino, U64_MAX),
inum.subvol, BTREE_ITER_slots, k, ({
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 405cf08bda34..15725b4ce393 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -406,7 +406,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
sync_inodes_sb(c->vfs_sb);
up_read(&c->vfs_sb->s_umount);
}
-retry:
+
if (arg.src_ptr) {
error = user_path_at(arg.dirfd,
(const char __user *)(unsigned long)arg.src_ptr,
@@ -486,11 +486,6 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
err2:
if (arg.src_ptr)
path_put(&src_path);
-
- if (retry_estale(error, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
err1:
return error;
}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index a41d0d8a2f7b..3f83f131d0e8 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -23,6 +23,7 @@
#include "journal.h"
#include "keylist.h"
#include "quota.h"
+#include "rebalance.h"
#include "snapshot.h"
#include "super.h"
#include "xattr.h"
@@ -38,6 +39,7 @@
#include <linux/posix_acl.h>
#include <linux/random.h>
#include <linux/seq_file.h>
+#include <linux/siphash.h>
#include <linux/statfs.h>
#include <linux/string.h>
#include <linux/xattr.h>
@@ -89,10 +91,25 @@ int __must_check bch2_write_inode(struct bch_fs *c,
retry:
bch2_trans_begin(trans);
- ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
- BTREE_ITER_intent) ?:
- (set ? set(trans, inode, &inode_u, p) : 0) ?:
- bch2_inode_write(trans, &iter, &inode_u) ?:
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent);
+ if (ret)
+ goto err;
+
+ struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u);
+
+ ret = (set ? set(trans, inode, &inode_u, p) : 0);
+ if (ret)
+ goto err;
+
+ struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
+
+ if (memcmp(&old_r, &new_r, sizeof(new_r))) {
+ ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_inode_write(trans, &iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
/*
@@ -101,7 +118,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
*/
if (!ret)
bch2_inode_update_after_write(trans, inode, &inode_u, fields);
-
+err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -160,8 +177,9 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
{
const subvol_inum *inum = data;
+ siphash_key_t k = { .key[0] = seed };
- return jhash(&inum->inum, sizeof(inum->inum), seed);
+ return siphash_2u64(inum->subvol, inum->inum, &k);
}
static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
@@ -190,11 +208,18 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
.automatic_shrinking = true,
};
+static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = {
+ .head_offset = offsetof(struct bch_inode_info, by_inum_hash),
+ .key_offset = offsetof(struct bch_inode_info, ei_inum.inum),
+ .key_len = sizeof(u64),
+ .automatic_shrinking = true,
+};
+
int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
{
struct bch_fs *c = trans->c;
- struct rhashtable *ht = &c->vfs_inodes_table;
- subvol_inum inum = (subvol_inum) { .inum = p.offset };
+ struct rhltable *ht = &c->vfs_inodes_by_inum_table;
+ u64 inum = p.offset;
DARRAY(u32) subvols;
int ret = 0;
@@ -219,15 +244,15 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
struct rhash_lock_head __rcu *const *bkt;
struct rhash_head *he;
unsigned int hash;
- struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+ struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht);
restart:
- hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params);
+ hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params);
bkt = rht_bucket(tbl, hash);
do {
struct bch_inode_info *inode;
rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
- if (inode->ei_inum.inum == inum.inum) {
+ if (inode->ei_inum.inum == inum) {
ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
GFP_NOWAIT|__GFP_NOWARN);
if (ret) {
@@ -248,7 +273,7 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
/* Ensure we see any new tables. */
smp_rmb();
- tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+ tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht);
if (unlikely(tbl))
goto restart;
rcu_read_unlock();
@@ -327,7 +352,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod
spin_unlock(&inode->v.i_lock);
if (remove) {
- int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
+ int ret = rhltable_remove(&c->vfs_inodes_by_inum_table,
+ &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params);
+ BUG_ON(ret);
+
+ ret = rhashtable_remove_fast(&c->vfs_inodes_table,
&inode->hash, bch2_vfs_inodes_params);
BUG_ON(ret);
inode->v.i_hash.pprev = NULL;
@@ -372,6 +401,11 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
discard_new_inode(&inode->v);
return old;
} else {
+ int ret = rhltable_insert(&c->vfs_inodes_by_inum_table,
+ &inode->by_inum_hash,
+ bch2_vfs_inodes_by_inum_params);
+ BUG_ON(ret);
+
inode_fake_hash(&inode->v);
inode_sb_list_add(&inode->v);
@@ -465,7 +499,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
struct bch_inode_unpacked inode_u;
struct bch_subvolume subvol;
int ret = lockrestart_do(trans,
- bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+ bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
bch2_trans_put(trans);
@@ -535,8 +569,7 @@ __bch2_create(struct mnt_idmap *idmap,
inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
inum.inum = inode_u.bi_inum;
- ret = bch2_subvolume_get(trans, inum.subvol, true,
- BTREE_ITER_with_updates, &subvol) ?:
+ ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
bch2_trans_commit(trans, NULL, &journal_seq, 0);
if (unlikely(ret)) {
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
@@ -617,7 +650,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
struct bch_subvolume subvol;
struct bch_inode_unpacked inode_u;
- ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+ ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
@@ -628,7 +661,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
goto err;
/* regular files may have hardlinks: */
- if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) &&
+ if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) &&
!bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
c,
"dirent points to inode that does not point back:\n %s",
@@ -1245,7 +1278,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
- unsigned offset_into_extent, sectors;
bool have_extent = false;
int ret = 0;
@@ -1278,7 +1310,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_btree_iter_set_snapshot(&iter, snapshot);
- k = bch2_btree_iter_peek_upto(&iter, end);
+ k = bch2_btree_iter_peek_max(&iter, end);
ret = bkey_err(k);
if (ret)
continue;
@@ -1292,9 +1324,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
continue;
}
- offset_into_extent = iter.pos.offset -
- bkey_start_offset(k.k);
- sectors = k.k->size - offset_into_extent;
+ s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+ unsigned sectors = k.k->size - offset_into_extent;
bch2_bkey_buf_reassemble(&cur, c, k);
@@ -1306,7 +1337,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
k = bkey_i_to_s_c(cur.k);
bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
- sectors = min(sectors, k.k->size - offset_into_extent);
+ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
bch2_cut_front(POS(k.k->p.inode,
bkey_start_offset(k.k) +
@@ -1736,7 +1767,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans,
bch2_inode_update_after_write(trans, inode, bi, ~0);
inode->v.i_blocks = bi->bi_sectors;
- inode->v.i_ino = bi->bi_inum;
inode->v.i_rdev = bi->bi_dev;
inode->v.i_generation = bi->bi_generation;
inode->v.i_size = bi->bi_size;
@@ -2200,7 +2230,8 @@ static int bch2_fs_get_tree(struct fs_context *fc)
sb->s_time_gran = c->sb.nsec_per_time_unit;
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
- sb->s_uuid = c->sb.user_uuid;
+ super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
+ super_set_sysfs_name_uuid(sb);
sb->s_shrink->seeks = 0;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));
@@ -2345,13 +2376,16 @@ static int bch2_init_fs_context(struct fs_context *fc)
void bch2_fs_vfs_exit(struct bch_fs *c)
{
+ if (c->vfs_inodes_by_inum_table.ht.tbl)
+ rhltable_destroy(&c->vfs_inodes_by_inum_table);
if (c->vfs_inodes_table.tbl)
rhashtable_destroy(&c->vfs_inodes_table);
}
int bch2_fs_vfs_init(struct bch_fs *c)
{
- return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
+ return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?:
+ rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params);
}
static struct file_system_type bcache_fs_type = {
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 59f9f7ae728d..dd2198541455 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -14,6 +14,7 @@
struct bch_inode_info {
struct inode v;
struct rhash_head hash;
+ struct rhlist_head by_inum_hash;
subvol_inum ei_inum;
struct list_head ei_vfs_inode_list;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 75c8a97a6954..3917d75f3c98 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bcachefs_ioctl.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_update.h"
@@ -16,6 +17,7 @@
#include "recovery_passes.h"
#include "snapshot.h"
#include "super.h"
+#include "thread_with_file.h"
#include "xattr.h"
#include <linux/bsearch.h>
@@ -73,7 +75,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
{
u64 sectors = 0;
- int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
SPOS(inum, 0, snapshot),
POS(inum, U64_MAX),
0, k, ({
@@ -90,7 +92,7 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
{
u64 subdirs = 0;
- int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
SPOS(inum, 0, snapshot),
POS(inum, U64_MAX),
0, k, ({
@@ -107,7 +109,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
u32 *snapshot, u64 *inum)
{
struct bch_subvolume s;
- int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+ int ret = bch2_subvolume_get(trans, subvol, false, &s);
*snapshot = le32_to_cpu(s.snapshot);
*inum = le64_to_cpu(s.inode);
@@ -170,7 +172,7 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
if (ret)
return ret;
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
*target = le64_to_cpu(d.v->d_inum);
*type = d.v->d_type;
bch2_trans_iter_exit(trans, &iter);
@@ -203,6 +205,36 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
return ret;
}
+/*
+ * Find any subvolume associated with a tree of snapshots
+ * We can't rely on master_subvol - it might have been deleted.
+ */
+static int find_snapshot_tree_subvol(struct btree_trans *trans,
+ u32 tree_id, u32 *subvol)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+ if (le32_to_cpu(s.v->tree) != tree_id)
+ continue;
+
+ if (s.v->subvol) {
+ *subvol = le32_to_cpu(s.v->subvol);
+ goto found;
+ }
+ }
+ ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
+found:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
/* Get lost+found, create if it doesn't exist: */
static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
struct bch_inode_unpacked *lostfound,
@@ -210,6 +242,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
{
struct bch_fs *c = trans->c;
struct qstr lostfound_str = QSTR("lost+found");
+ struct btree_iter lostfound_iter = { NULL };
u64 inum = 0;
unsigned d_type = 0;
int ret;
@@ -220,20 +253,24 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
if (ret)
return ret;
- subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
+ u32 subvolid;
+ ret = find_snapshot_tree_subvol(trans,
+ bch2_snapshot_tree(c, snapshot), &subvolid);
+ bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
+ bch2_snapshot_tree(c, snapshot));
+ if (ret)
+ return ret;
struct bch_subvolume subvol;
- ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
- false, 0, &subvol);
- bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
- le32_to_cpu(st.master_subvol), snapshot);
+ ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
+ bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
if (ret)
return ret;
if (!subvol.inode) {
struct btree_iter iter;
struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
+ BTREE_ID_subvolumes, POS(0, subvolid),
0, subvolume);
ret = PTR_ERR_OR_ZERO(subvol);
if (ret)
@@ -243,13 +280,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
bch2_trans_iter_exit(trans, &iter);
}
- root_inum.inum = le64_to_cpu(subvol.inode);
+ subvol_inum root_inum = {
+ .subvol = subvolid,
+ .inum = le64_to_cpu(subvol.inode)
+ };
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
- root_inum.inum, le32_to_cpu(st.master_subvol));
+ root_inum.inum, subvolid);
if (ret)
return ret;
@@ -288,11 +328,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
* XXX: we could have a nicer log message here if we had a nice way to
* walk backpointers to print a path
*/
- bch_notice(c, "creating lost+found in subvol %llu snapshot %u",
- root_inum.subvol, le32_to_cpu(st.root_snapshot));
+ struct printbuf path = PRINTBUF;
+ ret = bch2_inum_to_path(trans, root_inum, &path);
+ if (ret)
+ goto err;
+
+ bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
+ path.buf, root_inum.subvol, snapshot);
+ printbuf_exit(&path);
u64 now = bch2_current_time(c);
- struct btree_iter lostfound_iter = { NULL };
u64 cpu = raw_smp_processor_id();
bch2_inode_init_early(c, lostfound);
@@ -451,7 +496,9 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
continue;
struct bch_inode_unpacked child_inode;
- bch2_inode_unpack(k, &child_inode);
+ ret = bch2_inode_unpack(k, &child_inode);
+ if (ret)
+ break;
if (!inode_should_reattach(&child_inode)) {
ret = maybe_delete_dirent(trans,
@@ -482,6 +529,13 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
return ret;
}
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos)
+{
+ return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+}
+
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
@@ -490,13 +544,11 @@ static int remove_backpointer(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_s_c_dirent d =
- bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
- SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0,
- dirent);
- int ret = bkey_err(d) ?:
- dirent_points_to_inode(c, d, inode) ?:
- __remove_dirent(trans, d.k->p);
+ struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
+ SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
+ int ret = bkey_err(d) ?:
+ dirent_points_to_inode(c, d, inode) ?:
+ __remove_dirent(trans, d.k->p);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -613,7 +665,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
struct btree_iter iter = {};
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
- struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
bch2_trans_iter_exit(trans, &iter);
int ret = bkey_err(k);
if (ret)
@@ -780,11 +832,13 @@ struct inode_walker {
struct bpos last_pos;
DARRAY(struct inode_walker_entry) inodes;
+ snapshot_id_list deletes;
};
static void inode_walker_exit(struct inode_walker *w)
{
darray_exit(&w->inodes);
+ darray_exit(&w->deletes);
}
static struct inode_walker inode_walker_init(void)
@@ -797,9 +851,8 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
{
struct bch_inode_unpacked u;
- BUG_ON(bch2_inode_unpack(inode, &u));
-
- return darray_push(&w->inodes, ((struct inode_walker_entry) {
+ return bch2_inode_unpack(inode, &u) ?:
+ darray_push(&w->inodes, ((struct inode_walker_entry) {
.inode = u,
.snapshot = inode.k->p.snapshot,
}));
@@ -909,8 +962,9 @@ static int get_visible_inodes(struct btree_trans *trans,
int ret;
w->inodes.nr = 0;
+ w->deletes.nr = 0;
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
BTREE_ITER_all_snapshots, k, ret) {
if (k.k->p.offset != inum)
break;
@@ -918,10 +972,13 @@ static int get_visible_inodes(struct btree_trans *trans,
if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
continue;
- if (bkey_is_inode(k.k))
- add_inode(c, w, k);
+ if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
+ continue;
- if (k.k->p.snapshot >= s->pos.snapshot)
+ ret = bkey_is_inode(k.k)
+ ? add_inode(c, w, k)
+ : snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
+ if (ret)
break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -929,69 +986,16 @@ static int get_visible_inodes(struct btree_trans *trans,
return ret;
}
-static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
-{
- if (d.v->d_type == DT_SUBVOL) {
- u32 snap;
- u64 inum;
- int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
- return !ret;
- } else {
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- ret = bkey_is_inode(k.k);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
- }
-}
-
/*
* Prefer to delete the first one, since that will be the one at the wrong
* offset:
* return value: 0 -> delete k1, 1 -> delete k2
*/
-static int hash_pick_winner(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_s_c k1,
- struct bkey_s_c k2)
-{
- if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
- !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
- return 0;
-
- switch (desc.btree_id) {
- case BTREE_ID_dirents: {
- int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1));
- if (ret < 0)
- return ret;
- if (!ret)
- return 0;
-
- ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2));
- if (ret < 0)
- return ret;
- if (!ret)
- return 1;
- return 2;
- }
- default:
- return 0;
- }
-}
-
-static int fsck_update_backpointers(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_i *new)
+int bch2_fsck_update_backpointers(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_i *new)
{
if (new->k.type != KEY_TYPE_dirent)
return 0;
@@ -1019,160 +1023,6 @@ static int fsck_update_backpointers(struct btree_trans *trans,
return ret;
}
-static int fsck_rename_dirent(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_s_c_dirent old)
-{
- struct qstr old_name = bch2_dirent_get_name(old);
- struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
- int ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- bkey_dirent_init(&new->k_i);
- dirent_copy_target(new, old);
- new->k.p = old.k->p;
-
- for (unsigned i = 0; i < 1000; i++) {
- unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
- old_name.len, old_name.name, i);
- unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
-
- if (u64s > U8_MAX)
- return -EINVAL;
-
- new->k.u64s = u64s;
-
- ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- (subvol_inum) { 0, old.k->p.inode },
- old.k->p.snapshot, &new->k_i,
- BTREE_UPDATE_internal_snapshot_node);
- if (!bch2_err_matches(ret, EEXIST))
- break;
- }
-
- if (ret)
- return ret;
-
- return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
-}
-
-static int hash_check_key(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
- struct printbuf buf = PRINTBUF;
- struct bkey_s_c k;
- u64 hash;
- int ret = 0;
-
- if (hash_k.k->type != desc.key_type)
- return 0;
-
- hash = desc.hash_bkey(hash_info, hash_k);
-
- if (likely(hash == hash_k.k->p.offset))
- return 0;
-
- if (hash_k.k->p.offset < hash)
- goto bad_hash;
-
- for_each_btree_key_norestart(trans, iter, desc.btree_id,
- SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
- BTREE_ITER_slots, k, ret) {
- if (bkey_eq(k.k->p, hash_k.k->p))
- break;
-
- if (k.k->type == desc.key_type &&
- !desc.cmp_bkey(k, hash_k))
- goto duplicate_entries;
-
- if (bkey_deleted(k.k)) {
- bch2_trans_iter_exit(trans, &iter);
- goto bad_hash;
- }
- }
-out:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
-bad_hash:
- if (fsck_err(trans, hash_table_key_wrong_offset,
- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s",
- bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
- k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info,
- (subvol_inum) { 0, hash_k.k->p.inode },
- hash_k.k->p.snapshot, new,
- STR_HASH_must_create|
- BTREE_ITER_with_updates|
- BTREE_UPDATE_internal_snapshot_node);
- ret = bkey_err(k);
- if (ret)
- goto out;
- if (k.k)
- goto duplicate_entries;
-
- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter,
- BTREE_UPDATE_internal_snapshot_node) ?:
- fsck_update_backpointers(trans, s, desc, hash_info, new) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
- goto out;
- }
-fsck_err:
- goto out;
-duplicate_entries:
- ret = hash_pick_winner(trans, desc, hash_info, hash_k, k);
- if (ret < 0)
- goto out;
-
- if (!fsck_err(trans, hash_table_key_duplicate,
- "duplicate hash table keys%s:\n%s",
- ret != 2 ? "" : ", both point to valid inodes",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k),
- prt_newline(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf)))
- goto out;
-
- switch (ret) {
- case 0:
- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
- break;
- case 1:
- ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0);
- break;
- case 2:
- ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
- bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
- goto out;
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
- -BCH_ERR_transaction_restart_nested;
- goto out;
-}
-
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos pos)
-{
- return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
@@ -1260,7 +1110,7 @@ static int get_snapshot_root_inode(struct btree_trans *trans,
goto err;
BUG();
found_root:
- BUG_ON(bch2_inode_unpack(k, root));
+ ret = bch2_inode_unpack(k, root);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -1291,7 +1141,9 @@ static int check_inode(struct btree_trans *trans,
if (!bkey_is_inode(k.k))
return 0;
- BUG_ON(bch2_inode_unpack(k, &u));
+ ret = bch2_inode_unpack(k, &u);
+ if (ret)
+ goto err;
if (snapshot_root->bi_inum != u.bi_inum) {
ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
@@ -1302,7 +1154,7 @@ static int check_inode(struct btree_trans *trans,
if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
trans, inode_snapshot_mismatch,
- "inodes in different snapshots don't match")) {
+ "inode hash info in different snapshots don't match")) {
u.bi_hash_seed = snapshot_root->bi_hash_seed;
SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
do_update = true;
@@ -1392,7 +1244,7 @@ static int check_inode(struct btree_trans *trans,
if (fsck_err_on(!ret,
trans, inode_unlinked_and_not_open,
- "inode %llu%u unlinked and not open",
+ "inode %llu:%u unlinked and not open",
u.bi_inum, u.bi_snapshot)) {
ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck deleting inode");
@@ -1415,7 +1267,7 @@ static int check_inode(struct btree_trans *trans,
if (u.bi_subvol) {
struct bch_subvolume s;
- ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
+ ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
@@ -1441,6 +1293,17 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
}
+
+ if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
+ trans, inode_journal_seq_in_future,
+ "inode journal seq in future (currently at %llu)\n%s",
+ journal_cur_seq(&c->journal),
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &u),
+ buf.buf))) {
+ u.bi_journal_seq = journal_cur_seq(&c->journal);
+ do_update = true;
+ }
do_update:
if (do_update) {
ret = __bch2_fsck_write_inode(trans, &u);
@@ -1502,7 +1365,9 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
break;
struct bch_inode_unpacked parent_inode;
- bch2_inode_unpack(k, &parent_inode);
+ ret = bch2_inode_unpack(k, &parent_inode);
+ if (ret)
+ break;
if (!inode_should_reattach(&parent_inode))
break;
@@ -1525,7 +1390,9 @@ static int check_unreachable_inode(struct btree_trans *trans,
return 0;
struct bch_inode_unpacked inode;
- BUG_ON(bch2_inode_unpack(k, &inode));
+ ret = bch2_inode_unpack(k, &inode);
+ if (ret)
+ return ret;
if (!inode_should_reattach(&inode))
return 0;
@@ -1649,7 +1516,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
if (i->count != count2) {
bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
w->last_pos.inode, i->snapshot, i->count, count2);
- return -BCH_ERR_internal_fsck_err;
+ i->count = count2;
}
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
@@ -1753,7 +1620,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter1, btree, pos1,
BTREE_ITER_all_snapshots|
BTREE_ITER_not_extents);
- k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
+ k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
ret = bkey_err(k1);
if (ret)
goto err;
@@ -1778,7 +1645,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
while (1) {
bch2_btree_iter_advance(&iter2);
- k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
+ k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
ret = bkey_err(k2);
if (ret)
goto err;
@@ -2156,7 +2023,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
return __bch2_fsck_write_inode(trans, target);
}
- if (bch2_inode_should_have_bp(target) &&
+ if (bch2_inode_should_have_single_bp(target) &&
!fsck_err(trans, inode_wrong_backpointer,
"dirent points to inode that does not point back:\n %s",
(bch2_bkey_val_to_text(&buf, c, d.s_c),
@@ -2480,7 +2347,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
dir->first_this_inode = false;
- ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k);
+ ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
if (ret < 0)
goto err;
if (ret) {
@@ -2519,6 +2386,30 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
}
+
+ darray_for_each(target->deletes, i)
+ if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
+ trans, dirent_to_overwritten_inode,
+ "dirent points to inode overwritten in snapshot %u:\n%s",
+ *i,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
+ struct btree_iter delete_iter;
+ bch2_trans_iter_init(trans, &delete_iter,
+ BTREE_ID_dirents,
+ SPOS(k.k->p.inode, k.k->p.offset, *i),
+ BTREE_ITER_intent);
+ ret = bch2_btree_iter_traverse(&delete_iter) ?:
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ hash_info,
+ &delete_iter,
+ BTREE_UPDATE_internal_snapshot_node);
+ bch2_trans_iter_exit(trans, &delete_iter);
+ if (ret)
+ goto err;
+
+ }
}
ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
@@ -2594,7 +2485,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
inode->first_this_inode = false;
- ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k);
+ ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
bch_err_fn(c, ret);
return ret;
}
@@ -2774,6 +2665,48 @@ struct pathbuf_entry {
typedef DARRAY(struct pathbuf_entry) pathbuf;
+static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
+ u32 new_depth)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, p->inum, p->snapshot), 0);
+
+ struct bch_inode_unpacked inode;
+ int ret = bkey_err(k) ?:
+ !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
+ : bch2_inode_unpack(k, &inode);
+ if (ret)
+ goto err;
+
+ if (inode.bi_depth != new_depth) {
+ inode.bi_depth = new_depth;
+ ret = __bch2_fsck_write_inode(trans, &inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
+{
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+
+ darray_for_each_reverse(*path, i) {
+ ret = nested_lockrestart_do(trans,
+ bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
+ bch_err_fn(trans->c, ret);
+ if (ret)
+ break;
+
+ new_bi_depth++;
+ }
+
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
{
darray_for_each(*p, i)
@@ -2783,21 +2716,21 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
return false;
}
-static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
+static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
{
struct bch_fs *c = trans->c;
struct btree_iter inode_iter = {};
- struct bch_inode_unpacked inode;
+ pathbuf path = {};
struct printbuf buf = PRINTBUF;
u32 snapshot = inode_k.k->p.snapshot;
+ bool redo_bi_depth = false;
+ u32 min_bi_depth = U32_MAX;
int ret = 0;
- p->nr = 0;
-
- BUG_ON(bch2_inode_unpack(inode_k, &inode));
-
- if (!S_ISDIR(inode.bi_mode))
- return 0;
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_unpack(inode_k, &inode);
+ if (ret)
+ return ret;
while (!inode.bi_subvol) {
struct btree_iter dirent_iter;
@@ -2807,7 +2740,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
ret = bkey_err(d.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
- break;
+ goto out;
if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
bch2_trans_iter_exit(trans, &dirent_iter);
@@ -2822,7 +2755,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
bch2_trans_iter_exit(trans, &dirent_iter);
- ret = darray_push(p, ((struct pathbuf_entry) {
+ ret = darray_push(&path, ((struct pathbuf_entry) {
.inum = inode.bi_inum,
.snapshot = snapshot,
}));
@@ -2834,22 +2767,32 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
bch2_trans_iter_exit(trans, &inode_iter);
inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
SPOS(0, inode.bi_dir, snapshot), 0);
+
+ struct bch_inode_unpacked parent_inode;
ret = bkey_err(inode_k) ?:
!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
- : bch2_inode_unpack(inode_k, &inode);
+ : bch2_inode_unpack(inode_k, &parent_inode);
if (ret) {
/* Should have been caught in dirents pass */
bch_err_msg(c, ret, "error looking up parent directory");
- break;
+ goto out;
}
+ min_bi_depth = parent_inode.bi_depth;
+
+ if (parent_inode.bi_depth < inode.bi_depth &&
+ min_bi_depth < U16_MAX)
+ break;
+
+ inode = parent_inode;
snapshot = inode_k.k->p.snapshot;
+ redo_bi_depth = true;
- if (path_is_dup(p, inode.bi_inum, snapshot)) {
+ if (path_is_dup(&path, inode.bi_inum, snapshot)) {
/* XXX print path */
bch_err(c, "directory structure loop");
- darray_for_each(*p, i)
+ darray_for_each(path, i)
pr_err("%llu:%u", i->inum, i->snapshot);
pr_err("%llu:%u", inode.bi_inum, snapshot);
@@ -2862,12 +2805,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
ret = reattach_inode(trans, &inode);
bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
}
- break;
+
+ goto out;
}
}
+
+ if (inode.bi_subvol)
+ min_bi_depth = 0;
+
+ if (redo_bi_depth)
+ ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
out:
fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
+ darray_exit(&path);
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
@@ -2879,24 +2830,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
*/
int bch2_check_directory_structure(struct bch_fs *c)
{
- pathbuf path = { 0, };
- int ret;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_intent|
BTREE_ITER_prefetch|
BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- if (!bkey_is_inode(k.k))
+ if (!S_ISDIR(bkey_inode_mode(k)))
continue;
if (bch2_inode_flags(k) & BCH_INODE_unlinked)
continue;
- check_path(trans, &path, k);
+ check_path_loop(trans, k);
})));
- darray_exit(&path);
bch_err_fn(c, ret);
return ret;
@@ -2994,7 +2941,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
/* Should never fail, checked by bch2_inode_invalid: */
struct bch_inode_unpacked u;
- BUG_ON(bch2_inode_unpack(k, &u));
+ _ret3 = bch2_inode_unpack(k, &u);
+ if (_ret3)
+ break;
/*
* Backpointer and directory structure checks are sufficient for
@@ -3072,7 +3021,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
if (!bkey_is_inode(k.k))
return 0;
- BUG_ON(bch2_inode_unpack(k, &u));
+ ret = bch2_inode_unpack(k, &u);
+ if (ret)
+ return ret;
if (S_ISDIR(u.bi_mode))
return 0;
@@ -3194,3 +3145,223 @@ int bch2_fix_reflink_p(struct bch_fs *c)
bch_err_fn(c, ret);
return ret;
}
+
+#ifndef NO_BCACHEFS_CHARDEV
+
+struct fsck_thread {
+ struct thread_with_stdio thr;
+ struct bch_fs *c;
+ struct bch_opts opts;
+};
+
+static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
+{
+ struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
+ kfree(thr);
+}
+
+static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
+{
+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
+ struct bch_fs *c = thr->c;
+
+ int ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
+ return ret;
+
+ ret = bch2_fs_start(thr->c);
+ if (ret)
+ goto err;
+
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
+ ret |= 1;
+ }
+ if (test_bit(BCH_FS_error, &c->flags)) {
+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
+ ret |= 4;
+ }
+err:
+ bch2_fs_stop(c);
+ return ret;
+}
+
+static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
+ .exit = bch2_fsck_thread_exit,
+ .fn = bch2_fsck_offline_thread_fn,
+};
+
+long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
+{
+ struct bch_ioctl_fsck_offline arg;
+ struct fsck_thread *thr = NULL;
+ darray_str(devs) = {};
+ long ret = 0;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ for (size_t i = 0; i < arg.nr_devs; i++) {
+ u64 dev_u64;
+ ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
+ if (ret)
+ goto err;
+
+ char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(dev_str);
+ if (ret)
+ goto err;
+
+ ret = darray_push(&devs, dev_str);
+ if (ret) {
+ kfree(dev_str);
+ goto err;
+ }
+ }
+
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ thr->opts = bch2_opts_empty();
+
+ if (arg.opts) {
+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+ ret = PTR_ERR_OR_ZERO(optstr) ?:
+ bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
+
+ if (ret)
+ goto err;
+ }
+
+ opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
+ opt_set(thr->opts, read_only, 1);
+ opt_set(thr->opts, ratelimit_errors, 0);
+
+ /* We need request_key() to be called before we punt to kthread: */
+ opt_set(thr->opts, nostart, true);
+
+ bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
+
+ thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+
+ if (!IS_ERR(thr->c) &&
+ thr->c->opts.errors == BCH_ON_ERROR_panic)
+ thr->c->opts.errors = BCH_ON_ERROR_ro;
+
+ ret = __bch2_run_thread_with_stdio(&thr->thr);
+out:
+ darray_for_each(devs, i)
+ kfree(*i);
+ darray_exit(&devs);
+ return ret;
+err:
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ pr_err("ret %s", bch2_err_str(ret));
+ goto out;
+}
+
+static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
+{
+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
+ struct bch_fs *c = thr->c;
+
+ c->stdio_filter = current;
+ c->stdio = &thr->thr.stdio;
+
+ /*
+ * XXX: can we figure out a way to do this without mucking with c->opts?
+ */
+ unsigned old_fix_errors = c->opts.fix_errors;
+ if (opt_defined(thr->opts, fix_errors))
+ c->opts.fix_errors = thr->opts.fix_errors;
+ else
+ c->opts.fix_errors = FSCK_FIX_ask;
+
+ c->opts.fsck = true;
+ set_bit(BCH_FS_fsck_running, &c->flags);
+
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+ int ret = bch2_run_online_recovery_passes(c);
+
+ clear_bit(BCH_FS_fsck_running, &c->flags);
+ bch_err_fn(c, ret);
+
+ c->stdio = NULL;
+ c->stdio_filter = NULL;
+ c->opts.fix_errors = old_fix_errors;
+
+ up(&c->online_fsck_mutex);
+ bch2_ro_ref_put(c);
+ return ret;
+}
+
+static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
+ .exit = bch2_fsck_thread_exit,
+ .fn = bch2_fsck_online_thread_fn,
+};
+
+long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
+{
+ struct fsck_thread *thr = NULL;
+ long ret = 0;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!bch2_ro_ref_tryget(c))
+ return -EROFS;
+
+ if (down_trylock(&c->online_fsck_mutex)) {
+ bch2_ro_ref_put(c);
+ return -EAGAIN;
+ }
+
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ thr->c = c;
+ thr->opts = bch2_opts_empty();
+
+ if (arg.opts) {
+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+
+ ret = PTR_ERR_OR_ZERO(optstr) ?:
+ bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
+
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
+err:
+ if (ret < 0) {
+ bch_err_fn(c, ret);
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ up(&c->online_fsck_mutex);
+ bch2_ro_ref_put(c);
+ }
+ return ret;
+}
+
+#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 1cca31011530..574948278cd4 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -2,6 +2,14 @@
#ifndef _BCACHEFS_FSCK_H
#define _BCACHEFS_FSCK_H
+#include "str_hash.h"
+
+int bch2_fsck_update_backpointers(struct btree_trans *,
+ struct snapshots_seen *,
+ const struct bch_hash_desc,
+ struct bch_hash_info *,
+ struct bkey_i *);
+
int bch2_check_inodes(struct bch_fs *);
int bch2_check_extents(struct bch_fs *);
int bch2_check_indirect_extents(struct bch_fs *);
@@ -14,4 +22,7 @@ int bch2_check_directory_structure(struct bch_fs *);
int bch2_check_nlinks(struct bch_fs *);
int bch2_fix_reflink_p(struct bch_fs *);
+long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *);
+long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online);
+
#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 039cb7a22244..04ec05206f8c 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -14,6 +14,7 @@
#include "extent_update.h"
#include "fs.h"
#include "inode.h"
+#include "opts.h"
#include "str_hash.h"
#include "snapshot.h"
#include "subvolume.h"
@@ -47,10 +48,10 @@ static int inode_decode_field(const u8 *in, const u8 *end,
u8 *p;
if (in >= end)
- return -1;
+ return -BCH_ERR_inode_unpack_error;
if (!*in)
- return -1;
+ return -BCH_ERR_inode_unpack_error;
/*
* position of highest set bit indicates number of bytes:
@@ -60,7 +61,7 @@ static int inode_decode_field(const u8 *in, const u8 *end,
bytes = byte_table[shift - 1];
if (in + bytes > end)
- return -1;
+ return -BCH_ERR_inode_unpack_error;
p = (u8 *) be + 16 - bytes;
memcpy(p, in, bytes);
@@ -176,7 +177,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
return ret; \
\
if (field_bits > sizeof(unpacked->_name) * 8) \
- return -1; \
+ return -BCH_ERR_inode_unpack_error; \
\
unpacked->_name = field[1]; \
in += ret;
@@ -217,7 +218,7 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
\
unpacked->_name = v[0]; \
if (v[1] || v[0] != unpacked->_name) \
- return -1; \
+ return -BCH_ERR_inode_unpack_error; \
fieldnr++;
BCH_INODE_FIELDS_v2()
@@ -268,7 +269,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k,
\
unpacked->_name = v[0]; \
if (v[1] || v[0] != unpacked->_name) \
- return -1; \
+ return -BCH_ERR_inode_unpack_error; \
fieldnr++;
BCH_INODE_FIELDS_v3()
@@ -428,7 +429,7 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
}
static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bch_inode_unpacked unpacked;
int ret = 0;
@@ -468,7 +469,7 @@ static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
}
int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
int ret = 0;
@@ -478,13 +479,13 @@ int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
"invalid str hash type (%llu >= %u)",
INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
- ret = __bch2_inode_validate(c, k, flags);
+ ret = __bch2_inode_validate(c, k, from);
fsck_err:
return ret;
}
int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
int ret = 0;
@@ -494,13 +495,13 @@ int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
"invalid str hash type (%llu >= %u)",
INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
- ret = __bch2_inode_validate(c, k, flags);
+ ret = __bch2_inode_validate(c, k, from);
fsck_err:
return ret;
}
int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
int ret = 0;
@@ -518,7 +519,7 @@ int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
"invalid str hash type (%llu >= %u)",
INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
- ret = __bch2_inode_validate(c, k, flags);
+ ret = __bch2_inode_validate(c, k, from);
fsck_err:
return ret;
}
@@ -617,7 +618,7 @@ bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter
struct bkey_s_c k;
int ret = 0;
- for_each_btree_key_upto_norestart(trans, *iter, btree,
+ for_each_btree_key_max_norestart(trans, *iter, btree,
bpos_successor(pos),
SPOS(pos.inode, pos.offset, U32_MAX),
flags|BTREE_ITER_all_snapshots, k, ret)
@@ -652,7 +653,7 @@ int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
struct bkey_s_c k;
int ret = 0;
- for_each_btree_key_upto_norestart(trans, iter,
+ for_each_btree_key_max_norestart(trans, iter,
BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
BTREE_ITER_all_snapshots|
BTREE_ITER_with_updates, k, ret)
@@ -779,7 +780,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
}
int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
int ret = 0;
@@ -798,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
}
+int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors,
+ c, inode_alloc_cursor_inode_bad,
+ "k.p.inode bad");
+fsck_err:
+ return ret;
+}
+
+void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k);
+
+ prt_printf(out, "idx %llu generation %llu",
+ le64_to_cpu(i.v->idx),
+ le64_to_cpu(i.v->gen));
+}
+
void bch2_inode_init_early(struct bch_fs *c,
struct bch_inode_unpacked *inode_u)
{
@@ -858,43 +881,78 @@ static inline u32 bkey_generation(struct bkey_s_c k)
}
}
-/*
- * This just finds an empty slot:
- */
-int bch2_inode_create(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode_u,
- u32 snapshot, u64 cpu)
+static struct bkey_i_inode_alloc_cursor *
+bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
- u64 min, max, start, pos, *hint;
- int ret = 0;
- unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
- if (c->opts.shard_inode_numbers) {
- bits -= c->inode_shard_bits;
+ u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
- min = (cpu << bits);
- max = (cpu << bits) | ~(ULLONG_MAX << bits);
+ cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
- min = max_t(u64, min, BLOCKDEV_INODE_MAX);
- hint = c->unused_inode_hints + cpu;
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_logged_ops,
+ POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx),
+ BTREE_ITER_cached);
+ int ret = bkey_err(k);
+ if (ret)
+ return ERR_PTR(ret);
+
+ struct bkey_i_inode_alloc_cursor *cursor =
+ k.k->type == KEY_TYPE_inode_alloc_cursor
+ ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor)
+ : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor);
+ ret = PTR_ERR_OR_ZERO(cursor);
+ if (ret)
+ goto err;
+
+ if (c->opts.inodes_32bit) {
+ *min = BLOCKDEV_INODE_MAX;
+ *max = INT_MAX;
} else {
- min = BLOCKDEV_INODE_MAX;
- max = ~(ULLONG_MAX << bits);
- hint = c->unused_inode_hints;
+ cursor->v.bits = c->opts.shard_inode_numbers_bits;
+
+ unsigned bits = 63 - c->opts.shard_inode_numbers_bits;
+
+ *min = max(cpu << bits, (u64) INT_MAX + 1);
+ *max = (cpu << bits) | ~(ULLONG_MAX << bits);
}
- start = READ_ONCE(*hint);
+ if (le64_to_cpu(cursor->v.idx) < *min)
+ cursor->v.idx = cpu_to_le64(*min);
- if (start >= max || start < min)
- start = min;
+ if (le64_to_cpu(cursor->v.idx) >= *max) {
+ cursor->v.idx = cpu_to_le64(*min);
+ le32_add_cpu(&cursor->v.gen, 1);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret ? ERR_PTR(ret) : cursor;
+}
+
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode_u,
+ u32 snapshot, u64 cpu)
+{
+ u64 min, max;
+ struct bkey_i_inode_alloc_cursor *cursor =
+ bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
+ int ret = PTR_ERR_OR_ZERO(cursor);
+ if (ret)
+ return ret;
+
+ u64 start = le64_to_cpu(cursor->v.idx);
+ u64 pos = start;
- pos = start;
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
BTREE_ITER_all_snapshots|
BTREE_ITER_intent);
+ struct bkey_s_c k;
again:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
@@ -924,6 +982,7 @@ int bch2_inode_create(struct btree_trans *trans,
/* Retry from start */
pos = start = min;
bch2_btree_iter_set_pos(iter, POS(0, pos));
+ le32_add_cpu(&cursor->v.gen, 1);
goto again;
found_slot:
bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
@@ -934,9 +993,9 @@ int bch2_inode_create(struct btree_trans *trans,
return ret;
}
- *hint = k.k->p.offset;
inode_u->bi_inum = k.k->p.offset;
- inode_u->bi_generation = bkey_generation(k);
+ inode_u->bi_generation = le64_to_cpu(cursor->v.gen);
+ cursor->v.idx = cpu_to_le64(k.k->p.offset + 1);
return 0;
}
@@ -966,7 +1025,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
bch2_btree_iter_set_snapshot(&iter, snapshot);
- k = bch2_btree_iter_peek_upto(&iter, end);
+ k = bch2_btree_iter_peek_max(&iter, end);
ret = bkey_err(k);
if (ret)
goto err;
@@ -998,8 +1057,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
- struct bkey_i_inode_generation delete;
- struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
u32 snapshot;
int ret;
@@ -1039,13 +1096,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
goto err;
}
- bch2_inode_unpack(k, &inode_u);
-
- bkey_inode_generation_init(&delete.k_i);
- delete.k.p = iter.pos;
- delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
- ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+ ret = bch2_btree_delete_at(trans, &iter, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc);
err:
@@ -1141,12 +1192,17 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
struct bch_inode_unpacked *inode)
{
-#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name);
+#define x(_name, _bits) \
+ if ((inode)->bi_##_name) { \
+ opts->_name = inode->bi_##_name - 1; \
+ opts->_name##_from_inode = true; \
+ } else { \
+ opts->_name = c->opts._name; \
+ }
BCH_INODE_OPTS()
#undef x
- if (opts->nocow)
- opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
+ bch2_io_opts_fixups(opts);
}
int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
@@ -1380,7 +1436,8 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
if (ret > 0) {
- bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
+ bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
+ k.k->p.offset, k.k->p.snapshot);
ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
/*
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index eab82b5eb897..d2e134528f0e 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -7,15 +7,14 @@
#include "opts.h"
#include "snapshot.h"
-enum bch_validate_flags;
extern const char * const bch2_inode_opts[];
int bch2_inode_validate(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context);
int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context);
int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
@@ -60,7 +59,7 @@ static inline bool bkey_is_inode(const struct bkey *k)
}
int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context);
void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \
@@ -69,6 +68,16 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk
.min_val_size = 8, \
})
+int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \
+ .key_validate = bch2_inode_alloc_cursor_validate, \
+ .val_to_text = bch2_inode_alloc_cursor_to_text, \
+ .min_val_size = 16, \
+})
+
#if 0
typedef struct {
u64 lo;
@@ -220,6 +229,20 @@ static inline u32 bch2_inode_flags(struct bkey_s_c k)
}
}
+static inline unsigned bkey_inode_mode(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode:
+ return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode);
+ case KEY_TYPE_inode_v2:
+ return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode);
+ case KEY_TYPE_inode_v3:
+ return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v);
+ default:
+ return 0;
+ }
+}
+
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)
@@ -249,7 +272,7 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
-static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode)
+static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode)
{
bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
@@ -262,6 +285,14 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
+static inline struct bch_extent_rebalance
+bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
+{
+ struct bch_io_opts io_opts;
+ bch2_inode_opts_get(&io_opts, c, inode);
+ return io_opts_to_rebalance_opts(&io_opts);
+}
+
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
int bch2_delete_dead_inodes(struct bch_fs *);
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index 7928d0c6954f..b99a5bf1a75e 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -101,7 +101,9 @@ struct bch_inode_generation {
x(bi_dir_offset, 64) \
x(bi_subvol, 32) \
x(bi_parent_subvol, 32) \
- x(bi_nocow, 8)
+ x(bi_nocow, 8) \
+ x(bi_depth, 32) \
+ x(bi_inodes_32bit, 8)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
@@ -114,7 +116,8 @@ struct bch_inode_generation {
x(foreground_target, 16) \
x(background_target, 16) \
x(erasure_code, 16) \
- x(nocow, 8)
+ x(nocow, 8) \
+ x(inodes_32bit, 8)
enum inode_opt_id {
#define x(name, ...) \
@@ -164,4 +167,12 @@ LE64_BITMASK(INODEv3_FIELDS_START,
struct bch_inode_v3, bi_flags, 31, 36);
LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
+struct bch_inode_alloc_cursor {
+ struct bch_val v;
+ __u8 bits;
+ __u8 pad;
+ __le32 gen;
+ __le64 idx;
+};
+
#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index f283051758d6..5353979117b0 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -113,11 +113,13 @@ int bch2_extent_fallocate(struct btree_trans *trans,
err:
if (!ret && sectors_allocated)
bch2_increment_clock(c, sectors_allocated, WRITE);
- if (should_print_err(ret))
- bch_err_inum_offset_ratelimited(c,
- inum.inum,
- iter->pos.offset << 9,
- "%s(): error: %s", __func__, bch2_err_str(ret));
+ if (should_print_err(ret)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9);
+ prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
err_noprint:
bch2_open_buckets_put(c, &open_buckets);
bch2_disk_reservation_put(c, &disk_res);
@@ -164,9 +166,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
bch2_btree_iter_set_snapshot(iter, snapshot);
/*
- * peek_upto() doesn't have ideal semantics for extents:
+ * peek_max() doesn't have ideal semantics for extents:
*/
- k = bch2_btree_iter_peek_upto(iter, end_pos);
+ k = bch2_btree_iter_peek_max(iter, end_pos);
if (!k.k)
break;
@@ -426,8 +428,8 @@ case LOGGED_OP_FINSERT_shift_extents:
bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
k = insert
- ? bch2_btree_iter_peek_prev(&iter)
- : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+ ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0))
+ : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
if ((ret = bkey_err(k)))
goto btree_err;
@@ -461,7 +463,7 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
- ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index b3b934a87c6d..34a3569d085a 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -21,6 +21,7 @@
#include "io_read.h"
#include "io_misc.h"
#include "io_write.h"
+#include "reflink.h"
#include "subvolume.h"
#include "trace.h"
@@ -231,11 +232,11 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
update_opts.target = opts.foreground_target;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned i = 0;
+ unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
if (bch2_dev_io_failures(failed, ptr->dev))
- update_opts.rewrite_ptrs |= BIT(i);
- i++;
+ update_opts.rewrite_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
}
}
@@ -321,6 +322,20 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
/* Read */
+static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+ struct bch_read_bio *rbio, struct bpos read_pos)
+{
+ return bch2_inum_offset_err_msg_trans(trans, out,
+ (subvol_inum) { rbio->subvol, read_pos.inode },
+ read_pos.offset << 9);
+}
+
+static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
+ struct bch_read_bio *rbio, struct bpos read_pos)
+{
+ bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
+}
+
#define READ_RETRY_AVOID 1
#define READ_RETRY 2
#define READ_ERR 3
@@ -499,6 +514,29 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
}
}
+static void bch2_read_io_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bio *bio = &rbio->bio;
+ struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
+
+ if (ca) {
+ bch2_io_error(ca, BCH_MEMBER_ERROR_read);
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ } else {
+ bch_err_ratelimited(c, "%s", buf.buf);
+ }
+
+ printbuf_exit(&buf);
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+}
+
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
struct bch_read_bio *rbio)
{
@@ -562,6 +600,73 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
__bch2_rbio_narrow_crcs(trans, rbio));
}
+static void bch2_read_csum_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct bio *src = &rbio->bio;
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+ struct nonce nonce = extent_nonce(rbio->version, crc);
+ struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_str(&buf, "data ");
+ bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
+
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ if (ca) {
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ } else {
+ bch_err_ratelimited(c, "%s", buf.buf);
+ }
+
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ printbuf_exit(&buf);
+}
+
+static void bch2_read_decompress_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_str(&buf, "decompression error");
+
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ if (ca)
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ else
+ bch_err_ratelimited(c, "%s", buf.buf);
+
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ printbuf_exit(&buf);
+}
+
+static void bch2_read_decrypt_err(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+ prt_str(&buf, "decrypt error");
+
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ if (ca)
+ bch_err_ratelimited(ca, "%s", buf.buf);
+ else
+ bch_err_ratelimited(c, "%s", buf.buf);
+
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ printbuf_exit(&buf);
+}
+
/* Inner part that may run in process context */
static void __bch2_read_endio(struct work_struct *work)
{
@@ -668,33 +773,13 @@ static void __bch2_read_endio(struct work_struct *work)
goto out;
}
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
- prt_str(&buf, "data ");
- bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
-
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- if (ca) {
- bch_err_inum_offset_ratelimited(ca,
- rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "data %s", buf.buf);
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
- }
- printbuf_exit(&buf);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
decompression_err:
- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "decompression error");
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
decrypt_err:
- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "decrypt error");
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
}
@@ -715,16 +800,8 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bio->bi_status) {
- if (ca) {
- bch_err_inum_offset_ratelimited(ca,
- rbio->read_pos.inode,
- rbio->read_pos.offset,
- "data read error: %s",
- bch2_blk_status_to_str(bio->bi_status));
- bch2_io_error(ca, BCH_MEMBER_ERROR_read);
- }
- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+ if (unlikely(bio->bi_status)) {
+ bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
return;
}
@@ -750,45 +827,6 @@ static void bch2_read_endio(struct bio *bio)
bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
}
-int __bch2_read_indirect_extent(struct btree_trans *trans,
- unsigned *offset_into_extent,
- struct bkey_buf *orig_k)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 reflink_offset;
- int ret;
-
- reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
- *offset_into_extent;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
- POS(0, reflink_offset), 0);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_reflink_v &&
- k.k->type != KEY_TYPE_indirect_inline_data) {
- bch_err_inum_offset_ratelimited(trans->c,
- orig_k->k->k.p.inode,
- orig_k->k->k.p.offset << 9,
- "%llu len %u points to nonexistent indirect extent %llu",
- orig_k->k->k.p.offset,
- orig_k->k->k.size,
- reflink_offset);
- bch2_inconsistent_error(trans->c);
- ret = -BCH_ERR_missing_indirect_extent;
- goto err;
- }
-
- *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
- bch2_bkey_buf_reassemble(orig_k, trans->c, k);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
struct bch_dev *ca,
struct bkey_s_c k,
@@ -868,15 +906,24 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
if (!pick_ret)
goto hole;
- if (pick_ret < 0) {
+ if (unlikely(pick_ret < 0)) {
struct printbuf buf = PRINTBUF;
+ bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
+ prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret));
bch2_bkey_val_to_text(&buf, c, k);
- bch_err_inum_offset_ratelimited(c,
- read_pos.inode, read_pos.offset << 9,
- "no device to read from: %s\n %s",
- bch2_err_str(pick_ret),
- buf.buf);
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ goto err;
+ }
+
+ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
+ struct printbuf buf = PRINTBUF;
+ bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
+ prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
goto err;
}
@@ -1062,11 +1109,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
}
if (!rbio->pick.idx) {
- if (!rbio->have_ioref) {
- bch_err_inum_offset_ratelimited(c,
- read_pos.inode,
- read_pos.offset << 9,
- "no device to read from");
+ if (unlikely(!rbio->have_ioref)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
+ prt_printf(&buf, "no device to read from:\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
@@ -1164,7 +1215,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
BTREE_ITER_slots);
while (1) {
- unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
bch2_trans_begin(trans);
@@ -1184,9 +1234,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
if (ret)
goto err;
- offset_into_extent = iter.pos.offset -
+ s64 offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
- sectors = k.k->size - offset_into_extent;
+ unsigned sectors = k.k->size - offset_into_extent;
bch2_bkey_buf_reassemble(&sk, c, k);
@@ -1201,9 +1251,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
* With indirect extents, the amount of data to read is the min
* of the original extent and the indirect extent:
*/
- sectors = min(sectors, k.k->size - offset_into_extent);
+ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
- bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+ unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
swap(bvec_iter.bi_size, bytes);
if (bvec_iter.bi_size == bytes)
@@ -1229,16 +1279,20 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
}
bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&sk, c);
if (ret) {
- bch_err_inum_offset_ratelimited(c, inum.inum,
- bvec_iter.bi_sector << 9,
- "read error %i from btree lookup", ret);
+ struct printbuf buf = PRINTBUF;
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9);
+ prt_printf(&buf, "read error %i from btree lookup", ret);
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+
rbio->bio.bi_status = BLK_STS_IOERR;
bch2_rbio_done(rbio);
}
+
+ bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&sk, c);
}
void bch2_fs_io_read_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
index d9c18bb7d403..a82e8a94ccb6 100644
--- a/fs/bcachefs/io_read.h
+++ b/fs/bcachefs/io_read.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_IO_READ_H
#include "bkey_buf.h"
+#include "reflink.h"
struct bch_read_bio {
struct bch_fs *c;
@@ -79,19 +80,32 @@ struct bch_devs_mask;
struct cache_promote_op;
struct extent_ptr_decoded;
-int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
- struct bkey_buf *);
-
static inline int bch2_read_indirect_extent(struct btree_trans *trans,
enum btree_id *data_btree,
- unsigned *offset_into_extent,
- struct bkey_buf *k)
+ s64 *offset_into_extent,
+ struct bkey_buf *extent)
{
- if (k->k->k.type != KEY_TYPE_reflink_p)
+ if (extent->k->k.type != KEY_TYPE_reflink_p)
return 0;
*data_btree = BTREE_ID_reflink;
- return __bch2_read_indirect_extent(trans, offset_into_extent, k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
+ offset_into_extent,
+ bkey_i_to_s_c_reflink_p(extent->k),
+ true, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (bkey_deleted(k.k)) {
+ bch2_trans_iter_exit(trans, &iter);
+ return -BCH_ERR_missing_indirect_extent;
+ }
+
+ bch2_bkey_buf_reassemble(extent, trans->c, k);
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
}
enum bch_read_flags {
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 96720adcfee0..3e71860f66b9 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -164,7 +164,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
bch2_trans_copy_iter(&iter, extent_iter);
- for_each_btree_key_upto_continue_norestart(iter,
+ for_each_btree_key_max_continue_norestart(iter,
new->k.p, BTREE_ITER_slots, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
@@ -216,6 +216,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
SPOS(0,
extent_iter->pos.inode,
extent_iter->snapshot),
+ BTREE_ITER_intent|
BTREE_ITER_cached);
int ret = bkey_err(k);
if (unlikely(ret))
@@ -369,7 +370,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_slots|BTREE_ITER_intent);
- ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
@@ -395,6 +396,21 @@ static int bch2_write_index_default(struct bch_write_op *op)
/* Writes */
+static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
+ u64 offset)
+{
+ bch2_inum_offset_err_msg(op->c, out,
+ (subvol_inum) { op->subvol, op->pos.inode, },
+ offset << 9);
+ prt_printf(out, "write error%s: ",
+ op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
+}
+
+static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
+{
+ __bch2_write_op_error(out, op, op->pos.offset);
+}
+
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
enum bch_data_type type,
const struct bkey_i *k,
@@ -531,14 +547,14 @@ static void __bch2_write_index(struct bch_write_op *op)
op->written += sectors_start - keylist_sectors(keys);
- if (ret && !bch2_err_matches(ret, EROFS)) {
+ if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
- bch_err_inum_offset_ratelimited(c,
- insert->k.p.inode, insert->k.p.offset << 9,
- "%s write error while doing btree update: %s",
- op->flags & BCH_WRITE_MOVE ? "move" : "user",
- bch2_err_str(ret));
+ struct printbuf buf = PRINTBUF;
+ __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
+ prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
if (ret)
@@ -621,9 +637,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
while (1) {
spin_lock_irq(&wp->writes_lock);
- op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
- if (op)
- list_del(&op->wp_list);
+ op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list);
wp_update_state(wp, op != NULL);
spin_unlock_irq(&wp->writes_lock);
@@ -1080,11 +1094,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
*_dst = dst;
return more;
csum_err:
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- op->pos.offset << 9,
- "%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)",
- op->flags & BCH_WRITE_MOVE ? "move" : "user");
+ {
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
ret = -EIO;
err:
if (to_wbio(dst)->bounce)
@@ -1165,7 +1182,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
struct btree_trans *trans = bch2_trans_get(c);
for_each_keylist_key(&op->insert_keys, orig) {
- int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+ int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
@@ -1175,11 +1192,11 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
if (ret && !bch2_err_matches(ret, EROFS)) {
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
- bch_err_inum_offset_ratelimited(c,
- insert->k.p.inode, insert->k.p.offset << 9,
- "%s write error while doing btree update: %s",
- op->flags & BCH_WRITE_MOVE ? "move" : "user",
- bch2_err_str(ret));
+ struct printbuf buf = PRINTBUF;
+ __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
+ prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
if (ret) {
@@ -1339,17 +1356,19 @@ static void bch2_nocow_write(struct bch_write_op *op)
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
+ bch2_trans_put(trans);
+ darray_exit(&buckets);
+
if (ret) {
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode, op->pos.offset << 9,
- "%s: btree lookup error %s", __func__, bch2_err_str(ret));
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
op->error = ret;
op->flags |= BCH_WRITE_SUBMITTED;
}
- bch2_trans_put(trans);
- darray_exit(&buckets);
-
/* fallback to cow write path? */
if (!(op->flags & BCH_WRITE_SUBMITTED)) {
closure_sync(&op->cl);
@@ -1462,14 +1481,14 @@ static void __bch2_write(struct bch_write_op *op)
if (ret <= 0) {
op->flags |= BCH_WRITE_SUBMITTED;
- if (ret < 0) {
- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- op->pos.offset << 9,
- "%s(): %s error: %s", __func__,
- op->flags & BCH_WRITE_MOVE ? "move" : "user",
- bch2_err_str(ret));
+ if (unlikely(ret < 0)) {
+ if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
op->error = ret;
break;
}
@@ -1595,12 +1614,11 @@ CLOSURE_CALLBACK(bch2_write)
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(bio)->put_bio = false;
- if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- op->pos.offset << 9,
- "%s write error: misaligned write",
- op->flags & BCH_WRITE_MOVE ? "move" : "user");
+ if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "misaligned write");
+ printbuf_exit(&buf);
op->error = -EIO;
goto err;
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 2dc0d60c1745..2cd20114b74b 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -217,6 +217,12 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq)
if (__bch2_journal_pin_put(j, seq))
bch2_journal_reclaim_fast(j);
bch2_journal_do_writes(j);
+
+ /*
+ * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an
+ * open journal entry
+ */
+ wake_up(&j->wait);
}
/*
@@ -251,6 +257,9 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
if (!__journal_entry_is_open(old))
return;
+ if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)
+ old.cur_entry_offset = j->cur_entry_offset_if_blocked;
+
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
@@ -373,6 +382,10 @@ static int journal_entry_open(struct journal *j)
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
return JOURNAL_ERR_max_in_flight;
+ if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX,
+ c, "cannot start: journal seq overflow"))
+ return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+
BUG_ON(!j->cur_entry_sectors);
buf->expires =
@@ -664,7 +677,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
* @seq: seq to flush
* @parent: closure object to wait with
* Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed,
- * -EIO if @seq will never be flushed
+ * -BCH_ERR_journal_flush_err if @seq will never be flushed
*
* Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
* necessary
@@ -687,7 +700,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
/* Recheck under lock: */
if (j->err_seq && seq >= j->err_seq) {
- ret = -EIO;
+ ret = -BCH_ERR_journal_flush_err;
goto out;
}
@@ -794,10 +807,11 @@ int bch2_journal_flush(struct journal *j)
}
/*
- * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the
+ * range [start, end)
* @seq
*/
-bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
u64 unwritten_seq;
@@ -806,15 +820,15 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
return false;
- if (seq <= c->journal.flushed_seq_ondisk)
+ if (c->journal.flushed_seq_ondisk >= start)
return false;
spin_lock(&j->lock);
- if (seq <= c->journal.flushed_seq_ondisk)
+ if (c->journal.flushed_seq_ondisk >= start)
goto out;
for (unwritten_seq = journal_last_unwritten_seq(j);
- unwritten_seq < seq;
+ unwritten_seq < end;
unwritten_seq++) {
struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
@@ -831,19 +845,14 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
return ret;
}
-int bch2_journal_meta(struct journal *j)
+static int __bch2_journal_meta(struct journal *j)
{
- struct journal_buf *buf;
- struct journal_res res;
- int ret;
-
- memset(&res, 0, sizeof(res));
-
- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ struct journal_res res = {};
+ int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
if (ret)
return ret;
- buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+ struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
buf->must_flush = true;
if (!buf->flush_time) {
@@ -856,27 +865,70 @@ int bch2_journal_meta(struct journal *j)
return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE);
}
+int bch2_journal_meta(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal))
+ return -EROFS;
+
+ int ret = __bch2_journal_meta(j);
+ bch2_write_ref_put(c, BCH_WRITE_REF_journal);
+ return ret;
+}
+
/* block/unlock the journal: */
void bch2_journal_unblock(struct journal *j)
{
spin_lock(&j->lock);
- j->blocked--;
+ if (!--j->blocked &&
+ j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL &&
+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) {
+ union journal_res_state old, new;
+
+ old.v = atomic64_read(&j->reservations.counter);
+ do {
+ new.v = old.v;
+ new.cur_entry_offset = j->cur_entry_offset_if_blocked;
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
+ }
spin_unlock(&j->lock);
journal_wake(j);
}
+static void __bch2_journal_block(struct journal *j)
+{
+ if (!j->blocked++) {
+ union journal_res_state old, new;
+
+ old.v = atomic64_read(&j->reservations.counter);
+ do {
+ j->cur_entry_offset_if_blocked = old.cur_entry_offset;
+
+ if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL)
+ break;
+
+ new.v = old.v;
+ new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
+
+ journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
+ }
+}
+
void bch2_journal_block(struct journal *j)
{
spin_lock(&j->lock);
- j->blocked++;
+ __bch2_journal_block(j);
spin_unlock(&j->lock);
journal_quiesce(j);
}
-static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j,
+ u64 max_seq, bool *blocked)
{
struct journal_buf *ret = NULL;
@@ -893,13 +945,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
struct journal_buf *buf = j->buf + idx;
if (buf->need_flush_to_write_buffer) {
- if (seq == journal_cur_seq(j))
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-
union journal_res_state s;
s.v = atomic64_read_acquire(&j->reservations.counter);
- ret = journal_state_count(s, idx)
+ unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s);
+
+ if (open && !*blocked) {
+ __bch2_journal_block(j);
+ *blocked = true;
+ }
+
+ ret = journal_state_count(s, idx) > open
? ERR_PTR(-EAGAIN)
: buf;
break;
@@ -912,11 +968,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
return ret;
}
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
+ u64 max_seq, bool *blocked)
{
struct journal_buf *ret;
+ *blocked = false;
+
+ wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j,
+ max_seq, blocked)) != ERR_PTR(-EAGAIN));
+ if (IS_ERR_OR_NULL(ret) && *blocked)
+ bch2_journal_unblock(j);
- wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN));
return ret;
}
@@ -945,19 +1007,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
}
for (nr_got = 0; nr_got < nr_want; nr_got++) {
- if (new_fs) {
- bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
- if (bu[nr_got] < 0) {
- ret = -BCH_ERR_ENOSPC_bucket_alloc;
- break;
- }
- } else {
- ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal,
- BCH_DATA_journal, cl);
- ret = PTR_ERR_OR_ZERO(ob[nr_got]);
- if (ret)
- break;
+ enum bch_watermark watermark = new_fs
+ ? BCH_WATERMARK_btree
+ : BCH_WATERMARK_normal;
+ ob[nr_got] = bch2_bucket_alloc(c, ca, watermark,
+ BCH_DATA_journal, cl);
+ ret = PTR_ERR_OR_ZERO(ob[nr_got]);
+ if (ret)
+ break;
+
+ if (!new_fs) {
ret = bch2_trans_run(c,
bch2_trans_mark_metadata_bucket(trans, ca,
ob[nr_got]->bucket, BCH_DATA_journal,
@@ -967,9 +1027,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
bch_err_msg(c, ret, "marking new journal buckets");
break;
}
-
- bu[nr_got] = ob[nr_got]->bucket;
}
+
+ bu[nr_got] = ob[nr_got]->bucket;
}
if (!nr_got)
@@ -1009,8 +1069,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (ret)
goto err_unblock;
- if (!new_fs)
- bch2_write_super(c);
+ bch2_write_super(c);
/* Commit: */
if (c)
@@ -1044,9 +1103,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
bu[i], BCH_DATA_free, 0,
BTREE_TRIGGER_transactional));
err_free:
- if (!new_fs)
- for (i = 0; i < nr_got; i++)
- bch2_open_bucket_put(c, ob[i]);
+ for (i = 0; i < nr_got; i++)
+ bch2_open_bucket_put(c, ob[i]);
kfree(new_bucket_seq);
kfree(new_buckets);
@@ -1193,7 +1251,7 @@ void bch2_fs_journal_stop(struct journal *j)
* Always write a new journal entry, to make sure the clock hands are up
* to date (and match the superblock)
*/
- bch2_journal_meta(j);
+ __bch2_journal_meta(j);
journal_quiesce(j);
cancel_delayed_work_sync(&j->write_work);
@@ -1217,6 +1275,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
bool had_entries = false;
u64 last_seq = cur_seq, nr, seq;
+ if (cur_seq >= JOURNAL_SEQ_MAX) {
+ bch_err(c, "cannot start: journal seq overflow");
+ return -EINVAL;
+ }
+
genradix_for_each_reverse(&c->journal_entries, iter, _i) {
i = *_i;
@@ -1474,6 +1537,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
case JOURNAL_ENTRY_CLOSED_VAL:
prt_printf(out, "closed\n");
break;
+ case JOURNAL_ENTRY_BLOCKED_VAL:
+ prt_printf(out, "blocked\n");
+ break;
default:
prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
break;
@@ -1499,6 +1565,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
printbuf_indent_sub(out, 2);
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
+ if (!ca->mi.durability)
+ continue;
+
struct journal_device *ja = &ca->journal;
if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
@@ -1508,6 +1577,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
continue;
prt_printf(out, "dev %u:\n", ca->dev_idx);
+ prt_printf(out, "durability %u:\n", ca->mi.durability);
printbuf_indent_add(out, 2);
prt_printf(out, "nr\t%u\n", ja->nr);
prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size);
@@ -1519,6 +1589,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
printbuf_indent_sub(out, 2);
}
+ prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required);
+
rcu_read_unlock();
--out->atomic;
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 2762be6f9814..cb0df0663946 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -285,7 +285,8 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
spin_lock(&j->lock);
bch2_journal_buf_put_final(j, seq);
spin_unlock(&j->lock);
- }
+ } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL))
+ wake_up(&j->wait);
}
/*
@@ -403,7 +404,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64, unsigned);
int bch2_journal_flush(struct journal *);
-bool bch2_journal_noflush_seq(struct journal *, u64);
+bool bch2_journal_noflush_seq(struct journal *, u64, u64);
int bch2_journal_meta(struct journal *);
void bch2_journal_halt(struct journal *);
@@ -411,7 +412,7 @@ void bch2_journal_halt(struct journal *);
static inline int bch2_journal_error(struct journal *j)
{
return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
- ? -EIO : 0;
+ ? -BCH_ERR_journal_shutdown : 0;
}
struct bch_dev;
@@ -424,7 +425,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq);
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index fb35dd336331..e1773ac27824 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,6 +17,8 @@
#include "sb-clean.h"
#include "trace.h"
+#include <linux/string_choices.h>
+
void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
{
lockdep_assert_held(&c->sb_lock);
@@ -299,7 +301,7 @@ static void journal_entry_err_msg(struct printbuf *out,
journal_entry_err_msg(&_buf, version, jset, entry); \
prt_printf(&_buf, msg, ##__VA_ARGS__); \
\
- switch (flags & BCH_VALIDATE_write) { \
+ switch (from.flags & BCH_VALIDATE_write) { \
case READ: \
mustfix_fsck_err(c, _err, "%s", _buf.buf); \
break; \
@@ -325,11 +327,11 @@ static void journal_entry_err_msg(struct printbuf *out,
static int journal_validate_key(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
- unsigned level, enum btree_id btree_id,
struct bkey_i *k,
- unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from,
+ unsigned version, int big_endian)
{
+ enum bch_validate_flags flags = from.flags;
int write = flags & BCH_VALIDATE_write;
void *next = vstruct_next(entry);
int ret = 0;
@@ -364,11 +366,10 @@ static int journal_validate_key(struct bch_fs *c,
}
if (!write)
- bch2_bkey_compat(level, btree_id, version, big_endian,
+ bch2_bkey_compat(from.level, from.btree, version, big_endian,
write, NULL, bkey_to_packed(k));
- ret = bch2_bkey_validate(c, bkey_i_to_s_c(k),
- __btree_node_type(level, btree_id), write);
+ ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from);
if (ret == -BCH_ERR_fsck_delete_bkey) {
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -379,7 +380,7 @@ static int journal_validate_key(struct bch_fs *c,
goto fsck_err;
if (write)
- bch2_bkey_compat(level, btree_id, version, big_endian,
+ bch2_bkey_compat(from.level, from.btree, version, big_endian,
write, NULL, bkey_to_packed(k));
fsck_err:
return ret;
@@ -389,16 +390,15 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_i *k = entry->start;
+ from.level = entry->level;
+ from.btree = entry->btree_id;
+
while (k != vstruct_last(entry)) {
- int ret = journal_validate_key(c, jset, entry,
- entry->level,
- entry->btree_id,
- k, version, big_endian,
- flags|BCH_VALIDATE_journal);
+ int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
if (ret == FSCK_DELETED_KEY)
continue;
else if (ret)
@@ -421,7 +421,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
bch2_prt_jset_entry_type(out, entry->type);
prt_str(out, ": ");
}
- prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
+ bch2_btree_id_level_to_text(out, entry->btree_id, entry->level);
+ prt_char(out, ' ');
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
first = false;
}
@@ -431,11 +432,15 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_i *k = entry->start;
int ret = 0;
+ from.root = true;
+ from.level = entry->level + 1;
+ from.btree = entry->btree_id;
+
if (journal_entry_err_on(!entry->u64s ||
le16_to_cpu(entry->u64s) != k->k.u64s,
c, version, jset, entry,
@@ -452,8 +457,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
return 0;
}
- ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
- version, big_endian, flags);
+ ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
if (ret == FSCK_DELETED_KEY)
ret = 0;
fsck_err:
@@ -470,7 +474,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
/* obsolete, don't care: */
return 0;
@@ -485,7 +489,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
int ret = 0;
@@ -512,7 +516,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct jset_entry_blacklist_v2 *bl_entry;
int ret = 0;
@@ -554,7 +558,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
@@ -588,7 +592,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
@@ -632,7 +636,7 @@ static int journal_entry_clock_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
@@ -665,14 +669,14 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
- prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+ prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time));
}
static int journal_entry_dev_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
@@ -729,7 +733,7 @@ static int journal_entry_log_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
return 0;
}
@@ -738,19 +742,19 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
- unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
- prt_printf(out, "%.*s", bytes, l->d);
+ prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d);
}
static int journal_entry_overwrite_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
+ from.flags = 0;
return journal_entry_btree_keys_validate(c, jset, entry,
- version, big_endian, READ);
+ version, big_endian, from);
}
static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
@@ -763,10 +767,10 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
return journal_entry_btree_keys_validate(c, jset, entry,
- version, big_endian, READ);
+ version, big_endian, from);
}
static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
@@ -779,7 +783,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
unsigned bytes = vstruct_bytes(entry);
unsigned expected = 16;
@@ -809,7 +813,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
- enum bch_validate_flags);
+ struct bkey_validate_context);
void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
};
@@ -827,11 +831,11 @@ int bch2_journal_entry_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
return entry->type < BCH_JSET_ENTRY_NR
? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
- version, big_endian, flags)
+ version, big_endian, from)
: 0;
}
@@ -849,10 +853,18 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
enum bch_validate_flags flags)
{
+ struct bkey_validate_context from = {
+ .flags = flags,
+ .from = BKEY_VALIDATE_journal,
+ .journal_seq = le64_to_cpu(jset->seq),
+ };
+
unsigned version = le32_to_cpu(jset->version);
int ret = 0;
vstruct_for_each(jset, entry) {
+ from.journal_offset = (u64 *) entry - jset->_data;
+
if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
c, version, jset, entry,
journal_entry_past_jset_end,
@@ -861,8 +873,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
break;
}
- ret = bch2_journal_entry_validate(c, jset, entry,
- version, JSET_BIG_ENDIAN(jset), flags);
+ ret = bch2_journal_entry_validate(c, jset, entry, version,
+ JSET_BIG_ENDIAN(jset), from);
if (ret)
break;
}
@@ -875,13 +887,17 @@ static int jset_validate(struct bch_fs *c,
struct jset *jset, u64 sector,
enum bch_validate_flags flags)
{
- unsigned version;
+ struct bkey_validate_context from = {
+ .flags = flags,
+ .from = BKEY_VALIDATE_journal,
+ .journal_seq = le64_to_cpu(jset->seq),
+ };
int ret = 0;
if (le64_to_cpu(jset->magic) != jset_magic(c))
return JOURNAL_ENTRY_NONE;
- version = le32_to_cpu(jset->version);
+ unsigned version = le32_to_cpu(jset->version);
if (journal_entry_err_on(!bch2_version_compatible(version),
c, version, jset, NULL,
jset_unsupported_version,
@@ -926,15 +942,16 @@ static int jset_validate_early(struct bch_fs *c,
unsigned bucket_sectors_left,
unsigned sectors_read)
{
- size_t bytes = vstruct_bytes(jset);
- unsigned version;
- enum bch_validate_flags flags = BCH_VALIDATE_journal;
+ struct bkey_validate_context from = {
+ .from = BKEY_VALIDATE_journal,
+ .journal_seq = le64_to_cpu(jset->seq),
+ };
int ret = 0;
if (le64_to_cpu(jset->magic) != jset_magic(c))
return JOURNAL_ENTRY_NONE;
- version = le32_to_cpu(jset->version);
+ unsigned version = le32_to_cpu(jset->version);
if (journal_entry_err_on(!bch2_version_compatible(version),
c, version, jset, NULL,
jset_unsupported_version,
@@ -947,6 +964,7 @@ static int jset_validate_early(struct bch_fs *c,
return -EINVAL;
}
+ size_t bytes = vstruct_bytes(jset);
if (bytes > (sectors_read << 9) &&
sectors_read < bucket_sectors_left)
return JOURNAL_ENTRY_REREAD;
@@ -1096,8 +1114,10 @@ static int journal_read_bucket(struct bch_dev *ca,
(printbuf_reset(&err),
prt_str(&err, "journal "),
bch2_csum_err_msg(&err, csum_type, j->csum, csum),
- err.buf)))
+ err.buf))) {
saw_bad = true;
+ bch2_fatal_error(c);
+ }
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
@@ -1231,8 +1251,6 @@ int bch2_journal_read(struct bch_fs *c,
* those entries will be blacklisted:
*/
genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
- enum bch_validate_flags flags = BCH_VALIDATE_journal;
-
i = *_i;
if (journal_replay_ignore(i))
@@ -1252,6 +1270,10 @@ int bch2_journal_read(struct bch_fs *c,
continue;
}
+ struct bkey_validate_context from = {
+ .from = BKEY_VALIDATE_journal,
+ .journal_seq = le64_to_cpu(i->j.seq),
+ };
if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
c, le32_to_cpu(i->j.version), &i->j, NULL,
jset_last_seq_newer_than_seq,
@@ -1411,27 +1433,50 @@ int bch2_journal_read(struct bch_fs *c,
/* journal write: */
+static void journal_advance_devs_to_next_bucket(struct journal *j,
+ struct dev_alloc_list *devs,
+ unsigned sectors, u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ darray_for_each(*devs, i) {
+ struct bch_dev *ca = rcu_dereference(c->devs[*i]);
+ if (!ca)
+ continue;
+
+ struct journal_device *ja = &ca->journal;
+
+ if (sectors > ja->sectors_free &&
+ sectors <= ca->mi.bucket_size &&
+ bch2_journal_dev_buckets_available(j, ja,
+ journal_space_discarded)) {
+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+ ja->sectors_free = ca->mi.bucket_size;
+
+ /*
+ * ja->bucket_seq[ja->cur_idx] must always have
+ * something sensible:
+ */
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
+ }
+ }
+}
+
static void __journal_write_alloc(struct journal *j,
struct journal_buf *w,
- struct dev_alloc_list *devs_sorted,
+ struct dev_alloc_list *devs,
unsigned sectors,
unsigned *replicas,
unsigned replicas_want)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_device *ja;
- struct bch_dev *ca;
- unsigned i;
- if (*replicas >= replicas_want)
- return;
-
- for (i = 0; i < devs_sorted->nr; i++) {
- ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
+ darray_for_each(*devs, i) {
+ struct bch_dev *ca = rcu_dereference(c->devs[*i]);
if (!ca)
continue;
- ja = &ca->journal;
+ struct journal_device *ja = &ca->journal;
/*
* Check that we can use this device, and aren't already using
@@ -1477,65 +1522,53 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_devs_mask devs;
- struct journal_device *ja;
- struct bch_dev *ca;
struct dev_alloc_list devs_sorted;
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
unsigned target = c->opts.metadata_target ?:
c->opts.foreground_target;
- unsigned i, replicas = 0, replicas_want =
+ unsigned replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
unsigned replicas_need = min_t(unsigned, replicas_want,
READ_ONCE(c->opts.metadata_replicas_required));
+ bool advance_done = false;
rcu_read_lock();
-retry:
- devs = target_rw_devs(c, BCH_DATA_journal, target);
- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
+ /* We might run more than once if we have to stop and do discards: */
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key));
+ bkey_for_each_ptr(ptrs, p) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev);
+ if (ca)
+ replicas += ca->mi.durability;
+ }
- __journal_write_alloc(j, w, &devs_sorted,
- sectors, &replicas, replicas_want);
+retry_target:
+ devs = target_rw_devs(c, BCH_DATA_journal, target);
+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
+retry_alloc:
+ __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want);
- if (replicas >= replicas_want)
+ if (likely(replicas >= replicas_want))
goto done;
- for (i = 0; i < devs_sorted.nr; i++) {
- ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
- if (!ca)
- continue;
-
- ja = &ca->journal;
-
- if (sectors > ja->sectors_free &&
- sectors <= ca->mi.bucket_size &&
- bch2_journal_dev_buckets_available(j, ja,
- journal_space_discarded)) {
- ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- ja->sectors_free = ca->mi.bucket_size;
-
- /*
- * ja->bucket_seq[ja->cur_idx] must always have
- * something sensible:
- */
- ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
- }
+ if (!advance_done) {
+ journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq);
+ advance_done = true;
+ goto retry_alloc;
}
- __journal_write_alloc(j, w, &devs_sorted,
- sectors, &replicas, replicas_want);
-
if (replicas < replicas_want && target) {
/* Retry from all devices: */
target = 0;
- goto retry;
+ advance_done = false;
+ goto retry_target;
}
done:
rcu_read_unlock();
BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
- return replicas >= replicas_need ? 0 : -EROFS;
+ return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
@@ -2023,19 +2056,21 @@ CLOSURE_CALLBACK(bch2_journal_write)
bch2_journal_do_discards(j);
}
- if (ret) {
+ if (ret && !bch2_journal_error(j)) {
struct printbuf buf = PRINTBUF;
buf.atomic++;
- prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"),
+ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
le64_to_cpu(w->data->seq),
+ vstruct_sectors(w->data, c->block_bits),
bch2_err_str(ret));
__bch2_journal_debug_to_text(&buf, j);
spin_unlock(&j->lock);
bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
- goto err;
}
+ if (ret)
+ goto err;
/*
* write is allocated, no longer need to account for it in
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 2ca9cde30ea8..12b39fcb4424 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -63,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
- enum bch_validate_flags);
+ struct bkey_validate_context);
void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
struct jset_entry *);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ace291f175dd..3c8242606da7 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -38,6 +38,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
struct journal_device *ja,
enum journal_space_from from)
{
+ if (!ja->nr)
+ return 0;
+
unsigned available = (journal_space_from(ja, from) -
ja->cur_idx - 1 + ja->nr) % ja->nr;
@@ -137,14 +140,18 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
struct bch_fs *c = container_of(j, struct bch_fs, journal);
unsigned pos, nr_devs = 0;
struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+ unsigned min_bucket_size = U32_MAX;
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
- if (!ca->journal.nr)
+ if (!ca->journal.nr ||
+ !ca->mi.durability)
continue;
+ min_bucket_size = min(min_bucket_size, ca->mi.bucket_size);
+
space = journal_dev_space_available(j, ca, from);
if (!space.next_entry)
continue;
@@ -164,7 +171,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
* We sorted largest to smallest, and we want the smallest out of the
* @nr_devs_want largest devices:
*/
- return dev_space[nr_devs_want - 1];
+ space = dev_space[nr_devs_want - 1];
+ space.next_entry = min(space.next_entry, min_bucket_size);
+ return space;
}
void bch2_journal_space_available(struct journal *j)
@@ -758,10 +767,12 @@ static int bch2_journal_reclaim_thread(void *arg)
journal_empty = fifo_empty(&j->pin);
spin_unlock(&j->lock);
+ long timeout = j->next_reclaim - jiffies;
+
if (journal_empty)
schedule();
- else if (time_after(j->next_reclaim, jiffies))
- schedule_timeout(j->next_reclaim - jiffies);
+ else if (timeout > 0)
+ schedule_timeout(timeout);
else
break;
}
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 19183fcf7ad7..e9bd716fbb71 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -9,6 +9,9 @@
#include "super_types.h"
#include "fifo.h"
+/* btree write buffer steals 8 bits for its own purposes: */
+#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1)
+
#define JOURNAL_BUF_BITS 2
#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
@@ -112,6 +115,7 @@ union journal_res_state {
*/
#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
+#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2)
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
@@ -193,6 +197,7 @@ struct journal {
* insufficient devices:
*/
enum journal_errors cur_entry_error;
+ unsigned cur_entry_offset_if_blocked;
unsigned buf_size_want;
/*
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index 60e00702d1a4..75f27ec26f85 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -63,8 +63,10 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
int bch2_resume_logged_ops(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter,
- BTREE_ID_logged_ops, POS_MIN,
+ for_each_btree_key_max(trans, iter,
+ BTREE_ID_logged_ops,
+ POS(LOGGED_OPS_INUM_logged_ops, 0),
+ POS(LOGGED_OPS_INUM_logged_ops, U64_MAX),
BTREE_ITER_prefetch, k,
resume_logged_op(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -74,9 +76,8 @@ int bch2_resume_logged_ops(struct bch_fs *c)
static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
{
struct btree_iter iter;
- int ret;
-
- ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX);
+ int ret = bch2_bkey_get_empty_slot(trans, &iter,
+ BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX));
if (ret)
return ret;
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
index 6a4bf7129dba..cfb67c95d4c8 100644
--- a/fs/bcachefs/logged_ops_format.h
+++ b/fs/bcachefs/logged_ops_format.h
@@ -2,6 +2,11 @@
#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
#define _BCACHEFS_LOGGED_OPS_FORMAT_H
+enum logged_ops_inums {
+ LOGGED_OPS_INUM_logged_ops,
+ LOGGED_OPS_INUM_inode_cursors,
+};
+
struct bch_logged_op_truncate {
struct bch_val v;
__le32 subvol;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 10857eccdeaf..ce794d55818f 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -12,7 +12,7 @@
/* KEY_TYPE_lru is obsolete: */
int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
int ret = 0;
@@ -192,7 +192,7 @@ int bch2_check_lrus(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_lru_key(trans, &iter, k, &last_flushed)));
bch2_bkey_buf_exit(&last_flushed, c);
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index e6a7d8241bb8..f31a6cf1514c 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -33,7 +33,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
return BCH_LRU_read;
}
-int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 0ef4a86850bb..c493ea625553 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -21,6 +21,8 @@
#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
+#include "rebalance.h"
+#include "reflink.h"
#include "replicas.h"
#include "snapshot.h"
#include "super-io.h"
@@ -196,6 +198,13 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
list_del(&ctxt->list);
mutex_unlock(&c->moving_context_lock);
+ /*
+ * Generally, releasing a transaction within a transaction restart means
+ * an unhandled transaction restart: but this can happen legitimately
+ * within the move code, e.g. when bch2_move_ratelimit() tells us to
+ * exit before we've retried
+ */
+ bch2_trans_begin(ctxt->trans);
bch2_trans_put(ctxt->trans);
memset(ctxt, 0, sizeof(*ctxt));
}
@@ -379,34 +388,42 @@ int bch2_move_extent(struct moving_context *ctxt,
return ret;
}
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
struct per_snapshot_io_opts *io_opts,
+ struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
+ struct btree_iter *extent_iter,
struct bkey_s_c extent_k)
{
struct bch_fs *c = trans->c;
u32 restart_count = trans->restart_count;
+ struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
int ret = 0;
- if (io_opts->cur_inum != extent_k.k->p.inode) {
+ if (extent_k.k->type == KEY_TYPE_reflink_v)
+ goto out;
+
+ if (io_opts->cur_inum != extent_pos.inode) {
io_opts->d.nr = 0;
- ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+ ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode),
BTREE_ITER_all_snapshots, k, ({
- if (k.k->p.offset != extent_k.k->p.inode)
+ if (k.k->p.offset != extent_pos.inode)
break;
if (!bkey_is_inode(k.k))
continue;
struct bch_inode_unpacked inode;
- BUG_ON(bch2_inode_unpack(k, &inode));
+ _ret3 = bch2_inode_unpack(k, &inode);
+ if (_ret3)
+ break;
struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
darray_push(&io_opts->d, e);
}));
- io_opts->cur_inum = extent_k.k->p.inode;
+ io_opts->cur_inum = extent_pos.inode;
}
ret = ret ?: trans_was_restarted(trans, restart_count);
@@ -415,43 +432,46 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
if (extent_k.k->p.snapshot)
darray_for_each(io_opts->d, i)
- if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
- return &i->io_opts;
-
- return &io_opts->fs_io_opts;
+ if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) {
+ opts_ret = &i->io_opts;
+ break;
+ }
+out:
+ ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
+ if (ret)
+ return ERR_PTR(ret);
+ return opts_ret;
}
int bch2_move_get_io_opts_one(struct btree_trans *trans,
struct bch_io_opts *io_opts,
+ struct btree_iter *extent_iter,
struct bkey_s_c extent_k)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
+ struct bch_fs *c = trans->c;
+
+ *io_opts = bch2_opts_to_inode_opts(c->opts);
/* reflink btree? */
- if (!extent_k.k->p.inode) {
- *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
- return 0;
- }
+ if (!extent_k.k->p.inode)
+ goto out;
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ struct btree_iter inode_iter;
+ struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
BTREE_ITER_cached);
- ret = bkey_err(k);
+ int ret = bkey_err(inode_k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
- if (!ret && bkey_is_inode(k.k)) {
+ if (!ret && bkey_is_inode(inode_k.k)) {
struct bch_inode_unpacked inode;
- bch2_inode_unpack(k, &inode);
- bch2_inode_opts_get(io_opts, trans->c, &inode);
- } else {
- *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+ bch2_inode_unpack(inode_k, &inode);
+ bch2_inode_opts_get(io_opts, c, &inode);
}
-
- bch2_trans_iter_exit(trans, &iter);
- return 0;
+ bch2_trans_iter_exit(trans, &inode_iter);
+out:
+ return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
}
int bch2_move_ratelimit(struct moving_context *ctxt)
@@ -509,9 +529,15 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
struct per_snapshot_io_opts snapshot_io_opts;
struct bch_io_opts *io_opts;
struct bkey_buf sk;
- struct btree_iter iter;
+ struct btree_iter iter, reflink_iter = {};
struct bkey_s_c k;
struct data_update_opts data_opts;
+ /*
+ * If we're moving a single file, also process reflinked data it points
+ * to (this includes propagating changed io_opts from the inode to the
+ * extent):
+ */
+ bool walk_indirect = start.inode == end.inode;
int ret = 0, ret2;
per_snapshot_io_opts_init(&snapshot_io_opts, c);
@@ -531,6 +557,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
bch2_ratelimit_reset(ctxt->rate);
while (!bch2_move_ratelimit(ctxt)) {
+ struct btree_iter *extent_iter = &iter;
+
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@@ -549,10 +577,36 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
+ if (walk_indirect &&
+ k.k->type == KEY_TYPE_reflink_p &&
+ REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+
+ bch2_trans_iter_exit(trans, &reflink_iter);
+ k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+
+ if (bkey_deleted(k.k))
+ goto next_nondata;
+
+ /*
+ * XXX: reflink pointers may point to multiple indirect
+ * extents, so don't advance past the entire reflink
+ * pointer - need to fixup iter->k
+ */
+ extent_iter = &reflink_iter;
+ }
+
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
- io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
+ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
+ iter.pos, extent_iter, k);
ret = PTR_ERR_OR_ZERO(io_opts);
if (ret)
continue;
@@ -568,7 +622,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
+ ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
@@ -589,6 +643,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
bch2_btree_iter_advance(&iter);
}
+ bch2_trans_iter_exit(trans, &reflink_iter);
bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
per_snapshot_io_opts_exit(&snapshot_io_opts);
@@ -654,16 +709,12 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
struct bch_fs *c = trans->c;
bool is_kthread = current->flags & PF_KTHREAD;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct btree_iter iter;
+ struct btree_iter iter = {}, bp_iter = {};
struct bkey_buf sk;
- struct bch_backpointer bp;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
struct bkey_s_c k;
struct data_update_opts data_opts;
- unsigned dirty_sectors, bucket_size;
- u64 fragmentation;
- struct bpos bp_pos = POS_MIN;
+ unsigned sectors_moved = 0;
+ struct bkey_buf last_flushed;
int ret = 0;
struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
@@ -672,6 +723,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
trace_bucket_evacuate(c, &bucket);
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
bch2_bkey_buf_init(&sk);
/*
@@ -679,21 +732,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
*/
bch2_trans_begin(trans);
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- bucket, BTREE_ITER_cached);
- ret = lockrestart_do(trans,
- bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
- bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp_start(ca, bucket), 0);
bch_err_msg(c, ret, "looking up alloc key");
if (ret)
goto err;
- a = bch2_alloc_to_v4(k, &a_convert);
- dirty_sectors = bch2_bucket_sectors_dirty(*a);
- bucket_size = ca->mi.bucket_size;
- fragmentation = alloc_lru_idx_fragmentation(*a, ca);
-
ret = bch2_btree_write_buffer_tryflush(trans);
bch_err_msg(c, ret, "flushing btree write buffer");
if (ret)
@@ -705,18 +750,23 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
bch2_trans_begin(trans);
- ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
- &bp_pos, &bp,
- BTREE_ITER_cached);
+ k = bch2_btree_iter_peek(&bp_iter);
+ ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
goto err;
- if (bkey_eq(bp_pos, POS_MAX))
+
+ if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket)))
break;
- if (!bp.level) {
- k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
+ if (k.k->type != KEY_TYPE_backpointer)
+ goto next;
+
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+
+ if (!bp.v->level) {
+ k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
@@ -728,7 +778,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+ ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
if (ret) {
bch2_trans_iter_exit(trans, &iter);
continue;
@@ -738,14 +788,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
data_opts.target = io_opts.background_target;
data_opts.rewrite_ptrs = 0;
+ unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */
unsigned i = 0;
- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
- if (ptr->dev == bucket.inode) {
- data_opts.rewrite_ptrs |= 1U << i;
- if (ptr->cached) {
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+ if (p.ptr.dev == bucket.inode) {
+ if (p.ptr.cached) {
bch2_trans_iter_exit(trans, &iter);
goto next;
}
+ data_opts.rewrite_ptrs |= 1U << i;
+ break;
}
i++;
}
@@ -765,14 +819,15 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
goto err;
if (ctxt->stats)
- atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+ atomic64_add(sectors, &ctxt->stats->sectors_seen);
+ sectors_moved += sectors;
} else {
struct btree *b;
- b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
+ b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed);
ret = PTR_ERR_OR_ZERO(b);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
- continue;
+ goto next;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
@@ -796,15 +851,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
atomic64_add(sectors, &ctxt->stats->sectors_seen);
atomic64_add(sectors, &ctxt->stats->sectors_moved);
}
+ sectors_moved += btree_sectors(c);
}
next:
- bp_pos = bpos_nosnap_successor(bp_pos);
+ bch2_btree_iter_advance(&bp_iter);
}
- trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
+ trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret);
err:
+ bch2_trans_iter_exit(trans, &bp_iter);
bch2_dev_put(ca);
bch2_bkey_buf_exit(&sk, c);
+ bch2_bkey_buf_exit(&last_flushed, c);
return ret;
}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 9baf3093a678..51e0505a8156 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -110,9 +110,8 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt
darray_exit(&io_opts->d);
}
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
- struct per_snapshot_io_opts *, struct bkey_s_c);
-int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
+ struct btree_iter *, struct bkey_s_c);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index d658be90f737..85c361e78ba5 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -167,7 +167,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
bch2_trans_begin(trans);
- ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
+ ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
0, k, ({
@@ -350,9 +350,9 @@ static int bch2_copygc_thread(void *arg)
bch2_trans_unlock_long(ctxt.trans);
cond_resched();
- if (!c->copy_gc_enabled) {
+ if (!c->opts.copygc_enabled) {
move_buckets_wait(&ctxt, buckets, true);
- kthread_wait_freezable(c->copy_gc_enabled ||
+ kthread_wait_freezable(c->opts.copygc_enabled ||
kthread_should_stop());
}
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 0e2ee262fbd4..6772faf385a5 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
+#include <linux/fs_parser.h>
#include "bcachefs.h"
#include "compress.h"
@@ -48,12 +49,12 @@ static const char * const __bch2_csum_types[] = {
NULL
};
-const char * const bch2_csum_opts[] = {
+const char * const __bch2_csum_opts[] = {
BCH_CSUM_OPTS()
NULL
};
-static const char * const __bch2_compression_types[] = {
+const char * const __bch2_compression_types[] = {
BCH_COMPRESSION_TYPES()
NULL
};
@@ -113,6 +114,7 @@ void bch2_prt_##name(struct printbuf *out, type t) \
PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type);
PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type);
PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type);
+PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt);
PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type);
@@ -333,17 +335,18 @@ int bch2_opt_parse(struct bch_fs *c,
switch (opt->type) {
case BCH_OPT_BOOL:
if (val) {
- ret = kstrtou64(val, 10, res);
+ ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
+ if (ret != -BCH_ERR_option_not_bool) {
+ *res = ret;
+ } else {
+ if (err)
+ prt_printf(err, "%s: must be bool", opt->attr.name);
+ return ret;
+ }
} else {
- ret = 0;
*res = 1;
}
- if (ret < 0 || (*res != 0 && *res != 1)) {
- if (err)
- prt_printf(err, "%s: must be bool", opt->attr.name);
- return ret < 0 ? ret : -BCH_ERR_option_not_bool;
- }
break;
case BCH_OPT_UINT:
if (!val) {
@@ -710,11 +713,14 @@ void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
{
- return (struct bch_io_opts) {
+ struct bch_io_opts opts = {
#define x(_name, _bits) ._name = src._name,
BCH_INODE_OPTS()
#undef x
};
+
+ bch2_io_opts_fixups(&opts);
+ return opts;
}
bool bch2_opt_is_inode_opt(enum bch_opt_id id)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 23dda014e331..e763d52e0f38 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -16,7 +16,8 @@ extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
-extern const char * const bch2_csum_opts[];
+extern const char * const __bch2_csum_opts[];
+extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const __bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
@@ -27,6 +28,7 @@ extern const char * const bch2_d_types[];
void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type);
void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type);
void bch2_prt_data_type(struct printbuf *, enum bch_data_type);
+void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt);
void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type);
@@ -171,12 +173,12 @@ enum fsck_err_opts {
"size", "Maximum size of checksummed/compressed extents")\
x(metadata_checksum, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_csum_opts), \
+ OPT_STR(__bch2_csum_opts), \
BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(data_checksum, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_csum_opts), \
+ OPT_STR(__bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(compression, u8, \
@@ -220,14 +222,14 @@ enum fsck_err_opts {
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
x(inodes_32bit, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_INODE_32BIT, true, \
NULL, "Constrain inode numbers to 32 bits") \
- x(shard_inode_numbers, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_SHARD_INUMS, true, \
+ x(shard_inode_numbers_bits, u8, \
+ OPT_FS|OPT_FORMAT, \
+ OPT_UINT(0, 8), \
+ BCH_SB_SHARD_INUMS_NBITS, 0, \
NULL, "Shard new inode numbers by CPU id") \
x(inodes_use_key_cache, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
@@ -473,6 +475,18 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, true, \
NULL, "Enable nocow mode: enables runtime locking in\n"\
"data move path needed if nocow will ever be in use\n")\
+ x(copygc_enabled, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Enable copygc: disable for debugging, or to\n"\
+ "quiet the system when doing performance testing\n")\
+ x(rebalance_enabled, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Enable rebalance: disable for debugging, or to\n"\
+ "quiet the system when doing performance testing\n")\
x(no_data_io, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
@@ -488,7 +502,7 @@ enum fsck_err_opts {
OPT_DEVICE, \
OPT_UINT(0, S64_MAX), \
BCH2_NO_SB_OPT, 0, \
- "size", "Size of filesystem on device") \
+ "size", "Specifies the bucket size; must be greater than the btree node size")\
x(durability, u8, \
OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \
OPT_UINT(0, BCH_REPLICAS_MAX), \
@@ -624,14 +638,39 @@ struct bch_io_opts {
#define x(_name, _bits) u##_bits _name;
BCH_INODE_OPTS()
#undef x
+#define x(_name, _bits) u64 _name##_from_inode:1;
+ BCH_INODE_OPTS()
+#undef x
};
-static inline unsigned background_compression(struct bch_io_opts opts)
+static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
{
- return opts.background_compression ?: opts.compression;
+ if (!opts->background_target)
+ opts->background_target = opts->foreground_target;
+ if (!opts->background_compression)
+ opts->background_compression = opts->compression;
+ if (opts->nocow) {
+ opts->compression = opts->background_compression = 0;
+ opts->data_checksum = 0;
+ opts->erasure_code = 0;
+ }
}
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
+/* rebalance opts: */
+
+static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts)
+{
+ return (struct bch_extent_rebalance) {
+ .type = BIT(BCH_EXTENT_ENTRY_rebalance),
+#define x(_name) \
+ ._name = opts->_name, \
+ ._name##_from_inode = opts->_name##_from_inode,
+ BCH_REBALANCE_OPTS()
+#undef x
+ };
+};
+
#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index 1d570387b77f..d0dd398baa2b 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -251,16 +251,23 @@ static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
printbuf_nul_terminate_reserved(out);
}
+static inline void printbuf_reset_keep_tabstops(struct printbuf *buf)
+{
+ buf->pos = 0;
+ buf->allocation_failure = 0;
+ buf->last_newline = 0;
+ buf->last_field = 0;
+ buf->indent = 0;
+ buf->cur_tabstop = 0;
+}
+
/**
* printbuf_reset - re-use a printbuf without freeing and re-initializing it:
*/
static inline void printbuf_reset(struct printbuf *buf)
{
- buf->pos = 0;
- buf->allocation_failure = 0;
- buf->indent = 0;
+ printbuf_reset_keep_tabstops(buf);
buf->nr_tabstops = 0;
- buf->cur_tabstop = 0;
}
/**
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 74f45a8162ad..8b857fc33244 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -60,7 +60,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
};
int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
int ret = 0;
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index a62abcc5332a..1551800ff44c 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -5,10 +5,10 @@
#include "inode.h"
#include "quota_types.h"
-enum bch_validate_flags;
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
-int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_quota_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_quota ((struct bkey_ops) { \
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
index 40a20192eee8..bef2aa1b8bcd 100644
--- a/fs/bcachefs/rcu_pending.c
+++ b/fs/bcachefs/rcu_pending.c
@@ -25,21 +25,37 @@ enum rcu_pending_special {
#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
-static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
+#ifdef __KERNEL__
+typedef unsigned long rcu_gp_poll_state_t;
+
+static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
+{
+ return l == r;
+}
+#else
+typedef struct urcu_gp_poll_state rcu_gp_poll_state_t;
+
+static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
+{
+ return l.grace_period_id == r.grace_period_id;
+}
+#endif
+
+static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp)
{
return ssp
? get_state_synchronize_srcu(ssp)
: get_state_synchronize_rcu();
}
-static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
+static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp)
{
return ssp
? start_poll_synchronize_srcu(ssp)
: start_poll_synchronize_rcu();
}
-static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
+static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie)
{
return ssp
? poll_state_synchronize_srcu(ssp, cookie)
@@ -71,13 +87,13 @@ struct rcu_pending_seq {
GENRADIX(struct rcu_head *) objs;
size_t nr;
struct rcu_head **cursor;
- unsigned long seq;
+ rcu_gp_poll_state_t seq;
};
struct rcu_pending_list {
struct rcu_head *head;
struct rcu_head *tail;
- unsigned long seq;
+ rcu_gp_poll_state_t seq;
};
struct rcu_pending_pcpu {
@@ -316,10 +332,10 @@ static void rcu_pending_rcu_cb(struct rcu_head *rcu)
}
static __always_inline struct rcu_pending_seq *
-get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
+get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq)
{
darray_for_each_reverse(p->objs, objs)
- if (objs->seq == seq)
+ if (rcu_gp_poll_cookie_eq(objs->seq, seq))
return objs;
if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
@@ -329,7 +345,7 @@ get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
}
static noinline bool
-rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
+rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq,
struct rcu_head *head, void *ptr,
unsigned long *flags)
{
@@ -364,7 +380,7 @@ rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
again:
for (struct rcu_pending_list *i = p->lists;
i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
- if (i->seq == seq) {
+ if (rcu_gp_poll_cookie_eq(i->seq, seq)) {
rcu_pending_list_add(i, head);
return false;
}
@@ -408,7 +424,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
struct rcu_pending_pcpu *p;
struct rcu_pending_seq *objs;
struct genradix_node *new_node = NULL;
- unsigned long seq, flags;
+ unsigned long flags;
bool start_gp = false;
BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
@@ -416,7 +432,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
local_irq_save(flags);
p = this_cpu_ptr(pending->p);
spin_lock(&p->lock);
- seq = __get_state_synchronize_rcu(pending->srcu);
+ rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu);
restart:
if (may_sleep &&
unlikely(process_finished_items(pending, p, flags)))
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index cd6647374353..4adc74cd3f70 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -24,6 +24,192 @@
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
+/* bch_extent_rebalance: */
+
+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+ return &entry->rebalance;
+
+ return NULL;
+}
+
+static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_s_c k,
+ struct bkey_ptrs_c ptrs)
+{
+ if (!opts->background_compression)
+ return 0;
+
+ unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned ptr_bit = 1;
+ unsigned rewrite_ptrs = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+ p.ptr.unwritten)
+ return 0;
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ rewrite_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
+ }
+
+ return rewrite_ptrs;
+}
+
+static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_ptrs_c ptrs)
+{
+ if (!opts->background_target ||
+ !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
+ return 0;
+
+ unsigned ptr_bit = 1;
+ unsigned rewrite_ptrs = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
+ rewrite_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
+ }
+
+ return rewrite_ptrs;
+}
+
+static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
+ bch2_bkey_ptrs_need_move(c, opts, ptrs);
+}
+
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
+ if (!opts)
+ return 0;
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ u64 sectors = 0;
+
+ if (opts->background_compression) {
+ unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+ p.ptr.unwritten) {
+ sectors = 0;
+ goto incompressible;
+ }
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ sectors += p.crc.compressed_size;
+ }
+ }
+incompressible:
+ if (opts->background_target &&
+ bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) {
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
+ sectors += p.crc.compressed_size;
+ }
+
+ return sectors;
+}
+
+static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
+ struct bkey_s_c k)
+{
+ if (!bkey_extent_is_direct_data(k.k))
+ return 0;
+
+ const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
+
+ if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
+ struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts);
+ return old == NULL || memcmp(old, &new, sizeof(new));
+ } else {
+ return old != NULL;
+ }
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
+ struct bkey_i *_k)
+{
+ if (!bkey_extent_is_direct_data(&_k->k))
+ return 0;
+
+ struct bkey_s k = bkey_i_to_s(_k);
+ struct bch_extent_rebalance *old =
+ (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+
+ if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
+ if (!old) {
+ old = bkey_val_end(k);
+ k.k->u64s += sizeof(*old) / sizeof(u64);
+ }
+
+ *old = io_opts_to_rebalance_opts(opts);
+ } else {
+ if (old)
+ extent_entry_drop(k, (union bch_extent_entry *) old);
+ }
+
+ return 0;
+}
+
+int bch2_get_update_rebalance_opts(struct btree_trans *trans,
+ struct bch_io_opts *io_opts,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ BUG_ON(iter->flags & BTREE_ITER_is_extents);
+ BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
+
+ const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
+ ? bch2_bkey_rebalance_opts(k) : NULL;
+ if (r) {
+#define x(_name) \
+ if (r->_name##_from_inode) { \
+ io_opts->_name = r->_name; \
+ io_opts->_name##_from_inode = true; \
+ }
+ BCH_REBALANCE_OPTS()
+#undef x
+ }
+
+ if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
+ return 0;
+
+ struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(n, k);
+
+ /* On successfull transaction commit, @k was invalidated: */
+
+ return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
+ bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+}
+
#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
static const char * const bch2_rebalance_state_strs[] = {
@@ -33,7 +219,7 @@ static const char * const bch2_rebalance_state_strs[] = {
#undef x
};
-static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -71,9 +257,8 @@ static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
int ret = bch2_trans_commit_do(c, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_lazy_rw,
- __bch2_set_rebalance_needs_scan(trans, inum));
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_set_rebalance_needs_scan_trans(trans, inum));
rebalance_wakeup(c);
return ret;
}
@@ -121,6 +306,9 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
+ if (!bch2_bkey_rebalance_opts(k))
+ return 0;
+
struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
int ret = PTR_ERR_OR_ZERO(n);
if (ret)
@@ -134,31 +322,27 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
struct bpos work_pos,
struct btree_iter *extent_iter,
+ struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
bch2_trans_iter_exit(trans, extent_iter);
bch2_trans_iter_init(trans, extent_iter,
work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
work_pos,
BTREE_ITER_all_snapshots);
- k = bch2_btree_iter_peek_slot(extent_iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter);
if (bkey_err(k))
return k;
- const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
- if (!r) {
- /* raced due to btree write buffer, nothing to do */
- return bkey_s_c_null;
- }
+ int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
+ if (ret)
+ return bkey_s_c_err(ret);
memset(data_opts, 0, sizeof(*data_opts));
-
- data_opts->rewrite_ptrs =
- bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
- data_opts->target = r->target;
+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
+ data_opts->target = io_opts->background_target;
data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
if (!data_opts->rewrite_ptrs) {
@@ -178,12 +362,28 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
if (trace_rebalance_extent_enabled()) {
struct printbuf buf = PRINTBUF;
- prt_str(&buf, "target=");
- bch2_target_to_text(&buf, c, r->target);
- prt_str(&buf, " compression=");
- bch2_compression_opt_to_text(&buf, r->compression);
- prt_str(&buf, " ");
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
+ if (p) {
+ prt_str(&buf, "compression=");
+ bch2_compression_opt_to_text(&buf, io_opts->background_compression);
+ prt_str(&buf, " ");
+ bch2_prt_u64_base2(&buf, p);
+ prt_newline(&buf);
+ }
+
+ p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
+ if (p) {
+ prt_str(&buf, "move=");
+ bch2_target_to_text(&buf, c, io_opts->background_target);
+ prt_str(&buf, " ");
+ bch2_prt_u64_base2(&buf, p);
+ prt_newline(&buf);
+ }
trace_rebalance_extent(c, buf.buf);
printbuf_exit(&buf);
@@ -212,14 +412,10 @@ static int do_rebalance_extent(struct moving_context *ctxt,
bch2_bkey_buf_init(&sk);
ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
- extent_iter, &data_opts));
+ extent_iter, &io_opts, &data_opts));
if (ret || !k.k)
goto out;
- ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
- if (ret)
- goto out;
-
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
/*
@@ -253,20 +449,8 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
- unsigned target, compression;
-
- if (k.k->p.inode) {
- target = io_opts->background_target;
- compression = background_compression(*io_opts);
- } else {
- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
-
- target = r ? r->target : io_opts->background_target;
- compression = r ? r->compression : background_compression(*io_opts);
- }
-
- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
- data_opts->target = target;
+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
+ data_opts->target = io_opts->background_target;
data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
return data_opts->rewrite_ptrs != 0;
}
@@ -338,9 +522,9 @@ static int do_rebalance(struct moving_context *ctxt)
BTREE_ITER_all_snapshots);
while (!bch2_move_ratelimit(ctxt)) {
- if (!r->enabled) {
+ if (!c->opts.rebalance_enabled) {
bch2_moving_ctxt_flush_all(ctxt);
- kthread_wait_freezable(r->enabled ||
+ kthread_wait_freezable(c->opts.rebalance_enabled ||
kthread_should_stop());
}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 28a52638f16c..0a0821ab895d 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -2,8 +2,18 @@
#ifndef _BCACHEFS_REBALANCE_H
#define _BCACHEFS_REBALANCE_H
+#include "compress.h"
+#include "disk_groups.h"
#include "rebalance_types.h"
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
+int bch2_get_update_rebalance_opts(struct btree_trans *,
+ struct bch_io_opts *,
+ struct btree_iter *,
+ struct bkey_s_c);
+
+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
int bch2_set_fs_needs_rebalance(struct bch_fs *);
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
new file mode 100644
index 000000000000..ff9a1342a22b
--- /dev/null
+++ b/fs/bcachefs/rebalance_format.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_FORMAT_H
+#define _BCACHEFS_REBALANCE_FORMAT_H
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:6,
+ unused:3,
+
+ promote_target_from_inode:1,
+ erasure_code_from_inode:1,
+ data_checksum_from_inode:1,
+ background_compression_from_inode:1,
+ data_replicas_from_inode:1,
+ background_target_from_inode:1,
+
+ promote_target:16,
+ erasure_code:1,
+ data_checksum:4,
+ data_replicas:4,
+ background_compression:8, /* enum bch_compression_opt */
+ background_target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 background_target:16,
+ background_compression:8,
+ data_replicas:4,
+ data_checksum:4,
+ erasure_code:1,
+ promote_target:16,
+
+ background_target_from_inode:1,
+ data_replicas_from_inode:1,
+ background_compression_from_inode:1,
+ data_checksum_from_inode:1,
+ erasure_code_from_inode:1,
+ promote_target_from_inode:1,
+
+ unused:3,
+ type:6;
+#endif
+};
+
+/* subset of BCH_INODE_OPTS */
+#define BCH_REBALANCE_OPTS() \
+ x(data_checksum) \
+ x(background_compression) \
+ x(data_replicas) \
+ x(promote_target) \
+ x(background_target) \
+ x(erasure_code)
+
+#endif /* _BCACHEFS_REBALANCE_FORMAT_H */
+
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 0fffb536c1d0..fe5098c17dfc 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -30,8 +30,6 @@ struct bch_fs_rebalance {
struct bbpos scan_start;
struct bbpos scan_end;
struct bch_move_stats scan_stats;
-
- unsigned enabled:1;
};
#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 3c7f941dde39..98825437381c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -34,21 +34,83 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
+int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
{
- if (btree >= BTREE_ID_NR_MAX)
- return;
-
u64 b = BIT_ULL(btree);
+ int ret = 0;
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
if (!(c->sb.btrees_lost_data & b)) {
- bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
+ struct printbuf buf = PRINTBUF;
+ bch2_btree_id_to_text(&buf, btree);
+ bch_err(c, "flagging btree %s lost data", buf.buf);
+ printbuf_exit(&buf);
+ ext->btrees_lost_data |= cpu_to_le64(b);
+ }
- mutex_lock(&c->sb_lock);
- bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
+ /* Once we have runtime self healing for topology errors we won't need this: */
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret;
+
+ /* Btree node accounting will be off: */
+ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ /*
+ * These are much more minor, and don't need to be corrected right away,
+ * but in debug mode we want the next fsck run to be clean:
+ */
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret;
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret;
+#endif
+
+ switch (btree) {
+ case BTREE_ID_alloc:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+ goto out;
+ case BTREE_ID_backpointers:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret;
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret;
+ goto out;
+ case BTREE_ID_need_discard:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ goto out;
+ case BTREE_ID_freespace:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ goto out;
+ case BTREE_ID_bucket_gens:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ goto out;
+ case BTREE_ID_lru:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ goto out;
+ case BTREE_ID_accounting:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+ goto out;
+ default:
+ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
+ goto out;
}
+out:
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return ret;
+}
+
+static void kill_btree(struct bch_fs *c, enum btree_id btree)
+{
+ bch2_btree_id_root(c, btree)->alive = false;
+ bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
}
/* for -o reconstruct_alloc: */
@@ -79,6 +141,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
__set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent);
+
__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
@@ -99,16 +163,9 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++)
+ if (btree_id_is_alloc(i))
+ kill_btree(c, i);
}
/*
@@ -354,10 +411,13 @@ int bch2_journal_replay(struct bch_fs *c)
? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
: 0),
bch2_journal_replay_key(trans, k));
- bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
- bch2_btree_id_str(k->btree_id), k->level);
- if (ret)
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+ bch2_btree_id_level_to_text(&buf, k->btree_id, k->level);
+ bch_err_msg(c, ret, "while replaying key at %s:", buf.buf);
+ printbuf_exit(&buf);
goto err;
+ }
BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
}
@@ -403,7 +463,9 @@ static int journal_replay_entry_early(struct bch_fs *c,
switch (entry->type) {
case BCH_JSET_ENTRY_btree_root: {
- struct btree_root *r;
+
+ if (unlikely(!entry->u64s))
+ return 0;
if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
c, invalid_btree_id,
@@ -417,15 +479,11 @@ static int journal_replay_entry_early(struct bch_fs *c,
return ret;
}
- r = bch2_btree_id_root(c, entry->btree_id);
+ struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
- if (entry->u64s) {
- r->level = entry->level;
- bkey_copy(&r->key, (struct bkey_i *) entry->start);
- r->error = 0;
- } else {
- r->error = -BCH_ERR_btree_node_read_error;
- }
+ r->level = entry->level;
+ bkey_copy(&r->key, (struct bkey_i *) entry->start);
+ r->error = 0;
r->alive = true;
break;
}
@@ -505,6 +563,7 @@ static int journal_replay_early(struct bch_fs *c,
static int read_btree_roots(struct bch_fs *c)
{
+ struct printbuf buf = PRINTBUF;
int ret = 0;
for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
@@ -513,33 +572,22 @@ static int read_btree_roots(struct bch_fs *c)
if (!r->alive)
continue;
- if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
- continue;
+ printbuf_reset(&buf);
+ bch2_btree_id_level_to_text(&buf, i, r->level);
if (mustfix_fsck_err_on((ret = r->error),
c, btree_root_bkey_invalid,
"invalid btree root %s",
- bch2_btree_id_str(i)) ||
+ buf.buf) ||
mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
c, btree_root_read_error,
- "error reading btree root %s l=%u: %s",
- bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
- if (btree_id_is_alloc(i)) {
- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+ "error reading btree root %s: %s",
+ buf.buf, bch2_err_str(ret))) {
+ if (btree_id_is_alloc(i))
r->error = 0;
- } else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
- bch_info(c, "will run btree node scan");
- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
- }
- ret = 0;
- bch2_btree_lost_data(c, i);
+ ret = bch2_btree_lost_data(c, i);
+ BUG_ON(ret);
}
}
@@ -553,6 +601,7 @@ static int read_btree_roots(struct bch_fs *c)
}
}
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -563,6 +612,7 @@ static bool check_version_upgrade(struct bch_fs *c)
bch2_latest_compatible_version(c->sb.version));
unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
unsigned new_version = 0;
+ bool ret = false;
if (old_version < bcachefs_metadata_required_upgrade_below) {
if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
@@ -618,14 +668,32 @@ static bool check_version_upgrade(struct bch_fs *c)
}
bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
- bch2_sb_upgrade(c, new_version);
+ ret = true;
+ }
+ if (new_version > c->sb.version_incompat &&
+ c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Now allowing incompatible features up to ");
+ bch2_version_to_text(&buf, new_version);
+ prt_str(&buf, ", previously allowed up to ");
+ bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
+ prt_newline(&buf);
+
+ bch_info(c, "%s", buf.buf);
printbuf_exit(&buf);
- return true;
+
+ ret = true;
}
- return false;
+ if (ret)
+ bch2_sb_upgrade(c, new_version,
+ c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible);
+
+ return ret;
}
int bch2_fs_recovery(struct bch_fs *c)
@@ -660,8 +728,13 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (c->opts.norecovery)
- c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1;
+ if (c->opts.norecovery) {
+ c->opts.recovery_pass_last = c->opts.recovery_pass_last
+ ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
+ : BCH_RECOVERY_PASS_snapshots_read;
+ c->opts.nochanges = true;
+ c->opts.read_only = true;
+ }
mutex_lock(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
@@ -708,17 +781,20 @@ int bch2_fs_recovery(struct bch_fs *c)
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) {
+ SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe);
+ write_sb = true;
+ }
+
if (write_sb)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
-
if (c->opts.fsck)
set_bit(BCH_FS_fsck_running, &c->flags);
if (c->sb.clean)
set_bit(BCH_FS_clean_recovery, &c->flags);
+ set_bit(BCH_FS_recovery_running, &c->flags);
ret = bch2_blacklist_table_initialize(c);
if (ret) {
@@ -807,15 +883,15 @@ int bch2_fs_recovery(struct bch_fs *c)
c->journal_replay_seq_start = last_seq;
c->journal_replay_seq_end = blacklist_seq - 1;
- if (c->opts.reconstruct_alloc)
- bch2_reconstruct_alloc(c);
-
zero_out_btree_mem_ptr(&c->journal_keys);
ret = journal_replay_early(c, clean);
if (ret)
goto err;
+ if (c->opts.reconstruct_alloc)
+ bch2_reconstruct_alloc(c);
+
/*
* After an unclean shutdown, skip then next few journal sequence
* numbers as they may have been referenced by btree writes that
@@ -870,16 +946,17 @@ int bch2_fs_recovery(struct bch_fs *c)
*/
set_bit(BCH_FS_may_go_rw, &c->flags);
clear_bit(BCH_FS_fsck_running, &c->flags);
+ clear_bit(BCH_FS_recovery_running, &c->flags);
/* in case we don't run journal replay, i.e. norecovery mode */
set_bit(BCH_FS_accounting_replay_done, &c->flags);
+ bch2_async_btree_node_rewrites_flush(c);
+
/* fsync if we fixed errors */
- if (test_bit(BCH_FS_errors_fixed, &c->flags) &&
- bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) {
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
bch2_journal_flush_all_pins(&c->journal);
bch2_journal_meta(&c->journal);
- bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
}
/* If we fixed errors, verify that fs is actually clean now: */
@@ -1021,7 +1098,7 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_check_version_downgrade(c);
if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
- bch2_sb_upgrade(c, bcachefs_metadata_version_current);
+ bch2_sb_upgrade(c, bcachefs_metadata_version_current, false);
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
bch2_write_super(c);
}
@@ -1035,7 +1112,6 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
set_bit(BCH_FS_btree_running, &c->flags);
set_bit(BCH_FS_may_go_rw, &c->flags);
@@ -1076,9 +1152,6 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- for_each_online_member(c, ca)
- ca->new_fs_bucket_idx = 0;
-
ret = bch2_fs_freespace_init(c);
if (ret)
goto err;
@@ -1137,6 +1210,7 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
return 0;
err:
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 4bf818de1f2f..b0d55754b21b 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,7 +2,7 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-void bch2_btree_lost_data(struct bch_fs *, enum btree_id);
+int bch2_btree_lost_data(struct bch_fs *, enum btree_id);
int bch2_journal_replay(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index dff589ddc984..0b3c951c32da 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -46,7 +46,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
set_bit(BCH_FS_may_go_rw, &c->flags);
- if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
+ if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
return bch2_fs_read_write_early(c);
return 0;
}
@@ -100,20 +100,34 @@ u64 bch2_recovery_passes_from_stable(u64 v)
/*
* For when we need to rewind recovery passes and run a pass we skipped:
*/
-int bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
+static int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ enum bch_recovery_pass pass)
{
- if (c->opts.recovery_passes & BIT_ULL(pass))
+ if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns))
+ return -BCH_ERR_not_in_recovery;
+
+ if (c->recovery_passes_complete & BIT_ULL(pass))
return 0;
- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
- bch2_recovery_passes[pass], pass,
- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+ bool print = !(c->opts.recovery_passes & BIT_ULL(pass));
+
+ if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
+ c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) {
+ if (print)
+ bch_info(c, "need recovery pass %s (%u), but already rw",
+ bch2_recovery_passes[pass], pass);
+ return -BCH_ERR_cannot_rewind_recovery;
+ }
+
+ if (print)
+ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+ bch2_recovery_passes[pass], pass,
+ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
c->opts.recovery_passes |= BIT_ULL(pass);
- if (c->curr_recovery_pass >= pass) {
- c->curr_recovery_pass = pass;
+ if (c->curr_recovery_pass > pass) {
+ c->next_recovery_pass = pass;
c->recovery_passes_complete &= (1ULL << pass) >> 1;
return -BCH_ERR_restart_recovery;
} else {
@@ -121,6 +135,27 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
}
}
+int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&c->recovery_pass_lock, flags);
+ int ret = __bch2_run_explicit_recovery_pass(c, pass);
+ spin_unlock_irqrestore(&c->recovery_pass_lock, flags);
+ return ret;
+}
+
+int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
+
+ return bch2_run_explicit_recovery_pass(c, pass);
+}
+
int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
enum bch_recovery_pass pass)
{
@@ -233,31 +268,48 @@ int bch2_run_recovery_passes(struct bch_fs *c)
*/
c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
- if (c->opts.recovery_pass_last &&
- c->curr_recovery_pass > c->opts.recovery_pass_last)
- break;
-
- if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
- unsigned pass = c->curr_recovery_pass;
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
+ c->next_recovery_pass = c->curr_recovery_pass + 1;
- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass) ?:
- bch2_journal_flush(&c->journal);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
- (ret && c->curr_recovery_pass < pass))
- continue;
- if (ret)
- break;
+ spin_lock_irq(&c->recovery_pass_lock);
+ unsigned pass = c->curr_recovery_pass;
- c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
+ if (c->opts.recovery_pass_last &&
+ c->curr_recovery_pass > c->opts.recovery_pass_last) {
+ spin_unlock_irq(&c->recovery_pass_lock);
+ break;
}
- c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
-
- if (!test_bit(BCH_FS_error, &c->flags))
- bch2_clear_recovery_pass_required(c, c->curr_recovery_pass);
-
- c->curr_recovery_pass++;
+ if (!should_run_recovery_pass(c, pass)) {
+ c->curr_recovery_pass++;
+ c->recovery_pass_done = max(c->recovery_pass_done, pass);
+ spin_unlock_irq(&c->recovery_pass_lock);
+ continue;
+ }
+ spin_unlock_irq(&c->recovery_pass_lock);
+
+ ret = bch2_run_recovery_pass(c, pass) ?:
+ bch2_journal_flush(&c->journal);
+
+ if (!ret && !test_bit(BCH_FS_error, &c->flags))
+ bch2_clear_recovery_pass_required(c, pass);
+
+ spin_lock_irq(&c->recovery_pass_lock);
+ if (c->next_recovery_pass < c->curr_recovery_pass) {
+ /*
+ * bch2_run_explicit_recovery_pass() was called: we
+ * can't always catch -BCH_ERR_restart_recovery because
+ * it may have been called from another thread (btree
+ * node read completion)
+ */
+ ret = 0;
+ c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass);
+ } else {
+ c->recovery_passes_complete |= BIT_ULL(pass);
+ c->recovery_pass_done = max(c->recovery_pass_done, pass);
+ }
+ c->curr_recovery_pass = c->next_recovery_pass;
+ spin_unlock_irq(&c->recovery_pass_lock);
}
return ret;
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
index 99b464e127b8..7d7339c8fa29 100644
--- a/fs/bcachefs/recovery_passes.h
+++ b/fs/bcachefs/recovery_passes.h
@@ -9,6 +9,7 @@ u64 bch2_recovery_passes_from_stable(u64 v);
u64 bch2_fsck_recovery_passes(void);
int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass);
int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
int bch2_run_online_recovery_passes(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 94dc20ca2065..418557960ed6 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -8,53 +8,59 @@
#define PASS_ALWAYS BIT(3)
#define PASS_ONLINE BIT(4)
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define PASS_FSCK_DEBUG BIT(1)
+#else
+#define PASS_FSCK_DEBUG 0
+#endif
+
/*
* Passes may be reordered, but the second field is a persistent identifier and
* must never change:
*/
-#define BCH_RECOVERY_PASSES() \
- x(recovery_pass_empty, 41, PASS_SILENT) \
- x(scan_for_btree_nodes, 37, 0) \
- x(check_topology, 4, 0) \
- x(accounting_read, 39, PASS_ALWAYS) \
- x(alloc_read, 0, PASS_ALWAYS) \
- x(stripes_read, 1, PASS_ALWAYS) \
- x(initialize_subvolumes, 2, 0) \
- x(snapshots_read, 3, PASS_ALWAYS) \
- x(check_allocations, 5, PASS_FSCK) \
- x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
- x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
- x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
- x(journal_replay, 9, PASS_ALWAYS) \
- x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \
- x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \
- x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \
- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \
- x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \
- x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
- x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
- x(bucket_gens_init, 17, 0) \
- x(reconstruct_snapshots, 38, 0) \
- x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
- x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
- x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
- x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
- x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
- x(fs_upgrade_for_subvolumes, 22, 0) \
- x(check_inodes, 24, PASS_FSCK) \
- x(check_extents, 25, PASS_FSCK) \
- x(check_indirect_extents, 26, PASS_FSCK) \
- x(check_dirents, 27, PASS_FSCK) \
- x(check_xattrs, 28, PASS_FSCK) \
- x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
- x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \
- x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
- x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
- x(check_nlinks, 31, PASS_FSCK) \
- x(resume_logged_ops, 23, PASS_ALWAYS) \
- x(delete_dead_inodes, 32, PASS_ALWAYS) \
- x(fix_reflink_p, 33, 0) \
- x(set_fs_needs_rebalance, 34, 0) \
+#define BCH_RECOVERY_PASSES() \
+ x(recovery_pass_empty, 41, PASS_SILENT) \
+ x(scan_for_btree_nodes, 37, 0) \
+ x(check_topology, 4, 0) \
+ x(accounting_read, 39, PASS_ALWAYS) \
+ x(alloc_read, 0, PASS_ALWAYS) \
+ x(stripes_read, 1, PASS_ALWAYS) \
+ x(initialize_subvolumes, 2, 0) \
+ x(snapshots_read, 3, PASS_ALWAYS) \
+ x(check_allocations, 5, PASS_FSCK) \
+ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
+ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
+ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
+ x(journal_replay, 9, PASS_ALWAYS) \
+ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \
+ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \
+ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \
+ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
+ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \
+ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
+ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
+ x(bucket_gens_init, 17, 0) \
+ x(reconstruct_snapshots, 38, 0) \
+ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
+ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
+ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
+ x(fs_upgrade_for_subvolumes, 22, 0) \
+ x(check_inodes, 24, PASS_FSCK) \
+ x(check_extents, 25, PASS_FSCK) \
+ x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \
+ x(check_dirents, 27, PASS_FSCK) \
+ x(check_xattrs, 28, PASS_FSCK) \
+ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_unreachable_inodes, 40, PASS_FSCK) \
+ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
+ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
+ x(check_nlinks, 31, PASS_FSCK) \
+ x(resume_logged_ops, 23, PASS_ALWAYS) \
+ x(delete_dead_inodes, 32, PASS_ALWAYS) \
+ x(fix_reflink_p, 33, 0) \
+ x(set_fs_needs_rebalance, 34, 0)
/* We normally enumerate recovery passes in the order we run them: */
enum bch_recovery_pass {
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index f457925fa362..93ba4f4e47ca 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -15,6 +15,17 @@
#include <linux/sched/signal.h>
+static inline bool bkey_extent_is_reflink_data(const struct bkey *k)
+{
+ switch (k->type) {
+ case KEY_TYPE_reflink_v:
+ case KEY_TYPE_indirect_inline_data:
+ return true;
+ default:
+ return false;
+ }
+}
+
static inline unsigned bkey_type_to_indirect(const struct bkey *k)
{
switch (k->type) {
@@ -30,15 +41,15 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
/* reflink pointers */
int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
int ret = 0;
- bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad),
+ bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad),
c, reflink_p_front_pad_bad,
"idx < front_pad (%llu < %u)",
- le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+ REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad));
fsck_err:
return ret;
}
@@ -49,7 +60,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
prt_printf(out, "idx %llu front_pad %u back_pad %u",
- le64_to_cpu(p.v->idx),
+ REFLINK_P_IDX(p.v),
le32_to_cpu(p.v->front_pad),
le32_to_cpu(p.v->back_pad));
}
@@ -65,49 +76,250 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
*/
return false;
- if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+ if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v))
+ return false;
+
+ if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v))
return false;
bch2_key_resize(l.k, l.k->size + r.k->size);
return true;
}
+/* indirect extents */
+
+int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)),
+ c, reflink_v_pos_bad,
+ "indirect extent above maximum position 0:%llu",
+ REFLINK_P_IDX_MAX);
+
+ ret = bch2_bkey_ptrs_validate(c, k, from);
+fsck_err:
+ return ret;
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+ prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+ bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+#if 0
+Currently disabled, needs to be debugged:
+
+bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+ struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l);
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
+
+ return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
+}
+#endif
+
+/* indirect inline data */
+
+int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
+{
+ return 0;
+}
+
+void bch2_indirect_inline_data_to_text(struct printbuf *out,
+ struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
+ unsigned datalen = bkey_inline_data_bytes(k.k);
+
+ prt_printf(out, "refcount %llu datalen %u: %*phN",
+ le64_to_cpu(d.v->refcount), datalen,
+ min(datalen, 32U), d.v->data);
+}
+
+/* lookup */
+
+static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p,
+ bool should_commit)
+{
+ struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
+ int ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ SET_REFLINK_P_ERROR(&new->v, false);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
+ if (ret)
+ return ret;
+
+ if (!should_commit)
+ return 0;
+
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+}
+
+static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 missing_start, u64 missing_end,
+ bool should_commit)
+{
+ if (REFLINK_P_ERROR(p.v))
+ return -BCH_ERR_missing_indirect_extent;
+
+ struct bch_fs *c = trans->c;
+ u64 live_start = REFLINK_P_IDX(p.v);
+ u64 live_end = REFLINK_P_IDX(p.v) + p.k->size;
+ u64 refd_start = live_start - le32_to_cpu(p.v->front_pad);
+ u64 refd_end = live_end + le32_to_cpu(p.v->back_pad);
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ BUG_ON(missing_start < refd_start);
+ BUG_ON(missing_end > refd_end);
+
+ if (fsck_err(trans, reflink_p_to_missing_reflink_v,
+ "pointer to missing indirect extent\n"
+ " %s\n"
+ " missing range %llu-%llu",
+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+ missing_start, missing_end)) {
+ struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto err;
+
+ /*
+ * Is the missing range not actually needed?
+ *
+ * p.v->idx refers to the data that we actually want, but if the
+ * indirect extent we point to was bigger, front_pad and back_pad
+ * indicate the range we took a reference on.
+ */
+
+ if (missing_end <= live_start) {
+ new->v.front_pad = cpu_to_le32(live_start - missing_end);
+ } else if (missing_start >= live_end) {
+ new->v.back_pad = cpu_to_le32(missing_start - live_end);
+ } else {
+ struct bpos new_start = bkey_start_pos(&new->k);
+ struct bpos new_end = new->k.p;
+
+ if (missing_start > live_start)
+ new_start.offset += missing_start - live_start;
+ if (missing_end < live_end)
+ new_end.offset -= live_end - missing_end;
+
+ bch2_cut_front(new_start, &new->k_i);
+ bch2_cut_back(new_end, &new->k_i);
+
+ SET_REFLINK_P_ERROR(&new->v, true);
+ }
+
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
+ if (ret)
+ goto err;
+
+ if (should_commit)
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/*
+ * This is used from the read path, which doesn't expect to have to do a
+ * transaction commit, and from triggers, which should not be doing a commit:
+ */
+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ s64 *offset_into_extent,
+ struct bkey_s_c_reflink_p p,
+ bool should_commit,
+ unsigned iter_flags)
+{
+ BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad)));
+ BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad));
+
+ u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent;
+
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink,
+ POS(0, reflink_offset), iter_flags);
+ if (bkey_err(k))
+ return k;
+
+ if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
+ bch2_trans_iter_exit(trans, iter);
+
+ unsigned size = min((u64) k.k->size,
+ REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) -
+ reflink_offset);
+ bch2_key_resize(&iter->k, size);
+
+ int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
+ k.k->p.offset, should_commit);
+ if (ret)
+ return bkey_s_c_err(ret);
+ } else if (unlikely(REFLINK_P_ERROR(p.v))) {
+ bch2_trans_iter_exit(trans, iter);
+
+ int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
+ if (ret)
+ return bkey_s_c_err(ret);
+ }
+
+ *offset_into_extent = reflink_offset - bkey_start_offset(k.k);
+ return k;
+}
+
+/* reflink pointer trigger */
+
static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
struct bkey_s_c_reflink_p p, u64 *idx,
enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i *k;
- __le64 *refcount;
- int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
struct printbuf buf = PRINTBUF;
- int ret;
- k = bch2_bkey_get_mut_noupdate(trans, &iter,
- BTREE_ID_reflink, POS(0, *idx),
- BTREE_ITER_with_updates);
- ret = PTR_ERR_OR_ZERO(k);
+ s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false,
+ BTREE_ITER_intent|
+ BTREE_ITER_with_updates);
+ int ret = bkey_err(k);
if (ret)
- goto err;
+ return ret;
- refcount = bkey_refcount(bkey_i_to_s(k));
- if (!refcount) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "nonexistent indirect extent at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
- goto err;
+ if (bkey_deleted(k.k)) {
+ if (!(flags & BTREE_TRIGGER_overwrite))
+ ret = -BCH_ERR_missing_indirect_extent;
+ goto next;
}
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto err;
+
+ __le64 *refcount = bkey_refcount(bkey_i_to_s(new));
if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "indirect extent refcount underflow at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
- goto err;
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ log_fsck_err(trans, reflink_refcount_underflow,
+ "indirect extent refcount underflow while marking\n %s",
+ buf.buf);
+ goto next;
}
if (flags & BTREE_TRIGGER_insert) {
@@ -115,25 +327,26 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
u64 pad;
pad = max_t(s64, le32_to_cpu(v->front_pad),
- le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
+ REFLINK_P_IDX(v) - bkey_start_offset(&new->k));
BUG_ON(pad > U32_MAX);
v->front_pad = cpu_to_le32(pad);
pad = max_t(s64, le32_to_cpu(v->back_pad),
- k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
+ new->k.p.offset - p.k->size - REFLINK_P_IDX(v));
BUG_ON(pad > U32_MAX);
v->back_pad = cpu_to_le32(pad);
}
- le64_add_cpu(refcount, add);
+ le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1);
bch2_btree_iter_set_pos_to_extent_start(&iter);
- ret = bch2_trans_update(trans, &iter, k, 0);
+ ret = bch2_trans_update(trans, &iter, new, 0);
if (ret)
goto err;
-
- *idx = k->k.p.offset;
+next:
+ *idx = k.k->p.offset;
err:
+fsck_err:
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
return ret;
@@ -147,9 +360,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct reflink_gc *r;
int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
- u64 start = le64_to_cpu(p.v->idx);
- u64 end = le64_to_cpu(p.v->idx) + p.k->size;
- u64 next_idx = end + le32_to_cpu(p.v->back_pad);
+ u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
s64 ret = 0;
struct printbuf buf = PRINTBUF;
@@ -168,36 +379,14 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
*idx = r->offset;
return 0;
not_found:
- BUG_ON(!(flags & BTREE_TRIGGER_check_repair));
-
- if (fsck_err(trans, reflink_p_to_missing_reflink_v,
- "pointer to missing indirect extent\n"
- " %s\n"
- " missing range %llu-%llu",
- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
- *idx, next_idx)) {
- struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c);
- ret = PTR_ERR_OR_ZERO(update);
+ if (flags & BTREE_TRIGGER_check_repair) {
+ ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false);
if (ret)
goto err;
-
- if (next_idx <= start) {
- bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx);
- } else if (*idx >= end) {
- bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end);
- } else {
- bkey_error_init(update);
- update->k.p = p.k->p;
- update->k.size = p.k->size;
- set_bkey_val_u64s(&update->k, 0);
- }
-
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun);
}
*idx = next_idx;
err:
-fsck_err:
printbuf_exit(&buf);
return ret;
}
@@ -210,8 +399,8 @@ static int __trigger_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
int ret = 0;
- u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
- u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad);
+ u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
+ u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
if (flags & BTREE_TRIGGER_transactional) {
while (idx < end && !ret)
@@ -253,35 +442,7 @@ int bch2_trigger_reflink_p(struct btree_trans *trans,
return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags);
}
-/* indirect extents */
-
-int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
-{
- return bch2_bkey_ptrs_validate(c, k, flags);
-}
-
-void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
- prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
-
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-#if 0
-Currently disabled, needs to be debugged:
-
-bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
- struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l);
- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
-
- return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
-}
-#endif
+/* indirect extent trigger */
static inline void
check_indirect_extent_deleting(struct bkey_s new,
@@ -307,25 +468,6 @@ int bch2_trigger_reflink_v(struct btree_trans *trans,
return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
}
-/* indirect inline data */
-
-int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
-{
- return 0;
-}
-
-void bch2_indirect_inline_data_to_text(struct printbuf *out,
- struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
- unsigned datalen = bkey_inline_data_bytes(k.k);
-
- prt_printf(out, "refcount %llu datalen %u: %*phN",
- le64_to_cpu(d.v->refcount), datalen,
- min(datalen, 32U), d.v->data);
-}
-
int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
@@ -336,9 +478,12 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
return 0;
}
+/* create */
+
static int bch2_make_extent_indirect(struct btree_trans *trans,
struct btree_iter *extent_iter,
- struct bkey_i *orig)
+ struct bkey_i *orig,
+ bool reflink_p_may_update_opts_field)
{
struct bch_fs *c = trans->c;
struct btree_iter reflink_iter = { NULL };
@@ -358,6 +503,14 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
if (ret)
goto err;
+ /*
+ * XXX: we're assuming that 56 bits will be enough for the life of the
+ * filesystem: we need to implement wraparound, with a cursor in the
+ * logged ops btree:
+ */
+ if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size)))
+ return -ENOSPC;
+
r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
ret = PTR_ERR_OR_ZERO(r_v);
if (ret)
@@ -394,7 +547,10 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
memset(&r_p->v, 0, sizeof(r_p->v));
#endif
- r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
+ SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k));
+
+ if (reflink_p_may_update_opts_field)
+ SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true);
ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
BTREE_UPDATE_internal_snapshot_node);
@@ -409,7 +565,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
struct bkey_s_c k;
int ret;
- for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) {
+ for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) {
if (bkey_extent_is_unwritten(k))
continue;
@@ -426,7 +582,8 @@ s64 bch2_remap_range(struct bch_fs *c,
subvol_inum dst_inum, u64 dst_offset,
subvol_inum src_inum, u64 src_offset,
u64 remap_sectors,
- u64 new_i_size, s64 *i_sectors_delta)
+ u64 new_i_size, s64 *i_sectors_delta,
+ bool may_change_src_io_path_opts)
{
struct btree_trans *trans;
struct btree_iter dst_iter, src_iter;
@@ -439,6 +596,8 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bpos src_want;
u64 dst_done = 0;
u32 dst_snapshot, src_snapshot;
+ bool reflink_p_may_update_opts_field =
+ bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
int ret = 0, ret2 = 0;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
@@ -520,7 +679,8 @@ s64 bch2_remap_range(struct bch_fs *c,
src_k = bkey_i_to_s_c(new_src.k);
ret = bch2_make_extent_indirect(trans, &src_iter,
- new_src.k);
+ new_src.k,
+ reflink_p_may_update_opts_field);
if (ret)
continue;
@@ -533,11 +693,15 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bkey_i_reflink_p *dst_p =
bkey_reflink_p_init(new_dst.k);
- u64 offset = le64_to_cpu(src_p.v->idx) +
+ u64 offset = REFLINK_P_IDX(src_p.v) +
(src_want.offset -
bkey_start_offset(src_k.k));
- dst_p->v.idx = cpu_to_le64(offset);
+ SET_REFLINK_P_IDX(&dst_p->v, offset);
+
+ if (reflink_p_may_update_opts_field &&
+ may_change_src_io_path_opts)
+ SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true);
} else {
BUG();
}
@@ -547,7 +711,7 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?:
bch2_extent_update(trans, dst_inum, &dst_iter,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
@@ -591,3 +755,97 @@ s64 bch2_remap_range(struct bch_fs *c,
return dst_done ?: ret ?: ret2;
}
+
+/* fsck */
+
+static int bch2_gc_write_reflink_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ size_t *idx)
+{
+ struct bch_fs *c = trans->c;
+ const __le64 *refcount = bkey_refcount_c(k);
+ struct printbuf buf = PRINTBUF;
+ struct reflink_gc *r;
+ int ret = 0;
+
+ if (!refcount)
+ return 0;
+
+ while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
+ r->offset < k.k->p.offset)
+ ++*idx;
+
+ if (!r ||
+ r->offset != k.k->p.offset ||
+ r->size != k.k->size) {
+ bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+ return -EINVAL;
+ }
+
+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
+ trans, reflink_v_refcount_wrong,
+ "reflink key has wrong refcount:\n"
+ " %s\n"
+ " should be %u",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+ r->refcount)) {
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto out;
+
+ if (!r->refcount)
+ new->k.type = KEY_TYPE_deleted;
+ else
+ *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
+ ret = bch2_trans_update(trans, iter, new, 0);
+ }
+out:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_gc_reflink_done(struct bch_fs *c)
+{
+ size_t idx = 0;
+
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_prefetch, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
+ c->reflink_gc_nr = 0;
+ return ret;
+}
+
+int bch2_gc_reflink_start(struct bch_fs *c)
+{
+ c->reflink_gc_nr = 0;
+
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_prefetch, k, ({
+ const __le64 *refcount = bkey_refcount_c(k);
+
+ if (!refcount)
+ continue;
+
+ struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
+ c->reflink_gc_nr++, GFP_KERNEL);
+ if (!r) {
+ ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+ break;
+ }
+
+ r->offset = k.k->p.offset;
+ r->size = k.k->size;
+ r->refcount = 0;
+ 0;
+ })));
+
+ bch_err_fn(c, ret);
+ return ret;
+}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 51afe11d8ed6..1632780bdf18 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -2,9 +2,8 @@
#ifndef _BCACHEFS_REFLINK_H
#define _BCACHEFS_REFLINK_H
-enum bch_validate_flags;
-
-int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
@@ -19,7 +18,8 @@ int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
.min_val_size = 16, \
})
-int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
@@ -34,7 +34,7 @@ int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
})
int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
int bch2_trigger_indirect_inline_data(struct btree_trans *,
@@ -73,7 +73,15 @@ static inline __le64 *bkey_refcount(struct bkey_s k)
}
}
+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *,
+ s64 *, struct bkey_s_c_reflink_p,
+ bool, unsigned);
+
s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
- subvol_inum, u64, u64, u64, s64 *);
+ subvol_inum, u64, u64, u64, s64 *,
+ bool);
+
+int bch2_gc_reflink_done(struct bch_fs *);
+int bch2_gc_reflink_start(struct bch_fs *);
#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
index 6772eebb1fc6..92995e4f898e 100644
--- a/fs/bcachefs/reflink_format.h
+++ b/fs/bcachefs/reflink_format.h
@@ -4,7 +4,7 @@
struct bch_reflink_p {
struct bch_val v;
- __le64 idx;
+ __le64 idx_flags;
/*
* A reflink pointer might point to an indirect extent which is then
* later split (by copygc or rebalance). If we only pointed to part of
@@ -17,6 +17,11 @@ struct bch_reflink_p {
__le32 back_pad;
} __packed __aligned(8);
+LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56);
+LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57);
+LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS,
+ struct bch_reflink_p, idx_flags, 57, 58);
+
struct bch_reflink_v {
struct bch_val v;
__le64 refcount;
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 005275281804..59c8770e4a0e 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -23,6 +23,10 @@
int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
int write)
{
+ struct bkey_validate_context from = {
+ .flags = write,
+ .from = BKEY_VALIDATE_superblock,
+ };
struct jset_entry *entry;
int ret;
@@ -40,7 +44,7 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle
ret = bch2_journal_entry_validate(c, NULL, entry,
le16_to_cpu(c->disk_sb.sb->version),
BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
- write);
+ from);
if (ret)
return ret;
}
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index 62ea478215d0..fdcf598f08b1 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -2,86 +2,91 @@
#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
#define _BCACHEFS_SB_COUNTERS_FORMAT_H
-#define BCH_PERSISTENT_COUNTERS() \
- x(io_read, 0) \
- x(io_write, 1) \
- x(io_move, 2) \
- x(bucket_invalidate, 3) \
- x(bucket_discard, 4) \
- x(bucket_alloc, 5) \
- x(bucket_alloc_fail, 6) \
- x(btree_cache_scan, 7) \
- x(btree_cache_reap, 8) \
- x(btree_cache_cannibalize, 9) \
- x(btree_cache_cannibalize_lock, 10) \
- x(btree_cache_cannibalize_lock_fail, 11) \
- x(btree_cache_cannibalize_unlock, 12) \
- x(btree_node_write, 13) \
- x(btree_node_read, 14) \
- x(btree_node_compact, 15) \
- x(btree_node_merge, 16) \
- x(btree_node_split, 17) \
- x(btree_node_rewrite, 18) \
- x(btree_node_alloc, 19) \
- x(btree_node_free, 20) \
- x(btree_node_set_root, 21) \
- x(btree_path_relock_fail, 22) \
- x(btree_path_upgrade_fail, 23) \
- x(btree_reserve_get_fail, 24) \
- x(journal_entry_full, 25) \
- x(journal_full, 26) \
- x(journal_reclaim_finish, 27) \
- x(journal_reclaim_start, 28) \
- x(journal_write, 29) \
- x(read_promote, 30) \
- x(read_bounce, 31) \
- x(read_split, 33) \
- x(read_retry, 32) \
- x(read_reuse_race, 34) \
- x(move_extent_read, 35) \
- x(move_extent_write, 36) \
- x(move_extent_finish, 37) \
- x(move_extent_fail, 38) \
- x(move_extent_start_fail, 39) \
- x(copygc, 40) \
- x(copygc_wait, 41) \
- x(gc_gens_end, 42) \
- x(gc_gens_start, 43) \
- x(trans_blocked_journal_reclaim, 44) \
- x(trans_restart_btree_node_reused, 45) \
- x(trans_restart_btree_node_split, 46) \
- x(trans_restart_fault_inject, 47) \
- x(trans_restart_iter_upgrade, 48) \
- x(trans_restart_journal_preres_get, 49) \
- x(trans_restart_journal_reclaim, 50) \
- x(trans_restart_journal_res_get, 51) \
- x(trans_restart_key_cache_key_realloced, 52) \
- x(trans_restart_key_cache_raced, 53) \
- x(trans_restart_mark_replicas, 54) \
- x(trans_restart_mem_realloced, 55) \
- x(trans_restart_memory_allocation_failure, 56) \
- x(trans_restart_relock, 57) \
- x(trans_restart_relock_after_fill, 58) \
- x(trans_restart_relock_key_cache_fill, 59) \
- x(trans_restart_relock_next_node, 60) \
- x(trans_restart_relock_parent_for_fill, 61) \
- x(trans_restart_relock_path, 62) \
- x(trans_restart_relock_path_intent, 63) \
- x(trans_restart_too_many_iters, 64) \
- x(trans_restart_traverse, 65) \
- x(trans_restart_upgrade, 66) \
- x(trans_restart_would_deadlock, 67) \
- x(trans_restart_would_deadlock_write, 68) \
- x(trans_restart_injected, 69) \
- x(trans_restart_key_cache_upgrade, 70) \
- x(trans_traverse_all, 71) \
- x(transaction_commit, 72) \
- x(write_super, 73) \
- x(trans_restart_would_deadlock_recursion_limit, 74) \
- x(trans_restart_write_buffer_flush, 75) \
- x(trans_restart_split_race, 76) \
- x(write_buffer_flush_slowpath, 77) \
- x(write_buffer_flush_sync, 78)
+enum counters_flags {
+ TYPE_COUNTER = BIT(0), /* event counters */
+ TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */
+};
+
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0, TYPE_SECTORS) \
+ x(io_write, 1, TYPE_SECTORS) \
+ x(io_move, 2, TYPE_SECTORS) \
+ x(bucket_invalidate, 3, TYPE_COUNTER) \
+ x(bucket_discard, 4, TYPE_COUNTER) \
+ x(bucket_alloc, 5, TYPE_COUNTER) \
+ x(bucket_alloc_fail, 6, TYPE_COUNTER) \
+ x(btree_cache_scan, 7, TYPE_COUNTER) \
+ x(btree_cache_reap, 8, TYPE_COUNTER) \
+ x(btree_cache_cannibalize, 9, TYPE_COUNTER) \
+ x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \
+ x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \
+ x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \
+ x(btree_node_write, 13, TYPE_COUNTER) \
+ x(btree_node_read, 14, TYPE_COUNTER) \
+ x(btree_node_compact, 15, TYPE_COUNTER) \
+ x(btree_node_merge, 16, TYPE_COUNTER) \
+ x(btree_node_split, 17, TYPE_COUNTER) \
+ x(btree_node_rewrite, 18, TYPE_COUNTER) \
+ x(btree_node_alloc, 19, TYPE_COUNTER) \
+ x(btree_node_free, 20, TYPE_COUNTER) \
+ x(btree_node_set_root, 21, TYPE_COUNTER) \
+ x(btree_path_relock_fail, 22, TYPE_COUNTER) \
+ x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \
+ x(btree_reserve_get_fail, 24, TYPE_COUNTER) \
+ x(journal_entry_full, 25, TYPE_COUNTER) \
+ x(journal_full, 26, TYPE_COUNTER) \
+ x(journal_reclaim_finish, 27, TYPE_COUNTER) \
+ x(journal_reclaim_start, 28, TYPE_COUNTER) \
+ x(journal_write, 29, TYPE_COUNTER) \
+ x(read_promote, 30, TYPE_COUNTER) \
+ x(read_bounce, 31, TYPE_COUNTER) \
+ x(read_split, 33, TYPE_COUNTER) \
+ x(read_retry, 32, TYPE_COUNTER) \
+ x(read_reuse_race, 34, TYPE_COUNTER) \
+ x(move_extent_read, 35, TYPE_SECTORS) \
+ x(move_extent_write, 36, TYPE_SECTORS) \
+ x(move_extent_finish, 37, TYPE_SECTORS) \
+ x(move_extent_fail, 38, TYPE_COUNTER) \
+ x(move_extent_start_fail, 39, TYPE_COUNTER) \
+ x(copygc, 40, TYPE_COUNTER) \
+ x(copygc_wait, 41, TYPE_COUNTER) \
+ x(gc_gens_end, 42, TYPE_COUNTER) \
+ x(gc_gens_start, 43, TYPE_COUNTER) \
+ x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \
+ x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \
+ x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \
+ x(trans_restart_fault_inject, 47, TYPE_COUNTER) \
+ x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \
+ x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \
+ x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \
+ x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \
+ x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \
+ x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \
+ x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \
+ x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \
+ x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \
+ x(trans_restart_relock, 57, TYPE_COUNTER) \
+ x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \
+ x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \
+ x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \
+ x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \
+ x(trans_restart_relock_path, 62, TYPE_COUNTER) \
+ x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \
+ x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \
+ x(trans_restart_traverse, 65, TYPE_COUNTER) \
+ x(trans_restart_upgrade, 66, TYPE_COUNTER) \
+ x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \
+ x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \
+ x(trans_restart_injected, 69, TYPE_COUNTER) \
+ x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \
+ x(trans_traverse_all, 71, TYPE_COUNTER) \
+ x(transaction_commit, 72, TYPE_COUNTER) \
+ x(write_super, 73, TYPE_COUNTER) \
+ x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \
+ x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \
+ x(trans_restart_split_race, 76, TYPE_COUNTER) \
+ x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \
+ x(write_buffer_flush_sync, 78, TYPE_COUNTER)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 8767c33c2b51..051214fdc735 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -81,7 +81,16 @@
BCH_FSCK_ERR_accounting_mismatch) \
x(inode_has_child_snapshots, \
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
- BCH_FSCK_ERR_inode_has_child_snapshots_wrong)
+ BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \
+ x(backpointer_bucket_gen, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+ BCH_FSCK_ERR_backpointer_to_missing_ptr, \
+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \
+ x(disk_accounting_big_endian, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch, \
+ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
+ BCH_FSCK_ERR_accounting_key_junk_at_end)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \
@@ -117,7 +126,19 @@
BCH_FSCK_ERR_bkey_version_in_future) \
x(rebalance_work_acct_fix, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch)
+ BCH_FSCK_ERR_accounting_mismatch, \
+ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
+ BCH_FSCK_ERR_accounting_key_junk_at_end) \
+ x(backpointer_bucket_gen, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+ BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \
+ BCH_FSCK_ERR_backpointer_to_missing_ptr, \
+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \
+ x(disk_accounting_big_endian, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch, \
+ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
+ BCH_FSCK_ERR_accounting_key_junk_at_end)
struct upgrade_downgrade_entry {
u64 recovery_passes;
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 9feb6739f77a..80b6d589808b 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -5,9 +5,8 @@
enum bch_fsck_flags {
FSCK_CAN_FIX = 1 << 0,
FSCK_CAN_IGNORE = 1 << 1,
- FSCK_NEED_FSCK = 1 << 2,
- FSCK_NO_RATELIMIT = 1 << 3,
- FSCK_AUTOFIX = 1 << 4,
+ FSCK_NO_RATELIMIT = 1 << 2,
+ FSCK_AUTOFIX = 1 << 3,
};
#define BCH_SB_ERRS() \
@@ -59,7 +58,7 @@ enum bch_fsck_flags {
x(bset_empty, 45, 0) \
x(bset_bad_seq, 46, 0) \
x(bset_blacklisted_journal_seq, 47, 0) \
- x(first_bset_blacklisted_journal_seq, 48, 0) \
+ x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \
x(btree_node_bad_btree, 49, 0) \
x(btree_node_bad_level, 50, 0) \
x(btree_node_bad_min_key, 51, 0) \
@@ -68,17 +67,17 @@ enum bch_fsck_flags {
x(btree_node_bkey_past_bset_end, 54, 0) \
x(btree_node_bkey_bad_format, 55, 0) \
x(btree_node_bad_bkey, 56, 0) \
- x(btree_node_bkey_out_of_order, 57, 0) \
- x(btree_root_bkey_invalid, 58, 0) \
- x(btree_root_read_error, 59, 0) \
+ x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \
+ x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \
+ x(btree_root_read_error, 59, FSCK_AUTOFIX) \
x(btree_root_bad_min_key, 60, 0) \
x(btree_root_bad_max_key, 61, 0) \
- x(btree_node_read_error, 62, 0) \
- x(btree_node_topology_bad_min_key, 63, 0) \
- x(btree_node_topology_bad_max_key, 64, 0) \
- x(btree_node_topology_overwritten_by_prev_node, 65, 0) \
- x(btree_node_topology_overwritten_by_next_node, 66, 0) \
- x(btree_node_topology_interior_node_empty, 67, 0) \
+ x(btree_node_read_error, 62, FSCK_AUTOFIX) \
+ x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \
+ x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \
+ x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \
+ x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \
+ x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \
x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \
x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \
x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \
@@ -123,11 +122,12 @@ enum bch_fsck_flags {
x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \
x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \
x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \
+ x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \
x(bucket_sector_count_overflow, 112, 0) \
x(bucket_metadata_type_mismatch, 113, 0) \
- x(need_discard_key_wrong, 114, 0) \
- x(freespace_key_wrong, 115, 0) \
- x(freespace_hole_missing, 116, 0) \
+ x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \
+ x(freespace_key_wrong, 115, FSCK_AUTOFIX) \
+ x(freespace_hole_missing, 116, FSCK_AUTOFIX) \
x(bucket_gens_val_size_bad, 117, 0) \
x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \
x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \
@@ -139,9 +139,10 @@ enum bch_fsck_flags {
x(discarding_bucket_not_in_need_discard_btree, 291, 0) \
x(backpointer_bucket_offset_wrong, 125, 0) \
x(backpointer_level_bad, 294, 0) \
- x(backpointer_to_missing_device, 126, 0) \
- x(backpointer_to_missing_alloc, 127, 0) \
- x(backpointer_to_missing_ptr, 128, 0) \
+ x(backpointer_dev_bad, 297, 0) \
+ x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \
+ x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \
+ x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \
x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \
x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \
x(lru_entry_bad, 131, FSCK_AUTOFIX) \
@@ -167,14 +168,15 @@ enum bch_fsck_flags {
x(ptr_to_incorrect_stripe, 151, 0) \
x(ptr_gen_newer_than_bucket_gen, 152, 0) \
x(ptr_too_stale, 153, 0) \
- x(stale_dirty_ptr, 154, 0) \
+ x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \
x(ptr_bucket_data_type_mismatch, 155, 0) \
x(ptr_cached_and_erasure_coded, 156, 0) \
x(ptr_crc_uncompressed_size_too_small, 157, 0) \
+ x(ptr_crc_uncompressed_size_too_big, 161, 0) \
+ x(ptr_crc_uncompressed_size_mismatch, 300, 0) \
x(ptr_crc_csum_type_unknown, 158, 0) \
x(ptr_crc_compression_type_unknown, 159, 0) \
x(ptr_crc_redundant, 160, 0) \
- x(ptr_crc_uncompressed_size_too_big, 161, 0) \
x(ptr_crc_nonce_mismatch, 162, 0) \
x(ptr_stripe_redundant, 163, 0) \
x(reservation_key_nr_replicas_invalid, 164, 0) \
@@ -209,6 +211,7 @@ enum bch_fsck_flags {
x(bkey_in_missing_snapshot, 190, 0) \
x(inode_pos_inode_nonzero, 191, 0) \
x(inode_pos_blockdev_range, 192, 0) \
+ x(inode_alloc_cursor_inode_bad, 301, 0) \
x(inode_unpack_error, 193, 0) \
x(inode_str_hash_invalid, 194, 0) \
x(inode_v3_fields_start_bad, 195, 0) \
@@ -232,6 +235,7 @@ enum bch_fsck_flags {
x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
x(inode_has_child_snapshots_wrong, 287, 0) \
x(inode_unreachable, 210, FSCK_AUTOFIX) \
+ x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \
x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
@@ -252,6 +256,7 @@ enum bch_fsck_flags {
x(dirent_in_missing_dir_inode, 227, 0) \
x(dirent_in_non_dir_inode, 228, 0) \
x(dirent_to_missing_inode, 229, 0) \
+ x(dirent_to_overwritten_inode, 302, 0) \
x(dirent_to_missing_subvol, 230, 0) \
x(dirent_to_itself, 231, 0) \
x(quota_type_invalid, 232, 0) \
@@ -288,7 +293,7 @@ enum bch_fsck_flags {
x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
x(snapshot_node_missing, 264, 0) \
x(dup_backpointer_to_bad_csum_extent, 265, 0) \
- x(btree_bitmap_not_marked, 266, 0) \
+ x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \
x(sb_clean_entry_overrun, 267, 0) \
x(btree_ptr_v2_written_0, 268, 0) \
x(subvol_snapshot_bad, 269, 0) \
@@ -306,7 +311,9 @@ enum bch_fsck_flags {
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
- x(MAX, 295, 0)
+ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
+ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
+ x(MAX, 303, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 617d07e53b20..537bf049618f 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -616,8 +616,6 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long
if (type != SIX_LOCK_write)
six_release(&lock->dep_map, ip);
- else
- lock->seq++;
if (type == SIX_LOCK_intent &&
lock->intent_lock_recurse) {
@@ -625,6 +623,15 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long
return;
}
+ if (type == SIX_LOCK_write &&
+ lock->write_lock_recurse) {
+ --lock->write_lock_recurse;
+ return;
+ }
+
+ if (type == SIX_LOCK_write)
+ lock->seq++;
+
do_six_unlock_type(lock, type);
}
EXPORT_SYMBOL_GPL(six_unlock_ip);
@@ -735,13 +742,13 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
atomic_add(l[type].lock_val, &lock->state);
}
break;
+ case SIX_LOCK_write:
+ lock->write_lock_recurse++;
+ fallthrough;
case SIX_LOCK_intent:
EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
lock->intent_lock_recurse++;
break;
- case SIX_LOCK_write:
- BUG();
- break;
}
}
EXPORT_SYMBOL_GPL(six_lock_increment);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 68d46fd7f391..c142e06b7a3a 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -137,6 +137,7 @@ struct six_lock {
atomic_t state;
u32 seq;
unsigned intent_lock_recurse;
+ unsigned write_lock_recurse;
struct task_struct *owner;
unsigned __percpu *readers;
raw_spinlock_t wait_lock;
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ae57638506c3..c54091a28909 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "bkey_buf.h"
+#include "btree_cache.h"
#include "btree_key_cache.h"
#include "btree_update.h"
#include "buckets.h"
@@ -32,7 +33,7 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
}
int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
int ret = 0;
@@ -225,7 +226,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
}
int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_snapshot s;
u32 i, id;
@@ -279,23 +280,6 @@ int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
return ret;
}
-static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
-{
- struct snapshot_t *t = snapshot_t_mut(c, id);
- u32 parent = id;
-
- while ((parent = bch2_snapshot_parent_early(c, parent)) &&
- parent - id - 1 < IS_ANCESTOR_BITMAP)
- __set_bit(parent - id - 1, t->is_ancestor);
-}
-
-static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
-{
- mutex_lock(&c->snapshot_table_lock);
- __set_is_ancestor_bitmap(c, id);
- mutex_unlock(&c->snapshot_table_lock);
-}
-
static int __bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
@@ -317,6 +301,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
if (new.k->type == KEY_TYPE_snapshot) {
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+ t->live = true;
t->parent = le32_to_cpu(s.v->parent);
t->children[0] = le32_to_cpu(s.v->children[0]);
t->children[1] = le32_to_cpu(s.v->children[1]);
@@ -335,7 +320,11 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
t->skip[2] = 0;
}
- __set_is_ancestor_bitmap(c, id);
+ u32 parent = id;
+
+ while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+ parent - id - 1 < IS_ANCESTOR_BITMAP)
+ __set_bit(parent - id - 1, t->is_ancestor);
if (BCH_SNAPSHOT_DELETED(s.v)) {
set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
@@ -365,70 +354,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
BTREE_ITER_with_updates, snapshot, s);
}
-static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
-{
- struct bch_snapshot v;
- int ret;
-
- if (!id)
- return 0;
-
- ret = bch2_snapshot_lookup(trans, id, &v);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(trans->c, "snapshot node %u not found", id);
- if (ret)
- return ret;
-
- return !BCH_SNAPSHOT_DELETED(&v);
-}
-
-/*
- * If @k is a snapshot with just one live child, it's part of a linear chain,
- * which we consider to be an equivalence class: and then after snapshot
- * deletion cleanup, there should only be a single key at a given position in
- * this equivalence class.
- *
- * This sets the equivalence class of @k to be the child's equivalence class, if
- * it's part of such a linear chain: this correctly sets equivalence classes on
- * startup if we run leaf to root (i.e. in natural key order).
- */
-static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- unsigned i, nr_live = 0, live_idx = 0;
- struct bkey_s_c_snapshot snap;
- u32 id = k.k->p.offset, child[2];
-
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- snap = bkey_s_c_to_snapshot(k);
-
- child[0] = le32_to_cpu(snap.v->children[0]);
- child[1] = le32_to_cpu(snap.v->children[1]);
-
- for (i = 0; i < 2; i++) {
- int ret = bch2_snapshot_live(trans, child[i]);
-
- if (ret < 0)
- return ret;
-
- if (ret)
- live_idx = i;
- nr_live += ret;
- }
-
- mutex_lock(&c->snapshot_table_lock);
-
- snapshot_t_mut(c, id)->equiv = nr_live == 1
- ? snapshot_t_mut(c, child[live_idx])->equiv
- : id;
-
- mutex_unlock(&c->snapshot_table_lock);
-
- return 0;
-}
-
/* fsck: */
static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
@@ -506,7 +431,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
break;
}
}
-
bch2_trans_iter_exit(trans, &iter);
if (!ret && !found) {
@@ -536,6 +460,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
struct bch_snapshot s;
struct bch_subvolume subvol;
struct printbuf buf = PRINTBUF;
+ struct btree_iter snapshot_iter = {};
u32 root_id;
int ret;
@@ -545,22 +470,35 @@ static int check_snapshot_tree(struct btree_trans *trans,
st = bkey_s_c_to_snapshot_tree(k);
root_id = le32_to_cpu(st.v->root_snapshot);
- ret = bch2_snapshot_lookup(trans, root_id, &s);
+ struct bkey_s_c_snapshot snapshot_k =
+ bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
+ POS(0, root_id), 0, snapshot);
+ ret = bkey_err(snapshot_k);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
+ if (!ret)
+ bkey_val_copy(&s, snapshot_k);
+
if (fsck_err_on(ret ||
root_id != bch2_snapshot_root(c, root_id) ||
st.k->p.offset != le32_to_cpu(s.tree),
trans, snapshot_tree_to_missing_snapshot,
"snapshot tree points to missing/incorrect snapshot:\n %s",
- (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+ (bch2_bkey_val_to_text(&buf, c, st.s_c),
+ prt_newline(&buf),
+ ret
+ ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
+ : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
+ buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
goto err;
}
- ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
- false, 0, &subvol);
+ if (!st.v->master_subvol)
+ goto out;
+
+ ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
@@ -603,8 +541,10 @@ static int check_snapshot_tree(struct btree_trans *trans,
u->v.master_subvol = cpu_to_le32(subvol_id);
st = snapshot_tree_i_to_s_c(u);
}
+out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &snapshot_iter);
printbuf_exit(&buf);
return ret;
}
@@ -799,7 +739,7 @@ static int check_snapshot(struct btree_trans *trans,
if (should_have_subvol) {
id = le32_to_cpu(s.subvol);
- ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+ ret = bch2_subvolume_get(trans, id, false, &subvol);
if (bch2_err_matches(ret, ENOENT))
bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -902,7 +842,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
{
struct bch_fs *c = trans->c;
- if (bch2_snapshot_equiv(c, id))
+ if (bch2_snapshot_exists(c, id))
return 0;
/* Do we need to reconstruct the snapshot_tree entry as well? */
@@ -951,8 +891,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
- bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
- bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
+ bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
}
/* Figure out which snapshot nodes belong in the same tree: */
@@ -1050,7 +989,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
snapshot_id_list_to_text(&buf, t);
darray_for_each(*t, id) {
- if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
+ if (fsck_err_on(!bch2_snapshot_exists(c, *id),
trans, snapshot_node_missing,
"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
if (t->nr > 1) {
@@ -1083,10 +1022,12 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret = 0;
- if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot),
+ if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
trans, bkey_in_missing_snapshot,
"key in missing snapshot %s, delete?",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ (bch2_btree_id_to_text(&buf, iter->btree_id),
+ prt_char(&buf, ' '),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_internal_snapshot_node) ?: 1;
fsck_err:
@@ -1100,13 +1041,11 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans,
int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
{
struct btree_iter iter;
- struct bkey_i_snapshot *s;
- int ret = 0;
-
- s = bch2_bkey_get_mut_typed(trans, &iter,
+ struct bkey_i_snapshot *s =
+ bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_snapshots, POS(0, id),
0, snapshot);
- ret = PTR_ERR_OR_ZERO(s);
+ int ret = PTR_ERR_OR_ZERO(s);
if (unlikely(ret)) {
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
trans->c, "missing snapshot %u", id);
@@ -1294,10 +1233,6 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
goto err;
new_snapids[i] = iter.pos.offset;
-
- mutex_lock(&c->snapshot_table_lock);
- snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
- mutex_unlock(&c->snapshot_table_lock);
}
err:
bch2_trans_iter_exit(trans, &iter);
@@ -1403,129 +1338,153 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
* that key to snapshot leaf nodes, where we can mutate it
*/
-static int delete_dead_snapshots_process_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- snapshot_id_list *deleted,
- snapshot_id_list *equiv_seen,
- struct bpos *last_pos)
+struct snapshot_interior_delete {
+ u32 id;
+ u32 live_child;
+};
+typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
+
+static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
{
- int ret = bch2_check_key_has_snapshot(trans, iter, k);
- if (ret)
- return ret < 0 ? ret : 0;
+ darray_for_each(*l, i)
+ if (i->id == id)
+ return i->live_child;
+ return 0;
+}
- struct bch_fs *c = trans->c;
- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
- if (!equiv) /* key for invalid snapshot node, but we chose not to delete */
+static unsigned __live_child(struct snapshot_table *t, u32 id,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
+{
+ struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
return 0;
- if (!bkey_eq(k.k->p, *last_pos))
- equiv_seen->nr = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
+ if (s->children[i] &&
+ !snapshot_list_has_id(delete_leaves, s->children[i]) &&
+ !interior_delete_has_id(delete_interior, s->children[i]))
+ return s->children[i];
- if (snapshot_list_has_id(deleted, k.k->p.snapshot))
- return bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node);
+ for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
+ u32 live_child = s->children[i]
+ ? __live_child(t, s->children[i], delete_leaves, delete_interior)
+ : 0;
+ if (live_child)
+ return live_child;
+ }
- if (!bpos_eq(*last_pos, k.k->p) &&
- snapshot_list_has_id(equiv_seen, equiv))
- return bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node);
+ return 0;
+}
- *last_pos = k.k->p;
+static unsigned live_child(struct bch_fs *c, u32 id,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
+{
+ rcu_read_lock();
+ u32 ret = __live_child(rcu_dereference(c->snapshots), id,
+ delete_leaves, delete_interior);
+ rcu_read_unlock();
+ return ret;
+}
- ret = snapshot_list_add_nodup(c, equiv_seen, equiv);
- if (ret)
- return ret;
+static int delete_dead_snapshots_process_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
+{
+ if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node);
- /*
- * When we have a linear chain of snapshot nodes, we consider
- * those to form an equivalence class: we're going to collapse
- * them all down to a single node, and keep the leaf-most node -
- * which has the same id as the equivalence class id.
- *
- * If there are multiple keys in different snapshots at the same
- * position, we're only going to keep the one in the newest
- * snapshot (we delete the others above) - the rest have been
- * overwritten and are redundant, and for the key we're going to keep we
- * need to move it to the equivalance class ID if it's not there
- * already.
- */
- if (equiv != k.k->p.snapshot) {
+ u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
+ if (live_child) {
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
- new->k.p.snapshot = equiv;
-
- struct btree_iter new_iter;
- bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
- BTREE_ITER_all_snapshots|
- BTREE_ITER_cached|
- BTREE_ITER_intent);
+ new->k.p.snapshot = live_child;
- ret = bch2_btree_iter_traverse(&new_iter) ?:
- bch2_trans_update(trans, &new_iter, new,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &new_iter);
+ struct btree_iter dst_iter;
+ struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
+ iter->btree_id, new->k.p,
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_intent);
+ ret = bkey_err(dst_k);
if (ret)
return ret;
+
+ ret = (bkey_deleted(dst_k.k)
+ ? bch2_trans_update(trans, &dst_iter, new,
+ BTREE_UPDATE_internal_snapshot_node)
+ : 0) ?:
+ bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node);
+ bch2_trans_iter_exit(trans, &dst_iter);
+ return ret;
}
return 0;
}
-static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
{
- struct bkey_s_c_snapshot snap;
- u32 children[2];
- int ret;
-
if (k.k->type != KEY_TYPE_snapshot)
return 0;
- snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v) ||
- BCH_SNAPSHOT_SUBVOL(snap.v))
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+ unsigned live_children = 0;
+
+ if (BCH_SNAPSHOT_SUBVOL(s.v))
return 0;
- children[0] = le32_to_cpu(snap.v->children[0]);
- children[1] = le32_to_cpu(snap.v->children[1]);
+ for (unsigned i = 0; i < 2; i++) {
+ u32 child = le32_to_cpu(s.v->children[i]);
- ret = bch2_snapshot_live(trans, children[0]) ?:
- bch2_snapshot_live(trans, children[1]);
- if (ret < 0)
- return ret;
- return !ret;
-}
+ live_children += child &&
+ !snapshot_list_has_id(delete_leaves, child);
+ }
-/*
- * For a given snapshot, if it doesn't have a subvolume that points to it, and
- * it doesn't have child snapshot nodes - it's now redundant and we can mark it
- * as deleted.
- */
-static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
-{
- int ret = bch2_snapshot_needs_delete(trans, k);
+ if (live_children == 0) {
+ return snapshot_list_add(c, delete_leaves, s.k->p.offset);
+ } else if (live_children == 1) {
+ struct snapshot_interior_delete d = {
+ .id = s.k->p.offset,
+ .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior),
+ };
+
+ if (!d.live_child) {
+ bch_err(c, "error finding live child of snapshot %u", d.id);
+ return -EINVAL;
+ }
- return ret <= 0
- ? ret
- : bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+ return darray_push(delete_interior, d);
+ } else {
+ return 0;
+ }
}
static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
- snapshot_id_list *skip)
+ interior_delete_list *skip)
{
rcu_read_lock();
- while (snapshot_list_has_id(skip, id))
+ while (interior_delete_has_id(skip, id))
id = __bch2_snapshot_parent(c, id);
while (n--) {
do {
id = __bch2_snapshot_parent(c, id);
- } while (snapshot_list_has_id(skip, id));
+ } while (interior_delete_has_id(skip, id));
}
rcu_read_unlock();
@@ -1534,7 +1493,7 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
struct btree_iter *iter, struct bkey_s_c k,
- snapshot_id_list *deleted)
+ interior_delete_list *deleted)
{
struct bch_fs *c = trans->c;
u32 nr_deleted_ancestors = 0;
@@ -1544,7 +1503,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_snapshot)
return 0;
- if (snapshot_list_has_id(deleted, k.k->p.offset))
+ if (interior_delete_has_id(deleted, k.k->p.offset))
return 0;
s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
@@ -1553,7 +1512,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
return ret;
darray_for_each(*deleted, i)
- nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
+ nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
if (!nr_deleted_ancestors)
return 0;
@@ -1571,7 +1530,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
u32 id = le32_to_cpu(s->v.skip[j]);
- if (snapshot_list_has_id(deleted, id)) {
+ if (interior_delete_has_id(deleted, id)) {
id = bch2_snapshot_nth_parent_skip(c,
parent,
depth > 1
@@ -1590,51 +1549,45 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
int bch2_delete_dead_snapshots(struct bch_fs *c)
{
- struct btree_trans *trans;
- snapshot_id_list deleted = { 0 };
- snapshot_id_list deleted_interior = { 0 };
- int ret = 0;
-
if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
return 0;
- trans = bch2_trans_get(c);
+ struct btree_trans *trans = bch2_trans_get(c);
+ snapshot_id_list delete_leaves = {};
+ interior_delete_list delete_interior = {};
+ int ret = 0;
/*
* For every snapshot node: If we have no live children and it's not
* pointed to by a subvolume, delete it:
*/
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- NULL, NULL, 0,
- bch2_delete_redundant_snapshot(trans, k));
- bch_err_msg(c, ret, "deleting redundant snapshots");
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
+ check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "walking snapshots");
if (ret)
goto err;
- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- bch2_snapshot_set_equiv(trans, k));
- bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
- if (ret)
+ if (!delete_leaves.nr && !delete_interior.nr)
goto err;
- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ({
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
+ {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "deleting leaves");
+ darray_for_each(delete_leaves, i)
+ prt_printf(&buf, " %u", *i);
- BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
- ? snapshot_list_add(c, &deleted, k.k->p.offset)
- : 0;
- }));
- bch_err_msg(c, ret, "walking snapshots");
- if (ret)
- goto err;
+ prt_printf(&buf, " interior");
+ darray_for_each(delete_interior, i)
+ prt_printf(&buf, " %u->%u", i->id, i->live_child);
+
+ ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
+ printbuf_exit(&buf);
+ if (ret)
+ goto err;
+ }
for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
- struct bpos last_pos = POS_MIN;
- snapshot_id_list equiv_seen = { 0 };
struct disk_reservation res = { 0 };
if (!btree_type_has_snapshots(btree))
@@ -1644,33 +1597,26 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
btree, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc,
- delete_dead_snapshots_process_key(trans, &iter, k, &deleted,
- &equiv_seen, &last_pos));
+ delete_dead_snapshots_process_key(trans, &iter, k,
+ &delete_leaves,
+ &delete_interior));
bch2_disk_reservation_put(c, &res);
- darray_exit(&equiv_seen);
- bch_err_msg(c, ret, "deleting keys from dying snapshots");
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "deleting keys from dying snapshots");
if (ret)
goto err;
}
- bch2_trans_unlock(trans);
- down_write(&c->snapshot_create_lock);
-
- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ({
- u32 snapshot = k.k->p.offset;
- u32 equiv = bch2_snapshot_equiv(c, snapshot);
-
- equiv != snapshot
- ? snapshot_list_add(c, &deleted_interior, snapshot)
- : 0;
- }));
-
- bch_err_msg(c, ret, "walking snapshots");
- if (ret)
- goto err_create_lock;
+ darray_for_each(delete_leaves, i) {
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_snapshot_node_delete(trans, *i));
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ if (ret)
+ goto err;
+ }
/*
* Fixing children of deleted snapshots can't be done completely
@@ -1680,32 +1626,24 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
+ bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
if (ret)
- goto err_create_lock;
-
- darray_for_each(deleted, i) {
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(trans, *i));
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
- if (ret)
- goto err_create_lock;
- }
+ goto err;
- darray_for_each(deleted_interior, i) {
+ darray_for_each(delete_interior, i) {
ret = commit_do(trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(trans, *i));
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ bch2_snapshot_node_delete(trans, i->id));
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "deleting snapshot %u", i->id);
if (ret)
- goto err_create_lock;
+ goto err;
}
-err_create_lock:
- up_write(&c->snapshot_create_lock);
err:
- darray_exit(&deleted_interior);
- darray_exit(&deleted);
+ darray_exit(&delete_interior);
+ darray_exit(&delete_leaves);
bch2_trans_put(trans);
- bch_err_fn(c, ret);
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_fn(c, ret);
return ret;
}
@@ -1721,8 +1659,12 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work)
void bch2_delete_dead_snapshots_async(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
- !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
+ return;
+
+ BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
+
+ if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
@@ -1735,18 +1677,10 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- bch2_trans_iter_init(trans, &iter, id, pos,
- BTREE_ITER_not_extents|
- BTREE_ITER_all_snapshots);
- while (1) {
- k = bch2_btree_iter_prev(&iter);
- ret = bkey_err(k);
- if (ret)
- break;
-
- if (!k.k)
- break;
-
+ for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
+ BTREE_ITER_not_extents|
+ BTREE_ITER_all_snapshots,
+ k, ret) {
if (!bkey_eq(pos, k.k->p))
break;
@@ -1760,37 +1694,36 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
return ret;
}
-static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
+static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_snapshot snap;
- int ret = 0;
+ /* If there's one child, it's redundant and keys will be moved to the child */
+ return !!snap.v->children[0] + !!snap.v->children[1] == 1;
+}
+static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
+{
if (k.k->type != KEY_TYPE_snapshot)
return 0;
- snap = bkey_s_c_to_snapshot(k);
+ struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
if (BCH_SNAPSHOT_DELETED(snap.v) ||
- bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
- (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
- set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
- return 0;
- }
+ interior_snapshot_needs_delete(snap))
+ set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
- return ret;
+ return 0;
}
int bch2_snapshots_read(struct bch_fs *c)
{
+ /*
+ * Initializing the is_ancestor bitmaps requires ancestors to already be
+ * initialized - so mark in reverse:
+ */
int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
+ for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
+ POS_MAX, 0, k,
__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
- bch2_snapshot_set_equiv(trans, k) ?:
- bch2_check_snapshot_needs_deletion(trans, k)) ?:
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
+ bch2_check_snapshot_needs_deletion(trans, k)));
bch_err_fn(c, ret);
/*
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 29c94716293e..00373cf32e7b 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -2,11 +2,9 @@
#ifndef _BCACHEFS_SNAPSHOT_H
#define _BCACHEFS_SNAPSHOT_H
-enum bch_validate_flags;
-
void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags);
+ struct bkey_validate_context);
#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \
.key_validate = bch2_snapshot_tree_validate, \
@@ -19,7 +17,8 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
enum btree_iter_update_trigger_flags);
@@ -120,19 +119,19 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
return id;
}
-static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id)
{
const struct snapshot_t *s = snapshot_t(c, id);
- return s ? s->equiv : 0;
+ return s ? s->live : 0;
}
-static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
{
rcu_read_lock();
- id = __bch2_snapshot_equiv(c, id);
+ bool ret = __bch2_snapshot_exists(c, id);
rcu_read_unlock();
- return id;
+ return ret;
}
static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
new file mode 100644
index 000000000000..f5977c5c6743
--- /dev/null
+++ b/fs/bcachefs/str_hash.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "fsck.h"
+#include "str_hash.h"
+#include "subvolume.h"
+
+static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
+{
+ if (d.v->d_type == DT_SUBVOL) {
+ struct bch_subvolume subvol;
+ int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol),
+ false, &subvol);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+ return !ret;
+ } else {
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = bkey_is_inode(k.k);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+ }
+}
+
+static int fsck_rename_dirent(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c_dirent old)
+{
+ struct qstr old_name = bch2_dirent_get_name(old);
+ struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
+ int ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ bkey_dirent_init(&new->k_i);
+ dirent_copy_target(new, old);
+ new->k.p = old.k->p;
+
+ for (unsigned i = 0; i < 1000; i++) {
+ unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
+ old_name.len, old_name.name, i);
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
+
+ if (u64s > U8_MAX)
+ return -EINVAL;
+
+ new->k.u64s = u64s;
+
+ ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ (subvol_inum) { 0, old.k->p.inode },
+ old.k->p.snapshot, &new->k_i,
+ BTREE_UPDATE_internal_snapshot_node);
+ if (!bch2_err_matches(ret, EEXIST))
+ break;
+ }
+
+ if (ret)
+ return ret;
+
+ return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+}
+
+static int hash_pick_winner(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c k1,
+ struct bkey_s_c k2)
+{
+ if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
+ !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
+ return 0;
+
+ switch (desc.btree_id) {
+ case BTREE_ID_dirents: {
+ int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return 0;
+
+ ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return 1;
+ return 2;
+ }
+ default:
+ return 0;
+ }
+}
+
+static int repair_inode_hash_info(struct btree_trans *trans,
+ struct bch_inode_unpacked *snapshot_root)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
+ SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != snapshot_root->bi_inum)
+ break;
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_unpack(k, &inode);
+ if (ret)
+ break;
+
+ if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed ||
+ INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root),
+ trans, inode_snapshot_mismatch,
+ "inode hash info in different snapshots don't match")) {
+ inode.bi_hash_seed = snapshot_root->bi_hash_seed;
+ SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root));
+ ret = __bch2_fsck_write_inode(trans, &inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ break;
+ }
+ }
+fsck_err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/*
+ * All versions of the same inode in different snapshots must have the same hash
+ * seed/type: verify that the hash info we're using matches the root
+ */
+static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
+ struct bch_hash_info *hash_info)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+ if (bkey_is_inode(k.k))
+ goto found;
+ }
+ bch_err(c, "%s(): inum %llu not found", __func__, inum);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
+found:;
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_unpack(k, &inode);
+ if (ret)
+ goto err;
+
+ struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
+ if (memcmp(hash_info, &hash2, sizeof(hash2))) {
+ ret = repair_inode_hash_info(trans, &inode);
+ if (!ret) {
+ bch_err(c, "inode hash info mismatch with root, but mismatch not found");
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ }
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int __bch2_str_hash_check_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc *desc,
+ struct bch_hash_info *hash_info,
+ struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter = { NULL };
+ struct printbuf buf = PRINTBUF;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ u64 hash = desc->hash_bkey(hash_info, hash_k);
+ if (hash_k.k->p.offset < hash)
+ goto bad_hash;
+
+ for_each_btree_key_norestart(trans, iter, desc->btree_id,
+ SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
+ BTREE_ITER_slots, k, ret) {
+ if (bkey_eq(k.k->p, hash_k.k->p))
+ break;
+
+ if (k.k->type == desc->key_type &&
+ !desc->cmp_bkey(k, hash_k))
+ goto duplicate_entries;
+
+ if (bkey_deleted(k.k)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto bad_hash;
+ }
+ }
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+bad_hash:
+ /*
+ * Before doing any repair, check hash_info itself:
+ */
+ ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info);
+ if (ret)
+ goto out;
+
+ if (fsck_err(trans, hash_table_key_wrong_offset,
+ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s",
+ bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info,
+ (subvol_inum) { 0, hash_k.k->p.inode },
+ hash_k.k->p.snapshot, new,
+ STR_HASH_must_create|
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
+ if (k.k)
+ goto duplicate_entries;
+
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+ BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
+ }
+fsck_err:
+ goto out;
+duplicate_entries:
+ ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k);
+ if (ret < 0)
+ goto out;
+
+ if (!fsck_err(trans, hash_table_key_duplicate,
+ "duplicate hash table keys%s:\n%s",
+ ret != 2 ? "" : ", both point to valid inodes",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k),
+ prt_newline(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf)))
+ goto out;
+
+ switch (ret) {
+ case 0:
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
+ break;
+ case 1:
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0);
+ break;
+ case 2:
+ ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
+ bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
+ goto out;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
+}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index ec2b1feea520..55a4ac7bf220 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -160,7 +160,7 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
BTREE_ITER_slots|flags, k, ret) {
@@ -210,7 +210,7 @@ bch2_hash_hole(struct btree_trans *trans,
if (ret)
return ret;
- for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
BTREE_ITER_slots|BTREE_ITER_intent, k, ret)
@@ -265,7 +265,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
bool found = false;
int ret;
- for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
SPOS(insert->k.p.inode,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
@@ -393,4 +393,26 @@ int bch2_hash_delete(struct btree_trans *trans,
return ret;
}
+struct snapshots_seen;
+int __bch2_str_hash_check_key(struct btree_trans *,
+ struct snapshots_seen *,
+ const struct bch_hash_desc *,
+ struct bch_hash_info *,
+ struct btree_iter *, struct bkey_s_c);
+
+static inline int bch2_str_hash_check_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc *desc,
+ struct bch_hash_info *hash_info,
+ struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+ if (hash_k.k->type != desc->key_type)
+ return 0;
+
+ if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
+ return 0;
+
+ return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k);
+}
+
#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 80e5efaff524..e3d0475232e5 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -207,7 +207,7 @@ int bch2_check_subvol_children(struct bch_fs *c)
/* Subvolumes: */
int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k);
int ret = 0;
@@ -286,11 +286,11 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
static __always_inline int
bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
bool inconsistent_if_not_found,
- int iter_flags,
struct bch_subvolume *s)
{
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
- iter_flags, subvolume, s);
+ BTREE_ITER_cached|
+ BTREE_ITER_with_updates, subvolume, s);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
inconsistent_if_not_found,
trans->c, "missing subvolume %u", subvol);
@@ -299,16 +299,15 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
bool inconsistent_if_not_found,
- int iter_flags,
struct bch_subvolume *s)
{
- return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
+ return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s);
}
int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
{
struct bch_subvolume s;
- int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s);
+ int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s);
if (ret)
return ret;
@@ -328,7 +327,7 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
struct bch_snapshot snap;
return bch2_snapshot_lookup(trans, snapshot, &snap) ?:
- bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+ bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol);
}
int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
@@ -396,8 +395,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
struct bch_subvolume s;
return lockrestart_do(trans,
- bch2_subvolume_get(trans, subvolid_to_delete, true,
- BTREE_ITER_cached, &s)) ?:
+ bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?:
for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
@@ -411,26 +409,56 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
*/
static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
- struct btree_iter iter;
- struct bkey_s_c_subvolume subvol;
- u32 snapid;
- int ret = 0;
+ struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {};
- subvol = bch2_bkey_get_iter_typed(trans, &iter,
+ struct bkey_s_c_subvolume subvol =
+ bch2_bkey_get_iter_typed(trans, &subvol_iter,
BTREE_ID_subvolumes, POS(0, subvolid),
BTREE_ITER_cached|BTREE_ITER_intent,
subvolume);
- ret = bkey_err(subvol);
+ int ret = bkey_err(subvol);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
"missing subvolume %u", subvolid);
if (ret)
- return ret;
+ goto err;
- snapid = le32_to_cpu(subvol.v->snapshot);
+ u32 snapid = le32_to_cpu(subvol.v->snapshot);
+
+ struct bkey_s_c_snapshot snapshot =
+ bch2_bkey_get_iter_typed(trans, &snapshot_iter,
+ BTREE_ID_snapshots, POS(0, snapid),
+ 0, snapshot);
+ ret = bkey_err(subvol);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+ "missing snapshot %u", snapid);
+ if (ret)
+ goto err;
+
+ u32 treeid = le32_to_cpu(snapshot.v->tree);
- ret = bch2_btree_delete_at(trans, &iter, 0) ?:
+ struct bkey_s_c_snapshot_tree snapshot_tree =
+ bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter,
+ BTREE_ID_snapshot_trees, POS(0, treeid),
+ 0, snapshot_tree);
+
+ if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) {
+ struct bkey_i_snapshot_tree *snapshot_tree_mut =
+ bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter,
+ &snapshot_tree.s_c,
+ 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(snapshot_tree_mut);
+ if (ret)
+ goto err;
+
+ snapshot_tree_mut->v.master_subvol = 0;
+ }
+
+ ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?:
bch2_snapshot_node_set_deleted(trans, snapid);
- bch2_trans_iter_exit(trans, &iter);
+err:
+ bch2_trans_iter_exit(trans, &snapshot_tree_iter);
+ bch2_trans_iter_exit(trans, &snapshot_iter);
+ bch2_trans_iter_exit(trans, &subvol_iter);
return ret;
}
@@ -675,7 +703,7 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
/* set bi_subvol on root inode */
int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
{
- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_fs_upgrade_for_subvolumes(trans));
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index f897d106e142..910f6196700e 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -5,12 +5,11 @@
#include "darray.h"
#include "subvolume_types.h"
-enum bch_validate_flags;
-
int bch2_check_subvols(struct bch_fs *);
int bch2_check_subvol_children(struct bch_fs *);
-int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
@@ -25,7 +24,7 @@ int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
int bch2_subvol_has_children(struct btree_trans *, u32);
int bch2_subvolume_get(struct btree_trans *, unsigned,
- bool, int, struct bch_subvolume *);
+ bool, struct bch_subvolume *);
int __bch2_subvolume_get_snapshot(struct btree_trans *, u32,
u32 *, bool);
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
@@ -34,7 +33,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
int bch2_subvol_is_ro(struct bch_fs *, u32);
static inline struct bkey_s_c
-bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end,
+bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end,
u32 subvolid, unsigned flags)
{
u32 snapshot;
@@ -43,10 +42,10 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos
return bkey_s_c_err(ret);
bch2_btree_iter_set_snapshot(iter, snapshot);
- return bch2_btree_iter_peek_upto_type(iter, end, flags);
+ return bch2_btree_iter_peek_max_type(iter, end, flags);
}
-#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
+#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
_end, _subvolid, _flags, _k, _do) \
({ \
struct bkey_s_c _k; \
@@ -54,7 +53,7 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \
_end, _subvolid, (_flags)); \
if (!(_k).k) \
break; \
@@ -67,14 +66,14 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos
_ret3; \
})
-#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \
+#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \
_start, _end, _subvolid, _flags, _k, _do) \
({ \
struct btree_iter _iter; \
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
\
- for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
+ for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
_end, _subvolid, _flags, _k, _do); \
})
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index f2ec4277c2a5..1549d6daf7af 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -9,13 +9,13 @@ typedef DARRAY(u32) snapshot_id_list;
#define IS_ANCESTOR_BITMAP 128
struct snapshot_t {
+ bool live;
u32 parent;
u32 skip[3];
u32 depth;
u32 children[2];
u32 subvol; /* Nonzero only if a subvolume points to this node: */
u32 tree;
- u32 equiv;
unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
};
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 7c71594f6a8b..8037ccbacf6a 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -23,6 +23,7 @@
#include <linux/backing-dev.h>
#include <linux/sort.h>
+#include <linux/string_choices.h>
static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
};
@@ -41,7 +42,7 @@ static const struct bch2_metadata_version bch2_metadata_versions[] = {
#undef x
};
-void bch2_version_to_text(struct printbuf *out, unsigned v)
+void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v)
{
const char *str = "(unknown version)";
@@ -54,7 +55,7 @@ void bch2_version_to_text(struct printbuf *out, unsigned v)
prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
}
-unsigned bch2_latest_compatible_version(unsigned v)
+enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v)
{
if (!BCH_VERSION_MAJOR(v))
return v;
@@ -68,6 +69,16 @@ unsigned bch2_latest_compatible_version(unsigned v)
return v;
}
+void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
+{
+ mutex_lock(&c->sb_lock);
+ SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
+ max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
+
const char * const bch2_sb_fields[] = {
#define x(name, nr) #name,
BCH_SB_FIELDS()
@@ -368,6 +379,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
return -BCH_ERR_invalid_sb_features;
}
+ if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
+ BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
+ prt_printf(out, "Filesystem has incompatible version");
+ return -BCH_ERR_invalid_sb_features;
+ }
+
block_size = le16_to_cpu(sb->block_size);
if (block_size > PAGE_SECTORS) {
@@ -406,6 +423,21 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
return -BCH_ERR_invalid_sb_time_precision;
}
+ /* old versions didn't know to downgrade this field */
+ if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version))
+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version));
+
+ if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) {
+ prt_printf(out, "Invalid version_incompat ");
+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
+ prt_str(out, " > incompat_allowed ");
+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
+ if (flags & BCH_VALIDATE_write)
+ return -BCH_ERR_invalid_sb_version;
+ else
+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb));
+ }
+
if (!flags) {
/*
* Been seeing a bug where these are getting inexplicably
@@ -428,6 +460,11 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
}
+#ifdef __KERNEL__
+ if (!BCH_SB_SHARD_INUMS_NBITS(sb))
+ SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus())));
+#endif
+
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
const struct bch_option *opt = bch2_opt_table + opt_id;
@@ -519,6 +556,9 @@ static void bch2_sb_update(struct bch_fs *c)
c->sb.uuid = src->uuid;
c->sb.user_uuid = src->user_uuid;
c->sb.version = le16_to_cpu(src->version);
+ c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src);
+ c->sb.version_incompat_allowed
+ = BCH_SB_VERSION_INCOMPAT_ALLOWED(src);
c->sb.version_min = le16_to_cpu(src->version_min);
c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
c->sb.nr_devices = src->nr_devices;
@@ -676,7 +716,8 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf
}
enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
- if (csum_type >= BCH_CSUM_NR) {
+ if (csum_type >= BCH_CSUM_NR ||
+ bch2_csum_type_is_encryption(csum_type)) {
prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
return -BCH_ERR_invalid_sb_csum_type;
}
@@ -878,7 +919,7 @@ static void write_super_endio(struct bio *bio)
? BCH_MEMBER_ERROR_write
: BCH_MEMBER_ERROR_read,
"superblock %s error: %s",
- bio_data_dir(bio) ? "write" : "read",
+ str_write_read(bio_data_dir(bio)),
bch2_blk_status_to_str(bio->bi_status)))
ca->sb_write_error = 1;
@@ -891,14 +932,15 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
struct bch_sb *sb = ca->disk_sb.sb;
struct bio *bio = ca->disk_sb.bio;
+ memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE);
+
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]);
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
- bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
+ bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE);
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
- bio_sectors(bio));
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
percpu_ref_get(&ca->io_ref);
closure_bio_submit(bio, &c->sb_write);
@@ -1042,9 +1084,16 @@ int bch2_write_super(struct bch_fs *c)
": Superblock write was silently dropped! (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
- bch2_fs_fatal_error(c, "%s", buf.buf);
+
+ if (c->opts.errors != BCH_ON_ERROR_continue &&
+ c->opts.errors != BCH_ON_ERROR_fix_safe) {
+ ret = -BCH_ERR_erofs_sb_err;
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ } else {
+ bch_err(c, "%s", buf.buf);
+ }
+
printbuf_exit(&buf);
- ret = -BCH_ERR_erofs_sb_err;
}
if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
@@ -1149,6 +1198,8 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
*/
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
+ if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current)
+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current);
if (c->sb.version > bcachefs_metadata_version_current)
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
if (c->sb.version_min > bcachefs_metadata_version_current)
@@ -1157,7 +1208,7 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
return ret;
}
-void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
+void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
{
lockdep_assert_held(&c->sb_lock);
@@ -1167,6 +1218,10 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
c->disk_sb.sb->version = cpu_to_le16(new_version);
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+
+ if (incompat)
+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
+ max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
}
static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
@@ -1331,6 +1386,14 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
bch2_version_to_text(out, le16_to_cpu(sb->version));
prt_newline(out);
+ prt_printf(out, "Incompatible features allowed:\t");
+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
+ prt_newline(out);
+
+ prt_printf(out, "Incompatible features in use:\t");
+ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
+ prt_newline(out);
+
prt_printf(out, "Version upgrade complete:\t");
bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
prt_newline(out);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index fadd364e2802..f1ab4f943720 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -10,14 +10,29 @@
#include <asm/byteorder.h>
+#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096
+
static inline bool bch2_version_compatible(u16 version)
{
return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
version >= bcachefs_metadata_version_min;
}
-void bch2_version_to_text(struct printbuf *, unsigned);
-unsigned bch2_latest_compatible_version(unsigned);
+void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
+enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
+
+void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
+
+static inline bool bch2_request_incompat_feature(struct bch_fs *c,
+ enum bcachefs_metadata_version version)
+{
+ if (unlikely(version > c->sb.version_incompat)) {
+ if (version > c->sb.version_incompat_allowed)
+ return false;
+ bch2_set_version_incompat(c, version);
+ }
+ return true;
+}
static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
{
@@ -92,7 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
}
bool bch2_check_version_downgrade(struct bch_fs *);
-void bch2_sb_upgrade(struct bch_fs *, unsigned);
+void bch2_sb_upgrade(struct bch_fs *, unsigned, bool);
void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
struct bch_sb_field *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a6ed9a0bf1c7..d97ea7bd1171 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -290,7 +290,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_fs_journal_stop(&c->journal);
- bch_info(c, "%sshutdown complete, journal seq %llu",
+ bch_info(c, "%sclean shutdown complete, journal seq %llu",
test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
c->journal.seq_ondisk);
@@ -441,6 +441,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
{
int ret;
+ BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
+
if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
bch_err(c, "cannot go rw, unfixed btree errors");
return -BCH_ERR_erofs_unfixed_errors;
@@ -561,6 +563,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
+ bch2_fs_btree_gc_exit(c);
bch2_journal_keys_put_initial(c);
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
BUG_ON(atomic_read(&c->journal_keys.ref));
@@ -584,7 +587,6 @@ static void __bch2_fs_free(struct bch_fs *c)
#endif
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
- kfree(c->unused_inode_hints);
if (c->write_ref_wq)
destroy_workqueue(c->write_ref_wq);
@@ -766,21 +768,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
refcount_set(&c->ro_ref, 1);
init_waitqueue_head(&c->ro_ref_wait);
+ spin_lock_init(&c->recovery_pass_lock);
sema_init(&c->online_fsck_mutex, 1);
- init_rwsem(&c->gc_lock);
- mutex_init(&c->gc_gens_lock);
- atomic_set(&c->journal_keys.ref, 1);
- c->journal_keys.initial_ref_held = true;
-
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
- bch2_fs_gc_init(c);
bch2_fs_copygc_init(c);
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
bch2_fs_btree_iter_init_early(c);
bch2_fs_btree_interior_update_init_early(c);
+ bch2_fs_journal_keys_init(c);
bch2_fs_allocator_background_init(c);
bch2_fs_allocator_foreground_init(c);
bch2_fs_rebalance_init(c);
@@ -809,9 +807,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->vfs_inodes_list);
mutex_init(&c->vfs_inodes_lock);
- c->copy_gc_enabled = 1;
- c->rebalance.enabled = 1;
-
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
@@ -873,8 +868,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
(btree_blocks(c) + 1) * 2 *
sizeof(struct sort_iter_set);
- c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
-
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
@@ -901,9 +894,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
c->opts.btree_node_size) ||
- mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
- !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
- sizeof(u64), GFP_KERNEL))) {
+ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
ret = -BCH_ERR_ENOMEM_fs_other_alloc;
goto err;
}
@@ -917,6 +908,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_cache_init(c) ?:
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
bch2_fs_btree_interior_update_init(c) ?:
+ bch2_fs_btree_gc_init(c) ?:
bch2_fs_buckets_waiting_for_journal_init(c) ?:
bch2_fs_btree_write_buffer_init(c) ?:
bch2_fs_subvolumes_init(c) ?:
@@ -1033,9 +1025,12 @@ int bch2_fs_start(struct bch_fs *c)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+ c->recovery_task = current;
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
? bch2_fs_recovery(c)
: bch2_fs_initialize(c);
+ c->recovery_task = NULL;
+
if (ret)
goto err;
@@ -1120,12 +1115,12 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
prt_bdevname(&buf, fs->bdev);
prt_char(&buf, ' ');
- bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));;
+ bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));
prt_newline(&buf);
prt_bdevname(&buf, sb->bdev);
prt_char(&buf, ' ');
- bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
+ bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));
prt_newline(&buf);
if (!opts->no_splitbrain_check)
@@ -1198,7 +1193,7 @@ static void bch2_dev_free(struct bch_dev *ca)
free_percpu(ca->io_done);
bch2_dev_buckets_free(ca);
- free_page((unsigned long) ca->sb_read_scratch);
+ kfree(ca->sb_read_scratch);
bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
@@ -1309,8 +1304,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
init_completion(&ca->ref_completion);
init_completion(&ca->io_ref_completion);
- init_rwsem(&ca->bucket_lock);
-
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
@@ -1337,7 +1330,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
- !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
+ !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
bch2_dev_buckets_alloc(c, ca) ||
!(ca->io_done = alloc_percpu(*ca->io_done)))
goto err;
@@ -1366,7 +1359,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
{
struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
struct bch_dev *ca = NULL;
- int ret = 0;
if (bch2_fs_init_fault("dev_alloc"))
goto err;
@@ -1378,10 +1370,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->fs = c;
bch2_dev_attach(c, ca, dev_idx);
- return ret;
+ return 0;
err:
- if (ca)
- bch2_dev_free(ca);
return -BCH_ERR_ENOMEM_dev_alloc;
}
@@ -1751,11 +1741,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (ret)
goto err;
- ret = bch2_dev_journal_alloc(ca, true);
- bch_err_msg(c, ret, "allocating journal");
- if (ret)
- goto err;
-
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
@@ -1806,11 +1791,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (ret)
goto err_late;
- ca->new_fs_bucket_idx = 0;
-
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
+ ret = bch2_dev_journal_alloc(ca, false);
+ bch_err_msg(c, ret, "allocating journal");
+ if (ret)
+ goto err_late;
+
up_write(&c->state_lock);
return 0;
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index dada09331d2e..fa6d52216510 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -34,16 +34,6 @@ void bch2_fs_read_only(struct bch_fs *);
int bch2_fs_read_write(struct bch_fs *);
int bch2_fs_read_write_early(struct bch_fs *);
-/*
- * Only for use in the recovery/fsck path:
- */
-static inline void bch2_fs_lazy_rw(struct bch_fs *c)
-{
- if (!test_bit(BCH_FS_rw, &c->flags) &&
- !test_bit(BCH_FS_was_rw, &c->flags))
- bch2_fs_read_write_early(c);
-}
-
void __bch2_fs_stop(struct bch_fs *);
void bch2_fs_free(struct bch_fs *);
void bch2_fs_stop(struct bch_fs *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 03e59f86f360..a7eb1f511484 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -146,7 +146,7 @@ write_attribute(trigger_journal_writes);
write_attribute(trigger_btree_cache_shrink);
write_attribute(trigger_btree_key_cache_shrink);
write_attribute(trigger_freelist_wakeup);
-rw_attribute(gc_gens_pos);
+read_attribute(gc_gens_pos);
read_attribute(uuid);
read_attribute(minor);
@@ -203,7 +203,6 @@ read_attribute(disk_groups);
read_attribute(has_data);
read_attribute(alloc_debug);
-read_attribute(accounting);
read_attribute(usage_base);
#define x(t, n, ...) read_attribute(t);
@@ -211,12 +210,11 @@ BCH_PERSISTENT_COUNTERS()
#undef x
rw_attribute(discard);
+read_attribute(state);
rw_attribute(label);
-rw_attribute(copy_gc_enabled);
read_attribute(copy_gc_wait);
-rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_status);
@@ -237,11 +235,6 @@ write_attribute(perf_test);
BCH_TIME_STATS()
#undef x
-static struct attribute sysfs_state_rw = {
- .name = "state",
- .mode = 0444,
-};
-
static size_t bch2_btree_cache_size(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
@@ -302,7 +295,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
{
- prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree));
+ bch2_btree_id_to_text(out, c->gc_gens_btree);
+ prt_printf(out, ": ");
bch2_bpos_to_text(out, c->gc_gens_pos);
prt_printf(out, "\n");
}
@@ -339,9 +333,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_gc_gens_pos)
bch2_gc_gens_pos_to_text(out, c);
- sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
-
- sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled);
sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
if (attr == &sysfs_copy_gc_wait)
@@ -405,9 +396,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_alloc_debug)
bch2_fs_alloc_debug_to_text(out, c);
- if (attr == &sysfs_accounting)
- bch2_fs_accounting_to_text(out, c);
-
if (attr == &sysfs_usage_base)
bch2_fs_usage_base_to_text(out, c);
@@ -418,23 +406,6 @@ STORE(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
- if (attr == &sysfs_copy_gc_enabled) {
- ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
- ?: (ssize_t) size;
-
- if (c->copygc_thread)
- wake_up_process(c->copygc_thread);
- return ret;
- }
-
- if (attr == &sysfs_rebalance_enabled) {
- ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
- ?: (ssize_t) size;
-
- rebalance_wakeup(c);
- return ret;
- }
-
sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
/* Debugging: */
@@ -534,15 +505,22 @@ SHOW(bch2_fs_counters)
printbuf_tabstop_push(out, 32);
- #define x(t, ...) \
+ #define x(t, n, f, ...) \
if (attr == &sysfs_##t) { \
counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+ if (f & TYPE_SECTORS) { \
+ counter <<= 9; \
+ counter_since_mount <<= 9; \
+ } \
+ \
prt_printf(out, "since mount:\t"); \
+ (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\
prt_human_readable_u64(out, counter_since_mount); \
prt_newline(out); \
\
prt_printf(out, "since filesystem creation:\t"); \
+ (f & TYPE_COUNTER) ? prt_u64(out, counter) : \
prt_human_readable_u64(out, counter); \
prt_newline(out); \
}
@@ -610,10 +588,8 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_gc_gens_pos,
- &sysfs_copy_gc_enabled,
&sysfs_copy_gc_wait,
- &sysfs_rebalance_enabled,
sysfs_pd_controller_files(rebalance),
&sysfs_moving_ctxts,
@@ -622,7 +598,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_disk_groups,
&sysfs_alloc_debug,
- &sysfs_accounting,
&sysfs_usage_base,
NULL
};
@@ -682,6 +657,13 @@ STORE(bch2_fs_opts_dir)
(id == Opt_compression && !c->opts.background_compression)))
bch2_set_rebalance_needs_scan(c, 0);
+ if (v && id == Opt_rebalance_enabled)
+ rebalance_wakeup(c);
+
+ if (v && id == Opt_copygc_enabled &&
+ c->copygc_thread)
+ wake_up_process(c->copygc_thread);
+
ret = size;
err:
bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
@@ -790,7 +772,7 @@ SHOW(bch2_dev)
prt_char(out, '\n');
}
- if (attr == &sysfs_state_rw) {
+ if (attr == &sysfs_state) {
prt_string_option(out, bch2_member_states, ca->mi.state);
prt_char(out, '\n');
}
@@ -870,7 +852,7 @@ struct attribute *bch2_dev_files[] = {
/* settings: */
&sysfs_discard,
- &sysfs_state_rw,
+ &sysfs_state,
&sysfs_label,
&sysfs_has_data,
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index fb5c1543e52f..6c6469814637 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -131,7 +131,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(k.k->p.offset != i++);
@@ -186,7 +186,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ for_each_btree_key_max(trans, iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(bkey_start_offset(k.k) != i);
@@ -242,7 +242,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(k.k->p.offset != i);
@@ -259,7 +259,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
BTREE_ITER_slots, k, ({
if (i >= nr * 2)
@@ -302,7 +302,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ for_each_btree_key_max(trans, iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(bkey_start_offset(k.k) != i + 8);
@@ -320,7 +320,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
i = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ for_each_btree_key_max(trans, iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
BTREE_ITER_slots, k, ({
if (i == nr)
@@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(trans, &iter);
@@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(trans, &iter);
@@ -488,7 +488,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
trans = bch2_trans_get(c);
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, snapid_lo), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
BUG_ON(k.k->p.snapshot != U32_MAX);
@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX));
+ k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
@@ -726,7 +726,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
static int seq_lookup(struct bch_fs *c, u64 nr)
{
return bch2_trans_run(c,
- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k,
0));
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 5597b9d6297f..9d40b7d4ea29 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -199,6 +199,30 @@ DECLARE_EVENT_CLASS(bio,
(unsigned long long)__entry->sector, __entry->nr_sector)
);
+/* disk_accounting.c */
+
+TRACE_EVENT(accounting_mem_insert,
+ TP_PROTO(struct bch_fs *c, const char *acc),
+ TP_ARGS(c, acc),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(unsigned, new_nr )
+ __string(acc, acc )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->new_nr = c->accounting.k.nr;
+ __assign_str(acc);
+ ),
+
+ TP_printk("%d,%d entries %u added %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->new_nr,
+ __get_str(acc))
+);
+
/* fs.c: */
TRACE_EVENT(bch2_sync_fs,
TP_PROTO(struct super_block *sb, int wait),
@@ -848,8 +872,8 @@ TRACE_EVENT(move_data,
TRACE_EVENT(evacuate_bucket,
TP_PROTO(struct bch_fs *c, struct bpos *bucket,
unsigned sectors, unsigned bucket_size,
- u64 fragmentation, int ret),
- TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret),
+ int ret),
+ TP_ARGS(c, bucket, sectors, bucket_size, ret),
TP_STRUCT__entry(
__field(dev_t, dev )
@@ -857,7 +881,6 @@ TRACE_EVENT(evacuate_bucket,
__field(u64, bucket )
__field(u32, sectors )
__field(u32, bucket_size )
- __field(u64, fragmentation )
__field(int, ret )
),
@@ -867,15 +890,14 @@ TRACE_EVENT(evacuate_bucket,
__entry->bucket = bucket->offset;
__entry->sectors = sectors;
__entry->bucket_size = bucket_size;
- __entry->fragmentation = fragmentation;
__entry->ret = ret;
),
- TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i",
+ TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->member, __entry->bucket,
__entry->sectors, __entry->bucket_size,
- __entry->fragmentation, __entry->ret)
+ __entry->ret)
);
TRACE_EVENT(copygc,
@@ -1316,6 +1338,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
__entry->new_u64s)
);
+DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
TRACE_EVENT(path_downgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1352,10 +1380,21 @@ TRACE_EVENT(path_downgrade,
__entry->pos_snapshot)
);
-DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+TRACE_EVENT(key_cache_fill,
+ TP_PROTO(struct btree_trans *trans, const char *key),
+ TP_ARGS(trans, key),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __string(key, key )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __assign_str(key);
+ ),
+
+ TP_printk("%s %s", __entry->trans_fn, __get_str(key))
);
TRACE_EVENT(write_buffer_flush,
@@ -1414,6 +1453,24 @@ TRACE_EVENT(write_buffer_flush_slowpath,
TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
);
+TRACE_EVENT(write_buffer_maybe_flush,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key),
+ TP_ARGS(trans, caller_ip, key),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __string(key, key )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __assign_str(key);
+ ),
+
+ TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key))
+);
+
DEFINE_EVENT(fs_str, rebalance_extent,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index fb02c1c36004..1a1720116071 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -55,6 +55,16 @@ static inline size_t buf_pages(void *p, size_t len)
PAGE_SIZE);
}
+static inline void *bch2_kvmalloc(size_t n, gfp_t flags)
+{
+ void *p = unlikely(n >= INT_MAX)
+ ? vmalloc(n)
+ : kvmalloc(n, flags & ~__GFP_ZERO);
+ if (p && (flags & __GFP_ZERO))
+ memset(p, 0, n);
+ return p;
+}
+
#define init_heap(heap, _size, gfp) \
({ \
(heap)->nr = 0; \
@@ -317,6 +327,19 @@ do { \
_ptr ? container_of(_ptr, type, member) : NULL; \
})
+static inline struct list_head *list_pop(struct list_head *head)
+{
+ if (list_empty(head))
+ return NULL;
+
+ struct list_head *ret = head->next;
+ list_del_init(ret);
+ return ret;
+}
+
+#define list_pop_entry(head, type, member) \
+ container_of_or_null(list_pop(head), type, member)
+
/* Does linear interpolation between powers of two */
static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
{
@@ -696,4 +719,13 @@ static inline bool test_bit_le64(size_t bit, __le64 *addr)
return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
}
+static inline void memcpy_swab(void *_dst, void *_src, size_t len)
+{
+ u8 *dst = _dst + len;
+ u8 *src = _src;
+
+ while (len--)
+ *--dst = *src++;
+}
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index 6a78553d9b0c..6620ecae26af 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -9,6 +9,7 @@
#include <valgrind/memcheck.h>
#endif
+#include "errcode.h"
#include "varint.h"
/**
@@ -53,7 +54,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
u64 v;
if (unlikely(in + bytes > end))
- return -1;
+ return -BCH_ERR_varint_decode_error;
if (likely(bytes < 9)) {
__le64 v_le = 0;
@@ -115,7 +116,7 @@ int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
unsigned bytes = ffz(*in) + 1;
if (unlikely(in + bytes > end))
- return -1;
+ return -BCH_ERR_varint_decode_error;
if (likely(bytes < 9)) {
v >>= bytes;
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 952aca400faf..aed7c6984173 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -71,7 +71,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
};
int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags)
+ struct bkey_validate_context from)
{
struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
@@ -309,7 +309,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
u64 offset = 0, inum = inode->ei_inode.bi_inum;
int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs,
POS(inum, offset),
POS(inum, U64_MAX),
inode->ei_inum.subvol, 0, k, ({
@@ -565,13 +565,6 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
err:
mutex_unlock(&inode->ei_update_lock);
-
- if (value &&
- (opt_id == Opt_background_target ||
- opt_id == Opt_background_compression ||
- (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
- bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
-
err_class_exit:
return bch2_err_class(ret);
}
@@ -609,7 +602,7 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
#endif /* NO_BCACHEFS_FS */
-const struct xattr_handler *bch2_xattr_handlers[] = {
+const struct xattr_handler * const bch2_xattr_handlers[] = {
&bch_xattr_user_handler,
&bch_xattr_trusted_handler,
&bch_xattr_security_handler,
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index c188a5ad64ce..132fbbd15a66 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -6,7 +6,8 @@
extern const struct bch_hash_desc bch2_xattr_hash_desc;
-int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_xattr ((struct bkey_ops) { \
@@ -44,6 +45,6 @@ int bch2_xattr_set(struct btree_trans *, subvol_inum,
ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
-extern const struct xattr_handler *bch2_xattr_handlers[];
+extern const struct xattr_handler * const bch2_xattr_handlers[];
#endif /* _BCACHEFS_XATTR_H */
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 24727ec34e5a..6521e9a9d6ef 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -13,7 +13,7 @@
#include <linux/namei.h>
#include "internal.h"
-static const struct constant_table bool_names[] = {
+const struct constant_table bool_names[] = {
{ "0", false },
{ "1", true },
{ "false", false },
@@ -22,6 +22,7 @@ static const struct constant_table bool_names[] = {
{ "yes", true },
{ },
};
+EXPORT_SYMBOL(bool_names);
static const struct constant_table *
__lookup_constant(const struct constant_table *tbl, const char *name)
diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h
index 6cf713a7e6c6..0974cd33bcba 100644
--- a/include/linux/fs_parser.h
+++ b/include/linux/fs_parser.h
@@ -83,6 +83,8 @@ extern int fs_lookup_param(struct fs_context *fc,
extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found);
+extern const struct constant_table bool_names[];
+
#ifdef CONFIG_VALIDATE_FS_PARSER
extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
int low, int high, int special);
diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
index 43a7b9dcf15e..fe17b4828171 100644
--- a/include/linux/min_heap.h
+++ b/include/linux/min_heap.h
@@ -15,8 +15,8 @@
*/
#define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \
struct _name { \
- int nr; \
- int size; \
+ size_t nr; \
+ size_t size; \
_type *data; \
_type preallocated[_nr]; \
}
--
2.45.2