Kent Overstreet 1027c53029
Some checks failed
build / bcachefs-tools-msrv (push) Has been cancelled
Nix Flake actions / nix-matrix (push) Has been cancelled
.deb build orchestrator / source-only (push) Has been cancelled
.deb build orchestrator / publish (push) Has been cancelled
Nix Flake actions / ${{ matrix.name }} (${{ matrix.system }}) (push) Has been cancelled
.deb build orchestrator / obs (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / reprotest (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
Update bcachefs sources to fe41b3880c63 bcachefs: Avoid waiting on compaction when allocating btree node bufs
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2025-12-05 12:37:01 -05:00

1316 lines
36 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "alloc/buckets.h"
#include "btree/bkey_buf.h"
#include "btree/bkey_methods.h"
#include "btree/cache.h"
#include "btree/iter.h"
#include "btree/locking.h"
#include "btree/read.h"
#include "btree/sort.h"
#include "btree/update.h"
#include "data/checksum.h"
#include "data/extents.h"
#include "debug/async_objs.h"
#include "init/error.h"
#include "init/fs.h"
#include "init/recovery.h"
#include "journal/seq_blacklist.h"
#include "sb/counters.h"
#include "sb/io.h"
#include "util/enumerated_ref.h"
#include <linux/moduleparam.h>
#include <linux/sched/mm.h>
static __maybe_unused unsigned bch2_btree_read_corrupt_ratio;
static __maybe_unused int bch2_btree_read_corrupt_device;
#ifdef CONFIG_BCACHEFS_DEBUG
module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(btree_read_corrupt_ratio, "");
module_param_named(btree_read_corrupt_device, bch2_btree_read_corrupt_device, int, 0644);
MODULE_PARM_DESC(btree_read_corrupt_ratio, "");
#endif
static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
{
bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn));
prt_str(out, "min: ");
bch2_bpos_to_text(out, bn->min_key);
prt_newline(out);
prt_str(out, "max: ");
bch2_bpos_to_text(out, bn->max_key);
}
void bch2_btree_node_io_unlock(struct btree *b)
{
EBUG_ON(!btree_node_write_in_flight(b));
clear_btree_node_write_in_flight_inner(b);
clear_btree_node_write_in_flight(b);
smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
void bch2_btree_node_io_lock(struct btree *b)
{
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
void __bch2_btree_node_wait_on_read(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
}
void __bch2_btree_node_wait_on_write(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
void bch2_btree_node_wait_on_read(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
}
void bch2_btree_node_wait_on_write(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
__printf(7, 0)
static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca,
struct btree *b, struct bset *i, struct bkey_packed *k,
const char *fmt, va_list args)
{
if (ca)
prt_printf(out, "%s ", ca->name);
prt_printf(out, "node offset %u/%u",
b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
if (k)
prt_printf(out, " bset byte offset %lu",
(unsigned long)(void *)k -
((unsigned long)(void *)i & ~511UL));
prt_str(out, ": ");
prt_vprintf(out, fmt, args);
prt_newline(out);
}
__printf(11, 12)
static int __btree_err(enum bch_fsck_flags flags,
struct bch_fs *c,
struct bch_dev *ca,
struct btree *b,
struct bset *i,
struct bkey_packed *k,
int rw,
enum bch_sb_error_id err_type,
struct bch_io_failures *failed,
struct printbuf *err_msg,
const char *fmt, ...)
{
if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
return flags & FSCK_CAN_FIX
? bch_err_throw(c, fsck_fix)
: bch_err_throw(c, btree_node_validate_err);
bch2_sb_error_count(c, err_type);
if (rw == READ) {
va_list args;
va_start(args, fmt);
btree_err_msg(err_msg, c, ca, b, i, k, fmt, args);
va_end(args);
bch2_dev_io_failures_mut(failed, ca->dev_idx)->errcode =
bch_err_throw(c, btree_node_validate_err);
struct extent_ptr_decoded pick;
bool have_retry = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
failed, &pick, -1) == 1;
return !have_retry &&
(flags & FSCK_CAN_FIX) &&
bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type) == -BCH_ERR_fsck_fix
? bch_err_throw(c, fsck_fix)
: bch_err_throw(c, btree_node_validate_err);
} else {
CLASS(bch_log_msg, msg)(c);
prt_str(&msg.m, "corrupt btree node before write at btree ");
bch2_btree_pos_to_text(&msg.m, c, b);
prt_newline(&msg.m);
va_list args;
va_start(args, fmt);
btree_err_msg(&msg.m, c, NULL, b, i, k, fmt, args);
va_end(args);
bch2_fs_emergency_read_only2(c, &msg.m);
return bch_err_throw(c, fsck_errors_not_fixed);
}
}
#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
({ \
int _ret = __btree_err(type, c, ca, b, i, k, write, \
BCH_FSCK_ERR_##_err_type, \
failed, err_msg, \
msg, ##__VA_ARGS__); \
\
if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \
ret = _ret; \
goto fsck_err; \
} \
\
true; \
})
#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
/*
* When btree topology repair changes the start or end of a node, that might
* mean we have to drop keys that are no longer inside the node:
*/
__cold
void bch2_btree_node_drop_keys_outside_node(struct btree *b)
{
for_each_bset(b, t) {
struct bset *i = bset(b, t);
struct bkey_packed *k;
for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
break;
if (k != i->start) {
unsigned shift = (u64 *) k - (u64 *) i->start;
memmove_u64s_down(i->start, k,
(u64 *) vstruct_end(i) - (u64 *) k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
set_btree_bset_end(b, t);
}
for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
break;
if (k != vstruct_last(i)) {
i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
set_btree_bset_end(b, t);
}
}
/*
* Always rebuild search trees: eytzinger search tree nodes directly
* depend on the values of min/max key:
*/
bch2_bset_set_no_aux_tree(b, b->set);
bch2_btree_build_aux_trees(b);
b->nr = bch2_btree_node_count_keys(b);
struct bkey_s_c k;
struct bkey unpacked;
struct btree_node_iter iter;
for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
BUG_ON(bpos_gt(k.k->p, b->data->max_key));
}
}
int bch2_validate_bset(struct bch_fs *c, struct bch_dev *ca,
struct btree *b, struct bset *i,
unsigned offset, int write,
struct bch_io_failures *failed,
struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
CLASS(printbuf, buf1)();
CLASS(printbuf, buf2)();
int ret = 0;
btree_err_on(!bch2_version_compatible(version),
0,
c, ca, b, i, NULL,
btree_node_unsupported_version,
"unsupported bset version %u.%u",
BCH_VERSION_MAJOR(version),
BCH_VERSION_MINOR(version));
if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes &&
btree_err_on(version < c->sb.version_min,
FSCK_CAN_FIX,
c, ca, b, i, NULL,
btree_node_bset_older_than_sb_min,
"bset version %u older than superblock version_min %u",
version, c->sb.version_min)) {
if (bch2_version_compatible(version)) {
guard(mutex)(&c->sb_lock);
c->disk_sb.sb->version_min = cpu_to_le16(version);
bch2_write_super(c);
} else {
/* We have no idea what's going on: */
i->version = cpu_to_le16(c->sb.version);
}
}
if (btree_err_on(BCH_VERSION_MAJOR(version) >
BCH_VERSION_MAJOR(c->sb.version),
FSCK_CAN_FIX,
c, ca, b, i, NULL,
btree_node_bset_newer_than_sb,
"bset version %u newer than superblock version %u",
version, c->sb.version)) {
guard(mutex)(&c->sb_lock);
c->disk_sb.sb->version = cpu_to_le16(version);
bch2_write_super(c);
}
btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
0,
c, ca, b, i, NULL,
btree_node_unsupported_version,
"BSET_SEPARATE_WHITEOUTS no longer supported");
btree_err_on(offset && !i->u64s,
FSCK_CAN_FIX,
c, ca, b, i, NULL,
bset_empty,
"empty bset");
btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
FSCK_CAN_FIX,
c, ca, b, i, NULL,
bset_wrong_sector_offset,
"bset at wrong sector offset");
if (!offset) {
struct btree_node *bn =
container_of(i, struct btree_node, keys);
/* These indicate that we read the wrong btree node: */
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
/* XXX endianness */
btree_err_on(bp->seq != bn->keys.seq,
0,
c, ca, b, NULL, NULL,
bset_bad_seq,
"incorrect sequence number (wrong btree node)");
}
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
0,
c, ca, b, i, NULL,
btree_node_bad_btree,
"incorrect btree id");
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
0,
c, ca, b, i, NULL,
btree_node_bad_level,
"incorrect level");
if (!write)
compat_btree_node(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write, bn);
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
if (BTREE_PTR_RANGE_UPDATED(bp)) {
b->data->min_key = bp->min_key;
b->data->max_key = b->key.k.p;
}
btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
0,
c, ca, b, NULL, NULL,
btree_node_bad_min_key,
"incorrect min_key: got %s should be %s",
(printbuf_reset(&buf1),
bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
(printbuf_reset(&buf2),
bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
}
btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
0,
c, ca, b, i, NULL,
btree_node_bad_max_key,
"incorrect max key %s",
(printbuf_reset(&buf1),
bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
if (write)
compat_btree_node(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write, bn);
btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
0,
c, ca, b, i, NULL,
btree_node_bad_format,
"invalid bkey format: %s\n%s", buf1.buf,
(printbuf_reset(&buf2),
bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
printbuf_reset(&buf1);
compat_bformat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
&bn->format);
}
fsck_err:
return ret;
}
static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b,
struct bkey_s_c k,
enum bch_validate_flags flags)
{
return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) {
.from = BKEY_VALIDATE_btree_node,
.level = b->c.level,
.btree = b->c.btree_id,
.flags = flags
});
}
static int bset_key_validate(struct bch_fs *c, struct btree *b,
struct bkey_s_c k,
bool updated_range,
enum bch_validate_flags flags)
{
struct bkey_validate_context from = (struct bkey_validate_context) {
.from = BKEY_VALIDATE_btree_node,
.level = b->c.level,
.btree = b->c.btree_id,
.flags = flags,
};
return __bch2_bkey_validate(c, k, from) ?:
(!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?:
(flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0);
}
static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
struct bset *i, struct bkey_packed *k)
{
if (bkey_p_next(k) > vstruct_last(i))
return false;
if (k->format > KEY_FORMAT_CURRENT)
return false;
if (!bkeyp_u64s_valid(&b->format, k))
return false;
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
return !__bch2_bkey_validate(c, u.s_c,
(struct bkey_validate_context) {
.from = BKEY_VALIDATE_btree_node,
.level = b->c.level,
.btree = b->c.btree_id,
.flags = BCH_VALIDATE_silent
});
}
static inline int btree_node_read_bkey_cmp(const struct btree *b,
const struct bkey_packed *l,
const struct bkey_packed *r)
{
return bch2_bkey_cmp_packed(b, l, r)
?: (int) bkey_deleted(r) - (int) bkey_deleted(l);
}
int bch2_validate_bset_keys(struct bch_fs *c,
struct bch_dev *ca,
struct btree *b,
struct bset *i, int write,
struct bch_io_failures *failed,
struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
struct bkey_packed *k, *prev = NULL;
CLASS(printbuf, buf)();
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
int ret = 0;
for (k = i->start;
k != vstruct_last(i);) {
struct bkey_s u;
struct bkey tmp;
unsigned next_good_key;
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
FSCK_CAN_FIX,
c, ca, b, i, k,
btree_node_bkey_past_bset_end,
"key extends past end of bset")) {
i->u64s = cpu_to_le16((u64 *) k - i->_data);
break;
}
if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
FSCK_CAN_FIX,
c, ca, b, i, k,
btree_node_bkey_bad_format,
"invalid bkey format %u", k->format))
goto drop_this_key;
if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
FSCK_CAN_FIX,
c, ca, b, i, k,
btree_node_bkey_bad_u64s,
"bad k->u64s %u (min %u max %zu)", k->u64s,
bkeyp_key_u64s(&b->format, k),
U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
goto drop_this_key;
if (!write)
bch2_bkey_compat(c, b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
&b->format, k);
u = __bkey_disassemble(b, k, &tmp);
ret = bset_key_validate(c, b, u.s_c, updated_range, write);
if (ret == -BCH_ERR_fsck_delete_bkey)
goto drop_this_key;
if (ret)
goto fsck_err;
if (write)
bch2_bkey_compat(c, b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
&b->format, k);
if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) {
struct bkey up = bkey_unpack_key(b, prev);
printbuf_reset(&buf);
prt_printf(&buf, "keys out of order: ");
bch2_bkey_to_text(&buf, &up);
prt_printf(&buf, " > ");
bch2_bkey_to_text(&buf, u.k);
if (btree_err(FSCK_CAN_FIX,
c, ca, b, i, k,
btree_node_bkey_out_of_order,
"%s", buf.buf))
goto drop_this_key;
}
prev = k;
k = bkey_p_next(k);
continue;
drop_this_key:
ret = 0;
next_good_key = k->u64s;
if (!next_good_key ||
(BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
version >= bcachefs_metadata_version_snapshot)) {
/*
* only do scanning if bch2_bkey_compat() has nothing to
* do
*/
if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
for (next_good_key = 1;
next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
next_good_key++)
if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
goto got_good_key;
}
/*
* didn't find a good key, have to truncate the rest of
* the bset
*/
next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
}
got_good_key:
le16_add_cpu(&i->u64s, -next_good_key);
memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k);
set_btree_node_need_rewrite(b);
set_btree_node_need_rewrite_error(b);
}
fsck_err:
return ret;
}
static bool btree_node_degraded(struct bch_fs *c, struct btree *b)
{
guard(rcu)();
bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
if (ptr->dev == BCH_SB_MEMBER_INVALID)
continue;
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
if (!ca || ca->mi.state != BCH_MEMBER_STATE_rw)
return true;
}
return false;
}
int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct btree *b,
struct bch_io_failures *failed,
struct printbuf *err_msg)
{
struct btree_node_entry *bne;
struct sort_iter *iter;
struct btree_node *sorted;
struct bkey_packed *k;
struct bset *i;
bool used_mempool, blacklisted;
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
u64 max_journal_seq = 0;
CLASS(printbuf, buf)();
int ret = 0, write = READ;
u64 start_time = local_clock();
b->version_ondisk = U16_MAX;
/* We might get called multiple times on read retry: */
b->written = 0;
iter = mempool_alloc(&c->btree.fill_iter, GFP_NOFS);
sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
if (bch2_meta_read_fault("btree"))
btree_err(0,
c, ca, b, NULL, NULL,
btree_node_fault_injected,
"dynamic fault");
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
0,
c, ca, b, NULL, NULL,
btree_node_bad_magic,
"bad magic: want %llx, got %llx",
bset_magic(c), le64_to_cpu(b->data->magic));
while (b->written < (ptr_written ?: btree_sectors(c))) {
unsigned sectors;
bool first = !b->written;
if (first) {
bne = NULL;
i = &b->data->keys;
} else {
bne = write_block(b);
i = &bne->keys;
if (i->seq != b->data->keys.seq)
break;
}
struct nonce nonce = btree_nonce(i, b->written << 9);
bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
btree_err_on(!good_csum_type,
bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))
? 0
: FSCK_CAN_FIX,
c, ca, b, i, NULL,
bset_unknown_csum,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
if (first) {
sectors = vstruct_sectors(b->data, c->block_bits);
if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
FSCK_CAN_FIX,
c, ca, b, i, NULL,
bset_past_end_of_btree_node,
"bset past end of btree node (offset %u len %u but written %zu)",
b->written, sectors, ptr_written ?: btree_sectors(c)))
i->u64s = 0;
if (good_csum_type) {
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
if (csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
btree_err_on(csum_bad,
FSCK_CAN_FIX,
c, ca, b, i, NULL,
bset_bad_csum,
"%s",
(printbuf_reset(&buf),
bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
buf.buf));
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
"decrypting btree node: %s", bch2_err_str(ret)))
goto fsck_err;
}
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
bch2_bpos_to_text(&buf, b->data->min_key);
prt_str(&buf, "-");
bch2_bpos_to_text(&buf, b->data->max_key);
btree_err_on(b->data->keys.seq != bp->seq,
0,
c, ca, b, NULL, NULL,
btree_node_bad_seq,
"got wrong btree node: got\n%s",
(printbuf_reset(&buf),
printbuf_indent_add(&buf, 2),
bch2_btree_node_header_to_text(&buf, b->data),
buf.buf));
} else {
btree_err_on(!b->data->keys.seq,
0,
c, ca, b, NULL, NULL,
btree_node_bad_seq,
"bad btree header: seq 0\n%s",
(printbuf_reset(&buf),
bch2_btree_node_header_to_text(&buf, b->data),
buf.buf));
}
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
0,
c, ca, b, NULL, NULL,
btree_node_unsupported_version,
"btree node does not have NEW_EXTENT_OVERWRITE set");
} else {
sectors = vstruct_sectors(bne, c->block_bits);
if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
FSCK_CAN_FIX,
c, ca, b, i, NULL,
bset_past_end_of_btree_node,
"bset past end of btree node (offset %u len %u but written %zu)",
b->written, sectors, ptr_written ?: btree_sectors(c)))
i->u64s = 0;
if (good_csum_type) {
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
bool csum_bad = bch2_crc_cmp(bne->csum, csum);
if (ca && csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
btree_err_on(csum_bad,
FSCK_CAN_FIX,
c, ca, b, i, NULL,
bset_bad_csum,
"%s",
(printbuf_reset(&buf),
bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
buf.buf));
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
"decrypting btree node: %s", bch2_err_str(ret)))
goto fsck_err;
}
}
b->version_ondisk = min(b->version_ondisk,
le16_to_cpu(i->version));
ret = bch2_validate_bset(c, ca, b, i, b->written, READ, failed, err_msg);
if (ret)
goto fsck_err;
if (!b->written)
btree_node_set_format(b, b->data->format);
ret = bch2_validate_bset_keys(c, ca, b, i, READ, failed, err_msg);
if (ret)
goto fsck_err;
SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
blacklisted = bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(i->journal_seq),
true);
btree_err_on(blacklisted && first,
FSCK_CAN_FIX,
c, ca, b, i, NULL,
bset_blacklisted_journal_seq,
"first btree node btree/bset.has blacklisted journal seq (%llu)",
le64_to_cpu(i->journal_seq));
btree_err_on(blacklisted && ptr_written,
FSCK_CAN_FIX,
c, ca, b, i, NULL,
first_bset_blacklisted_journal_seq,
"found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
le64_to_cpu(i->journal_seq),
b->written, b->written + sectors, ptr_written);
b->written = min(b->written + sectors, btree_sectors(c));
if (blacklisted && !first)
continue;
sort_iter_add(iter,
vstruct_idx(i, 0),
vstruct_last(i));
max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
}
if (ptr_written) {
btree_err_on(b->written < ptr_written,
FSCK_CAN_FIX,
c, ca, b, NULL, NULL,
btree_node_data_missing,
"btree node data missing: expected %u sectors, found %u",
ptr_written, b->written);
} else {
for (bne = write_block(b);
bset_byte_offset(b, bne) < btree_buf_bytes(b);
bne = (void *) bne + block_bytes(c))
btree_err_on(bne->keys.seq == b->data->keys.seq &&
!bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq),
true),
FSCK_CAN_FIX,
c, ca, b, NULL, NULL,
btree_node_bset_after_end,
"found bset signature after last bset");
}
sorted = bch2_btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
sorted->keys.u64s = 0;
b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
btree_buf_bytes(b) -
sizeof(struct btree_node) -
b->nr.live_u64s * sizeof(u64));
b->data->keys.u64s = sorted->keys.u64s;
*sorted = *b->data;
swap(sorted, b->data);
set_btree_bset(b, b->set, &b->data->keys);
b->nsets = 1;
b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s));
bch2_btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
i = &b->data->keys;
for (k = i->start; k != vstruct_last(i);) {
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
if (ret == -BCH_ERR_fsck_delete_bkey ||
(static_branch_unlikely(&bch2_inject_invalid_keys) &&
!bversion_cmp(u.k->bversion, MAX_VERSION))) {
btree_keys_account_key_drop(&b->nr, 0, k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
set_btree_bset_end(b, b->set);
set_btree_node_need_rewrite(b);
set_btree_node_need_rewrite_error(b);
ret = 0;
continue;
}
if (ret)
goto fsck_err;
if (u.k->type == KEY_TYPE_btree_ptr_v2) {
struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
bp.v->mem_ptr = 0;
}
k = bkey_p_next(k);
}
bch2_bset_build_aux_tree(b, b->set, false);
bch2_set_bset_needs_whiteout(btree_bset_first(b), true);
btree_node_reset_sib_u64s(b);
if (updated_range)
bch2_btree_node_drop_keys_outside_node(b);
if (!ptr_written) {
set_btree_node_need_rewrite(b);
set_btree_node_need_rewrite_ptr_written_zero(b);
}
fsck_err:
mempool_free(iter, &c->btree.fill_iter);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
return ret;
}
static void btree_node_read_work(struct work_struct *work)
{
struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
struct btree *b = rb->b;
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
int ret = 0;
CLASS(printbuf, buf)();
bch2_log_msg_start(c, &buf);
prt_printf(&buf, "btree node read error at btree ");
bch2_btree_pos_to_text(&buf, c, b);
prt_newline(&buf);
while (1) {
if (rb->have_ioref)
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read);
rb->have_ioref = false;
if (!bio->bi_status) {
memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
bio->bi_iter.bi_size = btree_buf_bytes(b);
if (bch2_btree_read_corrupt_device == rb->pick.ptr.dev ||
bch2_btree_read_corrupt_device < 0)
bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio);
ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
} else {
ret = __bch2_err_throw(c, -blk_status_to_bch_err(bio->bi_status));
bch2_mark_io_failure(&failed, &rb->pick, ret);
}
if (!ret ||
bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
&failed, &rb->pick, -1) <= 0)
break;
ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
rb->have_ioref = ca != NULL;
rb->start_time = local_clock();
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_buf_bytes(b);
if (rb->have_ioref) {
bio_set_dev(bio, ca->disk_sb.bdev);
submit_bio_wait(bio);
} else {
bio->bi_status = BLK_STS_REMOVED;
}
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
rb->start_time, !bio->bi_status);
}
bch2_io_failures_to_text(&buf, c, &failed);
/*
* only print retry success if we read from a replica with no errors
*/
if (ret) {
/*
* Initialize buf.suppress before btree_lost_data(); that will
* clear it if it did any work (scheduling recovery passes,
* marking superblock
*/
buf.suppress = !__bch2_ratelimit(c, &c->btree.read_errors_hard);
set_btree_node_read_error(b);
bch2_btree_lost_data(c, &buf, b->c.btree_id);
prt_printf(&buf, "error %s\n", bch2_err_str(ret));
} else if (failed.nr) {
/* Separate ratelimit states for soft vs. hard errors */
buf.suppress = !__bch2_ratelimit(c, &c->btree.read_errors_soft);
if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
prt_printf(&buf, "retry success");
else
prt_printf(&buf, "repair success");
if ((failed.nr || btree_node_need_rewrite(b)) &&
c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
prt_printf(&buf, " (rewriting node)");
bch2_btree_node_rewrite_async(c, b);
}
prt_newline(&buf);
} else {
buf.suppress = true;
}
if (!buf.suppress)
bch2_print_str(c, ret ? KERN_ERR : KERN_NOTICE, buf.buf);
/*
* Do this late; unlike other btree_node_need_rewrite() cases if a node
* is merely degraded we should rewrite it before we update it, but we
* don't need to kick off an async rewrite now:
*/
if (btree_node_degraded(c, b)) {
set_btree_node_need_rewrite(b);
set_btree_node_need_rewrite_degraded(b);
}
async_object_list_del(c, btree_read_bio, rb->list_idx);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
rb->start_time);
bio_put(&rb->bio);
clear_btree_node_read_in_flight(b);
smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
}
static void btree_node_read_endio(struct bio *bio)
{
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
struct bch_fs *c = rb->c;
struct bch_dev *ca = rb->have_ioref
? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
rb->start_time, !bio->bi_status);
queue_work(c->btree.read_complete_wq, &rb->work);
}
void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio)
{
bch2_bio_to_text(out, &rbio->bio);
}
void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bool sync)
{
struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct btree_read_bio *rb;
struct bch_dev *ca;
struct bio *bio;
int ret;
trace_btree_node(c, b, btree_node_read);
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
NULL, &pick, -1);
if (ret <= 0) {
CLASS(bch_log_msg_ratelimited, msg)(c);
prt_str(&msg.m, "btree node read error: no device to read from\n at ");
bch2_btree_pos_to_text(&msg.m, c, b);
prt_newline(&msg.m);
bch2_btree_lost_data(c, &msg.m, b->c.btree_id);
if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology))
bch2_fs_emergency_read_only2(c, &msg.m);
set_btree_node_read_error(b);
clear_btree_node_read_in_flight(b);
smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
return;
}
ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
bio = bio_alloc_bioset(NULL,
buf_pages(b->data, btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree.bio);
rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c;
rb->b = b;
rb->start_time = local_clock();
rb->have_ioref = ca != NULL;
rb->pick = pick;
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_end_io = btree_node_read_endio;
bch2_bio_map(bio, b->data, btree_buf_bytes(b));
async_object_list_add(c, btree_read_bio, rb, &rb->list_idx);
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
bio_sectors(bio));
bio_set_dev(bio, ca->disk_sb.bdev);
if (sync) {
submit_bio_wait(bio);
bch2_latency_acct(ca, rb->start_time, READ);
btree_node_read_work(&rb->work);
} else {
submit_bio(bio);
}
} else {
bio->bi_status = BLK_STS_REMOVED;
if (sync)
btree_node_read_work(&rb->work);
else
queue_work(c->btree.read_complete_wq, &rb->work);
}
}
static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
const struct bkey_i *k, unsigned level)
{
struct bch_fs *c = trans->c;
struct btree *b;
int ret;
CLASS(closure_stack, cl)();
do {
ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(trans, level != 0);
bch2_btree_cache_cannibalize_unlock(trans);
BUG_ON(IS_ERR(b));
bkey_copy(&b->key, k);
BUG_ON(bch2_btree_node_hash_insert(&c->btree.cache, b, level, id));
set_btree_node_read_in_flight(b);
/* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
bch2_trans_unlock(trans);
bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
scoped_guard(mutex, &c->btree.cache.lock)
bch2_btree_node_hash_remove(&c->btree.cache, b);
ret = bch_err_throw(c, btree_node_read_error);
goto err;
}
bch2_btree_set_root_for_read(c, b);
err:
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
return ret;
}
int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
const struct bkey_i *k, unsigned level)
{
CLASS(btree_trans, trans)(c);
return __bch2_btree_root_read(trans, id, k, level);
}
struct btree_node_scrub {
struct bch_fs *c;
struct bch_dev *ca;
void *buf;
bool used_mempool;
unsigned written;
enum btree_id btree;
unsigned level;
struct bkey_buf key;
__le64 seq;
struct work_struct work;
struct bio bio;
struct bio_vec inline_vecs[];
};
static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
struct printbuf *err)
{
unsigned written = 0;
if (le64_to_cpu(data->magic) != bset_magic(c)) {
prt_printf(err, "bad magic: want %llx, got %llx",
bset_magic(c), le64_to_cpu(data->magic));
return false;
}
while (written < (ptr_written ?: btree_sectors(c))) {
struct btree_node_entry *bne;
struct bset *i;
bool first = !written;
if (first) {
bne = NULL;
i = &data->keys;
} else {
bne = (void *) data + (written << 9);
i = &bne->keys;
if (!ptr_written && i->seq != data->keys.seq)
break;
}
struct nonce nonce = btree_nonce(i, written << 9);
bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
if (first) {
if (good_csum_type) {
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
if (bch2_crc_cmp(data->csum, csum)) {
bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
return false;
}
}
written += vstruct_sectors(data, c->block_bits);
} else {
if (good_csum_type) {
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
if (bch2_crc_cmp(bne->csum, csum)) {
bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
return false;
}
}
written += vstruct_sectors(bne, c->block_bits);
}
}
return true;
}
static void btree_node_scrub_work(struct work_struct *work)
{
struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
struct bch_fs *c = scrub->c;
CLASS(printbuf, err)();
__bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
bkey_i_to_s_c(scrub->key.k));
prt_newline(&err);
if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
int ret = bch2_trans_do(c,
bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1,
scrub->key.k, 0));
if (!bch2_err_matches(ret, ENOENT) &&
!bch2_err_matches(ret, EROFS))
bch_err_fn_ratelimited(c, ret);
}
bch2_bkey_buf_exit(&scrub->key);
bch2_btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
kfree(scrub);
enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
}
static void btree_node_scrub_endio(struct bio *bio)
{
struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
queue_work(scrub->c->btree.read_complete_wq, &scrub->work);
}
int bch2_btree_node_scrub(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c k, unsigned dev)
{
if (k.k->type != KEY_TYPE_btree_ptr_v2)
return 0;
struct bch_fs *c = trans->c;
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub))
return bch_err_throw(c, erofs_no_writes);
struct extent_ptr_decoded pick;
int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
if (ret <= 0)
goto err;
struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
BCH_DEV_READ_REF_btree_node_scrub);
if (!ca) {
ret = bch_err_throw(c, device_offline);
goto err;
}
bool used_mempool = false;
void *buf = bch2_btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
struct btree_node_scrub *scrub =
kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
if (!scrub) {
ret = -ENOMEM;
goto err_free;
}
scrub->c = c;
scrub->ca = ca;
scrub->buf = buf;
scrub->used_mempool = used_mempool;
scrub->written = btree_ptr_sectors_written(k);
scrub->btree = btree;
scrub->level = level;
bch2_bkey_buf_init(&scrub->key);
bch2_bkey_buf_reassemble(&scrub->key, k);
scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq;
INIT_WORK(&scrub->work, btree_node_scrub_work);
bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->inline_vecs, vecs, REQ_OP_READ);
bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
scrub->bio.bi_iter.bi_sector = pick.ptr.offset;
scrub->bio.bi_end_io = btree_node_scrub_endio;
submit_bio(&scrub->bio);
return 0;
err_free:
bch2_btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
err:
enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
return ret;
}