mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-12-08 00:00:12 +03:00
Some checks failed
build / bcachefs-tools-msrv (push) Has been cancelled
Nix Flake actions / nix-matrix (push) Has been cancelled
.deb build orchestrator / source-only (push) Has been cancelled
.deb build orchestrator / publish (push) Has been cancelled
Nix Flake actions / ${{ matrix.name }} (${{ matrix.system }}) (push) Has been cancelled
.deb build orchestrator / obs (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / reprotest (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
1316 lines
36 KiB
C
1316 lines
36 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include "bcachefs.h"
|
|
|
|
#include "alloc/buckets.h"
|
|
|
|
#include "btree/bkey_buf.h"
|
|
#include "btree/bkey_methods.h"
|
|
#include "btree/cache.h"
|
|
#include "btree/iter.h"
|
|
#include "btree/locking.h"
|
|
#include "btree/read.h"
|
|
#include "btree/sort.h"
|
|
#include "btree/update.h"
|
|
|
|
#include "data/checksum.h"
|
|
#include "data/extents.h"
|
|
|
|
#include "debug/async_objs.h"
|
|
|
|
#include "init/error.h"
|
|
#include "init/fs.h"
|
|
#include "init/recovery.h"
|
|
|
|
#include "journal/seq_blacklist.h"
|
|
|
|
#include "sb/counters.h"
|
|
|
|
#include "sb/io.h"
|
|
|
|
#include "util/enumerated_ref.h"
|
|
|
|
#include <linux/moduleparam.h>
|
|
#include <linux/sched/mm.h>
|
|
|
|
static __maybe_unused unsigned bch2_btree_read_corrupt_ratio;
|
|
static __maybe_unused int bch2_btree_read_corrupt_device;
|
|
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644);
|
|
MODULE_PARM_DESC(btree_read_corrupt_ratio, "");
|
|
|
|
module_param_named(btree_read_corrupt_device, bch2_btree_read_corrupt_device, int, 0644);
|
|
MODULE_PARM_DESC(btree_read_corrupt_ratio, "");
|
|
#endif
|
|
|
|
static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
|
|
{
|
|
bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
|
|
prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn));
|
|
prt_str(out, "min: ");
|
|
bch2_bpos_to_text(out, bn->min_key);
|
|
prt_newline(out);
|
|
prt_str(out, "max: ");
|
|
bch2_bpos_to_text(out, bn->max_key);
|
|
}
|
|
|
|
void bch2_btree_node_io_unlock(struct btree *b)
|
|
{
|
|
EBUG_ON(!btree_node_write_in_flight(b));
|
|
|
|
clear_btree_node_write_in_flight_inner(b);
|
|
clear_btree_node_write_in_flight(b);
|
|
smp_mb__after_atomic();
|
|
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
|
|
}
|
|
|
|
void bch2_btree_node_io_lock(struct btree *b)
|
|
{
|
|
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
|
|
TASK_UNINTERRUPTIBLE);
|
|
}
|
|
|
|
void __bch2_btree_node_wait_on_read(struct btree *b)
|
|
{
|
|
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
|
|
TASK_UNINTERRUPTIBLE);
|
|
}
|
|
|
|
void __bch2_btree_node_wait_on_write(struct btree *b)
|
|
{
|
|
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
|
|
TASK_UNINTERRUPTIBLE);
|
|
}
|
|
|
|
void bch2_btree_node_wait_on_read(struct btree *b)
|
|
{
|
|
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
|
|
TASK_UNINTERRUPTIBLE);
|
|
}
|
|
|
|
void bch2_btree_node_wait_on_write(struct btree *b)
|
|
{
|
|
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
|
|
TASK_UNINTERRUPTIBLE);
|
|
}
|
|
|
|
__printf(7, 0)
|
|
static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca,
|
|
struct btree *b, struct bset *i, struct bkey_packed *k,
|
|
const char *fmt, va_list args)
|
|
{
|
|
if (ca)
|
|
prt_printf(out, "%s ", ca->name);
|
|
|
|
prt_printf(out, "node offset %u/%u",
|
|
b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
|
|
if (i)
|
|
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
|
|
if (k)
|
|
prt_printf(out, " bset byte offset %lu",
|
|
(unsigned long)(void *)k -
|
|
((unsigned long)(void *)i & ~511UL));
|
|
prt_str(out, ": ");
|
|
|
|
prt_vprintf(out, fmt, args);
|
|
prt_newline(out);
|
|
}
|
|
|
|
__printf(11, 12)
|
|
static int __btree_err(enum bch_fsck_flags flags,
|
|
struct bch_fs *c,
|
|
struct bch_dev *ca,
|
|
struct btree *b,
|
|
struct bset *i,
|
|
struct bkey_packed *k,
|
|
int rw,
|
|
enum bch_sb_error_id err_type,
|
|
struct bch_io_failures *failed,
|
|
struct printbuf *err_msg,
|
|
const char *fmt, ...)
|
|
{
|
|
if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
|
|
return flags & FSCK_CAN_FIX
|
|
? bch_err_throw(c, fsck_fix)
|
|
: bch_err_throw(c, btree_node_validate_err);
|
|
|
|
bch2_sb_error_count(c, err_type);
|
|
|
|
if (rw == READ) {
|
|
va_list args;
|
|
va_start(args, fmt);
|
|
btree_err_msg(err_msg, c, ca, b, i, k, fmt, args);
|
|
va_end(args);
|
|
|
|
bch2_dev_io_failures_mut(failed, ca->dev_idx)->errcode =
|
|
bch_err_throw(c, btree_node_validate_err);
|
|
|
|
struct extent_ptr_decoded pick;
|
|
bool have_retry = bch2_bkey_pick_read_device(c,
|
|
bkey_i_to_s_c(&b->key),
|
|
failed, &pick, -1) == 1;
|
|
|
|
return !have_retry &&
|
|
(flags & FSCK_CAN_FIX) &&
|
|
bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type) == -BCH_ERR_fsck_fix
|
|
? bch_err_throw(c, fsck_fix)
|
|
: bch_err_throw(c, btree_node_validate_err);
|
|
} else {
|
|
CLASS(bch_log_msg, msg)(c);
|
|
|
|
prt_str(&msg.m, "corrupt btree node before write at btree ");
|
|
bch2_btree_pos_to_text(&msg.m, c, b);
|
|
prt_newline(&msg.m);
|
|
|
|
va_list args;
|
|
va_start(args, fmt);
|
|
btree_err_msg(&msg.m, c, NULL, b, i, k, fmt, args);
|
|
va_end(args);
|
|
|
|
bch2_fs_emergency_read_only2(c, &msg.m);
|
|
|
|
return bch_err_throw(c, fsck_errors_not_fixed);
|
|
}
|
|
}
|
|
|
|
#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
|
|
({ \
|
|
int _ret = __btree_err(type, c, ca, b, i, k, write, \
|
|
BCH_FSCK_ERR_##_err_type, \
|
|
failed, err_msg, \
|
|
msg, ##__VA_ARGS__); \
|
|
\
|
|
if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \
|
|
ret = _ret; \
|
|
goto fsck_err; \
|
|
} \
|
|
\
|
|
true; \
|
|
})
|
|
|
|
#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
|
|
|
|
/*
|
|
* When btree topology repair changes the start or end of a node, that might
|
|
* mean we have to drop keys that are no longer inside the node:
|
|
*/
|
|
__cold
|
|
void bch2_btree_node_drop_keys_outside_node(struct btree *b)
|
|
{
|
|
for_each_bset(b, t) {
|
|
struct bset *i = bset(b, t);
|
|
struct bkey_packed *k;
|
|
|
|
for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
|
|
if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
|
|
break;
|
|
|
|
if (k != i->start) {
|
|
unsigned shift = (u64 *) k - (u64 *) i->start;
|
|
|
|
memmove_u64s_down(i->start, k,
|
|
(u64 *) vstruct_end(i) - (u64 *) k);
|
|
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
|
|
set_btree_bset_end(b, t);
|
|
}
|
|
|
|
for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
|
|
if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
|
|
break;
|
|
|
|
if (k != vstruct_last(i)) {
|
|
i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
|
|
set_btree_bset_end(b, t);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Always rebuild search trees: eytzinger search tree nodes directly
|
|
* depend on the values of min/max key:
|
|
*/
|
|
bch2_bset_set_no_aux_tree(b, b->set);
|
|
bch2_btree_build_aux_trees(b);
|
|
b->nr = bch2_btree_node_count_keys(b);
|
|
|
|
struct bkey_s_c k;
|
|
struct bkey unpacked;
|
|
struct btree_node_iter iter;
|
|
for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
|
|
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
|
|
BUG_ON(bpos_gt(k.k->p, b->data->max_key));
|
|
}
|
|
}
|
|
|
|
int bch2_validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
|
struct btree *b, struct bset *i,
|
|
unsigned offset, int write,
|
|
struct bch_io_failures *failed,
|
|
struct printbuf *err_msg)
|
|
{
|
|
unsigned version = le16_to_cpu(i->version);
|
|
CLASS(printbuf, buf1)();
|
|
CLASS(printbuf, buf2)();
|
|
int ret = 0;
|
|
|
|
btree_err_on(!bch2_version_compatible(version),
|
|
0,
|
|
c, ca, b, i, NULL,
|
|
btree_node_unsupported_version,
|
|
"unsupported bset version %u.%u",
|
|
BCH_VERSION_MAJOR(version),
|
|
BCH_VERSION_MINOR(version));
|
|
|
|
if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes &&
|
|
btree_err_on(version < c->sb.version_min,
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, NULL,
|
|
btree_node_bset_older_than_sb_min,
|
|
"bset version %u older than superblock version_min %u",
|
|
version, c->sb.version_min)) {
|
|
if (bch2_version_compatible(version)) {
|
|
guard(mutex)(&c->sb_lock);
|
|
c->disk_sb.sb->version_min = cpu_to_le16(version);
|
|
bch2_write_super(c);
|
|
} else {
|
|
/* We have no idea what's going on: */
|
|
i->version = cpu_to_le16(c->sb.version);
|
|
}
|
|
}
|
|
|
|
if (btree_err_on(BCH_VERSION_MAJOR(version) >
|
|
BCH_VERSION_MAJOR(c->sb.version),
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, NULL,
|
|
btree_node_bset_newer_than_sb,
|
|
"bset version %u newer than superblock version %u",
|
|
version, c->sb.version)) {
|
|
guard(mutex)(&c->sb_lock);
|
|
c->disk_sb.sb->version = cpu_to_le16(version);
|
|
bch2_write_super(c);
|
|
}
|
|
|
|
btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
|
|
0,
|
|
c, ca, b, i, NULL,
|
|
btree_node_unsupported_version,
|
|
"BSET_SEPARATE_WHITEOUTS no longer supported");
|
|
|
|
btree_err_on(offset && !i->u64s,
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, NULL,
|
|
bset_empty,
|
|
"empty bset");
|
|
|
|
btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, NULL,
|
|
bset_wrong_sector_offset,
|
|
"bset at wrong sector offset");
|
|
|
|
if (!offset) {
|
|
struct btree_node *bn =
|
|
container_of(i, struct btree_node, keys);
|
|
/* These indicate that we read the wrong btree node: */
|
|
|
|
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
|
|
struct bch_btree_ptr_v2 *bp =
|
|
&bkey_i_to_btree_ptr_v2(&b->key)->v;
|
|
|
|
/* XXX endianness */
|
|
btree_err_on(bp->seq != bn->keys.seq,
|
|
0,
|
|
c, ca, b, NULL, NULL,
|
|
bset_bad_seq,
|
|
"incorrect sequence number (wrong btree node)");
|
|
}
|
|
|
|
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
|
|
0,
|
|
c, ca, b, i, NULL,
|
|
btree_node_bad_btree,
|
|
"incorrect btree id");
|
|
|
|
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
|
|
0,
|
|
c, ca, b, i, NULL,
|
|
btree_node_bad_level,
|
|
"incorrect level");
|
|
|
|
if (!write)
|
|
compat_btree_node(b->c.level, b->c.btree_id, version,
|
|
BSET_BIG_ENDIAN(i), write, bn);
|
|
|
|
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
|
|
struct bch_btree_ptr_v2 *bp =
|
|
&bkey_i_to_btree_ptr_v2(&b->key)->v;
|
|
|
|
if (BTREE_PTR_RANGE_UPDATED(bp)) {
|
|
b->data->min_key = bp->min_key;
|
|
b->data->max_key = b->key.k.p;
|
|
}
|
|
|
|
btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
|
|
0,
|
|
c, ca, b, NULL, NULL,
|
|
btree_node_bad_min_key,
|
|
"incorrect min_key: got %s should be %s",
|
|
(printbuf_reset(&buf1),
|
|
bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
|
|
(printbuf_reset(&buf2),
|
|
bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
|
|
}
|
|
|
|
btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
|
|
0,
|
|
c, ca, b, i, NULL,
|
|
btree_node_bad_max_key,
|
|
"incorrect max key %s",
|
|
(printbuf_reset(&buf1),
|
|
bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
|
|
|
|
if (write)
|
|
compat_btree_node(b->c.level, b->c.btree_id, version,
|
|
BSET_BIG_ENDIAN(i), write, bn);
|
|
|
|
btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
|
|
0,
|
|
c, ca, b, i, NULL,
|
|
btree_node_bad_format,
|
|
"invalid bkey format: %s\n%s", buf1.buf,
|
|
(printbuf_reset(&buf2),
|
|
bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
|
|
printbuf_reset(&buf1);
|
|
|
|
compat_bformat(b->c.level, b->c.btree_id, version,
|
|
BSET_BIG_ENDIAN(i), write,
|
|
&bn->format);
|
|
}
|
|
fsck_err:
|
|
return ret;
|
|
}
|
|
|
|
static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b,
|
|
struct bkey_s_c k,
|
|
enum bch_validate_flags flags)
|
|
{
|
|
return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) {
|
|
.from = BKEY_VALIDATE_btree_node,
|
|
.level = b->c.level,
|
|
.btree = b->c.btree_id,
|
|
.flags = flags
|
|
});
|
|
}
|
|
|
|
static int bset_key_validate(struct bch_fs *c, struct btree *b,
|
|
struct bkey_s_c k,
|
|
bool updated_range,
|
|
enum bch_validate_flags flags)
|
|
{
|
|
struct bkey_validate_context from = (struct bkey_validate_context) {
|
|
.from = BKEY_VALIDATE_btree_node,
|
|
.level = b->c.level,
|
|
.btree = b->c.btree_id,
|
|
.flags = flags,
|
|
};
|
|
return __bch2_bkey_validate(c, k, from) ?:
|
|
(!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?:
|
|
(flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0);
|
|
}
|
|
|
|
static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
|
|
struct bset *i, struct bkey_packed *k)
|
|
{
|
|
if (bkey_p_next(k) > vstruct_last(i))
|
|
return false;
|
|
|
|
if (k->format > KEY_FORMAT_CURRENT)
|
|
return false;
|
|
|
|
if (!bkeyp_u64s_valid(&b->format, k))
|
|
return false;
|
|
|
|
struct bkey tmp;
|
|
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
|
|
return !__bch2_bkey_validate(c, u.s_c,
|
|
(struct bkey_validate_context) {
|
|
.from = BKEY_VALIDATE_btree_node,
|
|
.level = b->c.level,
|
|
.btree = b->c.btree_id,
|
|
.flags = BCH_VALIDATE_silent
|
|
});
|
|
}
|
|
|
|
static inline int btree_node_read_bkey_cmp(const struct btree *b,
|
|
const struct bkey_packed *l,
|
|
const struct bkey_packed *r)
|
|
{
|
|
return bch2_bkey_cmp_packed(b, l, r)
|
|
?: (int) bkey_deleted(r) - (int) bkey_deleted(l);
|
|
}
|
|
|
|
int bch2_validate_bset_keys(struct bch_fs *c,
|
|
struct bch_dev *ca,
|
|
struct btree *b,
|
|
struct bset *i, int write,
|
|
struct bch_io_failures *failed,
|
|
struct printbuf *err_msg)
|
|
{
|
|
unsigned version = le16_to_cpu(i->version);
|
|
struct bkey_packed *k, *prev = NULL;
|
|
CLASS(printbuf, buf)();
|
|
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
|
|
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
|
|
int ret = 0;
|
|
|
|
for (k = i->start;
|
|
k != vstruct_last(i);) {
|
|
struct bkey_s u;
|
|
struct bkey tmp;
|
|
unsigned next_good_key;
|
|
|
|
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, k,
|
|
btree_node_bkey_past_bset_end,
|
|
"key extends past end of bset")) {
|
|
i->u64s = cpu_to_le16((u64 *) k - i->_data);
|
|
break;
|
|
}
|
|
|
|
if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, k,
|
|
btree_node_bkey_bad_format,
|
|
"invalid bkey format %u", k->format))
|
|
goto drop_this_key;
|
|
|
|
if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, k,
|
|
btree_node_bkey_bad_u64s,
|
|
"bad k->u64s %u (min %u max %zu)", k->u64s,
|
|
bkeyp_key_u64s(&b->format, k),
|
|
U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
|
|
goto drop_this_key;
|
|
|
|
if (!write)
|
|
bch2_bkey_compat(c, b->c.level, b->c.btree_id, version,
|
|
BSET_BIG_ENDIAN(i), write,
|
|
&b->format, k);
|
|
|
|
u = __bkey_disassemble(b, k, &tmp);
|
|
|
|
ret = bset_key_validate(c, b, u.s_c, updated_range, write);
|
|
if (ret == -BCH_ERR_fsck_delete_bkey)
|
|
goto drop_this_key;
|
|
if (ret)
|
|
goto fsck_err;
|
|
|
|
if (write)
|
|
bch2_bkey_compat(c, b->c.level, b->c.btree_id, version,
|
|
BSET_BIG_ENDIAN(i), write,
|
|
&b->format, k);
|
|
|
|
if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) {
|
|
struct bkey up = bkey_unpack_key(b, prev);
|
|
|
|
printbuf_reset(&buf);
|
|
prt_printf(&buf, "keys out of order: ");
|
|
bch2_bkey_to_text(&buf, &up);
|
|
prt_printf(&buf, " > ");
|
|
bch2_bkey_to_text(&buf, u.k);
|
|
|
|
if (btree_err(FSCK_CAN_FIX,
|
|
c, ca, b, i, k,
|
|
btree_node_bkey_out_of_order,
|
|
"%s", buf.buf))
|
|
goto drop_this_key;
|
|
}
|
|
|
|
prev = k;
|
|
k = bkey_p_next(k);
|
|
continue;
|
|
drop_this_key:
|
|
ret = 0;
|
|
next_good_key = k->u64s;
|
|
|
|
if (!next_good_key ||
|
|
(BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
|
|
version >= bcachefs_metadata_version_snapshot)) {
|
|
/*
|
|
* only do scanning if bch2_bkey_compat() has nothing to
|
|
* do
|
|
*/
|
|
|
|
if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
|
|
for (next_good_key = 1;
|
|
next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
|
|
next_good_key++)
|
|
if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
|
|
goto got_good_key;
|
|
}
|
|
|
|
/*
|
|
* didn't find a good key, have to truncate the rest of
|
|
* the bset
|
|
*/
|
|
next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
|
|
}
|
|
got_good_key:
|
|
le16_add_cpu(&i->u64s, -next_good_key);
|
|
memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k);
|
|
set_btree_node_need_rewrite(b);
|
|
set_btree_node_need_rewrite_error(b);
|
|
}
|
|
fsck_err:
|
|
return ret;
|
|
}
|
|
|
|
static bool btree_node_degraded(struct bch_fs *c, struct btree *b)
|
|
{
|
|
guard(rcu)();
|
|
bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
|
|
if (ptr->dev == BCH_SB_MEMBER_INVALID)
|
|
continue;
|
|
|
|
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
|
|
if (!ca || ca->mi.state != BCH_MEMBER_STATE_rw)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
|
struct btree *b,
|
|
struct bch_io_failures *failed,
|
|
struct printbuf *err_msg)
|
|
{
|
|
struct btree_node_entry *bne;
|
|
struct sort_iter *iter;
|
|
struct btree_node *sorted;
|
|
struct bkey_packed *k;
|
|
struct bset *i;
|
|
bool used_mempool, blacklisted;
|
|
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
|
|
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
|
|
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
|
|
u64 max_journal_seq = 0;
|
|
CLASS(printbuf, buf)();
|
|
int ret = 0, write = READ;
|
|
u64 start_time = local_clock();
|
|
|
|
b->version_ondisk = U16_MAX;
|
|
/* We might get called multiple times on read retry: */
|
|
b->written = 0;
|
|
|
|
iter = mempool_alloc(&c->btree.fill_iter, GFP_NOFS);
|
|
sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
|
|
|
|
if (bch2_meta_read_fault("btree"))
|
|
btree_err(0,
|
|
c, ca, b, NULL, NULL,
|
|
btree_node_fault_injected,
|
|
"dynamic fault");
|
|
|
|
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
|
|
0,
|
|
c, ca, b, NULL, NULL,
|
|
btree_node_bad_magic,
|
|
"bad magic: want %llx, got %llx",
|
|
bset_magic(c), le64_to_cpu(b->data->magic));
|
|
|
|
while (b->written < (ptr_written ?: btree_sectors(c))) {
|
|
unsigned sectors;
|
|
bool first = !b->written;
|
|
|
|
if (first) {
|
|
bne = NULL;
|
|
i = &b->data->keys;
|
|
} else {
|
|
bne = write_block(b);
|
|
i = &bne->keys;
|
|
|
|
if (i->seq != b->data->keys.seq)
|
|
break;
|
|
}
|
|
|
|
struct nonce nonce = btree_nonce(i, b->written << 9);
|
|
bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
|
|
|
|
btree_err_on(!good_csum_type,
|
|
bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))
|
|
? 0
|
|
: FSCK_CAN_FIX,
|
|
c, ca, b, i, NULL,
|
|
bset_unknown_csum,
|
|
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
|
|
|
|
if (first) {
|
|
sectors = vstruct_sectors(b->data, c->block_bits);
|
|
if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, NULL,
|
|
bset_past_end_of_btree_node,
|
|
"bset past end of btree node (offset %u len %u but written %zu)",
|
|
b->written, sectors, ptr_written ?: btree_sectors(c)))
|
|
i->u64s = 0;
|
|
if (good_csum_type) {
|
|
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
|
|
bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
|
|
if (csum_bad)
|
|
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
|
|
|
|
btree_err_on(csum_bad,
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, NULL,
|
|
bset_bad_csum,
|
|
"%s",
|
|
(printbuf_reset(&buf),
|
|
bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
|
|
buf.buf));
|
|
|
|
ret = bset_encrypt(c, i, b->written << 9);
|
|
if (bch2_fs_fatal_err_on(ret, c,
|
|
"decrypting btree node: %s", bch2_err_str(ret)))
|
|
goto fsck_err;
|
|
}
|
|
|
|
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
|
|
struct bch_btree_ptr_v2 *bp =
|
|
&bkey_i_to_btree_ptr_v2(&b->key)->v;
|
|
|
|
bch2_bpos_to_text(&buf, b->data->min_key);
|
|
prt_str(&buf, "-");
|
|
bch2_bpos_to_text(&buf, b->data->max_key);
|
|
|
|
btree_err_on(b->data->keys.seq != bp->seq,
|
|
0,
|
|
c, ca, b, NULL, NULL,
|
|
btree_node_bad_seq,
|
|
"got wrong btree node: got\n%s",
|
|
(printbuf_reset(&buf),
|
|
printbuf_indent_add(&buf, 2),
|
|
bch2_btree_node_header_to_text(&buf, b->data),
|
|
buf.buf));
|
|
} else {
|
|
btree_err_on(!b->data->keys.seq,
|
|
0,
|
|
c, ca, b, NULL, NULL,
|
|
btree_node_bad_seq,
|
|
"bad btree header: seq 0\n%s",
|
|
(printbuf_reset(&buf),
|
|
bch2_btree_node_header_to_text(&buf, b->data),
|
|
buf.buf));
|
|
}
|
|
|
|
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
|
|
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
|
|
0,
|
|
c, ca, b, NULL, NULL,
|
|
btree_node_unsupported_version,
|
|
"btree node does not have NEW_EXTENT_OVERWRITE set");
|
|
} else {
|
|
sectors = vstruct_sectors(bne, c->block_bits);
|
|
if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, NULL,
|
|
bset_past_end_of_btree_node,
|
|
"bset past end of btree node (offset %u len %u but written %zu)",
|
|
b->written, sectors, ptr_written ?: btree_sectors(c)))
|
|
i->u64s = 0;
|
|
if (good_csum_type) {
|
|
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
|
|
bool csum_bad = bch2_crc_cmp(bne->csum, csum);
|
|
if (ca && csum_bad)
|
|
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
|
|
|
|
btree_err_on(csum_bad,
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, NULL,
|
|
bset_bad_csum,
|
|
"%s",
|
|
(printbuf_reset(&buf),
|
|
bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
|
|
buf.buf));
|
|
|
|
ret = bset_encrypt(c, i, b->written << 9);
|
|
if (bch2_fs_fatal_err_on(ret, c,
|
|
"decrypting btree node: %s", bch2_err_str(ret)))
|
|
goto fsck_err;
|
|
}
|
|
}
|
|
|
|
b->version_ondisk = min(b->version_ondisk,
|
|
le16_to_cpu(i->version));
|
|
|
|
ret = bch2_validate_bset(c, ca, b, i, b->written, READ, failed, err_msg);
|
|
if (ret)
|
|
goto fsck_err;
|
|
|
|
if (!b->written)
|
|
btree_node_set_format(b, b->data->format);
|
|
|
|
ret = bch2_validate_bset_keys(c, ca, b, i, READ, failed, err_msg);
|
|
if (ret)
|
|
goto fsck_err;
|
|
|
|
SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
|
|
|
|
blacklisted = bch2_journal_seq_is_blacklisted(c,
|
|
le64_to_cpu(i->journal_seq),
|
|
true);
|
|
|
|
btree_err_on(blacklisted && first,
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, NULL,
|
|
bset_blacklisted_journal_seq,
|
|
"first btree node btree/bset.has blacklisted journal seq (%llu)",
|
|
le64_to_cpu(i->journal_seq));
|
|
|
|
btree_err_on(blacklisted && ptr_written,
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, i, NULL,
|
|
first_bset_blacklisted_journal_seq,
|
|
"found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
|
|
le64_to_cpu(i->journal_seq),
|
|
b->written, b->written + sectors, ptr_written);
|
|
|
|
b->written = min(b->written + sectors, btree_sectors(c));
|
|
|
|
if (blacklisted && !first)
|
|
continue;
|
|
|
|
sort_iter_add(iter,
|
|
vstruct_idx(i, 0),
|
|
vstruct_last(i));
|
|
|
|
max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
|
|
}
|
|
|
|
if (ptr_written) {
|
|
btree_err_on(b->written < ptr_written,
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, NULL, NULL,
|
|
btree_node_data_missing,
|
|
"btree node data missing: expected %u sectors, found %u",
|
|
ptr_written, b->written);
|
|
} else {
|
|
for (bne = write_block(b);
|
|
bset_byte_offset(b, bne) < btree_buf_bytes(b);
|
|
bne = (void *) bne + block_bytes(c))
|
|
btree_err_on(bne->keys.seq == b->data->keys.seq &&
|
|
!bch2_journal_seq_is_blacklisted(c,
|
|
le64_to_cpu(bne->keys.journal_seq),
|
|
true),
|
|
FSCK_CAN_FIX,
|
|
c, ca, b, NULL, NULL,
|
|
btree_node_bset_after_end,
|
|
"found bset signature after last bset");
|
|
}
|
|
|
|
sorted = bch2_btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
|
|
sorted->keys.u64s = 0;
|
|
|
|
b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
|
|
memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
|
|
btree_buf_bytes(b) -
|
|
sizeof(struct btree_node) -
|
|
b->nr.live_u64s * sizeof(u64));
|
|
|
|
b->data->keys.u64s = sorted->keys.u64s;
|
|
*sorted = *b->data;
|
|
swap(sorted, b->data);
|
|
set_btree_bset(b, b->set, &b->data->keys);
|
|
b->nsets = 1;
|
|
b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
|
|
|
|
BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s));
|
|
|
|
bch2_btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
|
|
|
|
i = &b->data->keys;
|
|
for (k = i->start; k != vstruct_last(i);) {
|
|
struct bkey tmp;
|
|
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
|
|
|
|
ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
|
|
if (ret == -BCH_ERR_fsck_delete_bkey ||
|
|
(static_branch_unlikely(&bch2_inject_invalid_keys) &&
|
|
!bversion_cmp(u.k->bversion, MAX_VERSION))) {
|
|
btree_keys_account_key_drop(&b->nr, 0, k);
|
|
|
|
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
|
|
memmove_u64s_down(k, bkey_p_next(k),
|
|
(u64 *) vstruct_end(i) - (u64 *) k);
|
|
set_btree_bset_end(b, b->set);
|
|
set_btree_node_need_rewrite(b);
|
|
set_btree_node_need_rewrite_error(b);
|
|
ret = 0;
|
|
continue;
|
|
}
|
|
if (ret)
|
|
goto fsck_err;
|
|
|
|
if (u.k->type == KEY_TYPE_btree_ptr_v2) {
|
|
struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
|
|
|
|
bp.v->mem_ptr = 0;
|
|
}
|
|
|
|
k = bkey_p_next(k);
|
|
}
|
|
|
|
bch2_bset_build_aux_tree(b, b->set, false);
|
|
|
|
bch2_set_bset_needs_whiteout(btree_bset_first(b), true);
|
|
|
|
btree_node_reset_sib_u64s(b);
|
|
|
|
if (updated_range)
|
|
bch2_btree_node_drop_keys_outside_node(b);
|
|
|
|
if (!ptr_written) {
|
|
set_btree_node_need_rewrite(b);
|
|
set_btree_node_need_rewrite_ptr_written_zero(b);
|
|
}
|
|
fsck_err:
|
|
mempool_free(iter, &c->btree.fill_iter);
|
|
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
|
|
return ret;
|
|
}
|
|
|
|
static void btree_node_read_work(struct work_struct *work)
|
|
{
|
|
struct btree_read_bio *rb =
|
|
container_of(work, struct btree_read_bio, work);
|
|
struct bch_fs *c = rb->c;
|
|
struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
|
|
struct btree *b = rb->b;
|
|
struct bio *bio = &rb->bio;
|
|
struct bch_io_failures failed = { .nr = 0 };
|
|
int ret = 0;
|
|
|
|
CLASS(printbuf, buf)();
|
|
bch2_log_msg_start(c, &buf);
|
|
|
|
prt_printf(&buf, "btree node read error at btree ");
|
|
bch2_btree_pos_to_text(&buf, c, b);
|
|
prt_newline(&buf);
|
|
|
|
while (1) {
|
|
if (rb->have_ioref)
|
|
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read);
|
|
rb->have_ioref = false;
|
|
|
|
if (!bio->bi_status) {
|
|
memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
|
|
bio->bi_iter.bi_size = btree_buf_bytes(b);
|
|
|
|
if (bch2_btree_read_corrupt_device == rb->pick.ptr.dev ||
|
|
bch2_btree_read_corrupt_device < 0)
|
|
bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio);
|
|
|
|
ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
|
|
} else {
|
|
ret = __bch2_err_throw(c, -blk_status_to_bch_err(bio->bi_status));
|
|
bch2_mark_io_failure(&failed, &rb->pick, ret);
|
|
}
|
|
|
|
if (!ret ||
|
|
bch2_bkey_pick_read_device(c,
|
|
bkey_i_to_s_c(&b->key),
|
|
&failed, &rb->pick, -1) <= 0)
|
|
break;
|
|
|
|
ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
|
|
rb->have_ioref = ca != NULL;
|
|
rb->start_time = local_clock();
|
|
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
|
|
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
|
|
bio->bi_iter.bi_size = btree_buf_bytes(b);
|
|
|
|
if (rb->have_ioref) {
|
|
bio_set_dev(bio, ca->disk_sb.bdev);
|
|
submit_bio_wait(bio);
|
|
} else {
|
|
bio->bi_status = BLK_STS_REMOVED;
|
|
}
|
|
|
|
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
|
rb->start_time, !bio->bi_status);
|
|
}
|
|
|
|
bch2_io_failures_to_text(&buf, c, &failed);
|
|
|
|
/*
|
|
* only print retry success if we read from a replica with no errors
|
|
*/
|
|
if (ret) {
|
|
/*
|
|
* Initialize buf.suppress before btree_lost_data(); that will
|
|
* clear it if it did any work (scheduling recovery passes,
|
|
* marking superblock
|
|
*/
|
|
buf.suppress = !__bch2_ratelimit(c, &c->btree.read_errors_hard);
|
|
|
|
set_btree_node_read_error(b);
|
|
bch2_btree_lost_data(c, &buf, b->c.btree_id);
|
|
prt_printf(&buf, "error %s\n", bch2_err_str(ret));
|
|
} else if (failed.nr) {
|
|
/* Separate ratelimit states for soft vs. hard errors */
|
|
buf.suppress = !__bch2_ratelimit(c, &c->btree.read_errors_soft);
|
|
|
|
if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
|
|
prt_printf(&buf, "retry success");
|
|
else
|
|
prt_printf(&buf, "repair success");
|
|
|
|
if ((failed.nr || btree_node_need_rewrite(b)) &&
|
|
c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
|
|
prt_printf(&buf, " (rewriting node)");
|
|
bch2_btree_node_rewrite_async(c, b);
|
|
}
|
|
|
|
prt_newline(&buf);
|
|
} else {
|
|
buf.suppress = true;
|
|
}
|
|
|
|
if (!buf.suppress)
|
|
bch2_print_str(c, ret ? KERN_ERR : KERN_NOTICE, buf.buf);
|
|
|
|
/*
|
|
* Do this late; unlike other btree_node_need_rewrite() cases if a node
|
|
* is merely degraded we should rewrite it before we update it, but we
|
|
* don't need to kick off an async rewrite now:
|
|
*/
|
|
if (btree_node_degraded(c, b)) {
|
|
set_btree_node_need_rewrite(b);
|
|
set_btree_node_need_rewrite_degraded(b);
|
|
}
|
|
|
|
async_object_list_del(c, btree_read_bio, rb->list_idx);
|
|
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
|
|
rb->start_time);
|
|
bio_put(&rb->bio);
|
|
clear_btree_node_read_in_flight(b);
|
|
smp_mb__after_atomic();
|
|
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
|
|
}
|
|
|
|
static void btree_node_read_endio(struct bio *bio)
|
|
{
|
|
struct btree_read_bio *rb =
|
|
container_of(bio, struct btree_read_bio, bio);
|
|
struct bch_fs *c = rb->c;
|
|
struct bch_dev *ca = rb->have_ioref
|
|
? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
|
|
|
|
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
|
rb->start_time, !bio->bi_status);
|
|
|
|
queue_work(c->btree.read_complete_wq, &rb->work);
|
|
}
|
|
|
|
void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio)
|
|
{
|
|
bch2_bio_to_text(out, &rbio->bio);
|
|
}
|
|
|
|
void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
|
|
bool sync)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct extent_ptr_decoded pick;
|
|
struct btree_read_bio *rb;
|
|
struct bch_dev *ca;
|
|
struct bio *bio;
|
|
int ret;
|
|
|
|
trace_btree_node(c, b, btree_node_read);
|
|
|
|
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
|
|
NULL, &pick, -1);
|
|
|
|
if (ret <= 0) {
|
|
CLASS(bch_log_msg_ratelimited, msg)(c);
|
|
|
|
prt_str(&msg.m, "btree node read error: no device to read from\n at ");
|
|
bch2_btree_pos_to_text(&msg.m, c, b);
|
|
prt_newline(&msg.m);
|
|
bch2_btree_lost_data(c, &msg.m, b->c.btree_id);
|
|
|
|
if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology))
|
|
bch2_fs_emergency_read_only2(c, &msg.m);
|
|
|
|
set_btree_node_read_error(b);
|
|
clear_btree_node_read_in_flight(b);
|
|
smp_mb__after_atomic();
|
|
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
|
|
return;
|
|
}
|
|
|
|
ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
|
|
|
|
bio = bio_alloc_bioset(NULL,
|
|
buf_pages(b->data, btree_buf_bytes(b)),
|
|
REQ_OP_READ|REQ_SYNC|REQ_META,
|
|
GFP_NOFS,
|
|
&c->btree.bio);
|
|
rb = container_of(bio, struct btree_read_bio, bio);
|
|
rb->c = c;
|
|
rb->b = b;
|
|
rb->start_time = local_clock();
|
|
rb->have_ioref = ca != NULL;
|
|
rb->pick = pick;
|
|
INIT_WORK(&rb->work, btree_node_read_work);
|
|
bio->bi_iter.bi_sector = pick.ptr.offset;
|
|
bio->bi_end_io = btree_node_read_endio;
|
|
bch2_bio_map(bio, b->data, btree_buf_bytes(b));
|
|
|
|
async_object_list_add(c, btree_read_bio, rb, &rb->list_idx);
|
|
|
|
if (rb->have_ioref) {
|
|
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
|
|
bio_sectors(bio));
|
|
bio_set_dev(bio, ca->disk_sb.bdev);
|
|
|
|
if (sync) {
|
|
submit_bio_wait(bio);
|
|
bch2_latency_acct(ca, rb->start_time, READ);
|
|
btree_node_read_work(&rb->work);
|
|
} else {
|
|
submit_bio(bio);
|
|
}
|
|
} else {
|
|
bio->bi_status = BLK_STS_REMOVED;
|
|
|
|
if (sync)
|
|
btree_node_read_work(&rb->work);
|
|
else
|
|
queue_work(c->btree.read_complete_wq, &rb->work);
|
|
}
|
|
}
|
|
|
|
static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
|
|
const struct bkey_i *k, unsigned level)
|
|
{
|
|
struct bch_fs *c = trans->c;
|
|
struct btree *b;
|
|
int ret;
|
|
|
|
CLASS(closure_stack, cl)();
|
|
|
|
do {
|
|
ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
|
|
closure_sync(&cl);
|
|
} while (ret);
|
|
|
|
b = bch2_btree_node_mem_alloc(trans, level != 0);
|
|
bch2_btree_cache_cannibalize_unlock(trans);
|
|
|
|
BUG_ON(IS_ERR(b));
|
|
|
|
bkey_copy(&b->key, k);
|
|
BUG_ON(bch2_btree_node_hash_insert(&c->btree.cache, b, level, id));
|
|
|
|
set_btree_node_read_in_flight(b);
|
|
|
|
/* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
|
|
bch2_trans_unlock(trans);
|
|
bch2_btree_node_read(trans, b, true);
|
|
|
|
if (btree_node_read_error(b)) {
|
|
scoped_guard(mutex, &c->btree.cache.lock)
|
|
bch2_btree_node_hash_remove(&c->btree.cache, b);
|
|
|
|
ret = bch_err_throw(c, btree_node_read_error);
|
|
goto err;
|
|
}
|
|
|
|
bch2_btree_set_root_for_read(c, b);
|
|
err:
|
|
six_unlock_write(&b->c.lock);
|
|
six_unlock_intent(&b->c.lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
|
|
const struct bkey_i *k, unsigned level)
|
|
{
|
|
CLASS(btree_trans, trans)(c);
|
|
return __bch2_btree_root_read(trans, id, k, level);
|
|
}
|
|
|
|
struct btree_node_scrub {
|
|
struct bch_fs *c;
|
|
struct bch_dev *ca;
|
|
void *buf;
|
|
bool used_mempool;
|
|
unsigned written;
|
|
|
|
enum btree_id btree;
|
|
unsigned level;
|
|
struct bkey_buf key;
|
|
__le64 seq;
|
|
|
|
struct work_struct work;
|
|
struct bio bio;
|
|
struct bio_vec inline_vecs[];
|
|
};
|
|
|
|
static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
|
|
struct printbuf *err)
|
|
{
|
|
unsigned written = 0;
|
|
|
|
if (le64_to_cpu(data->magic) != bset_magic(c)) {
|
|
prt_printf(err, "bad magic: want %llx, got %llx",
|
|
bset_magic(c), le64_to_cpu(data->magic));
|
|
return false;
|
|
}
|
|
|
|
while (written < (ptr_written ?: btree_sectors(c))) {
|
|
struct btree_node_entry *bne;
|
|
struct bset *i;
|
|
bool first = !written;
|
|
|
|
if (first) {
|
|
bne = NULL;
|
|
i = &data->keys;
|
|
} else {
|
|
bne = (void *) data + (written << 9);
|
|
i = &bne->keys;
|
|
|
|
if (!ptr_written && i->seq != data->keys.seq)
|
|
break;
|
|
}
|
|
|
|
struct nonce nonce = btree_nonce(i, written << 9);
|
|
bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
|
|
|
|
if (first) {
|
|
if (good_csum_type) {
|
|
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
|
|
if (bch2_crc_cmp(data->csum, csum)) {
|
|
bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
written += vstruct_sectors(data, c->block_bits);
|
|
} else {
|
|
if (good_csum_type) {
|
|
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
|
|
if (bch2_crc_cmp(bne->csum, csum)) {
|
|
bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
written += vstruct_sectors(bne, c->block_bits);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static void btree_node_scrub_work(struct work_struct *work)
|
|
{
|
|
struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
|
|
struct bch_fs *c = scrub->c;
|
|
CLASS(printbuf, err)();
|
|
|
|
__bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
|
|
bkey_i_to_s_c(scrub->key.k));
|
|
prt_newline(&err);
|
|
|
|
if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
|
|
int ret = bch2_trans_do(c,
|
|
bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1,
|
|
scrub->key.k, 0));
|
|
if (!bch2_err_matches(ret, ENOENT) &&
|
|
!bch2_err_matches(ret, EROFS))
|
|
bch_err_fn_ratelimited(c, ret);
|
|
}
|
|
|
|
bch2_bkey_buf_exit(&scrub->key);
|
|
bch2_btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
|
|
enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
|
|
kfree(scrub);
|
|
enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
|
|
}
|
|
|
|
static void btree_node_scrub_endio(struct bio *bio)
|
|
{
|
|
struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
|
|
|
|
queue_work(scrub->c->btree.read_complete_wq, &scrub->work);
|
|
}
|
|
|
|
int bch2_btree_node_scrub(struct btree_trans *trans,
|
|
enum btree_id btree, unsigned level,
|
|
struct bkey_s_c k, unsigned dev)
|
|
{
|
|
if (k.k->type != KEY_TYPE_btree_ptr_v2)
|
|
return 0;
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub))
|
|
return bch_err_throw(c, erofs_no_writes);
|
|
|
|
struct extent_ptr_decoded pick;
|
|
int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
|
|
if (ret <= 0)
|
|
goto err;
|
|
|
|
struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
|
|
BCH_DEV_READ_REF_btree_node_scrub);
|
|
if (!ca) {
|
|
ret = bch_err_throw(c, device_offline);
|
|
goto err;
|
|
}
|
|
|
|
bool used_mempool = false;
|
|
void *buf = bch2_btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
|
|
|
|
unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
|
|
|
|
struct btree_node_scrub *scrub =
|
|
kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
|
|
if (!scrub) {
|
|
ret = -ENOMEM;
|
|
goto err_free;
|
|
}
|
|
|
|
scrub->c = c;
|
|
scrub->ca = ca;
|
|
scrub->buf = buf;
|
|
scrub->used_mempool = used_mempool;
|
|
scrub->written = btree_ptr_sectors_written(k);
|
|
|
|
scrub->btree = btree;
|
|
scrub->level = level;
|
|
bch2_bkey_buf_init(&scrub->key);
|
|
bch2_bkey_buf_reassemble(&scrub->key, k);
|
|
scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq;
|
|
|
|
INIT_WORK(&scrub->work, btree_node_scrub_work);
|
|
|
|
bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->inline_vecs, vecs, REQ_OP_READ);
|
|
bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
|
|
scrub->bio.bi_iter.bi_sector = pick.ptr.offset;
|
|
scrub->bio.bi_end_io = btree_node_scrub_endio;
|
|
submit_bio(&scrub->bio);
|
|
return 0;
|
|
err_free:
|
|
bch2_btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
|
|
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
|
|
err:
|
|
enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
|
|
return ret;
|
|
}
|