Update bcachefs sources to cbb2e45634dd bcachefs: fix simulateously upgrading & downgrading

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2024-01-05 14:18:29 -05:00
parent ba4c17c12b
commit b90031efaa
10 changed files with 167 additions and 91 deletions

View File

@ -1 +1 @@
d267e10a43b2e9ab37da6c9c991ca021142f6324
cbb2e45634dd3b6ae38e45856ab6215c687a8806

View File

@ -68,6 +68,12 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
_k = _n) {
_n = bkey_p_next(_k);
if (!_k->u64s) {
printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set,
_k->_data - i->_data);
break;
}
k = bkey_disassemble(b, _k, &uk);
printbuf_reset(&buf);

View File

@ -524,7 +524,8 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "at btree ");
bch2_btree_pos_to_text(out, c, b);
prt_printf(out, "\n node offset %u", b->written);
prt_printf(out, "\n node offset %u/%u",
b->written, btree_ptr_sectors_written(&b->key));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
prt_str(out, ": ");
@ -830,6 +831,23 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
}
static bool __bkey_valid(struct bch_fs *c, struct btree *b,
struct bset *i, struct bkey_packed *k)
{
if (bkey_p_next(k) > vstruct_last(i))
return false;
if (k->format > KEY_FORMAT_CURRENT)
return false;
struct printbuf buf = PRINTBUF;
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf);
printbuf_exit(&buf);
return ret;
}
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
struct bset *i, int write,
bool have_retry, bool *saw_error)
@ -845,6 +863,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
k != vstruct_last(i);) {
struct bkey_s u;
struct bkey tmp;
unsigned next_good_key;
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-BCH_ERR_btree_node_read_err_fixable,
@ -859,12 +878,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_bad_format,
"invalid bkey format %u", k->format)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
continue;
}
"invalid bkey format %u", k->format))
goto drop_this_key;
/* XXX: validate k->u64s */
if (!write)
@ -885,11 +900,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
c, NULL, b, i,
btree_node_bad_bkey,
"invalid bkey: %s", buf.buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
continue;
goto drop_this_key;
}
if (write)
@ -906,21 +917,45 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
prt_printf(&buf, " > ");
bch2_bkey_to_text(&buf, u.k);
bch2_dump_bset(c, b, i, 0);
if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_out_of_order,
"%s", buf.buf)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
continue;
}
"%s", buf.buf))
goto drop_this_key;
}
prev = k;
k = bkey_p_next(k);
continue;
drop_this_key:
next_good_key = k->u64s;
if (!next_good_key ||
(BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
version >= bcachefs_metadata_version_snapshot)) {
/*
* only do scanning if bch2_bkey_compat() has nothing to
* do
*/
if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
for (next_good_key = 1;
next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
next_good_key++)
if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
goto got_good_key;
}
/*
* didn't find a good key, have to truncate the rest of
* the bset
*/
next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
}
got_good_key:
le16_add_cpu(&i->u64s, -next_good_key);
memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
}
fsck_err:
printbuf_exit(&buf);
@ -1007,8 +1042,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
nonce = btree_nonce(i, b->written << 9);
csum_bad = bch2_crc_cmp(b->data->csum,
csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data));
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
csum_bad = bch2_crc_cmp(b->data->csum, csum);
if (csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
@ -1016,7 +1051,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_bad_csum,
"invalid checksum");
"%s",
(printbuf_reset(&buf),
bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
buf.buf));
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
@ -1045,8 +1083,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
csum_bad = bch2_crc_cmp(bne->csum,
csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne));
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
csum_bad = bch2_crc_cmp(bne->csum, csum);
if (csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
@ -1054,7 +1092,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_bad_csum,
"invalid checksum");
"%s",
(printbuf_reset(&buf),
bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
buf.buf));
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,

View File

@ -2818,34 +2818,11 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
return p;
}
#include "sb-members.h"
static inline void check_srcu_held_too_long(struct btree_trans *trans)
{
if (trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10)) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "btree node read time:\n");
bch2_time_stats_to_text(&buf, &trans->c->times[BCH_TIME_btree_node_read]);
prt_str(&buf, "btree node read_done time:\n");
bch2_time_stats_to_text(&buf, &trans->c->times[BCH_TIME_btree_node_read_done]);
for_each_member_device(trans->c, ca) {
prt_printf(&buf, "device %u read time:\n", ca->dev_idx);
bch2_time_stats_to_text(&buf, &ca->io_latency[READ]);
}
struct btree_transaction_stats *s = btree_trans_stats(trans);
prt_str(&buf, "transaction duration:\n");
bch2_time_stats_to_text(&buf, &s->duration);
WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
"btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
(jiffies - trans->srcu_lock_time) / HZ);
bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
}
void bch2_trans_srcu_unlock(struct btree_trans *trans)

View File

@ -45,6 +45,29 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
})
static inline void bch2_csum_to_text(struct printbuf *out,
enum bch_csum_type type,
struct bch_csum csum)
{
const u8 *p = (u8 *) &csum;
unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16;
for (unsigned i = 0; i < bytes; i++)
prt_hex_byte(out, p[i]);
}
static inline void bch2_csum_err_msg(struct printbuf *out,
enum bch_csum_type type,
struct bch_csum expected,
struct bch_csum got)
{
prt_printf(out, "checksum error: got ");
bch2_csum_to_text(out, type, got);
prt_str(out, " should be ");
bch2_csum_to_text(out, type, expected);
prt_printf(out, " type %s", bch2_csum_types[type]);
}
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
int bch2_request_key(struct bch_sb *, struct bch_key *);
#ifndef __KERNEL__

View File

@ -194,16 +194,6 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
int ret;
/*
* check if unlinked, disable/defer until relink
*/
/*
* also: add a mode where a file is a tmpfile until fully,
* asynchronously written
*/
ret = file_write_and_wait_range(file, start, end);
if (ret)
goto out;

View File

@ -642,12 +642,17 @@ csum_err:
goto out;
}
struct printbuf buf = PRINTBUF;
buf.atomic++;
prt_str(&buf, "data ");
bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
bch_err_inum_offset_ratelimited(ca,
rbio->read_pos.inode,
rbio->read_pos.offset << 9,
"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
"data %s", buf.buf);
printbuf_exit(&buf);
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;

View File

@ -27,11 +27,15 @@ static struct nonce journal_nonce(const struct jset *jset)
}};
}
static bool jset_csum_good(struct bch_fs *c, struct jset *j)
static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
{
return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
!bch2_crc_cmp(j->csum,
csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
*csum = (struct bch_csum) {};
return false;
}
*csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
return !bch2_crc_cmp(j->csum, *csum);
}
static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
@ -934,6 +938,7 @@ static int journal_read_bucket(struct bch_dev *ca,
u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
end = offset + ca->mi.bucket_size;
bool saw_bad = false, csum_good;
struct printbuf err = PRINTBUF;
int ret = 0;
pr_debug("reading %u", bucket);
@ -966,7 +971,7 @@ reread:
* found on a different device, and missing or
* no journal entries will be handled later
*/
return 0;
goto out;
}
j = buf->data;
@ -983,12 +988,12 @@ reread:
ret = journal_read_buf_realloc(buf,
vstruct_bytes(j));
if (ret)
return ret;
goto err;
}
goto reread;
case JOURNAL_ENTRY_NONE:
if (!saw_bad)
return 0;
goto out;
/*
* On checksum error we don't really trust the size
* field of the journal entry we read, so try reading
@ -997,7 +1002,7 @@ reread:
sectors = block_sectors(c);
goto next_block;
default:
return ret;
goto err;
}
/*
@ -1007,20 +1012,28 @@ reread:
* bucket:
*/
if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
return 0;
goto out;
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
csum_good = jset_csum_good(c, j);
enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
struct bch_csum csum;
csum_good = jset_csum_good(c, j, &csum);
if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
"journal checksum error"))
"%s",
(printbuf_reset(&err),
prt_str(&err, "journal "),
bch2_csum_err_msg(&err, csum_type, j->csum, csum),
err.buf)))
saw_bad = true;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
vstruct_end(j) - (void *) j->encrypted_start);
bch2_fs_fatal_err_on(ret, c,
"error decrypting journal entry: %i", ret);
"error decrypting journal entry: %s",
bch2_err_str(ret));
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
@ -1039,7 +1052,7 @@ reread:
case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
break;
default:
return ret;
goto err;
}
next_block:
pr_debug("next");
@ -1048,7 +1061,11 @@ next_block:
j = ((void *) j) + (sectors << 9);
}
return 0;
out:
ret = 0;
err:
printbuf_exit(&err);
return ret;
}
static CLOSURE_CALLBACK(bch2_journal_read_device)

View File

@ -696,8 +696,11 @@ static int bch2_run_recovery_passes(struct bch_fs *c)
while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
unsigned pass = c->curr_recovery_pass;
ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
(ret && c->curr_recovery_pass < pass))
continue;
if (ret)
break;

View File

@ -612,7 +612,6 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
{
struct bch_csum csum;
size_t bytes;
int ret;
reread:
@ -628,7 +627,9 @@ reread:
if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
!uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
prt_printf(err, "Not a bcachefs superblock");
prt_str(err, "Not a bcachefs superblock (got magic ");
pr_uuid(err, sb->sb->magic.b);
prt_str(err, ")");
return -BCH_ERR_invalid_sb_magic;
}
@ -651,17 +652,16 @@ reread:
goto reread;
}
if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
if (csum_type >= BCH_CSUM_NR) {
prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
return -BCH_ERR_invalid_sb_csum_type;
}
/* XXX: verify MACs */
csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
null_nonce(), sb->sb);
struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb);
if (bch2_crc_cmp(csum, sb->sb->csum)) {
prt_printf(err, "bad checksum");
bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum);
return -BCH_ERR_invalid_sb_csum;
}
@ -1088,13 +1088,22 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
/*
* Downgrade, if superblock is at a higher version than currently
* supported:
*
* c->sb will be checked before we write the superblock, so update it as
* well:
*/
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) {
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
if (c->sb.version > bcachefs_metadata_version_current)
c->sb.version_upgrade_complete = bcachefs_metadata_version_current;
}
if (c->sb.version > bcachefs_metadata_version_current) {
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
if (c->sb.version_min > bcachefs_metadata_version_current)
c->sb.version = bcachefs_metadata_version_current;
}
if (c->sb.version_min > bcachefs_metadata_version_current) {
c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
c->sb.version_min = bcachefs_metadata_version_current;
}
c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
return ret;
}
@ -1261,6 +1270,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
pr_uuid(out, sb->uuid.b);
prt_newline(out);
prt_printf(out, "Magic number:");
prt_tab(out);
pr_uuid(out, sb->magic.b);
prt_newline(out);
prt_str(out, "Device index:");
prt_tab(out);
prt_printf(out, "%u", sb->dev_idx);