Update bcachefs sources to 4d28432bcc5f bcachefs: Validate bch_sb.offset field

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-03-16 16:08:41 -04:00
parent f42ee45c6e
commit c0836924b1
42 changed files with 691 additions and 510 deletions

View File

@ -1 +1 @@
46af7258b951a79a66511172ab8772ad2dfaa4e3
4d28432bcc5f91caf053f64a1cde1a6286adf4a6

View File

@ -7,6 +7,7 @@
#define _CRYPTO_SHA_H
#include <linux/types.h>
#include <sodium/crypto_hash_sha256.h>
#define SHA1_DIGEST_SIZE 20
#define SHA1_BLOCK_SIZE 64
@ -112,4 +113,9 @@ extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *hash);
static inline void sha256(const u8 *data, unsigned int len, u8 *out)
{
crypto_hash_sha256(out, data, len);
}
#endif

View File

@ -82,4 +82,71 @@ static inline s64 div_s64(s64 dividend, s32 divisor)
return div_s64_rem(dividend, divisor, &remainder);
}
#ifndef mul_u32_u32
/*
* Many a GCC version messes this up and generates a 64x64 mult :-(
*/
static inline u64 mul_u32_u32(u32 a, u32 b)
{
return (u64)a * b;
}
#endif
#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
#ifndef mul_u64_u64_shr
static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
{
return (u64)(((unsigned __int128)a * mul) >> shift);
}
#endif /* mul_u64_u64_shr */
#else
#ifndef mul_u64_u64_shr
static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
{
union {
u64 ll;
struct {
#ifdef __BIG_ENDIAN
u32 high, low;
#else
u32 low, high;
#endif
} l;
} rl, rm, rn, rh, a0, b0;
u64 c;
a0.ll = a;
b0.ll = b;
rl.ll = mul_u32_u32(a0.l.low, b0.l.low);
rm.ll = mul_u32_u32(a0.l.low, b0.l.high);
rn.ll = mul_u32_u32(a0.l.high, b0.l.low);
rh.ll = mul_u32_u32(a0.l.high, b0.l.high);
/*
* Each of these lines computes a 64-bit intermediate result into "c",
* starting at bits 32-95. The low 32-bits go into the result of the
* multiplication, the high 32-bits are carried into the next step.
*/
rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low;
rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low;
rh.l.high = (c >> 32) + rh.l.high;
/*
* The 128-bit result of the multiplication is in rl.ll and rh.ll,
* shift it right and throw away the high part of the result.
*/
if (shift == 0)
return rl.ll;
if (shift < 64)
return (rl.ll >> shift) | (rh.ll << (64 - shift));
return rh.ll >> (shift & 63);
}
#endif /* mul_u64_u64_shr */
#endif
#endif /* _LINUX_MATH64_H */

View File

@ -9,7 +9,9 @@
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/log2.h>
#include <linux/math64.h>
#ifdef SYS_getrandom
static inline int getrandom(void *buf, size_t buflen, unsigned int flags)
@ -67,4 +69,19 @@ static inline u32 get_random_u32_below(u32 ceil)
}
}
static inline u64 get_random_u64_below(u64 ceil)
{
if (ceil <= 1)
return 0;
if (ceil <= U32_MAX)
return get_random_u32_below(ceil);
for (;;) {
u64 rand = get_random_u64();
u64 mult = ceil * rand;
if (likely(mult >= -ceil % ceil))
return mul_u64_u64_shr(ceil, rand, 64);
}
}
#endif /* _LINUX_RANDOM_H */

View File

@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
int ret = 0;
bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
c, alloc_v2_unpack_error,
c, alloc_v3_unpack_error,
"unpack error");
fsck_err:
return ret;

View File

@ -979,7 +979,6 @@ struct bch_fs {
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
size_t zstd_workspace_size;
struct crypto_shash *sha256;
struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;

View File

@ -842,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
/* one free bit */
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
@ -861,6 +862,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
struct bch_sb, flags[5], 48, 64);
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{

View File

@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
btree_node_write_in_flight(b));
btree_node_data_free(bc, b);
cond_resched();
}
BUG_ON(!bch2_journal_error(&c->journal) &&

View File

@ -2080,11 +2080,6 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
unsigned commit_flags =
BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw;
u64 start_time = wbio->start_time;
int ret = 0;
@ -2093,24 +2088,38 @@ static void btree_node_write_work(struct work_struct *work)
wbio->wbio.used_mempool,
wbio->data);
if (wbio->wbio.failed.nr) {
ret = bch2_trans_do(c,
bch2_btree_node_rewrite_key_get_iter(trans, b,
commit_flags));
} else if (!wbio->wbio.first_btree_write) {
bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
ret = -BCH_ERR_btree_node_write_all_failed;
goto err;
}
if (wbio->wbio.first_btree_write) {
if (wbio->wbio.failed.nr) {
}
} else {
ret = bch2_trans_do(c,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
commit_flags, true));
BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw,
!wbio->wbio.failed.nr));
if (ret)
goto err;
}
if (ret) {
set_btree_node_noevict(b);
bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
"writing btree node: %s", bch2_err_str(ret));
}
out:
bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b, start_time);
return;
err:
set_btree_node_noevict(b);
bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
"writing btree node: %s", bch2_err_str(ret));
goto out;
}
static void btree_node_write_endio(struct bio *bio)

View File

@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
struct btree_path *path,
struct btree_path_level *l,
struct bkey *u)
{
struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
bch2_btree_node_iter_peek(&l->iter, l->b));
path->pos = k.k ? k.k->p : l->b->key.k.p;
trans->paths_sorted = false;
bch2_btree_path_verify_level(trans, path, l - path->l);
return k;
}
static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
struct btree_path *path,
struct btree_path_level *l,

View File

@ -126,10 +126,18 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
int bch2_btree_write_buffer_insert_err(struct btree_trans *,
enum btree_id, struct bkey_i *);
static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
enum btree_id btree,
struct bkey_i *k)
{
if (unlikely(!btree_type_uses_write_buffer(btree))) {
int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
dump_stack();
return ret;
}
/*
* Most updates skip the btree write buffer until journal replay is
* finished because synchronization with journal replay relies on having

View File

@ -264,6 +264,22 @@ out:
BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
}
int bch2_btree_write_buffer_insert_err(struct btree_trans *trans,
enum btree_id btree, struct bkey_i *k)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
bch2_btree_id_to_text(&buf, btree);
prt_str(&buf, "\n");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
bch2_fs_inconsistent(c, "%s", buf.buf);
printbuf_exit(&buf);
return -EROFS;
}
static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
@ -312,7 +328,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
darray_for_each(wb->sorted, i) {
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
BUG_ON(!btree_type_uses_write_buffer(k->btree));
if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k);
goto err;
}
for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
prefetch(&wb->flushing.keys.data[n->idx]);

View File

@ -140,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b)
static inline int gen_after(u8 a, u8 b)
{
int r = gen_cmp(a, b);
return r > 0 ? r : 0;
return max(0, gen_cmp(a, b));
}
static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)

View File

@ -693,6 +693,14 @@ static int bch2_alloc_ciphers(struct bch_fs *c)
return 0;
}
#if 0
/*
* This seems to be duplicating code in cmd_remove_passphrase() in
* bcachefs-tools, but we might want to switch userspace to use this - and
* perhaps add an ioctl for calling this at runtime, so we can take the
* passphrase off of a mounted filesystem (which has come up).
*/
int bch2_disable_encryption(struct bch_fs *c)
{
struct bch_sb_field_crypt *crypt;
@ -725,6 +733,10 @@ out:
return ret;
}
/*
* For enabling encryption on an existing filesystem: not hooked up yet, but it
* should be
*/
int bch2_enable_encryption(struct bch_fs *c, bool keyed)
{
struct bch_encrypted_key key;
@ -781,6 +793,7 @@ err:
memzero_explicit(&key, sizeof(key));
return ret;
}
#endif
void bch2_fs_encryption_exit(struct bch_fs *c)
{
@ -788,8 +801,6 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
crypto_free_shash(c->poly1305);
if (c->chacha20)
crypto_free_sync_skcipher(c->chacha20);
if (c->sha256)
crypto_free_shash(c->sha256);
}
int bch2_fs_encryption_init(struct bch_fs *c)
@ -798,14 +809,6 @@ int bch2_fs_encryption_init(struct bch_fs *c)
struct bch_key key;
int ret = 0;
c->sha256 = crypto_alloc_shash("sha256", 0, 0);
ret = PTR_ERR_OR_ZERO(c->sha256);
if (ret) {
c->sha256 = NULL;
bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
goto out;
}
crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
if (!crypt)
goto out;

View File

@ -103,8 +103,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *);
#if 0
int bch2_disable_encryption(struct bch_fs *);
int bch2_enable_encryption(struct bch_fs *, bool);
#endif
void bch2_fs_encryption_exit(struct bch_fs *);
int bch2_fs_encryption_init(struct bch_fs *);

View File

@ -638,40 +638,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
static bool can_allocate_without_blocking(struct bch_fs *c,
struct data_update *m)
{
if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
return false;
unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
? m->op.target
: 0;
struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
darray_for_each(m->op.devs_have, i)
__clear_bit(*i, devs.d);
rcu_read_lock();
unsigned nr_replicas = 0, i;
for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
struct bch_dev *ca = bch2_dev_rcu(c, i);
struct bch_dev_usage usage;
bch2_dev_usage_read_fast(ca, &usage);
if (!dev_buckets_free(ca, usage, m->op.watermark))
continue;
nr_replicas += ca->mi.durability;
if (nr_replicas >= m->op.nr_replicas)
break;
}
rcu_read_unlock();
return nr_replicas >= m->op.nr_replicas;
}
int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
struct bch_io_opts *io_opts)
{
@ -700,22 +666,49 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
}
rbio_init(&m->rbio.bio, c, *io_opts, NULL);
m->rbio.data_update = true;
m->rbio.bio.bi_iter.bi_size = buf_bytes;
m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
return 0;
}
static bool can_write_extent(struct bch_fs *c,
struct bch_devs_list *devs_have,
unsigned target)
static int can_write_extent(struct bch_fs *c, struct data_update *m)
{
if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
return -BCH_ERR_data_update_done_would_block;
unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
? m->op.target
: 0;
struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
darray_for_each(*devs_have, i)
darray_for_each(m->op.devs_have, i)
__clear_bit(*i, devs.d);
return !bch2_is_zero(&devs, sizeof(devs));
rcu_read_lock();
unsigned nr_replicas = 0, i;
for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
struct bch_dev *ca = bch2_dev_rcu(c, i);
struct bch_dev_usage usage;
bch2_dev_usage_read_fast(ca, &usage);
if (!dev_buckets_free(ca, usage, m->op.watermark))
continue;
nr_replicas += ca->mi.durability;
if (nr_replicas >= m->op.nr_replicas)
break;
}
rcu_read_unlock();
if (!nr_replicas)
return -BCH_ERR_data_update_done_no_rw_devs;
if (nr_replicas < m->op.nr_replicas)
return -BCH_ERR_insufficient_devices;
return 0;
}
int bch2_data_update_init(struct btree_trans *trans,
@ -799,20 +792,6 @@ int bch2_data_update_init(struct btree_trans *trans,
ptr_bit <<= 1;
}
if (!can_write_extent(c, &m->op.devs_have,
m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) {
/*
* Check if we have rw devices not in devs_have: this can happen
* if we're trying to move data on a ro or failed device
*
* If we can't move it, we need to clear the rebalance_work bit,
* if applicable
*
* Also, copygc should skip ro/failed devices:
*/
return -BCH_ERR_data_update_done_no_rw_devs;
}
unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/*
@ -852,11 +831,22 @@ int bch2_data_update_init(struct btree_trans *trans,
goto out_bkey_buf_exit;
}
if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
!can_allocate_without_blocking(c, m)) {
ret = -BCH_ERR_data_update_done_would_block;
/*
* Check if the allocation will succeed, to avoid getting an error later
* in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
* read:
*
* This guards against
* - BCH_WRITE_alloc_nowait allocations failing (promotes)
* - Destination target full
* - Device(s) in destination target offline
* - Insufficient durability available in destination target
* (i.e. trying to move a durability=2 replica to a target with a
* single durability=2 device)
*/
ret = can_write_extent(c, m);
if (ret)
goto out_bkey_buf_exit;
}
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,

View File

@ -380,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans,
return 0;
}
static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
{
m->sectors = le16_to_cpu(s->sectors);
m->algorithm = s->algorithm;
m->nr_blocks = s->nr_blocks;
m->nr_redundant = s->nr_redundant;
m->disk_label = s->disk_label;
m->blocks_nonempty = 0;
for (unsigned i = 0; i < s->nr_blocks; i++)
m->blocks_nonempty += !!stripe_blockcount_get(s, i);
}
int bch2_trigger_stripe(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
@ -1320,6 +1307,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (s->err) {
if (!bch2_err_matches(s->err, EROFS))
bch_err(c, "error creating stripe: error writing data buckets");
ret = s->err;
goto err;
}
@ -1328,6 +1316,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_do_recov(c, &s->existing_stripe)) {
bch_err(c, "error creating stripe: error reading existing stripe");
ret = -BCH_ERR_ec_block_read;
goto err;
}
@ -1353,6 +1342,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_nr_failed(&s->new_stripe)) {
bch_err(c, "error creating stripe: error writing redundancy buckets");
ret = -BCH_ERR_ec_block_write;
goto err;
}

View File

@ -231,6 +231,7 @@
x(BCH_ERR_invalid_sb, invalid_sb_csum) \
x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
x(BCH_ERR_invalid_sb, invalid_sb_offset) \
x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
@ -273,21 +274,25 @@
x(EIO, stripe_reconstruct) \
x(EIO, key_type_error) \
x(EIO, extent_poisened) \
x(EIO, no_device_to_read_from) \
x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \
x(EIO, no_encryption_key) \
x(EIO, insufficient_journal_devices) \
x(EIO, device_offline) \
x(EIO, EIO_fault_injected) \
x(EIO, ec_block_read) \
x(EIO, ec_block_write) \
x(EIO, data_read) \
x(BCH_ERR_data_read, no_device_to_read_from) \
x(BCH_ERR_data_read, data_read_io_err) \
x(BCH_ERR_data_read, data_read_csum_err) \
x(BCH_ERR_data_read, data_read_retry) \
x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
x(BCH_ERR_data_read_retry_avoid,data_read_device_offline) \
x(BCH_ERR_data_read_retry_avoid,data_read_io_err) \
x(BCH_ERR_data_read_retry_avoid,data_read_ec_reconstruct_err) \
x(BCH_ERR_data_read_retry_avoid,data_read_csum_err) \
x(BCH_ERR_data_read_retry, data_read_csum_err_maybe_userspace) \
x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \
x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \
x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \
x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \
x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\
x(BCH_ERR_data_read, data_read_decompress_err) \
x(BCH_ERR_data_read, data_read_decrypt_err) \
x(BCH_ERR_data_read, data_read_ptr_stale_race) \

View File

@ -28,6 +28,8 @@
#include "trace.h"
#include "util.h"
#include <linux/random.h>
static const char * const bch2_extent_flags_strs[] = {
#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
BCH_EXTENT_FLAGS()
@ -94,38 +96,30 @@ static inline int dev_failed(struct bch_dev *ca)
*/
static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p1,
const struct extent_ptr_decoded p2)
u64 p1_latency,
struct bch_dev *ca1,
const struct extent_ptr_decoded p2,
u64 p2_latency)
{
if (likely(!p1.do_ec_reconstruct &&
!p2.do_ec_reconstruct)) {
struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
int failed_delta = dev_failed(ca1) - dev_failed(ca2);
int failed_delta = dev_failed(ca1) - dev_failed(ca2);
if (unlikely(failed_delta))
return failed_delta < 0;
if (failed_delta)
return failed_delta < 0;
u64 l1 = dev_latency(ca1);
u64 l2 = dev_latency(ca2);
/*
* Square the latencies, to bias more in favor of the faster
* device - we never want to stop issuing reads to the slower
* device altogether, so that we can update our latency numbers:
*/
l1 *= l1;
l2 *= l2;
/* Pick at random, biased in favor of the faster device: */
return bch2_rand_range(l1 + l2) > l1;
}
if (bch2_force_reconstruct_read)
if (unlikely(bch2_force_reconstruct_read))
return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
if (unlikely(crc_retry_delta))
return crc_retry_delta < 0;
/* Pick at random, biased in favor of the faster device: */
return get_random_u64_below(p1_latency + p2_latency) > p1_latency;
}
/*
@ -138,86 +132,105 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct extent_ptr_decoded *pick,
int dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
struct bch_dev_io_failures *f;
unsigned csum_retry = 0;
bool have_csum_retries = false;
int ret = 0;
bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
bool have_dirty_ptrs = false, have_pick = false;
if (k.k->type == KEY_TYPE_error)
return -BCH_ERR_key_type_error;
if (bch2_bkey_extent_ptrs_flags(ptrs) & BCH_EXTENT_FLAG_poisoned)
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
return -BCH_ERR_extent_poisened;
again:
rcu_read_lock();
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 pick_latency;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
have_dirty_ptrs |= !p.ptr.cached;
/*
* Unwritten extent: no need to actually read, treat it as a
* hole and return 0s:
*/
if (p.ptr.unwritten) {
ret = 0;
break;
rcu_read_unlock();
return 0;
}
/* Are we being asked to read from a specific device? */
if (dev >= 0 && p.ptr.dev != dev)
continue;
/*
* If there are any dirty pointers it's an error if we can't
* read:
*/
if (!ret && !p.ptr.cached)
ret = -BCH_ERR_no_device_to_read_from;
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
if (unlikely(failed) &&
(f = bch2_dev_io_failures(failed, p.ptr.dev))) {
have_csum_retries |= !f->failed_io && f->failed_csum_nr < BCH_MAX_CSUM_RETRIES;
struct bch_dev_io_failures *f =
unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
if (unlikely(f)) {
p.crc_retry_nr = f->failed_csum_nr;
p.has_ec &= ~f->failed_ec;
if (p.has_ec &&
!f->failed_ec &&
(f->failed_io || f->failed_csum_nr))
if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
have_io_errors |= f->failed_io;
have_io_errors |= f->failed_ec;
}
have_csum_errors |= !!f->failed_csum_nr;
if (p.has_ec && (f->failed_io || f->failed_csum_nr))
p.do_ec_reconstruct = true;
else if (f->failed_io ||
f->failed_csum_nr > csum_retry)
f->failed_csum_nr > c->opts.checksum_err_retry_nr)
continue;
}
have_missing_devs |= ca && !bch2_dev_is_online(ca);
if (!ca || !bch2_dev_is_online(ca)) {
if (p.has_ec)
p.do_ec_reconstruct = true;
else
if (!p.has_ec)
continue;
p.do_ec_reconstruct = true;
}
if (p.has_ec && bch2_force_reconstruct_read)
if (bch2_force_reconstruct_read && p.has_ec)
p.do_ec_reconstruct = true;
if (ret > 0 && !ptr_better(c, p, *pick))
continue;
u64 p_latency = dev_latency(ca);
/*
* Square the latencies, to bias more in favor of the faster
* device - we never want to stop issuing reads to the slower
* device altogether, so that we can update our latency numbers:
*/
p_latency *= p_latency;
*pick = p;
ret = 1;
if (!have_pick ||
ptr_better(c,
p, p_latency, ca,
*pick, pick_latency)) {
*pick = p;
pick_latency = p_latency;
have_pick = true;
}
}
rcu_read_unlock();
if (unlikely(ret == -BCH_ERR_no_device_to_read_from &&
have_csum_retries &&
csum_retry < BCH_MAX_CSUM_RETRIES)) {
csum_retry++;
goto again;
}
if (have_pick)
return 1;
if (!have_dirty_ptrs)
return 0;
if (have_missing_devs)
return -BCH_ERR_no_device_to_read_from;
if (have_csum_errors)
return -BCH_ERR_data_read_csum_err;
if (have_io_errors)
return -BCH_ERR_data_read_io_err;
return ret;
WARN_ONCE(1, "unhandled error case in %s\n", __func__);
return -EINVAL;
}
/* KEY_TYPE_btree_ptr: */

View File

@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
({ \
__label__ out; \
\
(_ptr).has_ec = false; \
(_ptr).do_ec_reconstruct = false; \
(_ptr).has_ec = false; \
(_ptr).do_ec_reconstruct = false; \
(_ptr).crc_retry_nr = 0; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
switch (__extent_entry_type(_entry)) { \

View File

@ -21,19 +21,18 @@ struct bch_extent_crc_unpacked {
struct extent_ptr_decoded {
bool has_ec;
unsigned do_ec_reconstruct;
bool do_ec_reconstruct;
u8 crc_retry_nr;
struct bch_extent_crc_unpacked crc;
struct bch_extent_ptr ptr;
struct bch_extent_stripe_ptr ec;
};
#define BCH_MAX_CSUM_RETRIES 3
struct bch_io_failures {
u8 nr;
struct bch_dev_io_failures {
u8 dev;
unsigned failed_csum_nr:4,
unsigned failed_csum_nr:6,
failed_io:1,
failed_ec:1;
} devs[BCH_REPLICAS_MAX + 1];

View File

@ -117,6 +117,9 @@ static int readpage_bio_extend(struct btree_trans *trans,
unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
/* ensure proper alignment */
order = min(order, __ffs(folio_offset|BIT(31)));
folio = xa_load(&iter->mapping->i_pages, folio_offset);
if (folio && !xa_is_value(folio))
break;

View File

@ -2026,44 +2026,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
return c ?: ERR_PTR(-ENOENT);
}
static int bch2_remount(struct super_block *sb, int *flags,
struct bch_opts opts)
{
struct bch_fs *c = sb->s_fs_info;
int ret = 0;
opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
if (opts.read_only != c->opts.read_only) {
down_write(&c->state_lock);
if (opts.read_only) {
bch2_fs_read_only(c);
sb->s_flags |= SB_RDONLY;
} else {
ret = bch2_fs_read_write(c);
if (ret) {
bch_err(c, "error going rw: %i", ret);
up_write(&c->state_lock);
ret = -EINVAL;
goto err;
}
sb->s_flags &= ~SB_RDONLY;
}
c->opts.read_only = opts.read_only;
up_write(&c->state_lock);
}
if (opt_defined(opts, errors))
c->opts.errors = opts.errors;
err:
return bch2_err_class(ret);
}
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
@ -2374,8 +2336,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
struct bch2_opts_parse *opts = fc->fs_private;
struct bch_fs *c = sb->s_fs_info;
int ret = 0;
return bch2_remount(sb, &fc->sb_flags, opts->opts);
opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
if (opts->opts.read_only != c->opts.read_only) {
down_write(&c->state_lock);
if (opts->opts.read_only) {
bch2_fs_read_only(c);
sb->s_flags |= SB_RDONLY;
} else {
ret = bch2_fs_read_write(c);
if (ret) {
bch_err(c, "error going rw: %i", ret);
up_write(&c->state_lock);
ret = -EINVAL;
goto err;
}
sb->s_flags &= ~SB_RDONLY;
}
c->opts.read_only = opts->opts.read_only;
up_write(&c->state_lock);
}
if (opt_defined(opts->opts, errors))
c->opts.errors = opts->opts.errors;
err:
return bch2_err_class(ret);
}
static const struct fs_context_operations bch2_context_ops = {

View File

@ -868,19 +868,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
uid, gid, mode, rdev, parent);
}
static inline u32 bkey_generation(struct bkey_s_c k)
{
switch (k.k->type) {
case KEY_TYPE_inode:
case KEY_TYPE_inode_v2:
BUG();
case KEY_TYPE_inode_generation:
return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
default:
return 0;
}
}
static struct bkey_i_inode_alloc_cursor *
bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
{
@ -1198,6 +1185,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
opts->_name##_from_inode = true; \
} else { \
opts->_name = c->opts._name; \
opts->_name##_from_inode = false; \
}
BCH_INODE_OPTS()
#undef x

View File

@ -25,8 +25,15 @@
#include "subvolume.h"
#include "trace.h"
#include <linux/random.h>
#include <linux/sched/mm.h>
#ifdef CONFIG_BCACHEFS_DEBUG
static unsigned bch2_read_corrupt_ratio;
module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(read_corrupt_ratio, "");
#endif
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static bool bch2_target_congested(struct bch_fs *c, u16 target)
@ -59,7 +66,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
}
rcu_read_unlock();
return bch2_rand_range(nr * CONGESTED_MAX) < total;
return get_random_u32_below(nr * CONGESTED_MAX) < total;
}
#else
@ -97,14 +104,21 @@ static inline bool have_io_error(struct bch_io_failures *failed)
return failed && failed->nr;
}
static bool ptr_being_rewritten(struct bch_read_bio *orig,
unsigned dev,
unsigned flags)
static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
{
if (!(flags & BCH_READ_data_update))
EBUG_ON(rbio->split);
return rbio->data_update
? container_of(rbio, struct data_update, rbio)
: NULL;
}
static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
{
struct data_update *u = rbio_data_update(orig);
if (!u)
return false;
struct data_update *u = container_of(orig, struct data_update, rbio);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
@ -193,7 +207,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
struct bpos pos,
struct extent_ptr_decoded *pick,
unsigned sectors,
unsigned flags,
struct bch_read_bio *orig,
struct bch_io_failures *failed)
{
@ -214,7 +227,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
if (bch2_dev_io_failures(failed, ptr->dev) &&
!ptr_being_rewritten(orig, ptr->dev, flags))
!ptr_being_rewritten(orig, ptr->dev))
update_opts.rewrite_ptrs |= ptr_bit;
ptr_bit <<= 1;
}
@ -308,7 +321,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_reflink
: BTREE_ID_extents,
k, pos, pick, sectors, flags, orig, failed);
k, pos, pick, sectors, orig, failed);
if (!promote)
return NULL;
@ -336,7 +349,7 @@ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *o
if (ret)
return ret;
if (rbio->flags & BCH_READ_data_update)
if (rbio->data_update)
prt_str(out, "(internal move) ");
return 0;
@ -416,83 +429,6 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
bio_endio(&rbio->bio);
}
static struct bkey_s_c get_rbio_extent(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct btree_iter *iter)
{
if (rbio->flags & BCH_READ_data_update) {
struct data_update *u = container_of(rbio, struct data_update, rbio);
return bch2_bkey_get_iter(trans, iter,
u->btree_id, bkey_start_pos(&u->k.k->k), 0);
} else {
struct bpos pos = rbio->read_pos;
int ret = bch2_subvolume_get_snapshot(trans, rbio->subvol, &pos.snapshot);
if (ret)
return bkey_s_c_err(ret);
return bch2_bkey_get_iter(trans, iter,
BTREE_ID_extents, pos, 0);
}
}
static void mark_io_failure_if_current_extent_matches(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct bch_io_failures *failed)
{
struct btree_iter iter = {};
struct bkey_s_c k;
int ret = lockrestart_do(trans,
bkey_err(k = get_rbio_extent(trans, rbio, &iter)));
if (!ret) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr)
if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr))
bch2_mark_io_failure(failed, &rbio->pick,
rbio->ret == -BCH_ERR_data_read_csum_err);
}
bch2_trans_iter_exit(trans, &iter);
}
static noinline int maybe_poison_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k, struct bch_io_failures *failed)
{
u64 flags = bch2_bkey_extent_flags(k);
if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
return 0;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
/*
* Make sure we actually attempt to read and got checksum failures from
* every replica
*/
rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
continue;
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, ptr->dev);
if (!f || f->failed_csum_nr != BCH_MAX_CSUM_RETRIES) {
rcu_read_unlock();
return 0;
}
}
rcu_read_unlock();
struct bkey_i *new = __bch2_bkey_make_mut(trans, iter, &k, 0, 0,
bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
return PTR_ERR_OR_ZERO(new) ?:
bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
bch2_trans_commit(trans, NULL, NULL, 0);
}
static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct bvec_iter bvec_iter,
@ -530,9 +466,6 @@ err:
goto retry;
if (ret) {
if (ret == -BCH_ERR_no_device_to_read_from && failed)
maybe_poison_extent(trans, &iter, k, failed);
rbio->bio.bi_status = BLK_STS_IOERR;
rbio->ret = ret;
}
@ -560,7 +493,8 @@ static void bch2_rbio_retry(struct work_struct *work)
bvec_iter_sectors(rbio->bvec_iter));
if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
mark_io_failure_if_current_extent_matches(trans, rbio, &failed);
bch2_mark_io_failure(&failed, &rbio->pick,
rbio->ret == -BCH_ERR_data_read_retry_csum_err);
if (!rbio->split) {
rbio->bio.bi_status = 0;
@ -577,7 +511,7 @@ static void bch2_rbio_retry(struct work_struct *work)
flags &= ~BCH_READ_last_fragment;
flags |= BCH_READ_must_clone;
int ret = flags & BCH_READ_data_update
int ret = rbio->data_update
? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
: __bch2_read(trans, rbio, iter, inum, &failed, flags);
@ -591,7 +525,7 @@ static void bch2_rbio_retry(struct work_struct *work)
bch2_inum_offset_err_msg_trans(trans, &buf,
(subvol_inum) { subvol, read_pos.inode },
read_pos.offset << 9));
if (rbio->flags & BCH_READ_data_update)
if (rbio->data_update)
prt_str(&buf, "(internal move) ");
prt_str(&buf, "successful retry");
@ -647,7 +581,7 @@ static void bch2_read_io_err(struct work_struct *work)
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
bch2_rbio_error(rbio, -BCH_ERR_data_read_io_err, bio->bi_status);
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
}
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
@ -734,7 +668,7 @@ static void bch2_read_csum_err(struct work_struct *work)
else
bch_err_ratelimited(c, "%s", buf.buf);
bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err, BLK_STS_IOERR);
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
@ -778,42 +712,6 @@ static void bch2_read_decrypt_err(struct work_struct *work)
printbuf_exit(&buf);
}
#ifdef CONFIG_BCACHEFS_DEBUG
static unsigned bch2_read_corrupt_ratio;
module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(read_corrupt_ratio, "");
static void corrupt_bio(struct bio *bio)
{
struct bvec_iter iter;
struct bio_vec bv;
unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
bio_for_each_segment(bv, bio, iter) {
unsigned u64s = bv.bv_len / sizeof(u64);
if (offset < u64s) {
u64 *segment = bvec_kmap_local(&bv);
segment[offset] = get_random_u64();
kunmap_local(segment);
return;
}
offset -= u64s;
}
}
static inline void maybe_corrupt_bio(struct bio *bio)
{
if (bch2_read_corrupt_ratio &&
!get_random_u32_below(bch2_read_corrupt_ratio))
corrupt_bio(bio);
}
#else
static inline void maybe_corrupt_bio(struct bio *bio)
{
}
#endif
/* Inner part that may run in process context */
static void __bch2_read_endio(struct work_struct *work)
{
@ -821,9 +719,10 @@ static void __bch2_read_endio(struct work_struct *work)
container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
struct bio *src = &rbio->bio;
struct bio *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
struct bch_read_bio *parent = bch2_rbio_parent(rbio);
struct bio *src = &rbio->bio;
struct bio *dst = &parent->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc);
unsigned nofs_flags;
@ -841,7 +740,7 @@ static void __bch2_read_endio(struct work_struct *work)
src->bi_iter = rbio->bvec_iter;
}
maybe_corrupt_bio(src);
bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
@ -853,7 +752,7 @@ static void __bch2_read_endio(struct work_struct *work)
*/
if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
rbio->flags |= BCH_READ_must_bounce;
bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err_maybe_userspace,
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
BLK_STS_IOERR);
goto out;
}
@ -873,7 +772,7 @@ static void __bch2_read_endio(struct work_struct *work)
if (unlikely(rbio->narrow_crcs))
bch2_rbio_narrow_crcs(rbio);
if (likely(!(rbio->flags & BCH_READ_data_update))) {
if (likely(!parent->data_update)) {
/* Adjust crc to point to subset of data we want: */
crc.offset += rbio->offset_into_extent;
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
@ -1043,6 +942,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bch_read_bio *rbio = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
struct data_update *u = rbio_data_update(orig);
int ret = 0;
if (bkey_extent_is_inline_data(k.k)) {
@ -1106,16 +1006,7 @@ retry_pick:
goto retry_pick;
}
/*
* Unlock the iterator while the btree node's lock is still in
* cache, before doing the IO:
*/
if (!(flags & BCH_READ_in_retry))
bch2_trans_unlock(trans);
else
bch2_trans_unlock_long(trans);
if (!(flags & BCH_READ_data_update)) {
if (likely(!u)) {
if (!(flags & BCH_READ_last_fragment) ||
bio_flagged(&orig->bio, BIO_CHAIN))
flags |= BCH_READ_must_clone;
@ -1138,12 +1029,10 @@ retry_pick:
bounce = true;
}
} else {
read_full = true;
/*
* can happen if we retry, and the extent we were going to read
* has been merged in the meantime:
*/
struct data_update *u = container_of(orig, struct data_update, rbio);
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
if (ca)
percpu_ref_put(&ca->io_ref);
@ -1152,6 +1041,7 @@ retry_pick:
}
iter.bi_size = pick.crc.compressed_size << 9;
read_full = true;
}
if (orig->opts.promote_target || have_io_error(failed))
@ -1242,10 +1132,14 @@ retry_pick:
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
/* XXX: also nvme read recovery level */
if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
rbio->bio.bi_opf |= REQ_FUA;
if (rbio->bounce)
trace_and_count(c, io_read_bounce, &rbio->bio);
if (!(flags & BCH_READ_data_update))
if (!u)
this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
else
this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
@ -1255,7 +1149,7 @@ retry_pick:
* If it's being moved internally, we don't want to flag it as a cache
* hit:
*/
if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update))
if (ca && pick.ptr.cached && !u)
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ);
@ -1264,6 +1158,15 @@ retry_pick:
trace_and_count(c, io_read_split, &orig->bio);
}
/*
* Unlock the iterator while the btree node's lock is still in
* cache, before doing the IO:
*/
if (!(flags & BCH_READ_in_retry))
bch2_trans_unlock(trans);
else
bch2_trans_unlock_long(trans);
if (likely(!rbio->pick.do_ec_reconstruct)) {
if (unlikely(!rbio->have_ioref)) {
struct printbuf buf = PRINTBUF;
@ -1275,7 +1178,7 @@ retry_pick:
printbuf_exit(&buf);
bch2_rbio_error(rbio,
-BCH_ERR_data_read_device_offline,
-BCH_ERR_data_read_retry_device_offline,
BLK_STS_IOERR);
goto out;
}
@ -1302,7 +1205,7 @@ retry_pick:
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(trans, rbio, k)) {
bch2_rbio_error(rbio, -BCH_ERR_data_read_ec_reconstruct_err,
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
BLK_STS_IOERR);
goto out;
}
@ -1314,6 +1217,8 @@ out:
if (likely(!(flags & BCH_READ_in_retry))) {
return 0;
} else {
bch2_trans_unlock(trans);
int ret;
rbio->context = RBIO_CONTEXT_UNBOUND;
@ -1324,7 +1229,7 @@ out:
if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
bch2_mark_io_failure(failed, &pick,
ret == -BCH_ERR_data_read_csum_err);
ret == -BCH_ERR_data_read_retry_csum_err);
return ret;
}
@ -1341,11 +1246,11 @@ hole:
this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
bvec_iter_sectors(iter));
/*
* won't normally happen in the BCH_READ_data_update
* (bch2_move_extent()) path, but if we retry and the extent we wanted
* to read no longer exists we have to signal that:
* won't normally happen in the data update (bch2_move_extent()) path,
* but if we retry and the extent we wanted to read no longer exists we
* have to signal that:
*/
if (flags & BCH_READ_data_update)
if (u)
orig->ret = -BCH_ERR_data_read_key_overwritten;
zero_fill_bio_iter(&orig->bio, iter);
@ -1366,7 +1271,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
struct bkey_s_c k;
int ret;
BUG_ON(flags & BCH_READ_data_update);
EBUG_ON(rbio->data_update);
bch2_bkey_buf_init(&sk);
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
@ -1393,23 +1298,6 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
if (ret)
goto err;
if (unlikely(flags & BCH_READ_in_retry)) {
struct data_update *u = flags & BCH_READ_data_update
? container_of(rbio, struct data_update, rbio)
: NULL;
if (u &&
!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
/* extent we wanted to read no longer exists: */
ret = -BCH_ERR_data_read_key_overwritten;
goto err;
}
if (!bkey_deleted(&sk.k->k) &&
!bkey_and_val_eq(k, bkey_i_to_s_c(sk.k)))
failed->nr = 0;
}
s64 offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
unsigned sectors = k.k->size - offset_into_extent;
@ -1447,16 +1335,18 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
err:
if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
flags |= BCH_READ_must_bounce;
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
!bch2_err_matches(ret, BCH_ERR_data_read_retry))
break;
}
if (unlikely(ret)) {
if (ret == -BCH_ERR_no_device_to_read_from && failed)
maybe_poison_extent(trans, &iter, k, failed);
bch2_trans_iter_exit(trans, &iter);
if (ret) {
struct printbuf buf = PRINTBUF;
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf, inum,
@ -1472,7 +1362,6 @@ err:
bch2_rbio_done(rbio);
}
bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
return ret;
}

View File

@ -36,7 +36,8 @@ struct bch_read_bio {
u16 flags;
union {
struct {
u16 promote:1,
u16 data_update:1,
promote:1,
bounce:1,
split:1,
have_ioref:1,
@ -109,7 +110,6 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
x(retry_if_stale) \
x(may_promote) \
x(user_mapped) \
x(data_update) \
x(last_fragment) \
x(must_bounce) \
x(must_clone) \
@ -163,12 +163,13 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
{
struct bch_read_bio *rbio = to_rbio(bio);
rbio->c = orig->c;
rbio->_state = 0;
rbio->ret = 0;
rbio->split = true;
rbio->parent = orig;
rbio->opts = orig->opts;
rbio->c = orig->c;
rbio->_state = 0;
rbio->flags = 0;
rbio->ret = 0;
rbio->split = true;
rbio->parent = orig;
rbio->opts = orig->opts;
return rbio;
}
@ -182,7 +183,8 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
rbio->start_time = local_clock();
rbio->c = c;
rbio->_state = 0;
rbio->ret = 0;
rbio->flags = 0;
rbio->ret = 0;
rbio->opts = opts;
rbio->bio.bi_end_io = end_io;
return rbio;

View File

@ -34,6 +34,12 @@
#include <linux/random.h>
#include <linux/sched/mm.h>
#ifdef CONFIG_BCACHEFS_DEBUG
static unsigned bch2_write_corrupt_ratio;
module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(write_corrupt_ratio, "");
#endif
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
@ -1005,6 +1011,15 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
bounce = true;
}
#ifdef CONFIG_BCACHEFS_DEBUG
unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
if (!bounce && write_corrupt_ratio) {
dst = bch2_write_bio_alloc(c, wp, src,
&page_alloc_failed,
ec_buf);
bounce = true;
}
#endif
saved_iter = dst->bi_iter;
do {
@ -1114,6 +1129,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
init_append_extent(op, wp, version, crc);
#ifdef CONFIG_BCACHEFS_DEBUG
if (write_corrupt_ratio) {
swap(dst->bi_iter.bi_size, dst_len);
bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
swap(dst->bi_iter.bi_size, dst_len);
}
#endif
if (dst != src)
bio_advance(dst, dst_len);
bio_advance(src, src_len);
@ -1394,6 +1417,7 @@ retry:
bio->bi_private = &op->cl;
bio->bi_opf |= REQ_OP_WRITE;
closure_get(&op->cl);
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
op->insert_keys.top, true);
@ -1718,20 +1742,26 @@ static const char * const bch2_write_flags[] = {
void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
{
prt_str(out, "pos: ");
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
prt_printf(out, "pos:\t");
bch2_bpos_to_text(out, op->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
prt_str(out, "started: ");
prt_printf(out, "started:\t");
bch2_pr_time_units(out, local_clock() - op->start_time);
prt_newline(out);
prt_str(out, "flags: ");
prt_printf(out, "flags:\t");
prt_bitflags(out, bch2_write_flags, op->flags);
prt_newline(out);
prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
printbuf_indent_sub(out, 2);
}

View File

@ -1609,11 +1609,6 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
kvfree(new_buf);
}
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
{
return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
}
static CLOSURE_CALLBACK(journal_write_done)
{
closure_type(w, struct journal_buf, io);

View File

@ -101,13 +101,25 @@ static void move_free(struct moving_io *io)
static void move_write_done(struct bch_write_op *op)
{
struct moving_io *io = container_of(op, struct moving_io, write.op);
struct bch_fs *c = op->c;
struct moving_context *ctxt = io->write.ctxt;
if (io->write.op.error)
ctxt->write_error = true;
if (op->error) {
if (trace_io_move_write_fail_enabled()) {
struct printbuf buf = PRINTBUF;
atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
atomic_dec(&io->write.ctxt->write_ios);
bch2_write_op_to_text(&buf, op);
prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error));
trace_io_move_write_fail(c, buf.buf);
printbuf_exit(&buf);
}
this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
ctxt->write_error = true;
}
atomic_sub(io->write_sectors, &ctxt->write_sectors);
atomic_dec(&ctxt->write_ios);
move_free(io);
closure_put(&ctxt->cl);
}
@ -359,7 +371,6 @@ int bch2_move_extent(struct moving_context *ctxt,
bkey_start_pos(k.k),
iter->btree_id, k, 0,
NULL,
BCH_READ_data_update|
BCH_READ_last_fragment,
data_opts.scrub ? data_opts.read_dev : -1);
return 0;
@ -580,7 +591,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
k.k->type == KEY_TYPE_reflink_p &&
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
s64 offset_into_extent = 0;
bch2_trans_iter_exit(trans, &reflink_iter);
k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
@ -599,6 +610,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
* pointer - need to fixup iter->k
*/
extent_iter = &reflink_iter;
offset_into_extent = 0;
}
if (!bkey_extent_is_direct_data(k.k))
@ -712,7 +724,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
struct btree_iter iter = {}, bp_iter = {};
struct bkey_buf sk;
struct bkey_s_c k;
unsigned sectors_moved = 0;
struct bkey_buf last_flushed;
int ret = 0;
@ -834,7 +845,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (ctxt->stats)
atomic64_add(sectors, &ctxt->stats->sectors_seen);
sectors_moved += sectors;
next:
bch2_btree_iter_advance(&bp_iter);
}
@ -1253,17 +1263,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
prt_newline(out);
printbuf_indent_add(out, 2);
prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved));
prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced));
prt_printf(out, "bytes seen: ");
prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved));
prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced));
prt_printf(out, "bytes seen:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
prt_newline(out);
prt_printf(out, "bytes moved: ");
prt_printf(out, "bytes moved:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
prt_printf(out, "bytes raced: ");
prt_printf(out, "bytes raced:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out);
@ -1272,7 +1282,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
{
struct moving_io *io;
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
@ -1292,6 +1303,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
printbuf_indent_add(out, 2);
mutex_lock(&ctxt->lock);
struct moving_io *io;
list_for_each_entry(io, &ctxt->ios, io_list)
bch2_data_update_inflight_to_text(out, &io->write);
mutex_unlock(&ctxt->lock);

View File

@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
prt_printf(out, "Currently calculated wait:\t");
prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
prt_newline(out);
rcu_read_lock();
struct task_struct *t = rcu_dereference(c->copygc_thread);
if (t)
get_task_struct(t);
rcu_read_unlock();
if (t) {
bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
put_task_struct(t);
}
}
static int bch2_copygc_thread(void *arg)

View File

@ -186,6 +186,11 @@ enum fsck_err_opts {
OPT_STR(__bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(checksum_err_retry_nr, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(0, 32), \
BCH_SB_CSUM_ERR_RETRY_NR, 3, \
NULL, NULL) \
x(compression, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \

View File

@ -26,9 +26,8 @@
/* bch_extent_rebalance: */
static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
bkey_extent_entry_for_each(ptrs, entry)
@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s
return NULL;
}
static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
{
return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
}
static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
struct bch_io_opts *opts,
struct bkey_s_c k,
@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
if (!opts)
return 0;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 sectors = 0;
@ -590,8 +595,19 @@ static int bch2_rebalance_thread(void *arg)
void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
{
printbuf_tabstop_push(out, 32);
struct bch_fs_rebalance *r = &c->rebalance;
/* print pending work */
struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, };
u64 v;
bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
prt_printf(out, "pending work:\t");
prt_human_readable_u64(out, v);
prt_printf(out, "\n\n");
prt_str(out, bch2_rebalance_state_strs[r->state]);
prt_newline(out);
printbuf_indent_add(out, 2);
@ -600,15 +616,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
case BCH_REBALANCE_waiting: {
u64 now = atomic64_read(&c->io_clock[WRITE].now);
prt_str(out, "io wait duration: ");
prt_printf(out, "io wait duration:\t");
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
prt_newline(out);
prt_str(out, "io wait remaining: ");
prt_printf(out, "io wait remaining:\t");
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
prt_newline(out);
prt_str(out, "duration waited: ");
prt_printf(out, "duration waited:\t");
bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
prt_newline(out);
break;
@ -621,6 +637,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
break;
}
prt_newline(out);
rcu_read_lock();
struct task_struct *t = rcu_dereference(c->rebalance.thread);
if (t)
get_task_struct(t);
rcu_read_unlock();
if (t) {
bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
put_task_struct(t);
}
printbuf_indent_sub(out, 2);
}

View File

@ -899,7 +899,7 @@ use_clean:
* journal sequence numbers:
*/
if (!c->sb.clean)
journal_seq += 8;
journal_seq += JOURNAL_BUF_NR * 4;
if (blacklist_seq != journal_seq) {
ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",

View File

@ -22,6 +22,7 @@ enum counters_flags {
x(io_move_write, 36, TYPE_SECTORS) \
x(io_move_finish, 37, TYPE_SECTORS) \
x(io_move_fail, 38, TYPE_COUNTER) \
x(io_move_write_fail, 82, TYPE_COUNTER) \
x(io_move_start_fail, 39, TYPE_COUNTER) \
x(bucket_invalidate, 3, TYPE_COUNTER) \
x(bucket_discard, 4, TYPE_COUNTER) \

View File

@ -12,7 +12,6 @@
#include "super.h"
#include <linux/crc32c.h>
#include <crypto/hash.h>
#include <crypto/sha2.h>
static inline enum bch_str_hash_type
@ -55,13 +54,10 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
};
if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
SHASH_DESC_ON_STACK(desc, c->sha256);
u8 digest[SHA256_DIGEST_SIZE];
desc->tfm = c->sha256;
crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
sizeof(bi->bi_hash_seed), digest);
sha256((const u8 *)&bi->bi_hash_seed,
sizeof(bi->bi_hash_seed), digest);
memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
}

View File

@ -365,10 +365,9 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
return 0;
}
static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
enum bch_validate_flags flags, struct printbuf *out)
int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
enum bch_validate_flags flags, struct printbuf *out)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
int ret;
@ -377,15 +376,27 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (ret)
return ret;
if (sb->features[1] ||
(le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
prt_printf(out, "Filesystem has incompatible features");
u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
unsigned incompat_bit = 0;
if (incompat)
incompat_bit = __ffs64(incompat);
else if (sb->features[1])
incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
if (incompat_bit) {
prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
incompat_bit,
bch2_sb_features[BCH_FEATURE_NR - 1],
BCH_FEATURE_NR - 1);
return -BCH_ERR_invalid_sb_features;
}
if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
prt_printf(out, "Filesystem has incompatible version");
prt_str(out, "Filesystem has incompatible version ");
bch2_version_to_text(out, le16_to_cpu(sb->version));
prt_str(out, ", current version ");
bch2_version_to_text(out, bcachefs_metadata_version_current);
return -BCH_ERR_invalid_sb_features;
}
@ -399,6 +410,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
return -BCH_ERR_invalid_sb_uuid;
}
if (!(flags & BCH_VALIDATE_write) &&
le64_to_cpu(sb->offset) != read_offset) {
prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
le64_to_cpu(sb->offset), read_offset);
return -BCH_ERR_invalid_sb_offset;
}
if (!sb->nr_devices ||
sb->nr_devices > BCH_SB_MEMBERS_MAX) {
prt_printf(out, "Bad number of member devices %u (max %u)",
@ -457,6 +475,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
!BCH_SB_CSUM_ERR_RETRY_NR(sb))
SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
}
#ifdef __KERNEL__
@ -874,7 +896,7 @@ got_super:
sb->have_layout = true;
ret = bch2_sb_validate(sb, 0, &err);
ret = bch2_sb_validate(sb->sb, offset, 0, &err);
if (ret) {
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
@ -1031,7 +1053,7 @@ int bch2_write_super(struct bch_fs *c)
darray_for_each(online_devices, ca) {
printbuf_reset(&err);
ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
goto out;

View File

@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *);

View File

@ -75,9 +75,6 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
MODULE_DESCRIPTION("bcachefs filesystem");
MODULE_SOFTDEP("pre: crc32c");
MODULE_SOFTDEP("pre: crc64");
MODULE_SOFTDEP("pre: sha256");
MODULE_SOFTDEP("pre: chacha20");
MODULE_SOFTDEP("pre: poly1305");
MODULE_SOFTDEP("pre: xxhash");
@ -1838,7 +1835,11 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_late;
up_write(&c->state_lock);
return 0;
out:
printbuf_exit(&label);
printbuf_exit(&errbuf);
bch_err_fn(c, ret);
return ret;
err_unlock:
mutex_unlock(&c->sb_lock);
@ -1847,10 +1848,7 @@ err:
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
printbuf_exit(&label);
printbuf_exit(&errbuf);
bch_err_fn(c, ret);
return ret;
goto out;
err_late:
up_write(&c->state_lock);
ca = NULL;

View File

@ -148,6 +148,7 @@ write_attribute(trigger_btree_key_cache_shrink);
write_attribute(trigger_freelist_wakeup);
write_attribute(trigger_btree_updates);
read_attribute(gc_gens_pos);
write_attribute(read_fua_test);
read_attribute(uuid);
read_attribute(minor);
@ -395,6 +396,71 @@ SHOW(bch2_fs)
return 0;
}
static int read_fua_test(struct bch_fs *c)
{
int ret = 0;
unsigned bs = 4096;
struct bio *bio;
void *buf;
struct bch_dev *ca = bch2_dev_get_ioref(c, 0, READ);
if (!ca)
return -EINVAL;
bio = bio_kmalloc(1, GFP_KERNEL);
if (!bio) {
ret = -ENOMEM;
goto err;
}
buf = kmalloc(bs, GFP_KERNEL);
if (!buf)
goto err;
u64 start = ktime_get_ns();
for (unsigned i = 0; i < 1000; i++) {
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ);
bch2_bio_map(bio, buf, bs);
ret = submit_bio_wait(bio);
if (ret)
goto err;
}
u64 ns_nofua = ktime_get_ns() - start;
start = ktime_get_ns();
for (unsigned i = 0; i < 1000; i++) {
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ);
bch2_bio_map(bio, buf, bs);
ret = submit_bio_wait(bio);
if (ret)
goto err;
}
u64 ns_fua = ktime_get_ns() - start;
u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca);
start = ktime_get_ns();
for (unsigned i = 0; i < 1000; i++) {
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ);
bio->bi_iter.bi_sector = (get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9;
bch2_bio_map(bio, buf, bs);
ret = submit_bio_wait(bio);
if (ret)
goto err;
}
u64 ns_rand = ktime_get_ns() - start;
pr_info("ns nofua %llu", ns_nofua);
pr_info("ns fua %llu", ns_fua);
pr_info("ns random %llu", ns_rand);
err:
kfree(buf);
kfree(bio);
percpu_ref_put(&ca->io_ref);
bch_err_fn(c, ret);
return ret;
}
STORE(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@ -451,6 +517,9 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_freelist_wakeup)
closure_wake_up(&c->freelist_wait);
if (attr == &sysfs_read_fua_test)
read_fua_test(c);
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@ -580,6 +649,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_btree_key_cache_shrink,
&sysfs_trigger_freelist_wakeup,
&sysfs_trigger_btree_updates,
&sysfs_read_fua_test,
&sysfs_gc_gens_pos,

View File

@ -846,6 +846,11 @@ DEFINE_EVENT(fs_str, io_move_fail,
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_write_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_start_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)

View File

@ -653,21 +653,6 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
return 0;
}
size_t bch2_rand_range(size_t max)
{
size_t rand;
if (!max)
return 0;
do {
rand = get_random_long();
rand &= roundup_pow_of_two(max) - 1;
} while (rand >= max);
return rand;
}
void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
{
struct bio_vec bv;
@ -698,6 +683,27 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_corrupt_bio(struct bio *bio)
{
struct bvec_iter iter;
struct bio_vec bv;
unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
bio_for_each_segment(bv, bio, iter) {
unsigned u64s = bv.bv_len / sizeof(u64);
if (offset < u64s) {
u64 *segment = bvec_kmap_local(&bv);
segment[offset] = get_random_u64();
kunmap_local(segment);
return;
}
offset -= u64s;
}
}
#endif
#if 0
void eytzinger1_test(void)
{

View File

@ -401,11 +401,21 @@ do { \
_ret; \
})
size_t bch2_rand_range(size_t);
void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_corrupt_bio(struct bio *);
static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
{
if (ratio && !get_random_u32_below(ratio))
bch2_corrupt_bio(bio);
}
#else
#define bch2_maybe_corrupt_bio(...) do {} while (0)
#endif
static inline void memcpy_u64s_small(void *dst, const void *src,
unsigned u64s)
{