Update bcachefs sources to 4d28432bcc5f bcachefs: Validate bch_sb.offset field

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-03-16 16:08:41 -04:00
parent f42ee45c6e
commit c0836924b1
42 changed files with 691 additions and 510 deletions

View File

@ -1 +1 @@
46af7258b951a79a66511172ab8772ad2dfaa4e3 4d28432bcc5f91caf053f64a1cde1a6286adf4a6

View File

@ -7,6 +7,7 @@
#define _CRYPTO_SHA_H #define _CRYPTO_SHA_H
#include <linux/types.h> #include <linux/types.h>
#include <sodium/crypto_hash_sha256.h>
#define SHA1_DIGEST_SIZE 20 #define SHA1_DIGEST_SIZE 20
#define SHA1_BLOCK_SIZE 64 #define SHA1_BLOCK_SIZE 64
@ -112,4 +113,9 @@ extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data, extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *hash); unsigned int len, u8 *hash);
static inline void sha256(const u8 *data, unsigned int len, u8 *out)
{
crypto_hash_sha256(out, data, len);
}
#endif #endif

View File

@ -82,4 +82,71 @@ static inline s64 div_s64(s64 dividend, s32 divisor)
return div_s64_rem(dividend, divisor, &remainder); return div_s64_rem(dividend, divisor, &remainder);
} }
#ifndef mul_u32_u32
/*
* Many a GCC version messes this up and generates a 64x64 mult :-(
*/
static inline u64 mul_u32_u32(u32 a, u32 b)
{
return (u64)a * b;
}
#endif
#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
#ifndef mul_u64_u64_shr
static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
{
return (u64)(((unsigned __int128)a * mul) >> shift);
}
#endif /* mul_u64_u64_shr */
#else
#ifndef mul_u64_u64_shr
static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
{
union {
u64 ll;
struct {
#ifdef __BIG_ENDIAN
u32 high, low;
#else
u32 low, high;
#endif
} l;
} rl, rm, rn, rh, a0, b0;
u64 c;
a0.ll = a;
b0.ll = b;
rl.ll = mul_u32_u32(a0.l.low, b0.l.low);
rm.ll = mul_u32_u32(a0.l.low, b0.l.high);
rn.ll = mul_u32_u32(a0.l.high, b0.l.low);
rh.ll = mul_u32_u32(a0.l.high, b0.l.high);
/*
* Each of these lines computes a 64-bit intermediate result into "c",
* starting at bits 32-95. The low 32-bits go into the result of the
* multiplication, the high 32-bits are carried into the next step.
*/
rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low;
rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low;
rh.l.high = (c >> 32) + rh.l.high;
/*
* The 128-bit result of the multiplication is in rl.ll and rh.ll,
* shift it right and throw away the high part of the result.
*/
if (shift == 0)
return rl.ll;
if (shift < 64)
return (rl.ll >> shift) | (rh.ll << (64 - shift));
return rh.ll >> (shift & 63);
}
#endif /* mul_u64_u64_shr */
#endif
#endif /* _LINUX_MATH64_H */ #endif /* _LINUX_MATH64_H */

View File

@ -9,7 +9,9 @@
#include <unistd.h> #include <unistd.h>
#include <sys/syscall.h> #include <sys/syscall.h>
#include <linux/bug.h> #include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/log2.h> #include <linux/log2.h>
#include <linux/math64.h>
#ifdef SYS_getrandom #ifdef SYS_getrandom
static inline int getrandom(void *buf, size_t buflen, unsigned int flags) static inline int getrandom(void *buf, size_t buflen, unsigned int flags)
@ -67,4 +69,19 @@ static inline u32 get_random_u32_below(u32 ceil)
} }
} }
static inline u64 get_random_u64_below(u64 ceil)
{
if (ceil <= 1)
return 0;
if (ceil <= U32_MAX)
return get_random_u32_below(ceil);
for (;;) {
u64 rand = get_random_u64();
u64 mult = ceil * rand;
if (likely(mult >= -ceil % ceil))
return mul_u64_u64_shr(ceil, rand, 64);
}
}
#endif /* _LINUX_RANDOM_H */ #endif /* _LINUX_RANDOM_H */

View File

@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
int ret = 0; int ret = 0;
bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
c, alloc_v2_unpack_error, c, alloc_v3_unpack_error,
"unpack error"); "unpack error");
fsck_err: fsck_err:
return ret; return ret;

View File

@ -979,7 +979,6 @@ struct bch_fs {
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
size_t zstd_workspace_size; size_t zstd_workspace_size;
struct crypto_shash *sha256;
struct crypto_sync_skcipher *chacha20; struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305; struct crypto_shash *poly1305;

View File

@ -842,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
/* one free bit */
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
@ -861,6 +862,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
struct bch_sb, flags[5], 48, 64); struct bch_sb, flags[5], 48, 64);
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{ {

View File

@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
btree_node_write_in_flight(b)); btree_node_write_in_flight(b));
btree_node_data_free(bc, b); btree_node_data_free(bc, b);
cond_resched();
} }
BUG_ON(!bch2_journal_error(&c->journal) && BUG_ON(!bch2_journal_error(&c->journal) &&

View File

@ -2080,11 +2080,6 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work); container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c; struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private; struct btree *b = wbio->wbio.bio.bi_private;
unsigned commit_flags =
BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw;
u64 start_time = wbio->start_time; u64 start_time = wbio->start_time;
int ret = 0; int ret = 0;
@ -2093,24 +2088,38 @@ static void btree_node_write_work(struct work_struct *work)
wbio->wbio.used_mempool, wbio->wbio.used_mempool,
wbio->data); wbio->data);
if (wbio->wbio.failed.nr) { bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
ret = bch2_trans_do(c, bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
bch2_btree_node_rewrite_key_get_iter(trans, b,
commit_flags)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
} else if (!wbio->wbio.first_btree_write) { ret = -BCH_ERR_btree_node_write_all_failed;
goto err;
}
if (wbio->wbio.first_btree_write) {
if (wbio->wbio.failed.nr) {
}
} else {
ret = bch2_trans_do(c, ret = bch2_trans_do(c,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
commit_flags, true)); BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw,
!wbio->wbio.failed.nr));
if (ret)
goto err;
} }
out:
if (ret) {
set_btree_node_noevict(b);
bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
"writing btree node: %s", bch2_err_str(ret));
}
bio_put(&wbio->wbio.bio); bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b, start_time); btree_node_write_done(c, b, start_time);
return;
err:
set_btree_node_noevict(b);
bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
"writing btree node: %s", bch2_err_str(ret));
goto out;
} }
static void btree_node_write_endio(struct bio *bio) static void btree_node_write_endio(struct bio *bio)

View File

@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
bch2_btree_node_iter_peek_all(&l->iter, l->b)); bch2_btree_node_iter_peek_all(&l->iter, l->b));
} }
static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
struct btree_path *path,
struct btree_path_level *l,
struct bkey *u)
{
struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
bch2_btree_node_iter_peek(&l->iter, l->b));
path->pos = k.k ? k.k->p : l->b->key.k.p;
trans->paths_sorted = false;
bch2_btree_path_verify_level(trans, path, l - path->l);
return k;
}
static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
struct btree_path *path, struct btree_path *path,
struct btree_path_level *l, struct btree_path_level *l,

View File

@ -126,10 +126,18 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
int bch2_btree_write_buffer_insert_err(struct btree_trans *,
enum btree_id, struct bkey_i *);
static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
enum btree_id btree, enum btree_id btree,
struct bkey_i *k) struct bkey_i *k)
{ {
if (unlikely(!btree_type_uses_write_buffer(btree))) {
int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
dump_stack();
return ret;
}
/* /*
* Most updates skip the btree write buffer until journal replay is * Most updates skip the btree write buffer until journal replay is
* finished because synchronization with journal replay relies on having * finished because synchronization with journal replay relies on having

View File

@ -264,6 +264,22 @@ out:
BUG_ON(wb->sorted.size < wb->flushing.keys.nr); BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
} }
int bch2_btree_write_buffer_insert_err(struct btree_trans *trans,
enum btree_id btree, struct bkey_i *k)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
bch2_btree_id_to_text(&buf, btree);
prt_str(&buf, "\n");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
bch2_fs_inconsistent(c, "%s", buf.buf);
printbuf_exit(&buf);
return -EROFS;
}
static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
@ -312,7 +328,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
darray_for_each(wb->sorted, i) { darray_for_each(wb->sorted, i) {
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
BUG_ON(!btree_type_uses_write_buffer(k->btree)); if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k);
goto err;
}
for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
prefetch(&wb->flushing.keys.data[n->idx]); prefetch(&wb->flushing.keys.data[n->idx]);

View File

@ -140,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b)
static inline int gen_after(u8 a, u8 b) static inline int gen_after(u8 a, u8 b)
{ {
int r = gen_cmp(a, b); return max(0, gen_cmp(a, b));
return r > 0 ? r : 0;
} }
static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)

View File

@ -693,6 +693,14 @@ static int bch2_alloc_ciphers(struct bch_fs *c)
return 0; return 0;
} }
#if 0
/*
* This seems to be duplicating code in cmd_remove_passphrase() in
* bcachefs-tools, but we might want to switch userspace to use this - and
* perhaps add an ioctl for calling this at runtime, so we can take the
* passphrase off of a mounted filesystem (which has come up).
*/
int bch2_disable_encryption(struct bch_fs *c) int bch2_disable_encryption(struct bch_fs *c)
{ {
struct bch_sb_field_crypt *crypt; struct bch_sb_field_crypt *crypt;
@ -725,6 +733,10 @@ out:
return ret; return ret;
} }
/*
* For enabling encryption on an existing filesystem: not hooked up yet, but it
* should be
*/
int bch2_enable_encryption(struct bch_fs *c, bool keyed) int bch2_enable_encryption(struct bch_fs *c, bool keyed)
{ {
struct bch_encrypted_key key; struct bch_encrypted_key key;
@ -781,6 +793,7 @@ err:
memzero_explicit(&key, sizeof(key)); memzero_explicit(&key, sizeof(key));
return ret; return ret;
} }
#endif
void bch2_fs_encryption_exit(struct bch_fs *c) void bch2_fs_encryption_exit(struct bch_fs *c)
{ {
@ -788,8 +801,6 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
crypto_free_shash(c->poly1305); crypto_free_shash(c->poly1305);
if (c->chacha20) if (c->chacha20)
crypto_free_sync_skcipher(c->chacha20); crypto_free_sync_skcipher(c->chacha20);
if (c->sha256)
crypto_free_shash(c->sha256);
} }
int bch2_fs_encryption_init(struct bch_fs *c) int bch2_fs_encryption_init(struct bch_fs *c)
@ -798,14 +809,6 @@ int bch2_fs_encryption_init(struct bch_fs *c)
struct bch_key key; struct bch_key key;
int ret = 0; int ret = 0;
c->sha256 = crypto_alloc_shash("sha256", 0, 0);
ret = PTR_ERR_OR_ZERO(c->sha256);
if (ret) {
c->sha256 = NULL;
bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
goto out;
}
crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
if (!crypt) if (!crypt)
goto out; goto out;

View File

@ -103,8 +103,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *); struct bch_key *);
#if 0
int bch2_disable_encryption(struct bch_fs *); int bch2_disable_encryption(struct bch_fs *);
int bch2_enable_encryption(struct bch_fs *, bool); int bch2_enable_encryption(struct bch_fs *, bool);
#endif
void bch2_fs_encryption_exit(struct bch_fs *); void bch2_fs_encryption_exit(struct bch_fs *);
int bch2_fs_encryption_init(struct bch_fs *); int bch2_fs_encryption_init(struct bch_fs *);

View File

@ -638,40 +638,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
} }
static bool can_allocate_without_blocking(struct bch_fs *c,
struct data_update *m)
{
if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
return false;
unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
? m->op.target
: 0;
struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
darray_for_each(m->op.devs_have, i)
__clear_bit(*i, devs.d);
rcu_read_lock();
unsigned nr_replicas = 0, i;
for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
struct bch_dev *ca = bch2_dev_rcu(c, i);
struct bch_dev_usage usage;
bch2_dev_usage_read_fast(ca, &usage);
if (!dev_buckets_free(ca, usage, m->op.watermark))
continue;
nr_replicas += ca->mi.durability;
if (nr_replicas >= m->op.nr_replicas)
break;
}
rcu_read_unlock();
return nr_replicas >= m->op.nr_replicas;
}
int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
struct bch_io_opts *io_opts) struct bch_io_opts *io_opts)
{ {
@ -700,22 +666,49 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
} }
rbio_init(&m->rbio.bio, c, *io_opts, NULL); rbio_init(&m->rbio.bio, c, *io_opts, NULL);
m->rbio.data_update = true;
m->rbio.bio.bi_iter.bi_size = buf_bytes; m->rbio.bio.bi_iter.bi_size = buf_bytes;
m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
return 0; return 0;
} }
static bool can_write_extent(struct bch_fs *c, static int can_write_extent(struct bch_fs *c, struct data_update *m)
struct bch_devs_list *devs_have,
unsigned target)
{ {
if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
return -BCH_ERR_data_update_done_would_block;
unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
? m->op.target
: 0;
struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
darray_for_each(*devs_have, i) darray_for_each(m->op.devs_have, i)
__clear_bit(*i, devs.d); __clear_bit(*i, devs.d);
return !bch2_is_zero(&devs, sizeof(devs)); rcu_read_lock();
unsigned nr_replicas = 0, i;
for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
struct bch_dev *ca = bch2_dev_rcu(c, i);
struct bch_dev_usage usage;
bch2_dev_usage_read_fast(ca, &usage);
if (!dev_buckets_free(ca, usage, m->op.watermark))
continue;
nr_replicas += ca->mi.durability;
if (nr_replicas >= m->op.nr_replicas)
break;
}
rcu_read_unlock();
if (!nr_replicas)
return -BCH_ERR_data_update_done_no_rw_devs;
if (nr_replicas < m->op.nr_replicas)
return -BCH_ERR_insufficient_devices;
return 0;
} }
int bch2_data_update_init(struct btree_trans *trans, int bch2_data_update_init(struct btree_trans *trans,
@ -799,20 +792,6 @@ int bch2_data_update_init(struct btree_trans *trans,
ptr_bit <<= 1; ptr_bit <<= 1;
} }
if (!can_write_extent(c, &m->op.devs_have,
m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) {
/*
* Check if we have rw devices not in devs_have: this can happen
* if we're trying to move data on a ro or failed device
*
* If we can't move it, we need to clear the rebalance_work bit,
* if applicable
*
* Also, copygc should skip ro/failed devices:
*/
return -BCH_ERR_data_update_done_no_rw_devs;
}
unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/* /*
@ -852,11 +831,22 @@ int bch2_data_update_init(struct btree_trans *trans,
goto out_bkey_buf_exit; goto out_bkey_buf_exit;
} }
if ((m->op.flags & BCH_WRITE_alloc_nowait) && /*
!can_allocate_without_blocking(c, m)) { * Check if the allocation will succeed, to avoid getting an error later
ret = -BCH_ERR_data_update_done_would_block; * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
* read:
*
* This guards against
* - BCH_WRITE_alloc_nowait allocations failing (promotes)
* - Destination target full
* - Device(s) in destination target offline
* - Insufficient durability available in destination target
* (i.e. trying to move a durability=2 replica to a target with a
* single durability=2 device)
*/
ret = can_write_extent(c, m);
if (ret)
goto out_bkey_buf_exit; goto out_bkey_buf_exit;
}
if (reserve_sectors) { if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,

View File

@ -380,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans,
return 0; return 0;
} }
static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
{
m->sectors = le16_to_cpu(s->sectors);
m->algorithm = s->algorithm;
m->nr_blocks = s->nr_blocks;
m->nr_redundant = s->nr_redundant;
m->disk_label = s->disk_label;
m->blocks_nonempty = 0;
for (unsigned i = 0; i < s->nr_blocks; i++)
m->blocks_nonempty += !!stripe_blockcount_get(s, i);
}
int bch2_trigger_stripe(struct btree_trans *trans, int bch2_trigger_stripe(struct btree_trans *trans,
enum btree_id btree, unsigned level, enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new, struct bkey_s_c old, struct bkey_s _new,
@ -1320,6 +1307,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (s->err) { if (s->err) {
if (!bch2_err_matches(s->err, EROFS)) if (!bch2_err_matches(s->err, EROFS))
bch_err(c, "error creating stripe: error writing data buckets"); bch_err(c, "error creating stripe: error writing data buckets");
ret = s->err;
goto err; goto err;
} }
@ -1328,6 +1316,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_do_recov(c, &s->existing_stripe)) { if (ec_do_recov(c, &s->existing_stripe)) {
bch_err(c, "error creating stripe: error reading existing stripe"); bch_err(c, "error creating stripe: error reading existing stripe");
ret = -BCH_ERR_ec_block_read;
goto err; goto err;
} }
@ -1353,6 +1342,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_nr_failed(&s->new_stripe)) { if (ec_nr_failed(&s->new_stripe)) {
bch_err(c, "error creating stripe: error writing redundancy buckets"); bch_err(c, "error creating stripe: error writing redundancy buckets");
ret = -BCH_ERR_ec_block_write;
goto err; goto err;
} }

View File

@ -231,6 +231,7 @@
x(BCH_ERR_invalid_sb, invalid_sb_csum) \ x(BCH_ERR_invalid_sb, invalid_sb_csum) \
x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
x(BCH_ERR_invalid_sb, invalid_sb_offset) \
x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
@ -273,21 +274,25 @@
x(EIO, stripe_reconstruct) \ x(EIO, stripe_reconstruct) \
x(EIO, key_type_error) \ x(EIO, key_type_error) \
x(EIO, extent_poisened) \ x(EIO, extent_poisened) \
x(EIO, no_device_to_read_from) \
x(EIO, missing_indirect_extent) \ x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \ x(EIO, invalidate_stripe_to_dev) \
x(EIO, no_encryption_key) \ x(EIO, no_encryption_key) \
x(EIO, insufficient_journal_devices) \ x(EIO, insufficient_journal_devices) \
x(EIO, device_offline) \ x(EIO, device_offline) \
x(EIO, EIO_fault_injected) \ x(EIO, EIO_fault_injected) \
x(EIO, ec_block_read) \
x(EIO, ec_block_write) \
x(EIO, data_read) \ x(EIO, data_read) \
x(BCH_ERR_data_read, no_device_to_read_from) \
x(BCH_ERR_data_read, data_read_io_err) \
x(BCH_ERR_data_read, data_read_csum_err) \
x(BCH_ERR_data_read, data_read_retry) \ x(BCH_ERR_data_read, data_read_retry) \
x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
x(BCH_ERR_data_read_retry_avoid,data_read_device_offline) \ x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \
x(BCH_ERR_data_read_retry_avoid,data_read_io_err) \ x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \
x(BCH_ERR_data_read_retry_avoid,data_read_ec_reconstruct_err) \ x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \
x(BCH_ERR_data_read_retry_avoid,data_read_csum_err) \ x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \
x(BCH_ERR_data_read_retry, data_read_csum_err_maybe_userspace) \ x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\
x(BCH_ERR_data_read, data_read_decompress_err) \ x(BCH_ERR_data_read, data_read_decompress_err) \
x(BCH_ERR_data_read, data_read_decrypt_err) \ x(BCH_ERR_data_read, data_read_decrypt_err) \
x(BCH_ERR_data_read, data_read_ptr_stale_race) \ x(BCH_ERR_data_read, data_read_ptr_stale_race) \

View File

@ -28,6 +28,8 @@
#include "trace.h" #include "trace.h"
#include "util.h" #include "util.h"
#include <linux/random.h>
static const char * const bch2_extent_flags_strs[] = { static const char * const bch2_extent_flags_strs[] = {
#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, #define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
BCH_EXTENT_FLAGS() BCH_EXTENT_FLAGS()
@ -94,38 +96,30 @@ static inline int dev_failed(struct bch_dev *ca)
*/ */
static inline bool ptr_better(struct bch_fs *c, static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p1, const struct extent_ptr_decoded p1,
const struct extent_ptr_decoded p2) u64 p1_latency,
struct bch_dev *ca1,
const struct extent_ptr_decoded p2,
u64 p2_latency)
{ {
if (likely(!p1.do_ec_reconstruct && struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
!p2.do_ec_reconstruct)) {
struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
int failed_delta = dev_failed(ca1) - dev_failed(ca2); int failed_delta = dev_failed(ca1) - dev_failed(ca2);
if (unlikely(failed_delta))
return failed_delta < 0;
if (failed_delta) if (unlikely(bch2_force_reconstruct_read))
return failed_delta < 0;
u64 l1 = dev_latency(ca1);
u64 l2 = dev_latency(ca2);
/*
* Square the latencies, to bias more in favor of the faster
* device - we never want to stop issuing reads to the slower
* device altogether, so that we can update our latency numbers:
*/
l1 *= l1;
l2 *= l2;
/* Pick at random, biased in favor of the faster device: */
return bch2_rand_range(l1 + l2) > l1;
}
if (bch2_force_reconstruct_read)
return p1.do_ec_reconstruct > p2.do_ec_reconstruct; return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
return p1.do_ec_reconstruct < p2.do_ec_reconstruct; if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
if (unlikely(crc_retry_delta))
return crc_retry_delta < 0;
/* Pick at random, biased in favor of the faster device: */
return get_random_u64_below(p1_latency + p2_latency) > p1_latency;
} }
/* /*
@ -138,86 +132,105 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct extent_ptr_decoded *pick, struct extent_ptr_decoded *pick,
int dev) int dev)
{ {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
const union bch_extent_entry *entry; bool have_dirty_ptrs = false, have_pick = false;
struct extent_ptr_decoded p;
struct bch_dev_io_failures *f;
unsigned csum_retry = 0;
bool have_csum_retries = false;
int ret = 0;
if (k.k->type == KEY_TYPE_error) if (k.k->type == KEY_TYPE_error)
return -BCH_ERR_key_type_error; return -BCH_ERR_key_type_error;
if (bch2_bkey_extent_ptrs_flags(ptrs) & BCH_EXTENT_FLAG_poisoned) struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
return -BCH_ERR_extent_poisened; return -BCH_ERR_extent_poisened;
again:
rcu_read_lock(); rcu_read_lock();
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 pick_latency;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
have_dirty_ptrs |= !p.ptr.cached;
/* /*
* Unwritten extent: no need to actually read, treat it as a * Unwritten extent: no need to actually read, treat it as a
* hole and return 0s: * hole and return 0s:
*/ */
if (p.ptr.unwritten) { if (p.ptr.unwritten) {
ret = 0; rcu_read_unlock();
break; return 0;
} }
/* Are we being asked to read from a specific device? */ /* Are we being asked to read from a specific device? */
if (dev >= 0 && p.ptr.dev != dev) if (dev >= 0 && p.ptr.dev != dev)
continue; continue;
/*
* If there are any dirty pointers it's an error if we can't
* read:
*/
if (!ret && !p.ptr.cached)
ret = -BCH_ERR_no_device_to_read_from;
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue; continue;
if (unlikely(failed) && struct bch_dev_io_failures *f =
(f = bch2_dev_io_failures(failed, p.ptr.dev))) { unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
have_csum_retries |= !f->failed_io && f->failed_csum_nr < BCH_MAX_CSUM_RETRIES; if (unlikely(f)) {
p.crc_retry_nr = f->failed_csum_nr;
p.has_ec &= ~f->failed_ec;
if (p.has_ec && if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
!f->failed_ec && have_io_errors |= f->failed_io;
(f->failed_io || f->failed_csum_nr)) have_io_errors |= f->failed_ec;
}
have_csum_errors |= !!f->failed_csum_nr;
if (p.has_ec && (f->failed_io || f->failed_csum_nr))
p.do_ec_reconstruct = true; p.do_ec_reconstruct = true;
else if (f->failed_io || else if (f->failed_io ||
f->failed_csum_nr > csum_retry) f->failed_csum_nr > c->opts.checksum_err_retry_nr)
continue; continue;
} }
have_missing_devs |= ca && !bch2_dev_is_online(ca);
if (!ca || !bch2_dev_is_online(ca)) { if (!ca || !bch2_dev_is_online(ca)) {
if (p.has_ec) if (!p.has_ec)
p.do_ec_reconstruct = true;
else
continue; continue;
p.do_ec_reconstruct = true;
} }
if (p.has_ec && bch2_force_reconstruct_read) if (bch2_force_reconstruct_read && p.has_ec)
p.do_ec_reconstruct = true; p.do_ec_reconstruct = true;
if (ret > 0 && !ptr_better(c, p, *pick)) u64 p_latency = dev_latency(ca);
continue; /*
* Square the latencies, to bias more in favor of the faster
* device - we never want to stop issuing reads to the slower
* device altogether, so that we can update our latency numbers:
*/
p_latency *= p_latency;
*pick = p; if (!have_pick ||
ret = 1; ptr_better(c,
p, p_latency, ca,
*pick, pick_latency)) {
*pick = p;
pick_latency = p_latency;
have_pick = true;
}
} }
rcu_read_unlock(); rcu_read_unlock();
if (unlikely(ret == -BCH_ERR_no_device_to_read_from && if (have_pick)
have_csum_retries && return 1;
csum_retry < BCH_MAX_CSUM_RETRIES)) { if (!have_dirty_ptrs)
csum_retry++; return 0;
goto again; if (have_missing_devs)
} return -BCH_ERR_no_device_to_read_from;
if (have_csum_errors)
return -BCH_ERR_data_read_csum_err;
if (have_io_errors)
return -BCH_ERR_data_read_io_err;
return ret; WARN_ONCE(1, "unhandled error case in %s\n", __func__);
return -EINVAL;
} }
/* KEY_TYPE_btree_ptr: */ /* KEY_TYPE_btree_ptr: */

View File

@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
({ \ ({ \
__label__ out; \ __label__ out; \
\ \
(_ptr).has_ec = false; \ (_ptr).has_ec = false; \
(_ptr).do_ec_reconstruct = false; \ (_ptr).do_ec_reconstruct = false; \
(_ptr).crc_retry_nr = 0; \
\ \
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \
switch (__extent_entry_type(_entry)) { \ switch (__extent_entry_type(_entry)) { \

View File

@ -21,19 +21,18 @@ struct bch_extent_crc_unpacked {
struct extent_ptr_decoded { struct extent_ptr_decoded {
bool has_ec; bool has_ec;
unsigned do_ec_reconstruct; bool do_ec_reconstruct;
u8 crc_retry_nr;
struct bch_extent_crc_unpacked crc; struct bch_extent_crc_unpacked crc;
struct bch_extent_ptr ptr; struct bch_extent_ptr ptr;
struct bch_extent_stripe_ptr ec; struct bch_extent_stripe_ptr ec;
}; };
#define BCH_MAX_CSUM_RETRIES 3
struct bch_io_failures { struct bch_io_failures {
u8 nr; u8 nr;
struct bch_dev_io_failures { struct bch_dev_io_failures {
u8 dev; u8 dev;
unsigned failed_csum_nr:4, unsigned failed_csum_nr:6,
failed_io:1, failed_io:1,
failed_ec:1; failed_ec:1;
} devs[BCH_REPLICAS_MAX + 1]; } devs[BCH_REPLICAS_MAX + 1];

View File

@ -117,6 +117,9 @@ static int readpage_bio_extend(struct btree_trans *trans,
unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
/* ensure proper alignment */
order = min(order, __ffs(folio_offset|BIT(31)));
folio = xa_load(&iter->mapping->i_pages, folio_offset); folio = xa_load(&iter->mapping->i_pages, folio_offset);
if (folio && !xa_is_value(folio)) if (folio && !xa_is_value(folio))
break; break;

View File

@ -2026,44 +2026,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
return c ?: ERR_PTR(-ENOENT); return c ?: ERR_PTR(-ENOENT);
} }
static int bch2_remount(struct super_block *sb, int *flags,
struct bch_opts opts)
{
struct bch_fs *c = sb->s_fs_info;
int ret = 0;
opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
if (opts.read_only != c->opts.read_only) {
down_write(&c->state_lock);
if (opts.read_only) {
bch2_fs_read_only(c);
sb->s_flags |= SB_RDONLY;
} else {
ret = bch2_fs_read_write(c);
if (ret) {
bch_err(c, "error going rw: %i", ret);
up_write(&c->state_lock);
ret = -EINVAL;
goto err;
}
sb->s_flags &= ~SB_RDONLY;
}
c->opts.read_only = opts.read_only;
up_write(&c->state_lock);
}
if (opt_defined(opts, errors))
c->opts.errors = opts.errors;
err:
return bch2_err_class(ret);
}
static int bch2_show_devname(struct seq_file *seq, struct dentry *root) static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
{ {
struct bch_fs *c = root->d_sb->s_fs_info; struct bch_fs *c = root->d_sb->s_fs_info;
@ -2374,8 +2336,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc)
{ {
struct super_block *sb = fc->root->d_sb; struct super_block *sb = fc->root->d_sb;
struct bch2_opts_parse *opts = fc->fs_private; struct bch2_opts_parse *opts = fc->fs_private;
struct bch_fs *c = sb->s_fs_info;
int ret = 0;
return bch2_remount(sb, &fc->sb_flags, opts->opts); opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
if (opts->opts.read_only != c->opts.read_only) {
down_write(&c->state_lock);
if (opts->opts.read_only) {
bch2_fs_read_only(c);
sb->s_flags |= SB_RDONLY;
} else {
ret = bch2_fs_read_write(c);
if (ret) {
bch_err(c, "error going rw: %i", ret);
up_write(&c->state_lock);
ret = -EINVAL;
goto err;
}
sb->s_flags &= ~SB_RDONLY;
}
c->opts.read_only = opts->opts.read_only;
up_write(&c->state_lock);
}
if (opt_defined(opts->opts, errors))
c->opts.errors = opts->opts.errors;
err:
return bch2_err_class(ret);
} }
static const struct fs_context_operations bch2_context_ops = { static const struct fs_context_operations bch2_context_ops = {

View File

@ -868,19 +868,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
uid, gid, mode, rdev, parent); uid, gid, mode, rdev, parent);
} }
static inline u32 bkey_generation(struct bkey_s_c k)
{
switch (k.k->type) {
case KEY_TYPE_inode:
case KEY_TYPE_inode_v2:
BUG();
case KEY_TYPE_inode_generation:
return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
default:
return 0;
}
}
static struct bkey_i_inode_alloc_cursor * static struct bkey_i_inode_alloc_cursor *
bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
{ {
@ -1198,6 +1185,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
opts->_name##_from_inode = true; \ opts->_name##_from_inode = true; \
} else { \ } else { \
opts->_name = c->opts._name; \ opts->_name = c->opts._name; \
opts->_name##_from_inode = false; \
} }
BCH_INODE_OPTS() BCH_INODE_OPTS()
#undef x #undef x

View File

@ -25,8 +25,15 @@
#include "subvolume.h" #include "subvolume.h"
#include "trace.h" #include "trace.h"
#include <linux/random.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#ifdef CONFIG_BCACHEFS_DEBUG
static unsigned bch2_read_corrupt_ratio;
module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(read_corrupt_ratio, "");
#endif
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static bool bch2_target_congested(struct bch_fs *c, u16 target) static bool bch2_target_congested(struct bch_fs *c, u16 target)
@ -59,7 +66,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
} }
rcu_read_unlock(); rcu_read_unlock();
return bch2_rand_range(nr * CONGESTED_MAX) < total; return get_random_u32_below(nr * CONGESTED_MAX) < total;
} }
#else #else
@ -97,14 +104,21 @@ static inline bool have_io_error(struct bch_io_failures *failed)
return failed && failed->nr; return failed && failed->nr;
} }
static bool ptr_being_rewritten(struct bch_read_bio *orig, static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
unsigned dev,
unsigned flags)
{ {
if (!(flags & BCH_READ_data_update)) EBUG_ON(rbio->split);
return rbio->data_update
? container_of(rbio, struct data_update, rbio)
: NULL;
}
static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
{
struct data_update *u = rbio_data_update(orig);
if (!u)
return false; return false;
struct data_update *u = container_of(orig, struct data_update, rbio);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
unsigned i = 0; unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) { bkey_for_each_ptr(ptrs, ptr) {
@ -193,7 +207,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
struct bpos pos, struct bpos pos,
struct extent_ptr_decoded *pick, struct extent_ptr_decoded *pick,
unsigned sectors, unsigned sectors,
unsigned flags,
struct bch_read_bio *orig, struct bch_read_bio *orig,
struct bch_io_failures *failed) struct bch_io_failures *failed)
{ {
@ -214,7 +227,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
unsigned ptr_bit = 1; unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) { bkey_for_each_ptr(ptrs, ptr) {
if (bch2_dev_io_failures(failed, ptr->dev) && if (bch2_dev_io_failures(failed, ptr->dev) &&
!ptr_being_rewritten(orig, ptr->dev, flags)) !ptr_being_rewritten(orig, ptr->dev))
update_opts.rewrite_ptrs |= ptr_bit; update_opts.rewrite_ptrs |= ptr_bit;
ptr_bit <<= 1; ptr_bit <<= 1;
} }
@ -308,7 +321,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
k.k->type == KEY_TYPE_reflink_v k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_reflink ? BTREE_ID_reflink
: BTREE_ID_extents, : BTREE_ID_extents,
k, pos, pick, sectors, flags, orig, failed); k, pos, pick, sectors, orig, failed);
if (!promote) if (!promote)
return NULL; return NULL;
@ -336,7 +349,7 @@ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *o
if (ret) if (ret)
return ret; return ret;
if (rbio->flags & BCH_READ_data_update) if (rbio->data_update)
prt_str(out, "(internal move) "); prt_str(out, "(internal move) ");
return 0; return 0;
@ -416,83 +429,6 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
bio_endio(&rbio->bio); bio_endio(&rbio->bio);
} }
static struct bkey_s_c get_rbio_extent(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct btree_iter *iter)
{
if (rbio->flags & BCH_READ_data_update) {
struct data_update *u = container_of(rbio, struct data_update, rbio);
return bch2_bkey_get_iter(trans, iter,
u->btree_id, bkey_start_pos(&u->k.k->k), 0);
} else {
struct bpos pos = rbio->read_pos;
int ret = bch2_subvolume_get_snapshot(trans, rbio->subvol, &pos.snapshot);
if (ret)
return bkey_s_c_err(ret);
return bch2_bkey_get_iter(trans, iter,
BTREE_ID_extents, pos, 0);
}
}
static void mark_io_failure_if_current_extent_matches(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct bch_io_failures *failed)
{
struct btree_iter iter = {};
struct bkey_s_c k;
int ret = lockrestart_do(trans,
bkey_err(k = get_rbio_extent(trans, rbio, &iter)));
if (!ret) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr)
if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr))
bch2_mark_io_failure(failed, &rbio->pick,
rbio->ret == -BCH_ERR_data_read_csum_err);
}
bch2_trans_iter_exit(trans, &iter);
}
static noinline int maybe_poison_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k, struct bch_io_failures *failed)
{
u64 flags = bch2_bkey_extent_flags(k);
if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
return 0;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
/*
* Make sure we actually attempt to read and got checksum failures from
* every replica
*/
rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
continue;
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, ptr->dev);
if (!f || f->failed_csum_nr != BCH_MAX_CSUM_RETRIES) {
rcu_read_unlock();
return 0;
}
}
rcu_read_unlock();
struct bkey_i *new = __bch2_bkey_make_mut(trans, iter, &k, 0, 0,
bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
return PTR_ERR_OR_ZERO(new) ?:
bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
bch2_trans_commit(trans, NULL, NULL, 0);
}
static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
struct bch_read_bio *rbio, struct bch_read_bio *rbio,
struct bvec_iter bvec_iter, struct bvec_iter bvec_iter,
@ -530,9 +466,6 @@ err:
goto retry; goto retry;
if (ret) { if (ret) {
if (ret == -BCH_ERR_no_device_to_read_from && failed)
maybe_poison_extent(trans, &iter, k, failed);
rbio->bio.bi_status = BLK_STS_IOERR; rbio->bio.bi_status = BLK_STS_IOERR;
rbio->ret = ret; rbio->ret = ret;
} }
@ -560,7 +493,8 @@ static void bch2_rbio_retry(struct work_struct *work)
bvec_iter_sectors(rbio->bvec_iter)); bvec_iter_sectors(rbio->bvec_iter));
if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
mark_io_failure_if_current_extent_matches(trans, rbio, &failed); bch2_mark_io_failure(&failed, &rbio->pick,
rbio->ret == -BCH_ERR_data_read_retry_csum_err);
if (!rbio->split) { if (!rbio->split) {
rbio->bio.bi_status = 0; rbio->bio.bi_status = 0;
@ -577,7 +511,7 @@ static void bch2_rbio_retry(struct work_struct *work)
flags &= ~BCH_READ_last_fragment; flags &= ~BCH_READ_last_fragment;
flags |= BCH_READ_must_clone; flags |= BCH_READ_must_clone;
int ret = flags & BCH_READ_data_update int ret = rbio->data_update
? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
: __bch2_read(trans, rbio, iter, inum, &failed, flags); : __bch2_read(trans, rbio, iter, inum, &failed, flags);
@ -591,7 +525,7 @@ static void bch2_rbio_retry(struct work_struct *work)
bch2_inum_offset_err_msg_trans(trans, &buf, bch2_inum_offset_err_msg_trans(trans, &buf,
(subvol_inum) { subvol, read_pos.inode }, (subvol_inum) { subvol, read_pos.inode },
read_pos.offset << 9)); read_pos.offset << 9));
if (rbio->flags & BCH_READ_data_update) if (rbio->data_update)
prt_str(&buf, "(internal move) "); prt_str(&buf, "(internal move) ");
prt_str(&buf, "successful retry"); prt_str(&buf, "successful retry");
@ -647,7 +581,7 @@ static void bch2_read_io_err(struct work_struct *work)
bch_err_ratelimited(c, "%s", buf.buf); bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf); printbuf_exit(&buf);
bch2_rbio_error(rbio, -BCH_ERR_data_read_io_err, bio->bi_status); bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
} }
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
@ -734,7 +668,7 @@ static void bch2_read_csum_err(struct work_struct *work)
else else
bch_err_ratelimited(c, "%s", buf.buf); bch_err_ratelimited(c, "%s", buf.buf);
bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err, BLK_STS_IOERR); bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
printbuf_exit(&buf); printbuf_exit(&buf);
} }
@ -778,42 +712,6 @@ static void bch2_read_decrypt_err(struct work_struct *work)
printbuf_exit(&buf); printbuf_exit(&buf);
} }
#ifdef CONFIG_BCACHEFS_DEBUG
static unsigned bch2_read_corrupt_ratio;
module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(read_corrupt_ratio, "");
static void corrupt_bio(struct bio *bio)
{
struct bvec_iter iter;
struct bio_vec bv;
unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
bio_for_each_segment(bv, bio, iter) {
unsigned u64s = bv.bv_len / sizeof(u64);
if (offset < u64s) {
u64 *segment = bvec_kmap_local(&bv);
segment[offset] = get_random_u64();
kunmap_local(segment);
return;
}
offset -= u64s;
}
}
static inline void maybe_corrupt_bio(struct bio *bio)
{
if (bch2_read_corrupt_ratio &&
!get_random_u32_below(bch2_read_corrupt_ratio))
corrupt_bio(bio);
}
#else
static inline void maybe_corrupt_bio(struct bio *bio)
{
}
#endif
/* Inner part that may run in process context */ /* Inner part that may run in process context */
static void __bch2_read_endio(struct work_struct *work) static void __bch2_read_endio(struct work_struct *work)
{ {
@ -821,9 +719,10 @@ static void __bch2_read_endio(struct work_struct *work)
container_of(work, struct bch_read_bio, work); container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c; struct bch_fs *c = rbio->c;
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
struct bio *src = &rbio->bio; struct bch_read_bio *parent = bch2_rbio_parent(rbio);
struct bio *dst = &bch2_rbio_parent(rbio)->bio; struct bio *src = &rbio->bio;
struct bvec_iter dst_iter = rbio->bvec_iter; struct bio *dst = &parent->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc); struct nonce nonce = extent_nonce(rbio->version, crc);
unsigned nofs_flags; unsigned nofs_flags;
@ -841,7 +740,7 @@ static void __bch2_read_endio(struct work_struct *work)
src->bi_iter = rbio->bvec_iter; src->bi_iter = rbio->bvec_iter;
} }
maybe_corrupt_bio(src); bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
@ -853,7 +752,7 @@ static void __bch2_read_endio(struct work_struct *work)
*/ */
if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
rbio->flags |= BCH_READ_must_bounce; rbio->flags |= BCH_READ_must_bounce;
bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err_maybe_userspace, bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
BLK_STS_IOERR); BLK_STS_IOERR);
goto out; goto out;
} }
@ -873,7 +772,7 @@ static void __bch2_read_endio(struct work_struct *work)
if (unlikely(rbio->narrow_crcs)) if (unlikely(rbio->narrow_crcs))
bch2_rbio_narrow_crcs(rbio); bch2_rbio_narrow_crcs(rbio);
if (likely(!(rbio->flags & BCH_READ_data_update))) { if (likely(!parent->data_update)) {
/* Adjust crc to point to subset of data we want: */ /* Adjust crc to point to subset of data we want: */
crc.offset += rbio->offset_into_extent; crc.offset += rbio->offset_into_extent;
crc.live_size = bvec_iter_sectors(rbio->bvec_iter); crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
@ -1043,6 +942,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bch_read_bio *rbio = NULL; struct bch_read_bio *rbio = NULL;
bool bounce = false, read_full = false, narrow_crcs = false; bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k); struct bpos data_pos = bkey_start_pos(k.k);
struct data_update *u = rbio_data_update(orig);
int ret = 0; int ret = 0;
if (bkey_extent_is_inline_data(k.k)) { if (bkey_extent_is_inline_data(k.k)) {
@ -1106,16 +1006,7 @@ retry_pick:
goto retry_pick; goto retry_pick;
} }
/* if (likely(!u)) {
* Unlock the iterator while the btree node's lock is still in
* cache, before doing the IO:
*/
if (!(flags & BCH_READ_in_retry))
bch2_trans_unlock(trans);
else
bch2_trans_unlock_long(trans);
if (!(flags & BCH_READ_data_update)) {
if (!(flags & BCH_READ_last_fragment) || if (!(flags & BCH_READ_last_fragment) ||
bio_flagged(&orig->bio, BIO_CHAIN)) bio_flagged(&orig->bio, BIO_CHAIN))
flags |= BCH_READ_must_clone; flags |= BCH_READ_must_clone;
@ -1138,12 +1029,10 @@ retry_pick:
bounce = true; bounce = true;
} }
} else { } else {
read_full = true;
/* /*
* can happen if we retry, and the extent we were going to read * can happen if we retry, and the extent we were going to read
* has been merged in the meantime: * has been merged in the meantime:
*/ */
struct data_update *u = container_of(orig, struct data_update, rbio);
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
if (ca) if (ca)
percpu_ref_put(&ca->io_ref); percpu_ref_put(&ca->io_ref);
@ -1152,6 +1041,7 @@ retry_pick:
} }
iter.bi_size = pick.crc.compressed_size << 9; iter.bi_size = pick.crc.compressed_size << 9;
read_full = true;
} }
if (orig->opts.promote_target || have_io_error(failed)) if (orig->opts.promote_target || have_io_error(failed))
@ -1242,10 +1132,14 @@ retry_pick:
rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio; rbio->bio.bi_end_io = bch2_read_endio;
/* XXX: also nvme read recovery level */
if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
rbio->bio.bi_opf |= REQ_FUA;
if (rbio->bounce) if (rbio->bounce)
trace_and_count(c, io_read_bounce, &rbio->bio); trace_and_count(c, io_read_bounce, &rbio->bio);
if (!(flags & BCH_READ_data_update)) if (!u)
this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
else else
this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
@ -1255,7 +1149,7 @@ retry_pick:
* If it's being moved internally, we don't want to flag it as a cache * If it's being moved internally, we don't want to flag it as a cache
* hit: * hit:
*/ */
if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update)) if (ca && pick.ptr.cached && !u)
bch2_bucket_io_time_reset(trans, pick.ptr.dev, bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ); PTR_BUCKET_NR(ca, &pick.ptr), READ);
@ -1264,6 +1158,15 @@ retry_pick:
trace_and_count(c, io_read_split, &orig->bio); trace_and_count(c, io_read_split, &orig->bio);
} }
/*
* Unlock the iterator while the btree node's lock is still in
* cache, before doing the IO:
*/
if (!(flags & BCH_READ_in_retry))
bch2_trans_unlock(trans);
else
bch2_trans_unlock_long(trans);
if (likely(!rbio->pick.do_ec_reconstruct)) { if (likely(!rbio->pick.do_ec_reconstruct)) {
if (unlikely(!rbio->have_ioref)) { if (unlikely(!rbio->have_ioref)) {
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
@ -1275,7 +1178,7 @@ retry_pick:
printbuf_exit(&buf); printbuf_exit(&buf);
bch2_rbio_error(rbio, bch2_rbio_error(rbio,
-BCH_ERR_data_read_device_offline, -BCH_ERR_data_read_retry_device_offline,
BLK_STS_IOERR); BLK_STS_IOERR);
goto out; goto out;
} }
@ -1302,7 +1205,7 @@ retry_pick:
} else { } else {
/* Attempting reconstruct read: */ /* Attempting reconstruct read: */
if (bch2_ec_read_extent(trans, rbio, k)) { if (bch2_ec_read_extent(trans, rbio, k)) {
bch2_rbio_error(rbio, -BCH_ERR_data_read_ec_reconstruct_err, bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
BLK_STS_IOERR); BLK_STS_IOERR);
goto out; goto out;
} }
@ -1314,6 +1217,8 @@ out:
if (likely(!(flags & BCH_READ_in_retry))) { if (likely(!(flags & BCH_READ_in_retry))) {
return 0; return 0;
} else { } else {
bch2_trans_unlock(trans);
int ret; int ret;
rbio->context = RBIO_CONTEXT_UNBOUND; rbio->context = RBIO_CONTEXT_UNBOUND;
@ -1324,7 +1229,7 @@ out:
if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
bch2_mark_io_failure(failed, &pick, bch2_mark_io_failure(failed, &pick,
ret == -BCH_ERR_data_read_csum_err); ret == -BCH_ERR_data_read_retry_csum_err);
return ret; return ret;
} }
@ -1341,11 +1246,11 @@ hole:
this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
bvec_iter_sectors(iter)); bvec_iter_sectors(iter));
/* /*
* won't normally happen in the BCH_READ_data_update * won't normally happen in the data update (bch2_move_extent()) path,
* (bch2_move_extent()) path, but if we retry and the extent we wanted * but if we retry and the extent we wanted to read no longer exists we
* to read no longer exists we have to signal that: * have to signal that:
*/ */
if (flags & BCH_READ_data_update) if (u)
orig->ret = -BCH_ERR_data_read_key_overwritten; orig->ret = -BCH_ERR_data_read_key_overwritten;
zero_fill_bio_iter(&orig->bio, iter); zero_fill_bio_iter(&orig->bio, iter);
@ -1366,7 +1271,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
struct bkey_s_c k; struct bkey_s_c k;
int ret; int ret;
BUG_ON(flags & BCH_READ_data_update); EBUG_ON(rbio->data_update);
bch2_bkey_buf_init(&sk); bch2_bkey_buf_init(&sk);
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
@ -1393,23 +1298,6 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
if (ret) if (ret)
goto err; goto err;
if (unlikely(flags & BCH_READ_in_retry)) {
struct data_update *u = flags & BCH_READ_data_update
? container_of(rbio, struct data_update, rbio)
: NULL;
if (u &&
!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
/* extent we wanted to read no longer exists: */
ret = -BCH_ERR_data_read_key_overwritten;
goto err;
}
if (!bkey_deleted(&sk.k->k) &&
!bkey_and_val_eq(k, bkey_i_to_s_c(sk.k)))
failed->nr = 0;
}
s64 offset_into_extent = iter.pos.offset - s64 offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k); bkey_start_offset(k.k);
unsigned sectors = k.k->size - offset_into_extent; unsigned sectors = k.k->size - offset_into_extent;
@ -1447,16 +1335,18 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
swap(bvec_iter.bi_size, bytes); swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
err: err:
if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
flags |= BCH_READ_must_bounce;
if (ret && if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
!bch2_err_matches(ret, BCH_ERR_data_read_retry)) !bch2_err_matches(ret, BCH_ERR_data_read_retry))
break; break;
} }
if (unlikely(ret)) { bch2_trans_iter_exit(trans, &iter);
if (ret == -BCH_ERR_no_device_to_read_from && failed)
maybe_poison_extent(trans, &iter, k, failed);
if (ret) {
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
lockrestart_do(trans, lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf, inum, bch2_inum_offset_err_msg_trans(trans, &buf, inum,
@ -1472,7 +1362,6 @@ err:
bch2_rbio_done(rbio); bch2_rbio_done(rbio);
} }
bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c); bch2_bkey_buf_exit(&sk, c);
return ret; return ret;
} }

View File

@ -36,7 +36,8 @@ struct bch_read_bio {
u16 flags; u16 flags;
union { union {
struct { struct {
u16 promote:1, u16 data_update:1,
promote:1,
bounce:1, bounce:1,
split:1, split:1,
have_ioref:1, have_ioref:1,
@ -109,7 +110,6 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
x(retry_if_stale) \ x(retry_if_stale) \
x(may_promote) \ x(may_promote) \
x(user_mapped) \ x(user_mapped) \
x(data_update) \
x(last_fragment) \ x(last_fragment) \
x(must_bounce) \ x(must_bounce) \
x(must_clone) \ x(must_clone) \
@ -163,12 +163,13 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
{ {
struct bch_read_bio *rbio = to_rbio(bio); struct bch_read_bio *rbio = to_rbio(bio);
rbio->c = orig->c; rbio->c = orig->c;
rbio->_state = 0; rbio->_state = 0;
rbio->ret = 0; rbio->flags = 0;
rbio->split = true; rbio->ret = 0;
rbio->parent = orig; rbio->split = true;
rbio->opts = orig->opts; rbio->parent = orig;
rbio->opts = orig->opts;
return rbio; return rbio;
} }
@ -182,7 +183,8 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
rbio->start_time = local_clock(); rbio->start_time = local_clock();
rbio->c = c; rbio->c = c;
rbio->_state = 0; rbio->_state = 0;
rbio->ret = 0; rbio->flags = 0;
rbio->ret = 0;
rbio->opts = opts; rbio->opts = opts;
rbio->bio.bi_end_io = end_io; rbio->bio.bi_end_io = end_io;
return rbio; return rbio;

View File

@ -34,6 +34,12 @@
#include <linux/random.h> #include <linux/random.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#ifdef CONFIG_BCACHEFS_DEBUG
static unsigned bch2_write_corrupt_ratio;
module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(write_corrupt_ratio, "");
#endif
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
@ -1005,6 +1011,15 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
bounce = true; bounce = true;
} }
#ifdef CONFIG_BCACHEFS_DEBUG
unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
if (!bounce && write_corrupt_ratio) {
dst = bch2_write_bio_alloc(c, wp, src,
&page_alloc_failed,
ec_buf);
bounce = true;
}
#endif
saved_iter = dst->bi_iter; saved_iter = dst->bi_iter;
do { do {
@ -1114,6 +1129,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
init_append_extent(op, wp, version, crc); init_append_extent(op, wp, version, crc);
#ifdef CONFIG_BCACHEFS_DEBUG
if (write_corrupt_ratio) {
swap(dst->bi_iter.bi_size, dst_len);
bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
swap(dst->bi_iter.bi_size, dst_len);
}
#endif
if (dst != src) if (dst != src)
bio_advance(dst, dst_len); bio_advance(dst, dst_len);
bio_advance(src, src_len); bio_advance(src, src_len);
@ -1394,6 +1417,7 @@ retry:
bio->bi_private = &op->cl; bio->bi_private = &op->cl;
bio->bi_opf |= REQ_OP_WRITE; bio->bi_opf |= REQ_OP_WRITE;
closure_get(&op->cl); closure_get(&op->cl);
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
op->insert_keys.top, true); op->insert_keys.top, true);
@ -1718,20 +1742,26 @@ static const char * const bch2_write_flags[] = {
void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
{ {
prt_str(out, "pos: "); if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
prt_printf(out, "pos:\t");
bch2_bpos_to_text(out, op->pos); bch2_bpos_to_text(out, op->pos);
prt_newline(out); prt_newline(out);
printbuf_indent_add(out, 2); printbuf_indent_add(out, 2);
prt_str(out, "started: "); prt_printf(out, "started:\t");
bch2_pr_time_units(out, local_clock() - op->start_time); bch2_pr_time_units(out, local_clock() - op->start_time);
prt_newline(out); prt_newline(out);
prt_str(out, "flags: "); prt_printf(out, "flags:\t");
prt_bitflags(out, bch2_write_flags, op->flags); prt_bitflags(out, bch2_write_flags, op->flags);
prt_newline(out); prt_newline(out);
prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
printbuf_indent_sub(out, 2); printbuf_indent_sub(out, 2);
} }

View File

@ -1609,11 +1609,6 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
kvfree(new_buf); kvfree(new_buf);
} }
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
{
return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
}
static CLOSURE_CALLBACK(journal_write_done) static CLOSURE_CALLBACK(journal_write_done)
{ {
closure_type(w, struct journal_buf, io); closure_type(w, struct journal_buf, io);

View File

@ -101,13 +101,25 @@ static void move_free(struct moving_io *io)
static void move_write_done(struct bch_write_op *op) static void move_write_done(struct bch_write_op *op)
{ {
struct moving_io *io = container_of(op, struct moving_io, write.op); struct moving_io *io = container_of(op, struct moving_io, write.op);
struct bch_fs *c = op->c;
struct moving_context *ctxt = io->write.ctxt; struct moving_context *ctxt = io->write.ctxt;
if (io->write.op.error) if (op->error) {
ctxt->write_error = true; if (trace_io_move_write_fail_enabled()) {
struct printbuf buf = PRINTBUF;
atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); bch2_write_op_to_text(&buf, op);
atomic_dec(&io->write.ctxt->write_ios); prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error));
trace_io_move_write_fail(c, buf.buf);
printbuf_exit(&buf);
}
this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
ctxt->write_error = true;
}
atomic_sub(io->write_sectors, &ctxt->write_sectors);
atomic_dec(&ctxt->write_ios);
move_free(io); move_free(io);
closure_put(&ctxt->cl); closure_put(&ctxt->cl);
} }
@ -359,7 +371,6 @@ int bch2_move_extent(struct moving_context *ctxt,
bkey_start_pos(k.k), bkey_start_pos(k.k),
iter->btree_id, k, 0, iter->btree_id, k, 0,
NULL, NULL,
BCH_READ_data_update|
BCH_READ_last_fragment, BCH_READ_last_fragment,
data_opts.scrub ? data_opts.read_dev : -1); data_opts.scrub ? data_opts.read_dev : -1);
return 0; return 0;
@ -580,7 +591,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
k.k->type == KEY_TYPE_reflink_p && k.k->type == KEY_TYPE_reflink_p &&
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); s64 offset_into_extent = 0;
bch2_trans_iter_exit(trans, &reflink_iter); bch2_trans_iter_exit(trans, &reflink_iter);
k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
@ -599,6 +610,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
* pointer - need to fixup iter->k * pointer - need to fixup iter->k
*/ */
extent_iter = &reflink_iter; extent_iter = &reflink_iter;
offset_into_extent = 0;
} }
if (!bkey_extent_is_direct_data(k.k)) if (!bkey_extent_is_direct_data(k.k))
@ -712,7 +724,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
struct btree_iter iter = {}, bp_iter = {}; struct btree_iter iter = {}, bp_iter = {};
struct bkey_buf sk; struct bkey_buf sk;
struct bkey_s_c k; struct bkey_s_c k;
unsigned sectors_moved = 0;
struct bkey_buf last_flushed; struct bkey_buf last_flushed;
int ret = 0; int ret = 0;
@ -834,7 +845,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (ctxt->stats) if (ctxt->stats)
atomic64_add(sectors, &ctxt->stats->sectors_seen); atomic64_add(sectors, &ctxt->stats->sectors_seen);
sectors_moved += sectors;
next: next:
bch2_btree_iter_advance(&bp_iter); bch2_btree_iter_advance(&bp_iter);
} }
@ -1253,17 +1263,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
prt_newline(out); prt_newline(out);
printbuf_indent_add(out, 2); printbuf_indent_add(out, 2);
prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved));
prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced));
prt_printf(out, "bytes seen: "); prt_printf(out, "bytes seen:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
prt_newline(out); prt_newline(out);
prt_printf(out, "bytes moved: "); prt_printf(out, "bytes moved:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out); prt_newline(out);
prt_printf(out, "bytes raced: "); prt_printf(out, "bytes raced:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out); prt_newline(out);
@ -1272,7 +1282,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
{ {
struct moving_io *io; if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
bch2_move_stats_to_text(out, ctxt->stats); bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2); printbuf_indent_add(out, 2);
@ -1292,6 +1303,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
printbuf_indent_add(out, 2); printbuf_indent_add(out, 2);
mutex_lock(&ctxt->lock); mutex_lock(&ctxt->lock);
struct moving_io *io;
list_for_each_entry(io, &ctxt->ios, io_list) list_for_each_entry(io, &ctxt->ios, io_list)
bch2_data_update_inflight_to_text(out, &io->write); bch2_data_update_inflight_to_text(out, &io->write);
mutex_unlock(&ctxt->lock); mutex_unlock(&ctxt->lock);

View File

@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
prt_printf(out, "Currently calculated wait:\t"); prt_printf(out, "Currently calculated wait:\t");
prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
prt_newline(out); prt_newline(out);
rcu_read_lock();
struct task_struct *t = rcu_dereference(c->copygc_thread);
if (t)
get_task_struct(t);
rcu_read_unlock();
if (t) {
bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
put_task_struct(t);
}
} }
static int bch2_copygc_thread(void *arg) static int bch2_copygc_thread(void *arg)

View File

@ -186,6 +186,11 @@ enum fsck_err_opts {
OPT_STR(__bch2_csum_opts), \ OPT_STR(__bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \ NULL, NULL) \
x(checksum_err_retry_nr, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(0, 32), \
BCH_SB_CSUM_ERR_RETRY_NR, 3, \
NULL, NULL) \
x(compression, u8, \ x(compression, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \ OPT_FN(bch2_opt_compression), \

View File

@ -26,9 +26,8 @@
/* bch_extent_rebalance: */ /* bch_extent_rebalance: */
static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
{ {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry; const union bch_extent_entry *entry;
bkey_extent_entry_for_each(ptrs, entry) bkey_extent_entry_for_each(ptrs, entry)
@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s
return NULL; return NULL;
} }
static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
{
return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
}
static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
struct bch_io_opts *opts, struct bch_io_opts *opts,
struct bkey_s_c k, struct bkey_s_c k,
@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
{ {
const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
if (!opts) if (!opts)
return 0; return 0;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry; const union bch_extent_entry *entry;
struct extent_ptr_decoded p; struct extent_ptr_decoded p;
u64 sectors = 0; u64 sectors = 0;
@ -590,8 +595,19 @@ static int bch2_rebalance_thread(void *arg)
void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
{ {
printbuf_tabstop_push(out, 32);
struct bch_fs_rebalance *r = &c->rebalance; struct bch_fs_rebalance *r = &c->rebalance;
/* print pending work */
struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, };
u64 v;
bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
prt_printf(out, "pending work:\t");
prt_human_readable_u64(out, v);
prt_printf(out, "\n\n");
prt_str(out, bch2_rebalance_state_strs[r->state]); prt_str(out, bch2_rebalance_state_strs[r->state]);
prt_newline(out); prt_newline(out);
printbuf_indent_add(out, 2); printbuf_indent_add(out, 2);
@ -600,15 +616,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
case BCH_REBALANCE_waiting: { case BCH_REBALANCE_waiting: {
u64 now = atomic64_read(&c->io_clock[WRITE].now); u64 now = atomic64_read(&c->io_clock[WRITE].now);
prt_str(out, "io wait duration: "); prt_printf(out, "io wait duration:\t");
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
prt_newline(out); prt_newline(out);
prt_str(out, "io wait remaining: "); prt_printf(out, "io wait remaining:\t");
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
prt_newline(out); prt_newline(out);
prt_str(out, "duration waited: "); prt_printf(out, "duration waited:\t");
bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
prt_newline(out); prt_newline(out);
break; break;
@ -621,6 +637,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
break; break;
} }
prt_newline(out); prt_newline(out);
rcu_read_lock();
struct task_struct *t = rcu_dereference(c->rebalance.thread);
if (t)
get_task_struct(t);
rcu_read_unlock();
if (t) {
bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
put_task_struct(t);
}
printbuf_indent_sub(out, 2); printbuf_indent_sub(out, 2);
} }

View File

@ -899,7 +899,7 @@ use_clean:
* journal sequence numbers: * journal sequence numbers:
*/ */
if (!c->sb.clean) if (!c->sb.clean)
journal_seq += 8; journal_seq += JOURNAL_BUF_NR * 4;
if (blacklist_seq != journal_seq) { if (blacklist_seq != journal_seq) {
ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",

View File

@ -22,6 +22,7 @@ enum counters_flags {
x(io_move_write, 36, TYPE_SECTORS) \ x(io_move_write, 36, TYPE_SECTORS) \
x(io_move_finish, 37, TYPE_SECTORS) \ x(io_move_finish, 37, TYPE_SECTORS) \
x(io_move_fail, 38, TYPE_COUNTER) \ x(io_move_fail, 38, TYPE_COUNTER) \
x(io_move_write_fail, 82, TYPE_COUNTER) \
x(io_move_start_fail, 39, TYPE_COUNTER) \ x(io_move_start_fail, 39, TYPE_COUNTER) \
x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \
x(bucket_discard, 4, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \

View File

@ -12,7 +12,6 @@
#include "super.h" #include "super.h"
#include <linux/crc32c.h> #include <linux/crc32c.h>
#include <crypto/hash.h>
#include <crypto/sha2.h> #include <crypto/sha2.h>
static inline enum bch_str_hash_type static inline enum bch_str_hash_type
@ -55,13 +54,10 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
}; };
if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
SHASH_DESC_ON_STACK(desc, c->sha256);
u8 digest[SHA256_DIGEST_SIZE]; u8 digest[SHA256_DIGEST_SIZE];
desc->tfm = c->sha256; sha256((const u8 *)&bi->bi_hash_seed,
sizeof(bi->bi_hash_seed), digest);
crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
sizeof(bi->bi_hash_seed), digest);
memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
} }

View File

@ -365,10 +365,9 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
return 0; return 0;
} }
static int bch2_sb_validate(struct bch_sb_handle *disk_sb, int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
enum bch_validate_flags flags, struct printbuf *out) enum bch_validate_flags flags, struct printbuf *out)
{ {
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field_members_v1 *mi; struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id; enum bch_opt_id opt_id;
int ret; int ret;
@ -377,15 +376,27 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (ret) if (ret)
return ret; return ret;
if (sb->features[1] || u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
(le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { unsigned incompat_bit = 0;
prt_printf(out, "Filesystem has incompatible features"); if (incompat)
incompat_bit = __ffs64(incompat);
else if (sb->features[1])
incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
if (incompat_bit) {
prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
incompat_bit,
bch2_sb_features[BCH_FEATURE_NR - 1],
BCH_FEATURE_NR - 1);
return -BCH_ERR_invalid_sb_features; return -BCH_ERR_invalid_sb_features;
} }
if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
prt_printf(out, "Filesystem has incompatible version"); prt_str(out, "Filesystem has incompatible version ");
bch2_version_to_text(out, le16_to_cpu(sb->version));
prt_str(out, ", current version ");
bch2_version_to_text(out, bcachefs_metadata_version_current);
return -BCH_ERR_invalid_sb_features; return -BCH_ERR_invalid_sb_features;
} }
@ -399,6 +410,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
return -BCH_ERR_invalid_sb_uuid; return -BCH_ERR_invalid_sb_uuid;
} }
if (!(flags & BCH_VALIDATE_write) &&
le64_to_cpu(sb->offset) != read_offset) {
prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
le64_to_cpu(sb->offset), read_offset);
return -BCH_ERR_invalid_sb_offset;
}
if (!sb->nr_devices || if (!sb->nr_devices ||
sb->nr_devices > BCH_SB_MEMBERS_MAX) { sb->nr_devices > BCH_SB_MEMBERS_MAX) {
prt_printf(out, "Bad number of member devices %u (max %u)", prt_printf(out, "Bad number of member devices %u (max %u)",
@ -457,6 +475,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
!BCH_SB_CSUM_ERR_RETRY_NR(sb))
SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
} }
#ifdef __KERNEL__ #ifdef __KERNEL__
@ -874,7 +896,7 @@ got_super:
sb->have_layout = true; sb->have_layout = true;
ret = bch2_sb_validate(sb, 0, &err); ret = bch2_sb_validate(sb->sb, offset, 0, &err);
if (ret) { if (ret) {
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf); path, err.buf);
@ -1031,7 +1053,7 @@ int bch2_write_super(struct bch_fs *c)
darray_for_each(online_devices, ca) { darray_for_each(online_devices, ca) {
printbuf_reset(&err); printbuf_reset(&err);
ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err); ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
if (ret) { if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
goto out; goto out;

View File

@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bch_sb_handle *); void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned); int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *); int bch2_write_super(struct bch_fs *);

View File

@ -75,9 +75,6 @@
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
MODULE_DESCRIPTION("bcachefs filesystem"); MODULE_DESCRIPTION("bcachefs filesystem");
MODULE_SOFTDEP("pre: crc32c");
MODULE_SOFTDEP("pre: crc64");
MODULE_SOFTDEP("pre: sha256");
MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: chacha20");
MODULE_SOFTDEP("pre: poly1305"); MODULE_SOFTDEP("pre: poly1305");
MODULE_SOFTDEP("pre: xxhash"); MODULE_SOFTDEP("pre: xxhash");
@ -1838,7 +1835,11 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_late; goto err_late;
up_write(&c->state_lock); up_write(&c->state_lock);
return 0; out:
printbuf_exit(&label);
printbuf_exit(&errbuf);
bch_err_fn(c, ret);
return ret;
err_unlock: err_unlock:
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
@ -1847,10 +1848,7 @@ err:
if (ca) if (ca)
bch2_dev_free(ca); bch2_dev_free(ca);
bch2_free_super(&sb); bch2_free_super(&sb);
printbuf_exit(&label); goto out;
printbuf_exit(&errbuf);
bch_err_fn(c, ret);
return ret;
err_late: err_late:
up_write(&c->state_lock); up_write(&c->state_lock);
ca = NULL; ca = NULL;

View File

@ -148,6 +148,7 @@ write_attribute(trigger_btree_key_cache_shrink);
write_attribute(trigger_freelist_wakeup); write_attribute(trigger_freelist_wakeup);
write_attribute(trigger_btree_updates); write_attribute(trigger_btree_updates);
read_attribute(gc_gens_pos); read_attribute(gc_gens_pos);
write_attribute(read_fua_test);
read_attribute(uuid); read_attribute(uuid);
read_attribute(minor); read_attribute(minor);
@ -395,6 +396,71 @@ SHOW(bch2_fs)
return 0; return 0;
} }
static int read_fua_test(struct bch_fs *c)
{
int ret = 0;
unsigned bs = 4096;
struct bio *bio;
void *buf;
struct bch_dev *ca = bch2_dev_get_ioref(c, 0, READ);
if (!ca)
return -EINVAL;
bio = bio_kmalloc(1, GFP_KERNEL);
if (!bio) {
ret = -ENOMEM;
goto err;
}
buf = kmalloc(bs, GFP_KERNEL);
if (!buf)
goto err;
u64 start = ktime_get_ns();
for (unsigned i = 0; i < 1000; i++) {
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ);
bch2_bio_map(bio, buf, bs);
ret = submit_bio_wait(bio);
if (ret)
goto err;
}
u64 ns_nofua = ktime_get_ns() - start;
start = ktime_get_ns();
for (unsigned i = 0; i < 1000; i++) {
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ);
bch2_bio_map(bio, buf, bs);
ret = submit_bio_wait(bio);
if (ret)
goto err;
}
u64 ns_fua = ktime_get_ns() - start;
u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca);
start = ktime_get_ns();
for (unsigned i = 0; i < 1000; i++) {
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ);
bio->bi_iter.bi_sector = (get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9;
bch2_bio_map(bio, buf, bs);
ret = submit_bio_wait(bio);
if (ret)
goto err;
}
u64 ns_rand = ktime_get_ns() - start;
pr_info("ns nofua %llu", ns_nofua);
pr_info("ns fua %llu", ns_fua);
pr_info("ns random %llu", ns_rand);
err:
kfree(buf);
kfree(bio);
percpu_ref_put(&ca->io_ref);
bch_err_fn(c, ret);
return ret;
}
STORE(bch2_fs) STORE(bch2_fs)
{ {
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@ -451,6 +517,9 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_freelist_wakeup) if (attr == &sysfs_trigger_freelist_wakeup)
closure_wake_up(&c->freelist_wait); closure_wake_up(&c->freelist_wait);
if (attr == &sysfs_read_fua_test)
read_fua_test(c);
#ifdef CONFIG_BCACHEFS_TESTS #ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) { if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@ -580,6 +649,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_btree_key_cache_shrink, &sysfs_trigger_btree_key_cache_shrink,
&sysfs_trigger_freelist_wakeup, &sysfs_trigger_freelist_wakeup,
&sysfs_trigger_btree_updates, &sysfs_trigger_btree_updates,
&sysfs_read_fua_test,
&sysfs_gc_gens_pos, &sysfs_gc_gens_pos,

View File

@ -846,6 +846,11 @@ DEFINE_EVENT(fs_str, io_move_fail,
TP_ARGS(c, str) TP_ARGS(c, str)
); );
DEFINE_EVENT(fs_str, io_move_write_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_start_fail, DEFINE_EVENT(fs_str, io_move_start_fail,
TP_PROTO(struct bch_fs *c, const char *str), TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str) TP_ARGS(c, str)

View File

@ -653,21 +653,6 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
return 0; return 0;
} }
size_t bch2_rand_range(size_t max)
{
size_t rand;
if (!max)
return 0;
do {
rand = get_random_long();
rand &= roundup_pow_of_two(max) - 1;
} while (rand >= max);
return rand;
}
void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
{ {
struct bio_vec bv; struct bio_vec bv;
@ -698,6 +683,27 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
} }
} }
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_corrupt_bio(struct bio *bio)
{
struct bvec_iter iter;
struct bio_vec bv;
unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
bio_for_each_segment(bv, bio, iter) {
unsigned u64s = bv.bv_len / sizeof(u64);
if (offset < u64s) {
u64 *segment = bvec_kmap_local(&bv);
segment[offset] = get_random_u64();
kunmap_local(segment);
return;
}
offset -= u64s;
}
}
#endif
#if 0 #if 0
void eytzinger1_test(void) void eytzinger1_test(void)
{ {

View File

@ -401,11 +401,21 @@ do { \
_ret; \ _ret; \
}) })
size_t bch2_rand_range(size_t);
void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
void memcpy_from_bio(void *, struct bio *, struct bvec_iter); void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_corrupt_bio(struct bio *);
static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
{
if (ratio && !get_random_u32_below(ratio))
bch2_corrupt_bio(bio);
}
#else
#define bch2_maybe_corrupt_bio(...) do {} while (0)
#endif
static inline void memcpy_u64s_small(void *dst, const void *src, static inline void memcpy_u64s_small(void *dst, const void *src,
unsigned u64s) unsigned u64s)
{ {