mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-03-27 00:00:04 +03:00
Update bcachefs sources to 4d28432bcc5f bcachefs: Validate bch_sb.offset field
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
f42ee45c6e
commit
c0836924b1
.bcachefs_revision
include
libbcachefs
alloc_background.cbcachefs.hbcachefs_format.hbtree_cache.cbtree_io.cbtree_iter.cbtree_update.hbtree_write_buffer.cbuckets.hchecksum.cchecksum.hdata_update.cec.cerrcode.hextents.cextents.hextents_types.hfs-io-buffered.cfs.cinode.cio_read.cio_read.hio_write.cjournal_io.cmove.cmovinggc.copts.hrebalance.crecovery.csb-counters_format.hstr_hash.hsuper-io.csuper-io.hsuper.csysfs.ctrace.hutil.cutil.h
@ -1 +1 @@
|
||||
46af7258b951a79a66511172ab8772ad2dfaa4e3
|
||||
4d28432bcc5f91caf053f64a1cde1a6286adf4a6
|
||||
|
@ -7,6 +7,7 @@
|
||||
#define _CRYPTO_SHA_H
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <sodium/crypto_hash_sha256.h>
|
||||
|
||||
#define SHA1_DIGEST_SIZE 20
|
||||
#define SHA1_BLOCK_SIZE 64
|
||||
@ -112,4 +113,9 @@ extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
|
||||
|
||||
extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len, u8 *hash);
|
||||
|
||||
static inline void sha256(const u8 *data, unsigned int len, u8 *out)
|
||||
{
|
||||
crypto_hash_sha256(out, data, len);
|
||||
}
|
||||
#endif
|
||||
|
@ -82,4 +82,71 @@ static inline s64 div_s64(s64 dividend, s32 divisor)
|
||||
return div_s64_rem(dividend, divisor, &remainder);
|
||||
}
|
||||
|
||||
#ifndef mul_u32_u32
|
||||
/*
|
||||
* Many a GCC version messes this up and generates a 64x64 mult :-(
|
||||
*/
|
||||
static inline u64 mul_u32_u32(u32 a, u32 b)
|
||||
{
|
||||
return (u64)a * b;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
|
||||
|
||||
#ifndef mul_u64_u64_shr
|
||||
static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
|
||||
{
|
||||
return (u64)(((unsigned __int128)a * mul) >> shift);
|
||||
}
|
||||
#endif /* mul_u64_u64_shr */
|
||||
|
||||
#else
|
||||
|
||||
#ifndef mul_u64_u64_shr
|
||||
static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
|
||||
{
|
||||
union {
|
||||
u64 ll;
|
||||
struct {
|
||||
#ifdef __BIG_ENDIAN
|
||||
u32 high, low;
|
||||
#else
|
||||
u32 low, high;
|
||||
#endif
|
||||
} l;
|
||||
} rl, rm, rn, rh, a0, b0;
|
||||
u64 c;
|
||||
|
||||
a0.ll = a;
|
||||
b0.ll = b;
|
||||
|
||||
rl.ll = mul_u32_u32(a0.l.low, b0.l.low);
|
||||
rm.ll = mul_u32_u32(a0.l.low, b0.l.high);
|
||||
rn.ll = mul_u32_u32(a0.l.high, b0.l.low);
|
||||
rh.ll = mul_u32_u32(a0.l.high, b0.l.high);
|
||||
|
||||
/*
|
||||
* Each of these lines computes a 64-bit intermediate result into "c",
|
||||
* starting at bits 32-95. The low 32-bits go into the result of the
|
||||
* multiplication, the high 32-bits are carried into the next step.
|
||||
*/
|
||||
rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low;
|
||||
rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low;
|
||||
rh.l.high = (c >> 32) + rh.l.high;
|
||||
|
||||
/*
|
||||
* The 128-bit result of the multiplication is in rl.ll and rh.ll,
|
||||
* shift it right and throw away the high part of the result.
|
||||
*/
|
||||
if (shift == 0)
|
||||
return rl.ll;
|
||||
if (shift < 64)
|
||||
return (rl.ll >> shift) | (rh.ll << (64 - shift));
|
||||
return rh.ll >> (shift & 63);
|
||||
}
|
||||
#endif /* mul_u64_u64_shr */
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_MATH64_H */
|
||||
|
@ -9,7 +9,9 @@
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <linux/bug.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/math64.h>
|
||||
|
||||
#ifdef SYS_getrandom
|
||||
static inline int getrandom(void *buf, size_t buflen, unsigned int flags)
|
||||
@ -67,4 +69,19 @@ static inline u32 get_random_u32_below(u32 ceil)
|
||||
}
|
||||
}
|
||||
|
||||
static inline u64 get_random_u64_below(u64 ceil)
|
||||
{
|
||||
if (ceil <= 1)
|
||||
return 0;
|
||||
if (ceil <= U32_MAX)
|
||||
return get_random_u32_below(ceil);
|
||||
|
||||
for (;;) {
|
||||
u64 rand = get_random_u64();
|
||||
u64 mult = ceil * rand;
|
||||
if (likely(mult >= -ceil % ceil))
|
||||
return mul_u64_u64_shr(ceil, rand, 64);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* _LINUX_RANDOM_H */
|
||||
|
@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
int ret = 0;
|
||||
|
||||
bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
|
||||
c, alloc_v2_unpack_error,
|
||||
c, alloc_v3_unpack_error,
|
||||
"unpack error");
|
||||
fsck_err:
|
||||
return ret;
|
||||
|
@ -979,7 +979,6 @@ struct bch_fs {
|
||||
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
|
||||
size_t zstd_workspace_size;
|
||||
|
||||
struct crypto_shash *sha256;
|
||||
struct crypto_sync_skcipher *chacha20;
|
||||
struct crypto_shash *poly1305;
|
||||
|
||||
|
@ -842,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
|
||||
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
|
||||
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
|
||||
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
|
||||
/* one free bit */
|
||||
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
|
||||
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
|
||||
LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
|
||||
@ -861,6 +862,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
|
||||
struct bch_sb, flags[5], 48, 64);
|
||||
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
|
||||
LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
|
||||
LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
|
||||
|
||||
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
|
||||
{
|
||||
|
@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
|
||||
btree_node_write_in_flight(b));
|
||||
|
||||
btree_node_data_free(bc, b);
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
BUG_ON(!bch2_journal_error(&c->journal) &&
|
||||
|
@ -2080,11 +2080,6 @@ static void btree_node_write_work(struct work_struct *work)
|
||||
container_of(work, struct btree_write_bio, work);
|
||||
struct bch_fs *c = wbio->wbio.c;
|
||||
struct btree *b = wbio->wbio.bio.bi_private;
|
||||
unsigned commit_flags =
|
||||
BCH_WATERMARK_interior_updates|
|
||||
BCH_TRANS_COMMIT_journal_reclaim|
|
||||
BCH_TRANS_COMMIT_no_enospc|
|
||||
BCH_TRANS_COMMIT_no_check_rw;
|
||||
u64 start_time = wbio->start_time;
|
||||
int ret = 0;
|
||||
|
||||
@ -2093,24 +2088,38 @@ static void btree_node_write_work(struct work_struct *work)
|
||||
wbio->wbio.used_mempool,
|
||||
wbio->data);
|
||||
|
||||
if (wbio->wbio.failed.nr) {
|
||||
ret = bch2_trans_do(c,
|
||||
bch2_btree_node_rewrite_key_get_iter(trans, b,
|
||||
commit_flags));
|
||||
} else if (!wbio->wbio.first_btree_write) {
|
||||
bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
|
||||
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
|
||||
|
||||
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
|
||||
ret = -BCH_ERR_btree_node_write_all_failed;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (wbio->wbio.first_btree_write) {
|
||||
if (wbio->wbio.failed.nr) {
|
||||
|
||||
}
|
||||
} else {
|
||||
ret = bch2_trans_do(c,
|
||||
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
|
||||
commit_flags, true));
|
||||
BCH_WATERMARK_interior_updates|
|
||||
BCH_TRANS_COMMIT_journal_reclaim|
|
||||
BCH_TRANS_COMMIT_no_enospc|
|
||||
BCH_TRANS_COMMIT_no_check_rw,
|
||||
!wbio->wbio.failed.nr));
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
set_btree_node_noevict(b);
|
||||
bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
|
||||
"writing btree node: %s", bch2_err_str(ret));
|
||||
}
|
||||
|
||||
out:
|
||||
bio_put(&wbio->wbio.bio);
|
||||
btree_node_write_done(c, b, start_time);
|
||||
return;
|
||||
err:
|
||||
set_btree_node_noevict(b);
|
||||
bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
|
||||
"writing btree node: %s", bch2_err_str(ret));
|
||||
goto out;
|
||||
}
|
||||
|
||||
static void btree_node_write_endio(struct bio *bio)
|
||||
|
@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
|
||||
bch2_btree_node_iter_peek_all(&l->iter, l->b));
|
||||
}
|
||||
|
||||
static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
|
||||
struct btree_path *path,
|
||||
struct btree_path_level *l,
|
||||
struct bkey *u)
|
||||
{
|
||||
struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
|
||||
bch2_btree_node_iter_peek(&l->iter, l->b));
|
||||
|
||||
path->pos = k.k ? k.k->p : l->b->key.k.p;
|
||||
trans->paths_sorted = false;
|
||||
bch2_btree_path_verify_level(trans, path, l - path->l);
|
||||
return k;
|
||||
}
|
||||
|
||||
static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
|
||||
struct btree_path *path,
|
||||
struct btree_path_level *l,
|
||||
|
@ -126,10 +126,18 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
|
||||
|
||||
int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
|
||||
|
||||
int bch2_btree_write_buffer_insert_err(struct btree_trans *,
|
||||
enum btree_id, struct bkey_i *);
|
||||
|
||||
static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
|
||||
enum btree_id btree,
|
||||
struct bkey_i *k)
|
||||
{
|
||||
if (unlikely(!btree_type_uses_write_buffer(btree))) {
|
||||
int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
|
||||
dump_stack();
|
||||
return ret;
|
||||
}
|
||||
/*
|
||||
* Most updates skip the btree write buffer until journal replay is
|
||||
* finished because synchronization with journal replay relies on having
|
||||
|
@ -264,6 +264,22 @@ out:
|
||||
BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
|
||||
}
|
||||
|
||||
int bch2_btree_write_buffer_insert_err(struct btree_trans *trans,
|
||||
enum btree_id btree, struct bkey_i *k)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
|
||||
bch2_btree_id_to_text(&buf, btree);
|
||||
prt_str(&buf, "\n");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
|
||||
|
||||
bch2_fs_inconsistent(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
return -EROFS;
|
||||
}
|
||||
|
||||
static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
@ -312,7 +328,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
darray_for_each(wb->sorted, i) {
|
||||
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
|
||||
|
||||
BUG_ON(!btree_type_uses_write_buffer(k->btree));
|
||||
if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
|
||||
ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k);
|
||||
goto err;
|
||||
}
|
||||
|
||||
for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
|
||||
prefetch(&wb->flushing.keys.data[n->idx]);
|
||||
|
@ -140,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b)
|
||||
|
||||
static inline int gen_after(u8 a, u8 b)
|
||||
{
|
||||
int r = gen_cmp(a, b);
|
||||
|
||||
return r > 0 ? r : 0;
|
||||
return max(0, gen_cmp(a, b));
|
||||
}
|
||||
|
||||
static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
|
||||
|
@ -693,6 +693,14 @@ static int bch2_alloc_ciphers(struct bch_fs *c)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
/*
|
||||
* This seems to be duplicating code in cmd_remove_passphrase() in
|
||||
* bcachefs-tools, but we might want to switch userspace to use this - and
|
||||
* perhaps add an ioctl for calling this at runtime, so we can take the
|
||||
* passphrase off of a mounted filesystem (which has come up).
|
||||
*/
|
||||
int bch2_disable_encryption(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_crypt *crypt;
|
||||
@ -725,6 +733,10 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* For enabling encryption on an existing filesystem: not hooked up yet, but it
|
||||
* should be
|
||||
*/
|
||||
int bch2_enable_encryption(struct bch_fs *c, bool keyed)
|
||||
{
|
||||
struct bch_encrypted_key key;
|
||||
@ -781,6 +793,7 @@ err:
|
||||
memzero_explicit(&key, sizeof(key));
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
void bch2_fs_encryption_exit(struct bch_fs *c)
|
||||
{
|
||||
@ -788,8 +801,6 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
|
||||
crypto_free_shash(c->poly1305);
|
||||
if (c->chacha20)
|
||||
crypto_free_sync_skcipher(c->chacha20);
|
||||
if (c->sha256)
|
||||
crypto_free_shash(c->sha256);
|
||||
}
|
||||
|
||||
int bch2_fs_encryption_init(struct bch_fs *c)
|
||||
@ -798,14 +809,6 @@ int bch2_fs_encryption_init(struct bch_fs *c)
|
||||
struct bch_key key;
|
||||
int ret = 0;
|
||||
|
||||
c->sha256 = crypto_alloc_shash("sha256", 0, 0);
|
||||
ret = PTR_ERR_OR_ZERO(c->sha256);
|
||||
if (ret) {
|
||||
c->sha256 = NULL;
|
||||
bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
|
||||
goto out;
|
||||
}
|
||||
|
||||
crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
|
||||
if (!crypt)
|
||||
goto out;
|
||||
|
@ -103,8 +103,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
|
||||
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
|
||||
struct bch_key *);
|
||||
|
||||
#if 0
|
||||
int bch2_disable_encryption(struct bch_fs *);
|
||||
int bch2_enable_encryption(struct bch_fs *, bool);
|
||||
#endif
|
||||
|
||||
void bch2_fs_encryption_exit(struct bch_fs *);
|
||||
int bch2_fs_encryption_init(struct bch_fs *);
|
||||
|
@ -638,40 +638,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
|
||||
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
||||
}
|
||||
|
||||
static bool can_allocate_without_blocking(struct bch_fs *c,
|
||||
struct data_update *m)
|
||||
{
|
||||
if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
|
||||
return false;
|
||||
|
||||
unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
|
||||
? m->op.target
|
||||
: 0;
|
||||
struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
|
||||
|
||||
darray_for_each(m->op.devs_have, i)
|
||||
__clear_bit(*i, devs.d);
|
||||
|
||||
rcu_read_lock();
|
||||
unsigned nr_replicas = 0, i;
|
||||
for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, i);
|
||||
|
||||
struct bch_dev_usage usage;
|
||||
bch2_dev_usage_read_fast(ca, &usage);
|
||||
|
||||
if (!dev_buckets_free(ca, usage, m->op.watermark))
|
||||
continue;
|
||||
|
||||
nr_replicas += ca->mi.durability;
|
||||
if (nr_replicas >= m->op.nr_replicas)
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return nr_replicas >= m->op.nr_replicas;
|
||||
}
|
||||
|
||||
int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
|
||||
struct bch_io_opts *io_opts)
|
||||
{
|
||||
@ -700,22 +666,49 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
|
||||
}
|
||||
|
||||
rbio_init(&m->rbio.bio, c, *io_opts, NULL);
|
||||
m->rbio.data_update = true;
|
||||
m->rbio.bio.bi_iter.bi_size = buf_bytes;
|
||||
m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
|
||||
m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool can_write_extent(struct bch_fs *c,
|
||||
struct bch_devs_list *devs_have,
|
||||
unsigned target)
|
||||
static int can_write_extent(struct bch_fs *c, struct data_update *m)
|
||||
{
|
||||
if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
|
||||
unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
|
||||
return -BCH_ERR_data_update_done_would_block;
|
||||
|
||||
unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
|
||||
? m->op.target
|
||||
: 0;
|
||||
struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
|
||||
|
||||
darray_for_each(*devs_have, i)
|
||||
darray_for_each(m->op.devs_have, i)
|
||||
__clear_bit(*i, devs.d);
|
||||
|
||||
return !bch2_is_zero(&devs, sizeof(devs));
|
||||
rcu_read_lock();
|
||||
unsigned nr_replicas = 0, i;
|
||||
for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, i);
|
||||
|
||||
struct bch_dev_usage usage;
|
||||
bch2_dev_usage_read_fast(ca, &usage);
|
||||
|
||||
if (!dev_buckets_free(ca, usage, m->op.watermark))
|
||||
continue;
|
||||
|
||||
nr_replicas += ca->mi.durability;
|
||||
if (nr_replicas >= m->op.nr_replicas)
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!nr_replicas)
|
||||
return -BCH_ERR_data_update_done_no_rw_devs;
|
||||
if (nr_replicas < m->op.nr_replicas)
|
||||
return -BCH_ERR_insufficient_devices;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_data_update_init(struct btree_trans *trans,
|
||||
@ -799,20 +792,6 @@ int bch2_data_update_init(struct btree_trans *trans,
|
||||
ptr_bit <<= 1;
|
||||
}
|
||||
|
||||
if (!can_write_extent(c, &m->op.devs_have,
|
||||
m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) {
|
||||
/*
|
||||
* Check if we have rw devices not in devs_have: this can happen
|
||||
* if we're trying to move data on a ro or failed device
|
||||
*
|
||||
* If we can't move it, we need to clear the rebalance_work bit,
|
||||
* if applicable
|
||||
*
|
||||
* Also, copygc should skip ro/failed devices:
|
||||
*/
|
||||
return -BCH_ERR_data_update_done_no_rw_devs;
|
||||
}
|
||||
|
||||
unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
|
||||
|
||||
/*
|
||||
@ -852,11 +831,22 @@ int bch2_data_update_init(struct btree_trans *trans,
|
||||
goto out_bkey_buf_exit;
|
||||
}
|
||||
|
||||
if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
|
||||
!can_allocate_without_blocking(c, m)) {
|
||||
ret = -BCH_ERR_data_update_done_would_block;
|
||||
/*
|
||||
* Check if the allocation will succeed, to avoid getting an error later
|
||||
* in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
|
||||
* read:
|
||||
*
|
||||
* This guards against
|
||||
* - BCH_WRITE_alloc_nowait allocations failing (promotes)
|
||||
* - Destination target full
|
||||
* - Device(s) in destination target offline
|
||||
* - Insufficient durability available in destination target
|
||||
* (i.e. trying to move a durability=2 replica to a target with a
|
||||
* single durability=2 device)
|
||||
*/
|
||||
ret = can_write_extent(c, m);
|
||||
if (ret)
|
||||
goto out_bkey_buf_exit;
|
||||
}
|
||||
|
||||
if (reserve_sectors) {
|
||||
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
|
||||
|
@ -380,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
|
||||
{
|
||||
m->sectors = le16_to_cpu(s->sectors);
|
||||
m->algorithm = s->algorithm;
|
||||
m->nr_blocks = s->nr_blocks;
|
||||
m->nr_redundant = s->nr_redundant;
|
||||
m->disk_label = s->disk_label;
|
||||
m->blocks_nonempty = 0;
|
||||
|
||||
for (unsigned i = 0; i < s->nr_blocks; i++)
|
||||
m->blocks_nonempty += !!stripe_blockcount_get(s, i);
|
||||
}
|
||||
|
||||
int bch2_trigger_stripe(struct btree_trans *trans,
|
||||
enum btree_id btree, unsigned level,
|
||||
struct bkey_s_c old, struct bkey_s _new,
|
||||
@ -1320,6 +1307,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
||||
if (s->err) {
|
||||
if (!bch2_err_matches(s->err, EROFS))
|
||||
bch_err(c, "error creating stripe: error writing data buckets");
|
||||
ret = s->err;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -1328,6 +1316,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
||||
|
||||
if (ec_do_recov(c, &s->existing_stripe)) {
|
||||
bch_err(c, "error creating stripe: error reading existing stripe");
|
||||
ret = -BCH_ERR_ec_block_read;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -1353,6 +1342,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
||||
|
||||
if (ec_nr_failed(&s->new_stripe)) {
|
||||
bch_err(c, "error creating stripe: error writing redundancy buckets");
|
||||
ret = -BCH_ERR_ec_block_write;
|
||||
goto err;
|
||||
}
|
||||
|
||||
|
@ -231,6 +231,7 @@
|
||||
x(BCH_ERR_invalid_sb, invalid_sb_csum) \
|
||||
x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
|
||||
x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
|
||||
x(BCH_ERR_invalid_sb, invalid_sb_offset) \
|
||||
x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
|
||||
x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
|
||||
x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
|
||||
@ -273,21 +274,25 @@
|
||||
x(EIO, stripe_reconstruct) \
|
||||
x(EIO, key_type_error) \
|
||||
x(EIO, extent_poisened) \
|
||||
x(EIO, no_device_to_read_from) \
|
||||
x(EIO, missing_indirect_extent) \
|
||||
x(EIO, invalidate_stripe_to_dev) \
|
||||
x(EIO, no_encryption_key) \
|
||||
x(EIO, insufficient_journal_devices) \
|
||||
x(EIO, device_offline) \
|
||||
x(EIO, EIO_fault_injected) \
|
||||
x(EIO, ec_block_read) \
|
||||
x(EIO, ec_block_write) \
|
||||
x(EIO, data_read) \
|
||||
x(BCH_ERR_data_read, no_device_to_read_from) \
|
||||
x(BCH_ERR_data_read, data_read_io_err) \
|
||||
x(BCH_ERR_data_read, data_read_csum_err) \
|
||||
x(BCH_ERR_data_read, data_read_retry) \
|
||||
x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_device_offline) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_io_err) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_ec_reconstruct_err) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_csum_err) \
|
||||
x(BCH_ERR_data_read_retry, data_read_csum_err_maybe_userspace) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \
|
||||
x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\
|
||||
x(BCH_ERR_data_read, data_read_decompress_err) \
|
||||
x(BCH_ERR_data_read, data_read_decrypt_err) \
|
||||
x(BCH_ERR_data_read, data_read_ptr_stale_race) \
|
||||
|
@ -28,6 +28,8 @@
|
||||
#include "trace.h"
|
||||
#include "util.h"
|
||||
|
||||
#include <linux/random.h>
|
||||
|
||||
static const char * const bch2_extent_flags_strs[] = {
|
||||
#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
|
||||
BCH_EXTENT_FLAGS()
|
||||
@ -94,38 +96,30 @@ static inline int dev_failed(struct bch_dev *ca)
|
||||
*/
|
||||
static inline bool ptr_better(struct bch_fs *c,
|
||||
const struct extent_ptr_decoded p1,
|
||||
const struct extent_ptr_decoded p2)
|
||||
u64 p1_latency,
|
||||
struct bch_dev *ca1,
|
||||
const struct extent_ptr_decoded p2,
|
||||
u64 p2_latency)
|
||||
{
|
||||
if (likely(!p1.do_ec_reconstruct &&
|
||||
!p2.do_ec_reconstruct)) {
|
||||
struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
|
||||
struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
|
||||
struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
|
||||
|
||||
int failed_delta = dev_failed(ca1) - dev_failed(ca2);
|
||||
int failed_delta = dev_failed(ca1) - dev_failed(ca2);
|
||||
if (unlikely(failed_delta))
|
||||
return failed_delta < 0;
|
||||
|
||||
if (failed_delta)
|
||||
return failed_delta < 0;
|
||||
|
||||
u64 l1 = dev_latency(ca1);
|
||||
u64 l2 = dev_latency(ca2);
|
||||
|
||||
/*
|
||||
* Square the latencies, to bias more in favor of the faster
|
||||
* device - we never want to stop issuing reads to the slower
|
||||
* device altogether, so that we can update our latency numbers:
|
||||
*/
|
||||
l1 *= l1;
|
||||
l2 *= l2;
|
||||
|
||||
/* Pick at random, biased in favor of the faster device: */
|
||||
|
||||
return bch2_rand_range(l1 + l2) > l1;
|
||||
}
|
||||
|
||||
if (bch2_force_reconstruct_read)
|
||||
if (unlikely(bch2_force_reconstruct_read))
|
||||
return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
|
||||
|
||||
return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
|
||||
if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
|
||||
return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
|
||||
|
||||
int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
|
||||
if (unlikely(crc_retry_delta))
|
||||
return crc_retry_delta < 0;
|
||||
|
||||
/* Pick at random, biased in favor of the faster device: */
|
||||
|
||||
return get_random_u64_below(p1_latency + p2_latency) > p1_latency;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -138,86 +132,105 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
||||
struct extent_ptr_decoded *pick,
|
||||
int dev)
|
||||
{
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
struct bch_dev_io_failures *f;
|
||||
unsigned csum_retry = 0;
|
||||
bool have_csum_retries = false;
|
||||
int ret = 0;
|
||||
bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
|
||||
bool have_dirty_ptrs = false, have_pick = false;
|
||||
|
||||
if (k.k->type == KEY_TYPE_error)
|
||||
return -BCH_ERR_key_type_error;
|
||||
|
||||
if (bch2_bkey_extent_ptrs_flags(ptrs) & BCH_EXTENT_FLAG_poisoned)
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
|
||||
if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
|
||||
return -BCH_ERR_extent_poisened;
|
||||
again:
|
||||
|
||||
rcu_read_lock();
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
u64 pick_latency;
|
||||
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
||||
have_dirty_ptrs |= !p.ptr.cached;
|
||||
|
||||
/*
|
||||
* Unwritten extent: no need to actually read, treat it as a
|
||||
* hole and return 0s:
|
||||
*/
|
||||
if (p.ptr.unwritten) {
|
||||
ret = 0;
|
||||
break;
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Are we being asked to read from a specific device? */
|
||||
if (dev >= 0 && p.ptr.dev != dev)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If there are any dirty pointers it's an error if we can't
|
||||
* read:
|
||||
*/
|
||||
if (!ret && !p.ptr.cached)
|
||||
ret = -BCH_ERR_no_device_to_read_from;
|
||||
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
|
||||
|
||||
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
|
||||
continue;
|
||||
|
||||
if (unlikely(failed) &&
|
||||
(f = bch2_dev_io_failures(failed, p.ptr.dev))) {
|
||||
have_csum_retries |= !f->failed_io && f->failed_csum_nr < BCH_MAX_CSUM_RETRIES;
|
||||
struct bch_dev_io_failures *f =
|
||||
unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
|
||||
if (unlikely(f)) {
|
||||
p.crc_retry_nr = f->failed_csum_nr;
|
||||
p.has_ec &= ~f->failed_ec;
|
||||
|
||||
if (p.has_ec &&
|
||||
!f->failed_ec &&
|
||||
(f->failed_io || f->failed_csum_nr))
|
||||
if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
|
||||
have_io_errors |= f->failed_io;
|
||||
have_io_errors |= f->failed_ec;
|
||||
}
|
||||
have_csum_errors |= !!f->failed_csum_nr;
|
||||
|
||||
if (p.has_ec && (f->failed_io || f->failed_csum_nr))
|
||||
p.do_ec_reconstruct = true;
|
||||
else if (f->failed_io ||
|
||||
f->failed_csum_nr > csum_retry)
|
||||
f->failed_csum_nr > c->opts.checksum_err_retry_nr)
|
||||
continue;
|
||||
}
|
||||
|
||||
have_missing_devs |= ca && !bch2_dev_is_online(ca);
|
||||
|
||||
if (!ca || !bch2_dev_is_online(ca)) {
|
||||
if (p.has_ec)
|
||||
p.do_ec_reconstruct = true;
|
||||
else
|
||||
if (!p.has_ec)
|
||||
continue;
|
||||
p.do_ec_reconstruct = true;
|
||||
}
|
||||
|
||||
if (p.has_ec && bch2_force_reconstruct_read)
|
||||
if (bch2_force_reconstruct_read && p.has_ec)
|
||||
p.do_ec_reconstruct = true;
|
||||
|
||||
if (ret > 0 && !ptr_better(c, p, *pick))
|
||||
continue;
|
||||
u64 p_latency = dev_latency(ca);
|
||||
/*
|
||||
* Square the latencies, to bias more in favor of the faster
|
||||
* device - we never want to stop issuing reads to the slower
|
||||
* device altogether, so that we can update our latency numbers:
|
||||
*/
|
||||
p_latency *= p_latency;
|
||||
|
||||
*pick = p;
|
||||
ret = 1;
|
||||
if (!have_pick ||
|
||||
ptr_better(c,
|
||||
p, p_latency, ca,
|
||||
*pick, pick_latency)) {
|
||||
*pick = p;
|
||||
pick_latency = p_latency;
|
||||
have_pick = true;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (unlikely(ret == -BCH_ERR_no_device_to_read_from &&
|
||||
have_csum_retries &&
|
||||
csum_retry < BCH_MAX_CSUM_RETRIES)) {
|
||||
csum_retry++;
|
||||
goto again;
|
||||
}
|
||||
if (have_pick)
|
||||
return 1;
|
||||
if (!have_dirty_ptrs)
|
||||
return 0;
|
||||
if (have_missing_devs)
|
||||
return -BCH_ERR_no_device_to_read_from;
|
||||
if (have_csum_errors)
|
||||
return -BCH_ERR_data_read_csum_err;
|
||||
if (have_io_errors)
|
||||
return -BCH_ERR_data_read_io_err;
|
||||
|
||||
return ret;
|
||||
WARN_ONCE(1, "unhandled error case in %s\n", __func__);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* KEY_TYPE_btree_ptr: */
|
||||
|
@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
|
||||
({ \
|
||||
__label__ out; \
|
||||
\
|
||||
(_ptr).has_ec = false; \
|
||||
(_ptr).do_ec_reconstruct = false; \
|
||||
(_ptr).has_ec = false; \
|
||||
(_ptr).do_ec_reconstruct = false; \
|
||||
(_ptr).crc_retry_nr = 0; \
|
||||
\
|
||||
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
|
||||
switch (__extent_entry_type(_entry)) { \
|
||||
|
@ -21,19 +21,18 @@ struct bch_extent_crc_unpacked {
|
||||
|
||||
struct extent_ptr_decoded {
|
||||
bool has_ec;
|
||||
unsigned do_ec_reconstruct;
|
||||
bool do_ec_reconstruct;
|
||||
u8 crc_retry_nr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
struct bch_extent_ptr ptr;
|
||||
struct bch_extent_stripe_ptr ec;
|
||||
};
|
||||
|
||||
#define BCH_MAX_CSUM_RETRIES 3
|
||||
|
||||
struct bch_io_failures {
|
||||
u8 nr;
|
||||
struct bch_dev_io_failures {
|
||||
u8 dev;
|
||||
unsigned failed_csum_nr:4,
|
||||
unsigned failed_csum_nr:6,
|
||||
failed_io:1,
|
||||
failed_ec:1;
|
||||
} devs[BCH_REPLICAS_MAX + 1];
|
||||
|
@ -117,6 +117,9 @@ static int readpage_bio_extend(struct btree_trans *trans,
|
||||
|
||||
unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
|
||||
|
||||
/* ensure proper alignment */
|
||||
order = min(order, __ffs(folio_offset|BIT(31)));
|
||||
|
||||
folio = xa_load(&iter->mapping->i_pages, folio_offset);
|
||||
if (folio && !xa_is_value(folio))
|
||||
break;
|
||||
|
@ -2026,44 +2026,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
|
||||
return c ?: ERR_PTR(-ENOENT);
|
||||
}
|
||||
|
||||
static int bch2_remount(struct super_block *sb, int *flags,
|
||||
struct bch_opts opts)
|
||||
{
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
int ret = 0;
|
||||
|
||||
opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
|
||||
|
||||
if (opts.read_only != c->opts.read_only) {
|
||||
down_write(&c->state_lock);
|
||||
|
||||
if (opts.read_only) {
|
||||
bch2_fs_read_only(c);
|
||||
|
||||
sb->s_flags |= SB_RDONLY;
|
||||
} else {
|
||||
ret = bch2_fs_read_write(c);
|
||||
if (ret) {
|
||||
bch_err(c, "error going rw: %i", ret);
|
||||
up_write(&c->state_lock);
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
sb->s_flags &= ~SB_RDONLY;
|
||||
}
|
||||
|
||||
c->opts.read_only = opts.read_only;
|
||||
|
||||
up_write(&c->state_lock);
|
||||
}
|
||||
|
||||
if (opt_defined(opts, errors))
|
||||
c->opts.errors = opts.errors;
|
||||
err:
|
||||
return bch2_err_class(ret);
|
||||
}
|
||||
|
||||
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
|
||||
{
|
||||
struct bch_fs *c = root->d_sb->s_fs_info;
|
||||
@ -2374,8 +2336,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc)
|
||||
{
|
||||
struct super_block *sb = fc->root->d_sb;
|
||||
struct bch2_opts_parse *opts = fc->fs_private;
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
int ret = 0;
|
||||
|
||||
return bch2_remount(sb, &fc->sb_flags, opts->opts);
|
||||
opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
|
||||
|
||||
if (opts->opts.read_only != c->opts.read_only) {
|
||||
down_write(&c->state_lock);
|
||||
|
||||
if (opts->opts.read_only) {
|
||||
bch2_fs_read_only(c);
|
||||
|
||||
sb->s_flags |= SB_RDONLY;
|
||||
} else {
|
||||
ret = bch2_fs_read_write(c);
|
||||
if (ret) {
|
||||
bch_err(c, "error going rw: %i", ret);
|
||||
up_write(&c->state_lock);
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
sb->s_flags &= ~SB_RDONLY;
|
||||
}
|
||||
|
||||
c->opts.read_only = opts->opts.read_only;
|
||||
|
||||
up_write(&c->state_lock);
|
||||
}
|
||||
|
||||
if (opt_defined(opts->opts, errors))
|
||||
c->opts.errors = opts->opts.errors;
|
||||
err:
|
||||
return bch2_err_class(ret);
|
||||
}
|
||||
|
||||
static const struct fs_context_operations bch2_context_ops = {
|
||||
|
@ -868,19 +868,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
|
||||
uid, gid, mode, rdev, parent);
|
||||
}
|
||||
|
||||
static inline u32 bkey_generation(struct bkey_s_c k)
|
||||
{
|
||||
switch (k.k->type) {
|
||||
case KEY_TYPE_inode:
|
||||
case KEY_TYPE_inode_v2:
|
||||
BUG();
|
||||
case KEY_TYPE_inode_generation:
|
||||
return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static struct bkey_i_inode_alloc_cursor *
|
||||
bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
|
||||
{
|
||||
@ -1198,6 +1185,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
|
||||
opts->_name##_from_inode = true; \
|
||||
} else { \
|
||||
opts->_name = c->opts._name; \
|
||||
opts->_name##_from_inode = false; \
|
||||
}
|
||||
BCH_INODE_OPTS()
|
||||
#undef x
|
||||
|
@ -25,8 +25,15 @@
|
||||
#include "subvolume.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/random.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
static unsigned bch2_read_corrupt_ratio;
|
||||
module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
|
||||
MODULE_PARM_DESC(read_corrupt_ratio, "");
|
||||
#endif
|
||||
|
||||
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
||||
|
||||
static bool bch2_target_congested(struct bch_fs *c, u16 target)
|
||||
@ -59,7 +66,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return bch2_rand_range(nr * CONGESTED_MAX) < total;
|
||||
return get_random_u32_below(nr * CONGESTED_MAX) < total;
|
||||
}
|
||||
|
||||
#else
|
||||
@ -97,14 +104,21 @@ static inline bool have_io_error(struct bch_io_failures *failed)
|
||||
return failed && failed->nr;
|
||||
}
|
||||
|
||||
static bool ptr_being_rewritten(struct bch_read_bio *orig,
|
||||
unsigned dev,
|
||||
unsigned flags)
|
||||
static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
|
||||
{
|
||||
if (!(flags & BCH_READ_data_update))
|
||||
EBUG_ON(rbio->split);
|
||||
|
||||
return rbio->data_update
|
||||
? container_of(rbio, struct data_update, rbio)
|
||||
: NULL;
|
||||
}
|
||||
|
||||
static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
|
||||
{
|
||||
struct data_update *u = rbio_data_update(orig);
|
||||
if (!u)
|
||||
return false;
|
||||
|
||||
struct data_update *u = container_of(orig, struct data_update, rbio);
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
|
||||
unsigned i = 0;
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
@ -193,7 +207,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
|
||||
struct bpos pos,
|
||||
struct extent_ptr_decoded *pick,
|
||||
unsigned sectors,
|
||||
unsigned flags,
|
||||
struct bch_read_bio *orig,
|
||||
struct bch_io_failures *failed)
|
||||
{
|
||||
@ -214,7 +227,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
|
||||
unsigned ptr_bit = 1;
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
if (bch2_dev_io_failures(failed, ptr->dev) &&
|
||||
!ptr_being_rewritten(orig, ptr->dev, flags))
|
||||
!ptr_being_rewritten(orig, ptr->dev))
|
||||
update_opts.rewrite_ptrs |= ptr_bit;
|
||||
ptr_bit <<= 1;
|
||||
}
|
||||
@ -308,7 +321,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
|
||||
k.k->type == KEY_TYPE_reflink_v
|
||||
? BTREE_ID_reflink
|
||||
: BTREE_ID_extents,
|
||||
k, pos, pick, sectors, flags, orig, failed);
|
||||
k, pos, pick, sectors, orig, failed);
|
||||
if (!promote)
|
||||
return NULL;
|
||||
|
||||
@ -336,7 +349,7 @@ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *o
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (rbio->flags & BCH_READ_data_update)
|
||||
if (rbio->data_update)
|
||||
prt_str(out, "(internal move) ");
|
||||
|
||||
return 0;
|
||||
@ -416,83 +429,6 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
|
||||
bio_endio(&rbio->bio);
|
||||
}
|
||||
|
||||
static struct bkey_s_c get_rbio_extent(struct btree_trans *trans,
|
||||
struct bch_read_bio *rbio,
|
||||
struct btree_iter *iter)
|
||||
{
|
||||
if (rbio->flags & BCH_READ_data_update) {
|
||||
struct data_update *u = container_of(rbio, struct data_update, rbio);
|
||||
|
||||
return bch2_bkey_get_iter(trans, iter,
|
||||
u->btree_id, bkey_start_pos(&u->k.k->k), 0);
|
||||
} else {
|
||||
struct bpos pos = rbio->read_pos;
|
||||
int ret = bch2_subvolume_get_snapshot(trans, rbio->subvol, &pos.snapshot);
|
||||
if (ret)
|
||||
return bkey_s_c_err(ret);
|
||||
|
||||
return bch2_bkey_get_iter(trans, iter,
|
||||
BTREE_ID_extents, pos, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void mark_io_failure_if_current_extent_matches(struct btree_trans *trans,
|
||||
struct bch_read_bio *rbio,
|
||||
struct bch_io_failures *failed)
|
||||
{
|
||||
struct btree_iter iter = {};
|
||||
struct bkey_s_c k;
|
||||
int ret = lockrestart_do(trans,
|
||||
bkey_err(k = get_rbio_extent(trans, rbio, &iter)));
|
||||
|
||||
if (!ret) {
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
|
||||
bkey_for_each_ptr(ptrs, ptr)
|
||||
if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr))
|
||||
bch2_mark_io_failure(failed, &rbio->pick,
|
||||
rbio->ret == -BCH_ERR_data_read_csum_err);
|
||||
}
|
||||
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
}
|
||||
|
||||
static noinline int maybe_poison_extent(struct btree_trans *trans, struct btree_iter *iter,
|
||||
struct bkey_s_c k, struct bch_io_failures *failed)
|
||||
{
|
||||
u64 flags = bch2_bkey_extent_flags(k);
|
||||
if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
|
||||
return 0;
|
||||
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
|
||||
/*
|
||||
* Make sure we actually attempt to read and got checksum failures from
|
||||
* every replica
|
||||
*/
|
||||
|
||||
rcu_read_lock();
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
|
||||
if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
|
||||
continue;
|
||||
|
||||
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, ptr->dev);
|
||||
if (!f || f->failed_csum_nr != BCH_MAX_CSUM_RETRIES) {
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
struct bkey_i *new = __bch2_bkey_make_mut(trans, iter, &k, 0, 0,
|
||||
bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
|
||||
return PTR_ERR_OR_ZERO(new) ?:
|
||||
bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
|
||||
bch2_trans_commit(trans, NULL, NULL, 0);
|
||||
}
|
||||
|
||||
static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
|
||||
struct bch_read_bio *rbio,
|
||||
struct bvec_iter bvec_iter,
|
||||
@ -530,9 +466,6 @@ err:
|
||||
goto retry;
|
||||
|
||||
if (ret) {
|
||||
if (ret == -BCH_ERR_no_device_to_read_from && failed)
|
||||
maybe_poison_extent(trans, &iter, k, failed);
|
||||
|
||||
rbio->bio.bi_status = BLK_STS_IOERR;
|
||||
rbio->ret = ret;
|
||||
}
|
||||
@ -560,7 +493,8 @@ static void bch2_rbio_retry(struct work_struct *work)
|
||||
bvec_iter_sectors(rbio->bvec_iter));
|
||||
|
||||
if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
|
||||
mark_io_failure_if_current_extent_matches(trans, rbio, &failed);
|
||||
bch2_mark_io_failure(&failed, &rbio->pick,
|
||||
rbio->ret == -BCH_ERR_data_read_retry_csum_err);
|
||||
|
||||
if (!rbio->split) {
|
||||
rbio->bio.bi_status = 0;
|
||||
@ -577,7 +511,7 @@ static void bch2_rbio_retry(struct work_struct *work)
|
||||
flags &= ~BCH_READ_last_fragment;
|
||||
flags |= BCH_READ_must_clone;
|
||||
|
||||
int ret = flags & BCH_READ_data_update
|
||||
int ret = rbio->data_update
|
||||
? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
|
||||
: __bch2_read(trans, rbio, iter, inum, &failed, flags);
|
||||
|
||||
@ -591,7 +525,7 @@ static void bch2_rbio_retry(struct work_struct *work)
|
||||
bch2_inum_offset_err_msg_trans(trans, &buf,
|
||||
(subvol_inum) { subvol, read_pos.inode },
|
||||
read_pos.offset << 9));
|
||||
if (rbio->flags & BCH_READ_data_update)
|
||||
if (rbio->data_update)
|
||||
prt_str(&buf, "(internal move) ");
|
||||
prt_str(&buf, "successful retry");
|
||||
|
||||
@ -647,7 +581,7 @@ static void bch2_read_io_err(struct work_struct *work)
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
|
||||
printbuf_exit(&buf);
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_io_err, bio->bi_status);
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
|
||||
}
|
||||
|
||||
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
|
||||
@ -734,7 +668,7 @@ static void bch2_read_csum_err(struct work_struct *work)
|
||||
else
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err, BLK_STS_IOERR);
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
@ -778,42 +712,6 @@ static void bch2_read_decrypt_err(struct work_struct *work)
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
static unsigned bch2_read_corrupt_ratio;
|
||||
module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
|
||||
MODULE_PARM_DESC(read_corrupt_ratio, "");
|
||||
|
||||
static void corrupt_bio(struct bio *bio)
|
||||
{
|
||||
struct bvec_iter iter;
|
||||
struct bio_vec bv;
|
||||
unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
|
||||
|
||||
bio_for_each_segment(bv, bio, iter) {
|
||||
unsigned u64s = bv.bv_len / sizeof(u64);
|
||||
|
||||
if (offset < u64s) {
|
||||
u64 *segment = bvec_kmap_local(&bv);
|
||||
segment[offset] = get_random_u64();
|
||||
kunmap_local(segment);
|
||||
return;
|
||||
}
|
||||
offset -= u64s;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void maybe_corrupt_bio(struct bio *bio)
|
||||
{
|
||||
if (bch2_read_corrupt_ratio &&
|
||||
!get_random_u32_below(bch2_read_corrupt_ratio))
|
||||
corrupt_bio(bio);
|
||||
}
|
||||
#else
|
||||
static inline void maybe_corrupt_bio(struct bio *bio)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Inner part that may run in process context */
|
||||
static void __bch2_read_endio(struct work_struct *work)
|
||||
{
|
||||
@ -821,9 +719,10 @@ static void __bch2_read_endio(struct work_struct *work)
|
||||
container_of(work, struct bch_read_bio, work);
|
||||
struct bch_fs *c = rbio->c;
|
||||
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
|
||||
struct bio *src = &rbio->bio;
|
||||
struct bio *dst = &bch2_rbio_parent(rbio)->bio;
|
||||
struct bvec_iter dst_iter = rbio->bvec_iter;
|
||||
struct bch_read_bio *parent = bch2_rbio_parent(rbio);
|
||||
struct bio *src = &rbio->bio;
|
||||
struct bio *dst = &parent->bio;
|
||||
struct bvec_iter dst_iter = rbio->bvec_iter;
|
||||
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
|
||||
struct nonce nonce = extent_nonce(rbio->version, crc);
|
||||
unsigned nofs_flags;
|
||||
@ -841,7 +740,7 @@ static void __bch2_read_endio(struct work_struct *work)
|
||||
src->bi_iter = rbio->bvec_iter;
|
||||
}
|
||||
|
||||
maybe_corrupt_bio(src);
|
||||
bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
|
||||
|
||||
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
|
||||
bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
|
||||
@ -853,7 +752,7 @@ static void __bch2_read_endio(struct work_struct *work)
|
||||
*/
|
||||
if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
|
||||
rbio->flags |= BCH_READ_must_bounce;
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err_maybe_userspace,
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
|
||||
BLK_STS_IOERR);
|
||||
goto out;
|
||||
}
|
||||
@ -873,7 +772,7 @@ static void __bch2_read_endio(struct work_struct *work)
|
||||
if (unlikely(rbio->narrow_crcs))
|
||||
bch2_rbio_narrow_crcs(rbio);
|
||||
|
||||
if (likely(!(rbio->flags & BCH_READ_data_update))) {
|
||||
if (likely(!parent->data_update)) {
|
||||
/* Adjust crc to point to subset of data we want: */
|
||||
crc.offset += rbio->offset_into_extent;
|
||||
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
|
||||
@ -1043,6 +942,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
||||
struct bch_read_bio *rbio = NULL;
|
||||
bool bounce = false, read_full = false, narrow_crcs = false;
|
||||
struct bpos data_pos = bkey_start_pos(k.k);
|
||||
struct data_update *u = rbio_data_update(orig);
|
||||
int ret = 0;
|
||||
|
||||
if (bkey_extent_is_inline_data(k.k)) {
|
||||
@ -1106,16 +1006,7 @@ retry_pick:
|
||||
goto retry_pick;
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlock the iterator while the btree node's lock is still in
|
||||
* cache, before doing the IO:
|
||||
*/
|
||||
if (!(flags & BCH_READ_in_retry))
|
||||
bch2_trans_unlock(trans);
|
||||
else
|
||||
bch2_trans_unlock_long(trans);
|
||||
|
||||
if (!(flags & BCH_READ_data_update)) {
|
||||
if (likely(!u)) {
|
||||
if (!(flags & BCH_READ_last_fragment) ||
|
||||
bio_flagged(&orig->bio, BIO_CHAIN))
|
||||
flags |= BCH_READ_must_clone;
|
||||
@ -1138,12 +1029,10 @@ retry_pick:
|
||||
bounce = true;
|
||||
}
|
||||
} else {
|
||||
read_full = true;
|
||||
/*
|
||||
* can happen if we retry, and the extent we were going to read
|
||||
* has been merged in the meantime:
|
||||
*/
|
||||
struct data_update *u = container_of(orig, struct data_update, rbio);
|
||||
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
|
||||
if (ca)
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
@ -1152,6 +1041,7 @@ retry_pick:
|
||||
}
|
||||
|
||||
iter.bi_size = pick.crc.compressed_size << 9;
|
||||
read_full = true;
|
||||
}
|
||||
|
||||
if (orig->opts.promote_target || have_io_error(failed))
|
||||
@ -1242,10 +1132,14 @@ retry_pick:
|
||||
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
|
||||
rbio->bio.bi_end_io = bch2_read_endio;
|
||||
|
||||
/* XXX: also nvme read recovery level */
|
||||
if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
|
||||
rbio->bio.bi_opf |= REQ_FUA;
|
||||
|
||||
if (rbio->bounce)
|
||||
trace_and_count(c, io_read_bounce, &rbio->bio);
|
||||
|
||||
if (!(flags & BCH_READ_data_update))
|
||||
if (!u)
|
||||
this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
|
||||
else
|
||||
this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
|
||||
@ -1255,7 +1149,7 @@ retry_pick:
|
||||
* If it's being moved internally, we don't want to flag it as a cache
|
||||
* hit:
|
||||
*/
|
||||
if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update))
|
||||
if (ca && pick.ptr.cached && !u)
|
||||
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
|
||||
PTR_BUCKET_NR(ca, &pick.ptr), READ);
|
||||
|
||||
@ -1264,6 +1158,15 @@ retry_pick:
|
||||
trace_and_count(c, io_read_split, &orig->bio);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlock the iterator while the btree node's lock is still in
|
||||
* cache, before doing the IO:
|
||||
*/
|
||||
if (!(flags & BCH_READ_in_retry))
|
||||
bch2_trans_unlock(trans);
|
||||
else
|
||||
bch2_trans_unlock_long(trans);
|
||||
|
||||
if (likely(!rbio->pick.do_ec_reconstruct)) {
|
||||
if (unlikely(!rbio->have_ioref)) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
@ -1275,7 +1178,7 @@ retry_pick:
|
||||
printbuf_exit(&buf);
|
||||
|
||||
bch2_rbio_error(rbio,
|
||||
-BCH_ERR_data_read_device_offline,
|
||||
-BCH_ERR_data_read_retry_device_offline,
|
||||
BLK_STS_IOERR);
|
||||
goto out;
|
||||
}
|
||||
@ -1302,7 +1205,7 @@ retry_pick:
|
||||
} else {
|
||||
/* Attempting reconstruct read: */
|
||||
if (bch2_ec_read_extent(trans, rbio, k)) {
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_ec_reconstruct_err,
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
|
||||
BLK_STS_IOERR);
|
||||
goto out;
|
||||
}
|
||||
@ -1314,6 +1217,8 @@ out:
|
||||
if (likely(!(flags & BCH_READ_in_retry))) {
|
||||
return 0;
|
||||
} else {
|
||||
bch2_trans_unlock(trans);
|
||||
|
||||
int ret;
|
||||
|
||||
rbio->context = RBIO_CONTEXT_UNBOUND;
|
||||
@ -1324,7 +1229,7 @@ out:
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
|
||||
bch2_mark_io_failure(failed, &pick,
|
||||
ret == -BCH_ERR_data_read_csum_err);
|
||||
ret == -BCH_ERR_data_read_retry_csum_err);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -1341,11 +1246,11 @@ hole:
|
||||
this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
|
||||
bvec_iter_sectors(iter));
|
||||
/*
|
||||
* won't normally happen in the BCH_READ_data_update
|
||||
* (bch2_move_extent()) path, but if we retry and the extent we wanted
|
||||
* to read no longer exists we have to signal that:
|
||||
* won't normally happen in the data update (bch2_move_extent()) path,
|
||||
* but if we retry and the extent we wanted to read no longer exists we
|
||||
* have to signal that:
|
||||
*/
|
||||
if (flags & BCH_READ_data_update)
|
||||
if (u)
|
||||
orig->ret = -BCH_ERR_data_read_key_overwritten;
|
||||
|
||||
zero_fill_bio_iter(&orig->bio, iter);
|
||||
@ -1366,7 +1271,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
BUG_ON(flags & BCH_READ_data_update);
|
||||
EBUG_ON(rbio->data_update);
|
||||
|
||||
bch2_bkey_buf_init(&sk);
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
|
||||
@ -1393,23 +1298,6 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (unlikely(flags & BCH_READ_in_retry)) {
|
||||
struct data_update *u = flags & BCH_READ_data_update
|
||||
? container_of(rbio, struct data_update, rbio)
|
||||
: NULL;
|
||||
|
||||
if (u &&
|
||||
!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
|
||||
/* extent we wanted to read no longer exists: */
|
||||
ret = -BCH_ERR_data_read_key_overwritten;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (!bkey_deleted(&sk.k->k) &&
|
||||
!bkey_and_val_eq(k, bkey_i_to_s_c(sk.k)))
|
||||
failed->nr = 0;
|
||||
}
|
||||
|
||||
s64 offset_into_extent = iter.pos.offset -
|
||||
bkey_start_offset(k.k);
|
||||
unsigned sectors = k.k->size - offset_into_extent;
|
||||
@ -1447,16 +1335,18 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
|
||||
swap(bvec_iter.bi_size, bytes);
|
||||
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
|
||||
err:
|
||||
if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
|
||||
flags |= BCH_READ_must_bounce;
|
||||
|
||||
if (ret &&
|
||||
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
|
||||
!bch2_err_matches(ret, BCH_ERR_data_read_retry))
|
||||
break;
|
||||
}
|
||||
|
||||
if (unlikely(ret)) {
|
||||
if (ret == -BCH_ERR_no_device_to_read_from && failed)
|
||||
maybe_poison_extent(trans, &iter, k, failed);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (ret) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
lockrestart_do(trans,
|
||||
bch2_inum_offset_err_msg_trans(trans, &buf, inum,
|
||||
@ -1472,7 +1362,6 @@ err:
|
||||
bch2_rbio_done(rbio);
|
||||
}
|
||||
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
bch2_bkey_buf_exit(&sk, c);
|
||||
return ret;
|
||||
}
|
||||
|
@ -36,7 +36,8 @@ struct bch_read_bio {
|
||||
u16 flags;
|
||||
union {
|
||||
struct {
|
||||
u16 promote:1,
|
||||
u16 data_update:1,
|
||||
promote:1,
|
||||
bounce:1,
|
||||
split:1,
|
||||
have_ioref:1,
|
||||
@ -109,7 +110,6 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
|
||||
x(retry_if_stale) \
|
||||
x(may_promote) \
|
||||
x(user_mapped) \
|
||||
x(data_update) \
|
||||
x(last_fragment) \
|
||||
x(must_bounce) \
|
||||
x(must_clone) \
|
||||
@ -163,12 +163,13 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
|
||||
{
|
||||
struct bch_read_bio *rbio = to_rbio(bio);
|
||||
|
||||
rbio->c = orig->c;
|
||||
rbio->_state = 0;
|
||||
rbio->ret = 0;
|
||||
rbio->split = true;
|
||||
rbio->parent = orig;
|
||||
rbio->opts = orig->opts;
|
||||
rbio->c = orig->c;
|
||||
rbio->_state = 0;
|
||||
rbio->flags = 0;
|
||||
rbio->ret = 0;
|
||||
rbio->split = true;
|
||||
rbio->parent = orig;
|
||||
rbio->opts = orig->opts;
|
||||
return rbio;
|
||||
}
|
||||
|
||||
@ -182,7 +183,8 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
|
||||
rbio->start_time = local_clock();
|
||||
rbio->c = c;
|
||||
rbio->_state = 0;
|
||||
rbio->ret = 0;
|
||||
rbio->flags = 0;
|
||||
rbio->ret = 0;
|
||||
rbio->opts = opts;
|
||||
rbio->bio.bi_end_io = end_io;
|
||||
return rbio;
|
||||
|
@ -34,6 +34,12 @@
|
||||
#include <linux/random.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
static unsigned bch2_write_corrupt_ratio;
|
||||
module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
|
||||
MODULE_PARM_DESC(write_corrupt_ratio, "");
|
||||
#endif
|
||||
|
||||
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
||||
|
||||
static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
|
||||
@ -1005,6 +1011,15 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
||||
bounce = true;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
|
||||
if (!bounce && write_corrupt_ratio) {
|
||||
dst = bch2_write_bio_alloc(c, wp, src,
|
||||
&page_alloc_failed,
|
||||
ec_buf);
|
||||
bounce = true;
|
||||
}
|
||||
#endif
|
||||
saved_iter = dst->bi_iter;
|
||||
|
||||
do {
|
||||
@ -1114,6 +1129,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
||||
|
||||
init_append_extent(op, wp, version, crc);
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
if (write_corrupt_ratio) {
|
||||
swap(dst->bi_iter.bi_size, dst_len);
|
||||
bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
|
||||
swap(dst->bi_iter.bi_size, dst_len);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (dst != src)
|
||||
bio_advance(dst, dst_len);
|
||||
bio_advance(src, src_len);
|
||||
@ -1394,6 +1417,7 @@ retry:
|
||||
bio->bi_private = &op->cl;
|
||||
bio->bi_opf |= REQ_OP_WRITE;
|
||||
closure_get(&op->cl);
|
||||
|
||||
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
|
||||
op->insert_keys.top, true);
|
||||
|
||||
@ -1718,20 +1742,26 @@ static const char * const bch2_write_flags[] = {
|
||||
|
||||
void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
|
||||
{
|
||||
prt_str(out, "pos: ");
|
||||
if (!out->nr_tabstops)
|
||||
printbuf_tabstop_push(out, 32);
|
||||
|
||||
prt_printf(out, "pos:\t");
|
||||
bch2_bpos_to_text(out, op->pos);
|
||||
prt_newline(out);
|
||||
printbuf_indent_add(out, 2);
|
||||
|
||||
prt_str(out, "started: ");
|
||||
prt_printf(out, "started:\t");
|
||||
bch2_pr_time_units(out, local_clock() - op->start_time);
|
||||
prt_newline(out);
|
||||
|
||||
prt_str(out, "flags: ");
|
||||
prt_printf(out, "flags:\t");
|
||||
prt_bitflags(out, bch2_write_flags, op->flags);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
|
||||
prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
|
||||
prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
|
||||
|
||||
prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
|
||||
|
||||
printbuf_indent_sub(out, 2);
|
||||
}
|
||||
|
@ -1609,11 +1609,6 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
|
||||
kvfree(new_buf);
|
||||
}
|
||||
|
||||
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
|
||||
{
|
||||
return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
|
||||
}
|
||||
|
||||
static CLOSURE_CALLBACK(journal_write_done)
|
||||
{
|
||||
closure_type(w, struct journal_buf, io);
|
||||
|
@ -101,13 +101,25 @@ static void move_free(struct moving_io *io)
|
||||
static void move_write_done(struct bch_write_op *op)
|
||||
{
|
||||
struct moving_io *io = container_of(op, struct moving_io, write.op);
|
||||
struct bch_fs *c = op->c;
|
||||
struct moving_context *ctxt = io->write.ctxt;
|
||||
|
||||
if (io->write.op.error)
|
||||
ctxt->write_error = true;
|
||||
if (op->error) {
|
||||
if (trace_io_move_write_fail_enabled()) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
|
||||
atomic_dec(&io->write.ctxt->write_ios);
|
||||
bch2_write_op_to_text(&buf, op);
|
||||
prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error));
|
||||
trace_io_move_write_fail(c, buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
|
||||
|
||||
ctxt->write_error = true;
|
||||
}
|
||||
|
||||
atomic_sub(io->write_sectors, &ctxt->write_sectors);
|
||||
atomic_dec(&ctxt->write_ios);
|
||||
move_free(io);
|
||||
closure_put(&ctxt->cl);
|
||||
}
|
||||
@ -359,7 +371,6 @@ int bch2_move_extent(struct moving_context *ctxt,
|
||||
bkey_start_pos(k.k),
|
||||
iter->btree_id, k, 0,
|
||||
NULL,
|
||||
BCH_READ_data_update|
|
||||
BCH_READ_last_fragment,
|
||||
data_opts.scrub ? data_opts.read_dev : -1);
|
||||
return 0;
|
||||
@ -580,7 +591,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
||||
k.k->type == KEY_TYPE_reflink_p &&
|
||||
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
|
||||
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
|
||||
s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
|
||||
s64 offset_into_extent = 0;
|
||||
|
||||
bch2_trans_iter_exit(trans, &reflink_iter);
|
||||
k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
|
||||
@ -599,6 +610,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
||||
* pointer - need to fixup iter->k
|
||||
*/
|
||||
extent_iter = &reflink_iter;
|
||||
offset_into_extent = 0;
|
||||
}
|
||||
|
||||
if (!bkey_extent_is_direct_data(k.k))
|
||||
@ -712,7 +724,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
|
||||
struct btree_iter iter = {}, bp_iter = {};
|
||||
struct bkey_buf sk;
|
||||
struct bkey_s_c k;
|
||||
unsigned sectors_moved = 0;
|
||||
struct bkey_buf last_flushed;
|
||||
int ret = 0;
|
||||
|
||||
@ -834,7 +845,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
|
||||
|
||||
if (ctxt->stats)
|
||||
atomic64_add(sectors, &ctxt->stats->sectors_seen);
|
||||
sectors_moved += sectors;
|
||||
next:
|
||||
bch2_btree_iter_advance(&bp_iter);
|
||||
}
|
||||
@ -1253,17 +1263,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
|
||||
prt_newline(out);
|
||||
printbuf_indent_add(out, 2);
|
||||
|
||||
prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved));
|
||||
prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced));
|
||||
prt_printf(out, "bytes seen: ");
|
||||
prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved));
|
||||
prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced));
|
||||
prt_printf(out, "bytes seen:\t");
|
||||
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "bytes moved: ");
|
||||
prt_printf(out, "bytes moved:\t");
|
||||
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "bytes raced: ");
|
||||
prt_printf(out, "bytes raced:\t");
|
||||
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
|
||||
prt_newline(out);
|
||||
|
||||
@ -1272,7 +1282,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
|
||||
|
||||
static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
|
||||
{
|
||||
struct moving_io *io;
|
||||
if (!out->nr_tabstops)
|
||||
printbuf_tabstop_push(out, 32);
|
||||
|
||||
bch2_move_stats_to_text(out, ctxt->stats);
|
||||
printbuf_indent_add(out, 2);
|
||||
@ -1292,6 +1303,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
|
||||
printbuf_indent_add(out, 2);
|
||||
|
||||
mutex_lock(&ctxt->lock);
|
||||
struct moving_io *io;
|
||||
list_for_each_entry(io, &ctxt->ios, io_list)
|
||||
bch2_data_update_inflight_to_text(out, &io->write);
|
||||
mutex_unlock(&ctxt->lock);
|
||||
|
@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
prt_printf(out, "Currently calculated wait:\t");
|
||||
prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
|
||||
prt_newline(out);
|
||||
|
||||
rcu_read_lock();
|
||||
struct task_struct *t = rcu_dereference(c->copygc_thread);
|
||||
if (t)
|
||||
get_task_struct(t);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (t) {
|
||||
bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
|
||||
put_task_struct(t);
|
||||
}
|
||||
}
|
||||
|
||||
static int bch2_copygc_thread(void *arg)
|
||||
|
@ -186,6 +186,11 @@ enum fsck_err_opts {
|
||||
OPT_STR(__bch2_csum_opts), \
|
||||
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
|
||||
NULL, NULL) \
|
||||
x(checksum_err_retry_nr, u8, \
|
||||
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
||||
OPT_UINT(0, 32), \
|
||||
BCH_SB_CSUM_ERR_RETRY_NR, 3, \
|
||||
NULL, NULL) \
|
||||
x(compression, u8, \
|
||||
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
||||
OPT_FN(bch2_opt_compression), \
|
||||
|
@ -26,9 +26,8 @@
|
||||
|
||||
/* bch_extent_rebalance: */
|
||||
|
||||
static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
|
||||
static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
|
||||
{
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
|
||||
bkey_extent_entry_for_each(ptrs, entry)
|
||||
@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
|
||||
{
|
||||
return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
|
||||
}
|
||||
|
||||
static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
|
||||
struct bch_io_opts *opts,
|
||||
struct bkey_s_c k,
|
||||
@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
|
||||
|
||||
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
|
||||
{
|
||||
const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
|
||||
const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
|
||||
if (!opts)
|
||||
return 0;
|
||||
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
u64 sectors = 0;
|
||||
@ -590,8 +595,19 @@ static int bch2_rebalance_thread(void *arg)
|
||||
|
||||
void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
{
|
||||
printbuf_tabstop_push(out, 32);
|
||||
|
||||
struct bch_fs_rebalance *r = &c->rebalance;
|
||||
|
||||
/* print pending work */
|
||||
struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, };
|
||||
u64 v;
|
||||
bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
|
||||
|
||||
prt_printf(out, "pending work:\t");
|
||||
prt_human_readable_u64(out, v);
|
||||
prt_printf(out, "\n\n");
|
||||
|
||||
prt_str(out, bch2_rebalance_state_strs[r->state]);
|
||||
prt_newline(out);
|
||||
printbuf_indent_add(out, 2);
|
||||
@ -600,15 +616,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
case BCH_REBALANCE_waiting: {
|
||||
u64 now = atomic64_read(&c->io_clock[WRITE].now);
|
||||
|
||||
prt_str(out, "io wait duration: ");
|
||||
prt_printf(out, "io wait duration:\t");
|
||||
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
|
||||
prt_newline(out);
|
||||
|
||||
prt_str(out, "io wait remaining: ");
|
||||
prt_printf(out, "io wait remaining:\t");
|
||||
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
|
||||
prt_newline(out);
|
||||
|
||||
prt_str(out, "duration waited: ");
|
||||
prt_printf(out, "duration waited:\t");
|
||||
bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
|
||||
prt_newline(out);
|
||||
break;
|
||||
@ -621,6 +637,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
break;
|
||||
}
|
||||
prt_newline(out);
|
||||
|
||||
rcu_read_lock();
|
||||
struct task_struct *t = rcu_dereference(c->rebalance.thread);
|
||||
if (t)
|
||||
get_task_struct(t);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (t) {
|
||||
bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
|
||||
put_task_struct(t);
|
||||
}
|
||||
|
||||
printbuf_indent_sub(out, 2);
|
||||
}
|
||||
|
||||
|
@ -899,7 +899,7 @@ use_clean:
|
||||
* journal sequence numbers:
|
||||
*/
|
||||
if (!c->sb.clean)
|
||||
journal_seq += 8;
|
||||
journal_seq += JOURNAL_BUF_NR * 4;
|
||||
|
||||
if (blacklist_seq != journal_seq) {
|
||||
ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
|
||||
|
@ -22,6 +22,7 @@ enum counters_flags {
|
||||
x(io_move_write, 36, TYPE_SECTORS) \
|
||||
x(io_move_finish, 37, TYPE_SECTORS) \
|
||||
x(io_move_fail, 38, TYPE_COUNTER) \
|
||||
x(io_move_write_fail, 82, TYPE_COUNTER) \
|
||||
x(io_move_start_fail, 39, TYPE_COUNTER) \
|
||||
x(bucket_invalidate, 3, TYPE_COUNTER) \
|
||||
x(bucket_discard, 4, TYPE_COUNTER) \
|
||||
|
@ -12,7 +12,6 @@
|
||||
#include "super.h"
|
||||
|
||||
#include <linux/crc32c.h>
|
||||
#include <crypto/hash.h>
|
||||
#include <crypto/sha2.h>
|
||||
|
||||
static inline enum bch_str_hash_type
|
||||
@ -55,13 +54,10 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
|
||||
};
|
||||
|
||||
if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
|
||||
SHASH_DESC_ON_STACK(desc, c->sha256);
|
||||
u8 digest[SHA256_DIGEST_SIZE];
|
||||
|
||||
desc->tfm = c->sha256;
|
||||
|
||||
crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
|
||||
sizeof(bi->bi_hash_seed), digest);
|
||||
sha256((const u8 *)&bi->bi_hash_seed,
|
||||
sizeof(bi->bi_hash_seed), digest);
|
||||
memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
|
||||
}
|
||||
|
||||
|
@ -365,10 +365,9 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
||||
enum bch_validate_flags flags, struct printbuf *out)
|
||||
int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
|
||||
enum bch_validate_flags flags, struct printbuf *out)
|
||||
{
|
||||
struct bch_sb *sb = disk_sb->sb;
|
||||
struct bch_sb_field_members_v1 *mi;
|
||||
enum bch_opt_id opt_id;
|
||||
int ret;
|
||||
@ -377,15 +376,27 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (sb->features[1] ||
|
||||
(le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
|
||||
prt_printf(out, "Filesystem has incompatible features");
|
||||
u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
|
||||
unsigned incompat_bit = 0;
|
||||
if (incompat)
|
||||
incompat_bit = __ffs64(incompat);
|
||||
else if (sb->features[1])
|
||||
incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
|
||||
|
||||
if (incompat_bit) {
|
||||
prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
|
||||
incompat_bit,
|
||||
bch2_sb_features[BCH_FEATURE_NR - 1],
|
||||
BCH_FEATURE_NR - 1);
|
||||
return -BCH_ERR_invalid_sb_features;
|
||||
}
|
||||
|
||||
if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
|
||||
BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
|
||||
prt_printf(out, "Filesystem has incompatible version");
|
||||
prt_str(out, "Filesystem has incompatible version ");
|
||||
bch2_version_to_text(out, le16_to_cpu(sb->version));
|
||||
prt_str(out, ", current version ");
|
||||
bch2_version_to_text(out, bcachefs_metadata_version_current);
|
||||
return -BCH_ERR_invalid_sb_features;
|
||||
}
|
||||
|
||||
@ -399,6 +410,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
||||
return -BCH_ERR_invalid_sb_uuid;
|
||||
}
|
||||
|
||||
if (!(flags & BCH_VALIDATE_write) &&
|
||||
le64_to_cpu(sb->offset) != read_offset) {
|
||||
prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
|
||||
le64_to_cpu(sb->offset), read_offset);
|
||||
return -BCH_ERR_invalid_sb_offset;
|
||||
}
|
||||
|
||||
if (!sb->nr_devices ||
|
||||
sb->nr_devices > BCH_SB_MEMBERS_MAX) {
|
||||
prt_printf(out, "Bad number of member devices %u (max %u)",
|
||||
@ -457,6 +475,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
||||
|
||||
if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
|
||||
SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
|
||||
|
||||
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
|
||||
!BCH_SB_CSUM_ERR_RETRY_NR(sb))
|
||||
SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
@ -874,7 +896,7 @@ got_super:
|
||||
|
||||
sb->have_layout = true;
|
||||
|
||||
ret = bch2_sb_validate(sb, 0, &err);
|
||||
ret = bch2_sb_validate(sb->sb, offset, 0, &err);
|
||||
if (ret) {
|
||||
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
|
||||
path, err.buf);
|
||||
@ -1031,7 +1053,7 @@ int bch2_write_super(struct bch_fs *c)
|
||||
darray_for_each(online_devices, ca) {
|
||||
printbuf_reset(&err);
|
||||
|
||||
ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
|
||||
ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
|
||||
if (ret) {
|
||||
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
|
||||
goto out;
|
||||
|
@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
|
||||
void bch2_free_super(struct bch_sb_handle *);
|
||||
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
|
||||
|
||||
int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
|
||||
|
||||
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
|
||||
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
|
||||
int bch2_write_super(struct bch_fs *);
|
||||
|
@ -75,9 +75,6 @@
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
|
||||
MODULE_DESCRIPTION("bcachefs filesystem");
|
||||
MODULE_SOFTDEP("pre: crc32c");
|
||||
MODULE_SOFTDEP("pre: crc64");
|
||||
MODULE_SOFTDEP("pre: sha256");
|
||||
MODULE_SOFTDEP("pre: chacha20");
|
||||
MODULE_SOFTDEP("pre: poly1305");
|
||||
MODULE_SOFTDEP("pre: xxhash");
|
||||
@ -1838,7 +1835,11 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
|
||||
goto err_late;
|
||||
|
||||
up_write(&c->state_lock);
|
||||
return 0;
|
||||
out:
|
||||
printbuf_exit(&label);
|
||||
printbuf_exit(&errbuf);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
|
||||
err_unlock:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
@ -1847,10 +1848,7 @@ err:
|
||||
if (ca)
|
||||
bch2_dev_free(ca);
|
||||
bch2_free_super(&sb);
|
||||
printbuf_exit(&label);
|
||||
printbuf_exit(&errbuf);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
goto out;
|
||||
err_late:
|
||||
up_write(&c->state_lock);
|
||||
ca = NULL;
|
||||
|
@ -148,6 +148,7 @@ write_attribute(trigger_btree_key_cache_shrink);
|
||||
write_attribute(trigger_freelist_wakeup);
|
||||
write_attribute(trigger_btree_updates);
|
||||
read_attribute(gc_gens_pos);
|
||||
write_attribute(read_fua_test);
|
||||
|
||||
read_attribute(uuid);
|
||||
read_attribute(minor);
|
||||
@ -395,6 +396,71 @@ SHOW(bch2_fs)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_fua_test(struct bch_fs *c)
|
||||
{
|
||||
int ret = 0;
|
||||
unsigned bs = 4096;
|
||||
struct bio *bio;
|
||||
void *buf;
|
||||
|
||||
struct bch_dev *ca = bch2_dev_get_ioref(c, 0, READ);
|
||||
if (!ca)
|
||||
return -EINVAL;
|
||||
|
||||
bio = bio_kmalloc(1, GFP_KERNEL);
|
||||
if (!bio) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
buf = kmalloc(bs, GFP_KERNEL);
|
||||
if (!buf)
|
||||
goto err;
|
||||
|
||||
u64 start = ktime_get_ns();
|
||||
for (unsigned i = 0; i < 1000; i++) {
|
||||
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ);
|
||||
bch2_bio_map(bio, buf, bs);
|
||||
ret = submit_bio_wait(bio);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
u64 ns_nofua = ktime_get_ns() - start;
|
||||
|
||||
start = ktime_get_ns();
|
||||
for (unsigned i = 0; i < 1000; i++) {
|
||||
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ);
|
||||
bch2_bio_map(bio, buf, bs);
|
||||
ret = submit_bio_wait(bio);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
u64 ns_fua = ktime_get_ns() - start;
|
||||
|
||||
u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca);
|
||||
|
||||
start = ktime_get_ns();
|
||||
for (unsigned i = 0; i < 1000; i++) {
|
||||
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ);
|
||||
bio->bi_iter.bi_sector = (get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9;
|
||||
bch2_bio_map(bio, buf, bs);
|
||||
ret = submit_bio_wait(bio);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
u64 ns_rand = ktime_get_ns() - start;
|
||||
|
||||
pr_info("ns nofua %llu", ns_nofua);
|
||||
pr_info("ns fua %llu", ns_fua);
|
||||
pr_info("ns random %llu", ns_rand);
|
||||
err:
|
||||
kfree(buf);
|
||||
kfree(bio);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
STORE(bch2_fs)
|
||||
{
|
||||
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
|
||||
@ -451,6 +517,9 @@ STORE(bch2_fs)
|
||||
if (attr == &sysfs_trigger_freelist_wakeup)
|
||||
closure_wake_up(&c->freelist_wait);
|
||||
|
||||
if (attr == &sysfs_read_fua_test)
|
||||
read_fua_test(c);
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_TESTS
|
||||
if (attr == &sysfs_perf_test) {
|
||||
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
|
||||
@ -580,6 +649,7 @@ struct attribute *bch2_fs_internal_files[] = {
|
||||
&sysfs_trigger_btree_key_cache_shrink,
|
||||
&sysfs_trigger_freelist_wakeup,
|
||||
&sysfs_trigger_btree_updates,
|
||||
&sysfs_read_fua_test,
|
||||
|
||||
&sysfs_gc_gens_pos,
|
||||
|
||||
|
@ -846,6 +846,11 @@ DEFINE_EVENT(fs_str, io_move_fail,
|
||||
TP_ARGS(c, str)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(fs_str, io_move_write_fail,
|
||||
TP_PROTO(struct bch_fs *c, const char *str),
|
||||
TP_ARGS(c, str)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(fs_str, io_move_start_fail,
|
||||
TP_PROTO(struct bch_fs *c, const char *str),
|
||||
TP_ARGS(c, str)
|
||||
|
@ -653,21 +653,6 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t bch2_rand_range(size_t max)
|
||||
{
|
||||
size_t rand;
|
||||
|
||||
if (!max)
|
||||
return 0;
|
||||
|
||||
do {
|
||||
rand = get_random_long();
|
||||
rand &= roundup_pow_of_two(max) - 1;
|
||||
} while (rand >= max);
|
||||
|
||||
return rand;
|
||||
}
|
||||
|
||||
void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
|
||||
{
|
||||
struct bio_vec bv;
|
||||
@ -698,6 +683,27 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
void bch2_corrupt_bio(struct bio *bio)
|
||||
{
|
||||
struct bvec_iter iter;
|
||||
struct bio_vec bv;
|
||||
unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
|
||||
|
||||
bio_for_each_segment(bv, bio, iter) {
|
||||
unsigned u64s = bv.bv_len / sizeof(u64);
|
||||
|
||||
if (offset < u64s) {
|
||||
u64 *segment = bvec_kmap_local(&bv);
|
||||
segment[offset] = get_random_u64();
|
||||
kunmap_local(segment);
|
||||
return;
|
||||
}
|
||||
offset -= u64s;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
void eytzinger1_test(void)
|
||||
{
|
||||
|
@ -401,11 +401,21 @@ do { \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
size_t bch2_rand_range(size_t);
|
||||
|
||||
void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
|
||||
void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
void bch2_corrupt_bio(struct bio *);
|
||||
|
||||
static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
|
||||
{
|
||||
if (ratio && !get_random_u32_below(ratio))
|
||||
bch2_corrupt_bio(bio);
|
||||
}
|
||||
#else
|
||||
#define bch2_maybe_corrupt_bio(...) do {} while (0)
|
||||
#endif
|
||||
|
||||
static inline void memcpy_u64s_small(void *dst, const void *src,
|
||||
unsigned u64s)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user