diff --git a/.bcachefs_revision b/.bcachefs_revision index 7d7555ff..e778bec6 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -46af7258b951a79a66511172ab8772ad2dfaa4e3 +4d28432bcc5f91caf053f64a1cde1a6286adf4a6 diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index 8a46202b..b6183bd0 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -7,6 +7,7 @@ #define _CRYPTO_SHA_H #include <linux/types.h> +#include <sodium/crypto_hash_sha256.h> #define SHA1_DIGEST_SIZE 20 #define SHA1_BLOCK_SIZE 64 @@ -112,4 +113,9 @@ extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data, extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *hash); + +static inline void sha256(const u8 *data, unsigned int len, u8 *out) +{ + crypto_hash_sha256(out, data, len); +} #endif diff --git a/include/linux/math64.h b/include/linux/math64.h index 5eb6f064..13efcc08 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -82,4 +82,71 @@ static inline s64 div_s64(s64 dividend, s32 divisor) return div_s64_rem(dividend, divisor, &remainder); } +#ifndef mul_u32_u32 +/* + * Many a GCC version messes this up and generates a 64x64 mult :-( + */ +static inline u64 mul_u32_u32(u32 a, u32 b) +{ + return (u64)a * b; +} +#endif + +#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) + +#ifndef mul_u64_u64_shr +static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) +{ + return (u64)(((unsigned __int128)a * mul) >> shift); +} +#endif /* mul_u64_u64_shr */ + +#else + +#ifndef mul_u64_u64_shr +static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift) +{ + union { + u64 ll; + struct { +#ifdef __BIG_ENDIAN + u32 high, low; +#else + u32 low, high; +#endif + } l; + } rl, rm, rn, rh, a0, b0; + u64 c; + + a0.ll = a; + b0.ll = b; + + rl.ll = mul_u32_u32(a0.l.low, b0.l.low); + rm.ll = mul_u32_u32(a0.l.low, b0.l.high); + rn.ll = mul_u32_u32(a0.l.high, b0.l.low); + rh.ll = mul_u32_u32(a0.l.high, b0.l.high); + + /* + * Each of these lines computes a 64-bit intermediate result into "c", + * starting at bits 32-95. The low 32-bits go into the result of the + * multiplication, the high 32-bits are carried into the next step. + */ + rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low; + rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low; + rh.l.high = (c >> 32) + rh.l.high; + + /* + * The 128-bit result of the multiplication is in rl.ll and rh.ll, + * shift it right and throw away the high part of the result. + */ + if (shift == 0) + return rl.ll; + if (shift < 64) + return (rl.ll >> shift) | (rh.ll << (64 - shift)); + return rh.ll >> (shift & 63); +} +#endif /* mul_u64_u64_shr */ + +#endif + #endif /* _LINUX_MATH64_H */ diff --git a/include/linux/random.h b/include/linux/random.h index 3203d13c..9b2bb59a 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -9,7 +9,9 @@ #include <unistd.h> #include <sys/syscall.h> #include <linux/bug.h> +#include <linux/kernel.h> #include <linux/log2.h> +#include <linux/math64.h> #ifdef SYS_getrandom static inline int getrandom(void *buf, size_t buflen, unsigned int flags) @@ -67,4 +69,19 @@ static inline u32 get_random_u32_below(u32 ceil) } } +static inline u64 get_random_u64_below(u64 ceil) +{ + if (ceil <= 1) + return 0; + if (ceil <= U32_MAX) + return get_random_u32_below(ceil); + + for (;;) { + u64 rand = get_random_u64(); + u64 mult = ceil * rand; + if (likely(mult >= -ceil % ceil)) + return mul_u64_u64_shr(ceil, rand, 64); + } +} + #endif /* _LINUX_RANDOM_H */ diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index ecad4a78..4dfcf3e6 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, int ret = 0; bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), - c, alloc_v2_unpack_error, + c, alloc_v3_unpack_error, "unpack error"); fsck_err: return ret; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index b432bb6e..0ea593e8 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -979,7 +979,6 @@ struct bch_fs { mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; size_t zstd_workspace_size; - struct crypto_shash *sha256; struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 7a5b0d21..e96d8776 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -842,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); +/* one free bit */ LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); @@ -861,6 +862,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); +LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 1ec1f90e..54666027 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) btree_node_write_in_flight(b)); btree_node_data_free(bc, b); + cond_resched(); } BUG_ON(!bch2_journal_error(&c->journal) && diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 6638bb1f..6abc9f17 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -2080,11 +2080,6 @@ static void btree_node_write_work(struct work_struct *work) container_of(work, struct btree_write_bio, work); struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; - unsigned commit_flags = - BCH_WATERMARK_interior_updates| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw; u64 start_time = wbio->start_time; int ret = 0; @@ -2093,24 +2088,38 @@ static void btree_node_write_work(struct work_struct *work) wbio->wbio.used_mempool, wbio->data); - if (wbio->wbio.failed.nr) { - ret = bch2_trans_do(c, - bch2_btree_node_rewrite_key_get_iter(trans, b, - commit_flags)); - } else if (!wbio->wbio.first_btree_write) { + bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); + + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { + ret = -BCH_ERR_btree_node_write_all_failed; + goto err; + } + + if (wbio->wbio.first_btree_write) { + if (wbio->wbio.failed.nr) { + + } + } else { ret = bch2_trans_do(c, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, - commit_flags, true)); + BCH_WATERMARK_interior_updates| + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw, + !wbio->wbio.failed.nr)); + if (ret) + goto err; } - - if (ret) { - set_btree_node_noevict(b); - bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, - "writing btree node: %s", bch2_err_str(ret)); - } - +out: bio_put(&wbio->wbio.bio); btree_node_write_done(c, b, start_time); + return; +err: + set_btree_node_noevict(b); + bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, + "writing btree node: %s", bch2_err_str(ret)); + goto out; } static void btree_node_write_endio(struct bio *bio) diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index e32fce4f..7542c6f9 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, - struct bkey *u) -{ - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, - bch2_btree_node_iter_peek(&l->iter, l->b)); - - path->pos = k.k ? k.k->p : l->b->key.k.p; - trans->paths_sorted = false; - bch2_btree_path_verify_level(trans, path, l - path->l); - return k; -} - static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 8f22ef9a..47d8690f 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -126,10 +126,18 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); +int bch2_btree_write_buffer_insert_err(struct btree_trans *, + enum btree_id, struct bkey_i *); + static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, enum btree_id btree, struct bkey_i *k) { + if (unlikely(!btree_type_uses_write_buffer(btree))) { + int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); + dump_stack(); + return ret; + } /* * Most updates skip the btree write buffer until journal replay is * finished because synchronization with journal replay relies on having diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index b56c4987..2c09d19d 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -264,6 +264,22 @@ out: BUG_ON(wb->sorted.size < wb->flushing.keys.nr); } +int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, + enum btree_id btree, struct bkey_i *k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "attempting to do write buffer update on non wb btree="); + bch2_btree_id_to_text(&buf, btree); + prt_str(&buf, "\n"); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + return -EROFS; +} + static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) { struct bch_fs *c = trans->c; @@ -312,7 +328,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) darray_for_each(wb->sorted, i) { struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; - BUG_ON(!btree_type_uses_write_buffer(k->btree)); + if (unlikely(!btree_type_uses_write_buffer(k->btree))) { + ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k); + goto err; + } for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) prefetch(&wb->flushing.keys.data[n->idx]); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 6aeec1c0..c5363256 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -140,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b) static inline int gen_after(u8 a, u8 b) { - int r = gen_cmp(a, b); - - return r > 0 ? r : 0; + return max(0, gen_cmp(a, b)); } static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index 23a38357..7f9e4c59 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -693,6 +693,14 @@ static int bch2_alloc_ciphers(struct bch_fs *c) return 0; } +#if 0 + +/* + * This seems to be duplicating code in cmd_remove_passphrase() in + * bcachefs-tools, but we might want to switch userspace to use this - and + * perhaps add an ioctl for calling this at runtime, so we can take the + * passphrase off of a mounted filesystem (which has come up). + */ int bch2_disable_encryption(struct bch_fs *c) { struct bch_sb_field_crypt *crypt; @@ -725,6 +733,10 @@ out: return ret; } +/* + * For enabling encryption on an existing filesystem: not hooked up yet, but it + * should be + */ int bch2_enable_encryption(struct bch_fs *c, bool keyed) { struct bch_encrypted_key key; @@ -781,6 +793,7 @@ err: memzero_explicit(&key, sizeof(key)); return ret; } +#endif void bch2_fs_encryption_exit(struct bch_fs *c) { @@ -788,8 +801,6 @@ void bch2_fs_encryption_exit(struct bch_fs *c) crypto_free_shash(c->poly1305); if (c->chacha20) crypto_free_sync_skcipher(c->chacha20); - if (c->sha256) - crypto_free_shash(c->sha256); } int bch2_fs_encryption_init(struct bch_fs *c) @@ -798,14 +809,6 @@ int bch2_fs_encryption_init(struct bch_fs *c) struct bch_key key; int ret = 0; - c->sha256 = crypto_alloc_shash("sha256", 0, 0); - ret = PTR_ERR_OR_ZERO(c->sha256); - if (ret) { - c->sha256 = NULL; - bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); - goto out; - } - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) goto out; diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 43b9d71f..4ac251c8 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -103,8 +103,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, struct bch_key *); +#if 0 int bch2_disable_encryption(struct bch_fs *); int bch2_enable_encryption(struct bch_fs *, bool); +#endif void bch2_fs_encryption_exit(struct bch_fs *); int bch2_fs_encryption_init(struct bch_fs *); diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 522574bc..08bb7f30 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -638,40 +638,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } -static bool can_allocate_without_blocking(struct bch_fs *c, - struct data_update *m) -{ - if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) - return false; - - unsigned target = m->op.flags & BCH_WRITE_only_specified_devs - ? m->op.target - : 0; - struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); - - darray_for_each(m->op.devs_have, i) - __clear_bit(*i, devs.d); - - rcu_read_lock(); - unsigned nr_replicas = 0, i; - for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { - struct bch_dev *ca = bch2_dev_rcu(c, i); - - struct bch_dev_usage usage; - bch2_dev_usage_read_fast(ca, &usage); - - if (!dev_buckets_free(ca, usage, m->op.watermark)) - continue; - - nr_replicas += ca->mi.durability; - if (nr_replicas >= m->op.nr_replicas) - break; - } - rcu_read_unlock(); - - return nr_replicas >= m->op.nr_replicas; -} - int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, struct bch_io_opts *io_opts) { @@ -700,22 +666,49 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, } rbio_init(&m->rbio.bio, c, *io_opts, NULL); + m->rbio.data_update = true; m->rbio.bio.bi_iter.bi_size = buf_bytes; m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); return 0; } -static bool can_write_extent(struct bch_fs *c, - struct bch_devs_list *devs_have, - unsigned target) +static int can_write_extent(struct bch_fs *c, struct data_update *m) { + if ((m->op.flags & BCH_WRITE_alloc_nowait) && + unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) + return -BCH_ERR_data_update_done_would_block; + + unsigned target = m->op.flags & BCH_WRITE_only_specified_devs + ? m->op.target + : 0; struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); - darray_for_each(*devs_have, i) + darray_for_each(m->op.devs_have, i) __clear_bit(*i, devs.d); - return !bch2_is_zero(&devs, sizeof(devs)); + rcu_read_lock(); + unsigned nr_replicas = 0, i; + for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { + struct bch_dev *ca = bch2_dev_rcu(c, i); + + struct bch_dev_usage usage; + bch2_dev_usage_read_fast(ca, &usage); + + if (!dev_buckets_free(ca, usage, m->op.watermark)) + continue; + + nr_replicas += ca->mi.durability; + if (nr_replicas >= m->op.nr_replicas) + break; + } + rcu_read_unlock(); + + if (!nr_replicas) + return -BCH_ERR_data_update_done_no_rw_devs; + if (nr_replicas < m->op.nr_replicas) + return -BCH_ERR_insufficient_devices; + return 0; } int bch2_data_update_init(struct btree_trans *trans, @@ -799,20 +792,6 @@ int bch2_data_update_init(struct btree_trans *trans, ptr_bit <<= 1; } - if (!can_write_extent(c, &m->op.devs_have, - m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) { - /* - * Check if we have rw devices not in devs_have: this can happen - * if we're trying to move data on a ro or failed device - * - * If we can't move it, we need to clear the rebalance_work bit, - * if applicable - * - * Also, copygc should skip ro/failed devices: - */ - return -BCH_ERR_data_update_done_no_rw_devs; - } - unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); /* @@ -852,11 +831,22 @@ int bch2_data_update_init(struct btree_trans *trans, goto out_bkey_buf_exit; } - if ((m->op.flags & BCH_WRITE_alloc_nowait) && - !can_allocate_without_blocking(c, m)) { - ret = -BCH_ERR_data_update_done_would_block; + /* + * Check if the allocation will succeed, to avoid getting an error later + * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless + * read: + * + * This guards against + * - BCH_WRITE_alloc_nowait allocations failing (promotes) + * - Destination target full + * - Device(s) in destination target offline + * - Insufficient durability available in destination target + * (i.e. trying to move a durability=2 replica to a target with a + * single durability=2 device) + */ + ret = can_write_extent(c, m); + if (ret) goto out_bkey_buf_exit; - } if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 865cc53a..c73ba73f 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -380,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans, return 0; } -static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s) -{ - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->disk_label = s->disk_label; - m->blocks_nonempty = 0; - - for (unsigned i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); -} - int bch2_trigger_stripe(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, @@ -1320,6 +1307,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (s->err) { if (!bch2_err_matches(s->err, EROFS)) bch_err(c, "error creating stripe: error writing data buckets"); + ret = s->err; goto err; } @@ -1328,6 +1316,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_do_recov(c, &s->existing_stripe)) { bch_err(c, "error creating stripe: error reading existing stripe"); + ret = -BCH_ERR_ec_block_read; goto err; } @@ -1353,6 +1342,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); + ret = -BCH_ERR_ec_block_write; goto err; } diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 531fe575..cb27de6f 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -231,6 +231,7 @@ x(BCH_ERR_invalid_sb, invalid_sb_csum) \ x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ + x(BCH_ERR_invalid_sb, invalid_sb_offset) \ x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ @@ -273,21 +274,25 @@ x(EIO, stripe_reconstruct) \ x(EIO, key_type_error) \ x(EIO, extent_poisened) \ - x(EIO, no_device_to_read_from) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ x(EIO, insufficient_journal_devices) \ x(EIO, device_offline) \ x(EIO, EIO_fault_injected) \ + x(EIO, ec_block_read) \ + x(EIO, ec_block_write) \ x(EIO, data_read) \ + x(BCH_ERR_data_read, no_device_to_read_from) \ + x(BCH_ERR_data_read, data_read_io_err) \ + x(BCH_ERR_data_read, data_read_csum_err) \ x(BCH_ERR_data_read, data_read_retry) \ x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ - x(BCH_ERR_data_read_retry_avoid,data_read_device_offline) \ - x(BCH_ERR_data_read_retry_avoid,data_read_io_err) \ - x(BCH_ERR_data_read_retry_avoid,data_read_ec_reconstruct_err) \ - x(BCH_ERR_data_read_retry_avoid,data_read_csum_err) \ - x(BCH_ERR_data_read_retry, data_read_csum_err_maybe_userspace) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \ + x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\ x(BCH_ERR_data_read, data_read_decompress_err) \ x(BCH_ERR_data_read, data_read_decrypt_err) \ x(BCH_ERR_data_read, data_read_ptr_stale_race) \ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index f62ee96b..1da754a8 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -28,6 +28,8 @@ #include "trace.h" #include "util.h" +#include <linux/random.h> + static const char * const bch2_extent_flags_strs[] = { #define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, BCH_EXTENT_FLAGS() @@ -94,38 +96,30 @@ static inline int dev_failed(struct bch_dev *ca) */ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p1, - const struct extent_ptr_decoded p2) + u64 p1_latency, + struct bch_dev *ca1, + const struct extent_ptr_decoded p2, + u64 p2_latency) { - if (likely(!p1.do_ec_reconstruct && - !p2.do_ec_reconstruct)) { - struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev); - struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); - int failed_delta = dev_failed(ca1) - dev_failed(ca2); + int failed_delta = dev_failed(ca1) - dev_failed(ca2); + if (unlikely(failed_delta)) + return failed_delta < 0; - if (failed_delta) - return failed_delta < 0; - - u64 l1 = dev_latency(ca1); - u64 l2 = dev_latency(ca2); - - /* - * Square the latencies, to bias more in favor of the faster - * device - we never want to stop issuing reads to the slower - * device altogether, so that we can update our latency numbers: - */ - l1 *= l1; - l2 *= l2; - - /* Pick at random, biased in favor of the faster device: */ - - return bch2_rand_range(l1 + l2) > l1; - } - - if (bch2_force_reconstruct_read) + if (unlikely(bch2_force_reconstruct_read)) return p1.do_ec_reconstruct > p2.do_ec_reconstruct; - return p1.do_ec_reconstruct < p2.do_ec_reconstruct; + if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) + return p1.do_ec_reconstruct < p2.do_ec_reconstruct; + + int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr; + if (unlikely(crc_retry_delta)) + return crc_retry_delta < 0; + + /* Pick at random, biased in favor of the faster device: */ + + return get_random_u64_below(p1_latency + p2_latency) > p1_latency; } /* @@ -138,86 +132,105 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, struct extent_ptr_decoded *pick, int dev) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_dev_io_failures *f; - unsigned csum_retry = 0; - bool have_csum_retries = false; - int ret = 0; + bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; + bool have_dirty_ptrs = false, have_pick = false; if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; - if (bch2_bkey_extent_ptrs_flags(ptrs) & BCH_EXTENT_FLAG_poisoned) + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) return -BCH_ERR_extent_poisened; -again: + rcu_read_lock(); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 pick_latency; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + have_dirty_ptrs |= !p.ptr.cached; + /* * Unwritten extent: no need to actually read, treat it as a * hole and return 0s: */ if (p.ptr.unwritten) { - ret = 0; - break; + rcu_read_unlock(); + return 0; } /* Are we being asked to read from a specific device? */ if (dev >= 0 && p.ptr.dev != dev) continue; - /* - * If there are any dirty pointers it's an error if we can't - * read: - */ - if (!ret && !p.ptr.cached) - ret = -BCH_ERR_no_device_to_read_from; - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; - if (unlikely(failed) && - (f = bch2_dev_io_failures(failed, p.ptr.dev))) { - have_csum_retries |= !f->failed_io && f->failed_csum_nr < BCH_MAX_CSUM_RETRIES; + struct bch_dev_io_failures *f = + unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; + if (unlikely(f)) { + p.crc_retry_nr = f->failed_csum_nr; + p.has_ec &= ~f->failed_ec; - if (p.has_ec && - !f->failed_ec && - (f->failed_io || f->failed_csum_nr)) + if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { + have_io_errors |= f->failed_io; + have_io_errors |= f->failed_ec; + } + have_csum_errors |= !!f->failed_csum_nr; + + if (p.has_ec && (f->failed_io || f->failed_csum_nr)) p.do_ec_reconstruct = true; else if (f->failed_io || - f->failed_csum_nr > csum_retry) + f->failed_csum_nr > c->opts.checksum_err_retry_nr) continue; } + have_missing_devs |= ca && !bch2_dev_is_online(ca); + if (!ca || !bch2_dev_is_online(ca)) { - if (p.has_ec) - p.do_ec_reconstruct = true; - else + if (!p.has_ec) continue; + p.do_ec_reconstruct = true; } - if (p.has_ec && bch2_force_reconstruct_read) + if (bch2_force_reconstruct_read && p.has_ec) p.do_ec_reconstruct = true; - if (ret > 0 && !ptr_better(c, p, *pick)) - continue; + u64 p_latency = dev_latency(ca); + /* + * Square the latencies, to bias more in favor of the faster + * device - we never want to stop issuing reads to the slower + * device altogether, so that we can update our latency numbers: + */ + p_latency *= p_latency; - *pick = p; - ret = 1; + if (!have_pick || + ptr_better(c, + p, p_latency, ca, + *pick, pick_latency)) { + *pick = p; + pick_latency = p_latency; + have_pick = true; + } } rcu_read_unlock(); - if (unlikely(ret == -BCH_ERR_no_device_to_read_from && - have_csum_retries && - csum_retry < BCH_MAX_CSUM_RETRIES)) { - csum_retry++; - goto again; - } + if (have_pick) + return 1; + if (!have_dirty_ptrs) + return 0; + if (have_missing_devs) + return -BCH_ERR_no_device_to_read_from; + if (have_csum_errors) + return -BCH_ERR_data_read_csum_err; + if (have_io_errors) + return -BCH_ERR_data_read_io_err; - return ret; + WARN_ONCE(1, "unhandled error case in %s\n", __func__); + return -EINVAL; } /* KEY_TYPE_btree_ptr: */ diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index b4058502..e78a39e7 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ({ \ __label__ out; \ \ - (_ptr).has_ec = false; \ - (_ptr).do_ec_reconstruct = false; \ + (_ptr).has_ec = false; \ + (_ptr).do_ec_reconstruct = false; \ + (_ptr).crc_retry_nr = 0; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ switch (__extent_entry_type(_entry)) { \ diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h index f8b8e598..e51529dc 100644 --- a/libbcachefs/extents_types.h +++ b/libbcachefs/extents_types.h @@ -21,19 +21,18 @@ struct bch_extent_crc_unpacked { struct extent_ptr_decoded { bool has_ec; - unsigned do_ec_reconstruct; + bool do_ec_reconstruct; + u8 crc_retry_nr; struct bch_extent_crc_unpacked crc; struct bch_extent_ptr ptr; struct bch_extent_stripe_ptr ec; }; -#define BCH_MAX_CSUM_RETRIES 3 - struct bch_io_failures { u8 nr; struct bch_dev_io_failures { u8 dev; - unsigned failed_csum_nr:4, + unsigned failed_csum_nr:6, failed_io:1, failed_ec:1; } devs[BCH_REPLICAS_MAX + 1]; diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index 881b3051..5ab1c73c 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -117,6 +117,9 @@ static int readpage_bio_extend(struct btree_trans *trans, unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); + /* ensure proper alignment */ + order = min(order, __ffs(folio_offset|BIT(31))); + folio = xa_load(&iter->mapping->i_pages, folio_offset); if (folio && !xa_is_value(folio)) break; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 459ca825..17ac9c55 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -2026,44 +2026,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static int bch2_remount(struct super_block *sb, int *flags, - struct bch_opts opts) -{ - struct bch_fs *c = sb->s_fs_info; - int ret = 0; - - opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); - - if (opts.read_only != c->opts.read_only) { - down_write(&c->state_lock); - - if (opts.read_only) { - bch2_fs_read_only(c); - - sb->s_flags |= SB_RDONLY; - } else { - ret = bch2_fs_read_write(c); - if (ret) { - bch_err(c, "error going rw: %i", ret); - up_write(&c->state_lock); - ret = -EINVAL; - goto err; - } - - sb->s_flags &= ~SB_RDONLY; - } - - c->opts.read_only = opts.read_only; - - up_write(&c->state_lock); - } - - if (opt_defined(opts, errors)) - c->opts.errors = opts.errors; -err: - return bch2_err_class(ret); -} - static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; @@ -2374,8 +2336,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct bch2_opts_parse *opts = fc->fs_private; + struct bch_fs *c = sb->s_fs_info; + int ret = 0; - return bch2_remount(sb, &fc->sb_flags, opts->opts); + opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); + + if (opts->opts.read_only != c->opts.read_only) { + down_write(&c->state_lock); + + if (opts->opts.read_only) { + bch2_fs_read_only(c); + + sb->s_flags |= SB_RDONLY; + } else { + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); + up_write(&c->state_lock); + ret = -EINVAL; + goto err; + } + + sb->s_flags &= ~SB_RDONLY; + } + + c->opts.read_only = opts->opts.read_only; + + up_write(&c->state_lock); + } + + if (opt_defined(opts->opts, errors)) + c->opts.errors = opts->opts.errors; +err: + return bch2_err_class(ret); } static const struct fs_context_operations bch2_context_ops = { diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 04ec0520..7aca010e 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -868,19 +868,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, uid, gid, mode, rdev, parent); } -static inline u32 bkey_generation(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - case KEY_TYPE_inode_v2: - BUG(); - case KEY_TYPE_inode_generation: - return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); - default: - return 0; - } -} - static struct bkey_i_inode_alloc_cursor * bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) { @@ -1198,6 +1185,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, opts->_name##_from_inode = true; \ } else { \ opts->_name = c->opts._name; \ + opts->_name##_from_inode = false; \ } BCH_INODE_OPTS() #undef x diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index 652dbc58..4fb279f1 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -25,8 +25,15 @@ #include "subvolume.h" #include "trace.h" +#include <linux/random.h> #include <linux/sched/mm.h> +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_read_corrupt_ratio; +module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(read_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -59,7 +66,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) } rcu_read_unlock(); - return bch2_rand_range(nr * CONGESTED_MAX) < total; + return get_random_u32_below(nr * CONGESTED_MAX) < total; } #else @@ -97,14 +104,21 @@ static inline bool have_io_error(struct bch_io_failures *failed) return failed && failed->nr; } -static bool ptr_being_rewritten(struct bch_read_bio *orig, - unsigned dev, - unsigned flags) +static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) { - if (!(flags & BCH_READ_data_update)) + EBUG_ON(rbio->split); + + return rbio->data_update + ? container_of(rbio, struct data_update, rbio) + : NULL; +} + +static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) +{ + struct data_update *u = rbio_data_update(orig); + if (!u) return false; - struct data_update *u = container_of(orig, struct data_update, rbio); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { @@ -193,7 +207,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, struct bpos pos, struct extent_ptr_decoded *pick, unsigned sectors, - unsigned flags, struct bch_read_bio *orig, struct bch_io_failures *failed) { @@ -214,7 +227,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, unsigned ptr_bit = 1; bkey_for_each_ptr(ptrs, ptr) { if (bch2_dev_io_failures(failed, ptr->dev) && - !ptr_being_rewritten(orig, ptr->dev, flags)) + !ptr_being_rewritten(orig, ptr->dev)) update_opts.rewrite_ptrs |= ptr_bit; ptr_bit <<= 1; } @@ -308,7 +321,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_reflink : BTREE_ID_extents, - k, pos, pick, sectors, flags, orig, failed); + k, pos, pick, sectors, orig, failed); if (!promote) return NULL; @@ -336,7 +349,7 @@ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *o if (ret) return ret; - if (rbio->flags & BCH_READ_data_update) + if (rbio->data_update) prt_str(out, "(internal move) "); return 0; @@ -416,83 +429,6 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } -static struct bkey_s_c get_rbio_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, - struct btree_iter *iter) -{ - if (rbio->flags & BCH_READ_data_update) { - struct data_update *u = container_of(rbio, struct data_update, rbio); - - return bch2_bkey_get_iter(trans, iter, - u->btree_id, bkey_start_pos(&u->k.k->k), 0); - } else { - struct bpos pos = rbio->read_pos; - int ret = bch2_subvolume_get_snapshot(trans, rbio->subvol, &pos.snapshot); - if (ret) - return bkey_s_c_err(ret); - - return bch2_bkey_get_iter(trans, iter, - BTREE_ID_extents, pos, 0); - } -} - -static void mark_io_failure_if_current_extent_matches(struct btree_trans *trans, - struct bch_read_bio *rbio, - struct bch_io_failures *failed) -{ - struct btree_iter iter = {}; - struct bkey_s_c k; - int ret = lockrestart_do(trans, - bkey_err(k = get_rbio_extent(trans, rbio, &iter))); - - if (!ret) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) - if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) - bch2_mark_io_failure(failed, &rbio->pick, - rbio->ret == -BCH_ERR_data_read_csum_err); - } - - bch2_trans_iter_exit(trans, &iter); -} - -static noinline int maybe_poison_extent(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, struct bch_io_failures *failed) -{ - u64 flags = bch2_bkey_extent_flags(k); - if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; - - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - /* - * Make sure we actually attempt to read and got checksum failures from - * every replica - */ - - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) - continue; - - struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, ptr->dev); - if (!f || f->failed_csum_nr != BCH_MAX_CSUM_RETRIES) { - rcu_read_unlock(); - return 0; - } - } - rcu_read_unlock(); - - struct bkey_i *new = __bch2_bkey_make_mut(trans, iter, &k, 0, 0, - bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); - return PTR_ERR_OR_ZERO(new) ?: - bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: - bch2_trans_commit(trans, NULL, NULL, 0); -} - static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, @@ -530,9 +466,6 @@ err: goto retry; if (ret) { - if (ret == -BCH_ERR_no_device_to_read_from && failed) - maybe_poison_extent(trans, &iter, k, failed); - rbio->bio.bi_status = BLK_STS_IOERR; rbio->ret = ret; } @@ -560,7 +493,8 @@ static void bch2_rbio_retry(struct work_struct *work) bvec_iter_sectors(rbio->bvec_iter)); if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) - mark_io_failure_if_current_extent_matches(trans, rbio, &failed); + bch2_mark_io_failure(&failed, &rbio->pick, + rbio->ret == -BCH_ERR_data_read_retry_csum_err); if (!rbio->split) { rbio->bio.bi_status = 0; @@ -577,7 +511,7 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_last_fragment; flags |= BCH_READ_must_clone; - int ret = flags & BCH_READ_data_update + int ret = rbio->data_update ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) : __bch2_read(trans, rbio, iter, inum, &failed, flags); @@ -591,7 +525,7 @@ static void bch2_rbio_retry(struct work_struct *work) bch2_inum_offset_err_msg_trans(trans, &buf, (subvol_inum) { subvol, read_pos.inode }, read_pos.offset << 9)); - if (rbio->flags & BCH_READ_data_update) + if (rbio->data_update) prt_str(&buf, "(internal move) "); prt_str(&buf, "successful retry"); @@ -647,7 +581,7 @@ static void bch2_read_io_err(struct work_struct *work) bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - bch2_rbio_error(rbio, -BCH_ERR_data_read_io_err, bio->bi_status); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); } static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, @@ -734,7 +668,7 @@ static void bch2_read_csum_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -778,42 +712,6 @@ static void bch2_read_decrypt_err(struct work_struct *work) printbuf_exit(&buf); } -#ifdef CONFIG_BCACHEFS_DEBUG -static unsigned bch2_read_corrupt_ratio; -module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); -MODULE_PARM_DESC(read_corrupt_ratio, ""); - -static void corrupt_bio(struct bio *bio) -{ - struct bvec_iter iter; - struct bio_vec bv; - unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); - - bio_for_each_segment(bv, bio, iter) { - unsigned u64s = bv.bv_len / sizeof(u64); - - if (offset < u64s) { - u64 *segment = bvec_kmap_local(&bv); - segment[offset] = get_random_u64(); - kunmap_local(segment); - return; - } - offset -= u64s; - } -} - -static inline void maybe_corrupt_bio(struct bio *bio) -{ - if (bch2_read_corrupt_ratio && - !get_random_u32_below(bch2_read_corrupt_ratio)) - corrupt_bio(bio); -} -#else -static inline void maybe_corrupt_bio(struct bio *bio) -{ -} -#endif - /* Inner part that may run in process context */ static void __bch2_read_endio(struct work_struct *work) { @@ -821,9 +719,10 @@ static void __bch2_read_endio(struct work_struct *work) container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct bio *src = &rbio->bio; - struct bio *dst = &bch2_rbio_parent(rbio)->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_read_bio *parent = bch2_rbio_parent(rbio); + struct bio *src = &rbio->bio; + struct bio *dst = &parent->bio; + struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); unsigned nofs_flags; @@ -841,7 +740,7 @@ static void __bch2_read_endio(struct work_struct *work) src->bi_iter = rbio->bvec_iter; } - maybe_corrupt_bio(src); + bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; @@ -853,7 +752,7 @@ static void __bch2_read_endio(struct work_struct *work) */ if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { rbio->flags |= BCH_READ_must_bounce; - bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err_maybe_userspace, + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, BLK_STS_IOERR); goto out; } @@ -873,7 +772,7 @@ static void __bch2_read_endio(struct work_struct *work) if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); - if (likely(!(rbio->flags & BCH_READ_data_update))) { + if (likely(!parent->data_update)) { /* Adjust crc to point to subset of data we want: */ crc.offset += rbio->offset_into_extent; crc.live_size = bvec_iter_sectors(rbio->bvec_iter); @@ -1043,6 +942,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_read_bio *rbio = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); + struct data_update *u = rbio_data_update(orig); int ret = 0; if (bkey_extent_is_inline_data(k.k)) { @@ -1106,16 +1006,7 @@ retry_pick: goto retry_pick; } - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - if (!(flags & BCH_READ_in_retry)) - bch2_trans_unlock(trans); - else - bch2_trans_unlock_long(trans); - - if (!(flags & BCH_READ_data_update)) { + if (likely(!u)) { if (!(flags & BCH_READ_last_fragment) || bio_flagged(&orig->bio, BIO_CHAIN)) flags |= BCH_READ_must_clone; @@ -1138,12 +1029,10 @@ retry_pick: bounce = true; } } else { - read_full = true; /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - struct data_update *u = container_of(orig, struct data_update, rbio); if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) percpu_ref_put(&ca->io_ref); @@ -1152,6 +1041,7 @@ retry_pick: } iter.bi_size = pick.crc.compressed_size << 9; + read_full = true; } if (orig->opts.promote_target || have_io_error(failed)) @@ -1242,10 +1132,14 @@ retry_pick: rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; + /* XXX: also nvme read recovery level */ + if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev))) + rbio->bio.bi_opf |= REQ_FUA; + if (rbio->bounce) trace_and_count(c, io_read_bounce, &rbio->bio); - if (!(flags & BCH_READ_data_update)) + if (!u) this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); else this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); @@ -1255,7 +1149,7 @@ retry_pick: * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update)) + if (ca && pick.ptr.cached && !u) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); @@ -1264,6 +1158,15 @@ retry_pick: trace_and_count(c, io_read_split, &orig->bio); } + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + if (!(flags & BCH_READ_in_retry)) + bch2_trans_unlock(trans); + else + bch2_trans_unlock_long(trans); + if (likely(!rbio->pick.do_ec_reconstruct)) { if (unlikely(!rbio->have_ioref)) { struct printbuf buf = PRINTBUF; @@ -1275,7 +1178,7 @@ retry_pick: printbuf_exit(&buf); bch2_rbio_error(rbio, - -BCH_ERR_data_read_device_offline, + -BCH_ERR_data_read_retry_device_offline, BLK_STS_IOERR); goto out; } @@ -1302,7 +1205,7 @@ retry_pick: } else { /* Attempting reconstruct read: */ if (bch2_ec_read_extent(trans, rbio, k)) { - bch2_rbio_error(rbio, -BCH_ERR_data_read_ec_reconstruct_err, + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, BLK_STS_IOERR); goto out; } @@ -1314,6 +1217,8 @@ out: if (likely(!(flags & BCH_READ_in_retry))) { return 0; } else { + bch2_trans_unlock(trans); + int ret; rbio->context = RBIO_CONTEXT_UNBOUND; @@ -1324,7 +1229,7 @@ out: if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(failed, &pick, - ret == -BCH_ERR_data_read_csum_err); + ret == -BCH_ERR_data_read_retry_csum_err); return ret; } @@ -1341,11 +1246,11 @@ hole: this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], bvec_iter_sectors(iter)); /* - * won't normally happen in the BCH_READ_data_update - * (bch2_move_extent()) path, but if we retry and the extent we wanted - * to read no longer exists we have to signal that: + * won't normally happen in the data update (bch2_move_extent()) path, + * but if we retry and the extent we wanted to read no longer exists we + * have to signal that: */ - if (flags & BCH_READ_data_update) + if (u) orig->ret = -BCH_ERR_data_read_key_overwritten; zero_fill_bio_iter(&orig->bio, iter); @@ -1366,7 +1271,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, struct bkey_s_c k; int ret; - BUG_ON(flags & BCH_READ_data_update); + EBUG_ON(rbio->data_update); bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, @@ -1393,23 +1298,6 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, if (ret) goto err; - if (unlikely(flags & BCH_READ_in_retry)) { - struct data_update *u = flags & BCH_READ_data_update - ? container_of(rbio, struct data_update, rbio) - : NULL; - - if (u && - !bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { - /* extent we wanted to read no longer exists: */ - ret = -BCH_ERR_data_read_key_overwritten; - goto err; - } - - if (!bkey_deleted(&sk.k->k) && - !bkey_and_val_eq(k, bkey_i_to_s_c(sk.k))) - failed->nr = 0; - } - s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); unsigned sectors = k.k->size - offset_into_extent; @@ -1447,16 +1335,18 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); err: + if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) + flags |= BCH_READ_must_bounce; + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_data_read_retry)) break; } - if (unlikely(ret)) { - if (ret == -BCH_ERR_no_device_to_read_from && failed) - maybe_poison_extent(trans, &iter, k, failed); + bch2_trans_iter_exit(trans, &iter); + if (ret) { struct printbuf buf = PRINTBUF; lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, @@ -1472,7 +1362,6 @@ err: bch2_rbio_done(rbio); } - bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); return ret; } diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h index edcf50a4..cd219504 100644 --- a/libbcachefs/io_read.h +++ b/libbcachefs/io_read.h @@ -36,7 +36,8 @@ struct bch_read_bio { u16 flags; union { struct { - u16 promote:1, + u16 data_update:1, + promote:1, bounce:1, split:1, have_ioref:1, @@ -109,7 +110,6 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, x(retry_if_stale) \ x(may_promote) \ x(user_mapped) \ - x(data_update) \ x(last_fragment) \ x(must_bounce) \ x(must_clone) \ @@ -163,12 +163,13 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, { struct bch_read_bio *rbio = to_rbio(bio); - rbio->c = orig->c; - rbio->_state = 0; - rbio->ret = 0; - rbio->split = true; - rbio->parent = orig; - rbio->opts = orig->opts; + rbio->c = orig->c; + rbio->_state = 0; + rbio->flags = 0; + rbio->ret = 0; + rbio->split = true; + rbio->parent = orig; + rbio->opts = orig->opts; return rbio; } @@ -182,7 +183,8 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, rbio->start_time = local_clock(); rbio->c = c; rbio->_state = 0; - rbio->ret = 0; + rbio->flags = 0; + rbio->ret = 0; rbio->opts = opts; rbio->bio.bi_end_io = end_io; return rbio; diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index dbfcb28f..a2e6b305 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -34,6 +34,12 @@ #include <linux/random.h> #include <linux/sched/mm.h> +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_write_corrupt_ratio; +module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(write_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, @@ -1005,6 +1011,15 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bounce = true; } +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); + if (!bounce && write_corrupt_ratio) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bounce = true; + } +#endif saved_iter = dst->bi_iter; do { @@ -1114,6 +1129,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, init_append_extent(op, wp, version, crc); +#ifdef CONFIG_BCACHEFS_DEBUG + if (write_corrupt_ratio) { + swap(dst->bi_iter.bi_size, dst_len); + bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); + swap(dst->bi_iter.bi_size, dst_len); + } +#endif + if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); @@ -1394,6 +1417,7 @@ retry: bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, op->insert_keys.top, true); @@ -1718,20 +1742,26 @@ static const char * const bch2_write_flags[] = { void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) { - prt_str(out, "pos: "); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_printf(out, "pos:\t"); bch2_bpos_to_text(out, op->pos); prt_newline(out); printbuf_indent_add(out, 2); - prt_str(out, "started: "); + prt_printf(out, "started:\t"); bch2_pr_time_units(out, local_clock() - op->start_time); prt_newline(out); - prt_str(out, "flags: "); + prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); - prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); + prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); + prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); + + prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); printbuf_indent_sub(out, 2); } diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 331c9d76..cf2700b0 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1609,11 +1609,6 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) kvfree(new_buf); } -static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) -{ - return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); -} - static CLOSURE_CALLBACK(journal_write_done) { closure_type(w, struct journal_buf, io); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index a3096e2a..55e17c2d 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -101,13 +101,25 @@ static void move_free(struct moving_io *io) static void move_write_done(struct bch_write_op *op) { struct moving_io *io = container_of(op, struct moving_io, write.op); + struct bch_fs *c = op->c; struct moving_context *ctxt = io->write.ctxt; - if (io->write.op.error) - ctxt->write_error = true; + if (op->error) { + if (trace_io_move_write_fail_enabled()) { + struct printbuf buf = PRINTBUF; - atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); - atomic_dec(&io->write.ctxt->write_ios); + bch2_write_op_to_text(&buf, op); + prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error)); + trace_io_move_write_fail(c, buf.buf); + printbuf_exit(&buf); + } + this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); + + ctxt->write_error = true; + } + + atomic_sub(io->write_sectors, &ctxt->write_sectors); + atomic_dec(&ctxt->write_ios); move_free(io); closure_put(&ctxt->cl); } @@ -359,7 +371,6 @@ int bch2_move_extent(struct moving_context *ctxt, bkey_start_pos(k.k), iter->btree_id, k, 0, NULL, - BCH_READ_data_update| BCH_READ_last_fragment, data_opts.scrub ? data_opts.read_dev : -1); return 0; @@ -580,7 +591,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, k.k->type == KEY_TYPE_reflink_p && REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + s64 offset_into_extent = 0; bch2_trans_iter_exit(trans, &reflink_iter); k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); @@ -599,6 +610,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, * pointer - need to fixup iter->k */ extent_iter = &reflink_iter; + offset_into_extent = 0; } if (!bkey_extent_is_direct_data(k.k)) @@ -712,7 +724,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; struct bkey_s_c k; - unsigned sectors_moved = 0; struct bkey_buf last_flushed; int ret = 0; @@ -834,7 +845,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (ctxt->stats) atomic64_add(sectors, &ctxt->stats->sectors_seen); - sectors_moved += sectors; next: bch2_btree_iter_advance(&bp_iter); } @@ -1253,17 +1263,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); - prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); - prt_printf(out, "bytes seen: "); + prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); + prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); + prt_printf(out, "bytes seen:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_newline(out); - prt_printf(out, "bytes moved: "); + prt_printf(out, "bytes moved:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_printf(out, "bytes raced: "); + prt_printf(out, "bytes raced:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); @@ -1272,7 +1282,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) { - struct moving_io *io; + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); @@ -1292,6 +1303,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str printbuf_indent_add(out, 2); mutex_lock(&ctxt->lock); + struct moving_io *io; list_for_each_entry(io, &ctxt->ios, io_list) bch2_data_update_inflight_to_text(out, &io->write); mutex_unlock(&ctxt->lock); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index fa19fc44..5126c870 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "Currently calculated wait:\t"); prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->copygc_thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } } static int bch2_copygc_thread(void *arg) diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index afb89d31..baa9c11a 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -186,6 +186,11 @@ enum fsck_err_opts { OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ + x(checksum_err_retry_nr, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, 32), \ + BCH_SB_CSUM_ERR_RETRY_NR, 3, \ + NULL, NULL) \ x(compression, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 58f6d97e..29a56938 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -26,9 +26,8 @@ /* bch_extent_rebalance: */ -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; bkey_extent_entry_for_each(ptrs, entry) @@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s return NULL; } +static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); +} + static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, struct bch_io_opts *opts, struct bkey_s_c k, @@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) { - const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); if (!opts) return 0; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; @@ -590,8 +595,19 @@ static int bch2_rebalance_thread(void *arg) void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) { + printbuf_tabstop_push(out, 32); + struct bch_fs_rebalance *r = &c->rebalance; + /* print pending work */ + struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, }; + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + + prt_printf(out, "pending work:\t"); + prt_human_readable_u64(out, v); + prt_printf(out, "\n\n"); + prt_str(out, bch2_rebalance_state_strs[r->state]); prt_newline(out); printbuf_indent_add(out, 2); @@ -600,15 +616,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) case BCH_REBALANCE_waiting: { u64 now = atomic64_read(&c->io_clock[WRITE].now); - prt_str(out, "io wait duration: "); + prt_printf(out, "io wait duration:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); prt_newline(out); - prt_str(out, "io wait remaining: "); + prt_printf(out, "io wait remaining:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); prt_newline(out); - prt_str(out, "duration waited: "); + prt_printf(out, "duration waited:\t"); bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); prt_newline(out); break; @@ -621,6 +637,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) break; } prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->rebalance.thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } + printbuf_indent_sub(out, 2); } diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 71c786cd..a6e26733 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -899,7 +899,7 @@ use_clean: * journal sequence numbers: */ if (!c->sb.clean) - journal_seq += 8; + journal_seq += JOURNAL_BUF_NR * 4; if (blacklist_seq != journal_seq) { ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h index c82a8910..fa27ec59 100644 --- a/libbcachefs/sb-counters_format.h +++ b/libbcachefs/sb-counters_format.h @@ -22,6 +22,7 @@ enum counters_flags { x(io_move_write, 36, TYPE_SECTORS) \ x(io_move_finish, 37, TYPE_SECTORS) \ x(io_move_fail, 38, TYPE_COUNTER) \ + x(io_move_write_fail, 82, TYPE_COUNTER) \ x(io_move_start_fail, 39, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index f645a454..575ad1e0 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -12,7 +12,6 @@ #include "super.h" #include <linux/crc32c.h> -#include <crypto/hash.h> #include <crypto/sha2.h> static inline enum bch_str_hash_type @@ -55,13 +54,10 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) }; if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { - SHASH_DESC_ON_STACK(desc, c->sha256); u8 digest[SHA256_DIGEST_SIZE]; - desc->tfm = c->sha256; - - crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, - sizeof(bi->bi_hash_seed), digest); + sha256((const u8 *)&bi->bi_hash_seed, + sizeof(bi->bi_hash_seed), digest); memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); } diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index ee32d043..f2e44282 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -365,10 +365,9 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) return 0; } -static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - enum bch_validate_flags flags, struct printbuf *out) +int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, + enum bch_validate_flags flags, struct printbuf *out) { - struct bch_sb *sb = disk_sb->sb; struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; int ret; @@ -377,15 +376,27 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (ret) return ret; - if (sb->features[1] || - (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { - prt_printf(out, "Filesystem has incompatible features"); + u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); + unsigned incompat_bit = 0; + if (incompat) + incompat_bit = __ffs64(incompat); + else if (sb->features[1]) + incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); + + if (incompat_bit) { + prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", + incompat_bit, + bch2_sb_features[BCH_FEATURE_NR - 1], + BCH_FEATURE_NR - 1); return -BCH_ERR_invalid_sb_features; } if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { - prt_printf(out, "Filesystem has incompatible version"); + prt_str(out, "Filesystem has incompatible version "); + bch2_version_to_text(out, le16_to_cpu(sb->version)); + prt_str(out, ", current version "); + bch2_version_to_text(out, bcachefs_metadata_version_current); return -BCH_ERR_invalid_sb_features; } @@ -399,6 +410,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, return -BCH_ERR_invalid_sb_uuid; } + if (!(flags & BCH_VALIDATE_write) && + le64_to_cpu(sb->offset) != read_offset) { + prt_printf(out, "Bad sb offset (got %llu, read from %llu)", + le64_to_cpu(sb->offset), read_offset); + return -BCH_ERR_invalid_sb_offset; + } + if (!sb->nr_devices || sb->nr_devices > BCH_SB_MEMBERS_MAX) { prt_printf(out, "Bad number of member devices %u (max %u)", @@ -457,6 +475,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags && + !BCH_SB_CSUM_ERR_RETRY_NR(sb)) + SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3); } #ifdef __KERNEL__ @@ -874,7 +896,7 @@ got_super: sb->have_layout = true; - ret = bch2_sb_validate(sb, 0, &err); + ret = bch2_sb_validate(sb->sb, offset, 0, &err); if (ret) { bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); @@ -1031,7 +1053,7 @@ int bch2_write_super(struct bch_fs *c) darray_for_each(online_devices, ca) { printbuf_reset(&err); - ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err); + ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); goto out; diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 167dd98f..78f708a6 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); +int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *); + int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_write_super(struct bch_fs *); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index cffad3b6..8e928b3d 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -75,9 +75,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); MODULE_DESCRIPTION("bcachefs filesystem"); -MODULE_SOFTDEP("pre: crc32c"); -MODULE_SOFTDEP("pre: crc64"); -MODULE_SOFTDEP("pre: sha256"); MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: poly1305"); MODULE_SOFTDEP("pre: xxhash"); @@ -1838,7 +1835,11 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err_late; up_write(&c->state_lock); - return 0; +out: + printbuf_exit(&label); + printbuf_exit(&errbuf); + bch_err_fn(c, ret); + return ret; err_unlock: mutex_unlock(&c->sb_lock); @@ -1847,10 +1848,7 @@ err: if (ca) bch2_dev_free(ca); bch2_free_super(&sb); - printbuf_exit(&label); - printbuf_exit(&errbuf); - bch_err_fn(c, ret); - return ret; + goto out; err_late: up_write(&c->state_lock); ca = NULL; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 2ed3f755..5b8463ae 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -148,6 +148,7 @@ write_attribute(trigger_btree_key_cache_shrink); write_attribute(trigger_freelist_wakeup); write_attribute(trigger_btree_updates); read_attribute(gc_gens_pos); +write_attribute(read_fua_test); read_attribute(uuid); read_attribute(minor); @@ -395,6 +396,71 @@ SHOW(bch2_fs) return 0; } +static int read_fua_test(struct bch_fs *c) +{ + int ret = 0; + unsigned bs = 4096; + struct bio *bio; + void *buf; + + struct bch_dev *ca = bch2_dev_get_ioref(c, 0, READ); + if (!ca) + return -EINVAL; + + bio = bio_kmalloc(1, GFP_KERNEL); + if (!bio) { + ret = -ENOMEM; + goto err; + } + + buf = kmalloc(bs, GFP_KERNEL); + if (!buf) + goto err; + + u64 start = ktime_get_ns(); + for (unsigned i = 0; i < 1000; i++) { + bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ); + bch2_bio_map(bio, buf, bs); + ret = submit_bio_wait(bio); + if (ret) + goto err; + } + u64 ns_nofua = ktime_get_ns() - start; + + start = ktime_get_ns(); + for (unsigned i = 0; i < 1000; i++) { + bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ); + bch2_bio_map(bio, buf, bs); + ret = submit_bio_wait(bio); + if (ret) + goto err; + } + u64 ns_fua = ktime_get_ns() - start; + + u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca); + + start = ktime_get_ns(); + for (unsigned i = 0; i < 1000; i++) { + bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ); + bio->bi_iter.bi_sector = (get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9; + bch2_bio_map(bio, buf, bs); + ret = submit_bio_wait(bio); + if (ret) + goto err; + } + u64 ns_rand = ktime_get_ns() - start; + + pr_info("ns nofua %llu", ns_nofua); + pr_info("ns fua %llu", ns_fua); + pr_info("ns random %llu", ns_rand); +err: + kfree(buf); + kfree(bio); + percpu_ref_put(&ca->io_ref); + bch_err_fn(c, ret); + return ret; +} + STORE(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -451,6 +517,9 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_freelist_wakeup) closure_wake_up(&c->freelist_wait); + if (attr == &sysfs_read_fua_test) + read_fua_test(c); + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -580,6 +649,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_btree_key_cache_shrink, &sysfs_trigger_freelist_wakeup, &sysfs_trigger_btree_updates, + &sysfs_read_fua_test, &sysfs_gc_gens_pos, diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index c8669a6b..519d00d6 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -846,6 +846,11 @@ DEFINE_EVENT(fs_str, io_move_fail, TP_ARGS(c, str) ); +DEFINE_EVENT(fs_str, io_move_write_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + DEFINE_EVENT(fs_str, io_move_start_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 50a90e48..bf555ae7 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -653,21 +653,6 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) return 0; } -size_t bch2_rand_range(size_t max) -{ - size_t rand; - - if (!max) - return 0; - - do { - rand = get_random_long(); - rand &= roundup_pow_of_two(max) - 1; - } while (rand >= max); - - return rand; -} - void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) { struct bio_vec bv; @@ -698,6 +683,27 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *bio) +{ + struct bvec_iter iter; + struct bio_vec bv; + unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); + + bio_for_each_segment(bv, bio, iter) { + unsigned u64s = bv.bv_len / sizeof(u64); + + if (offset < u64s) { + u64 *segment = bvec_kmap_local(&bv); + segment[offset] = get_random_u64(); + kunmap_local(segment); + return; + } + offset -= u64s; + } +} +#endif + #if 0 void eytzinger1_test(void) { diff --git a/libbcachefs/util.h b/libbcachefs/util.h index e7c3541b..f0e360eb 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -401,11 +401,21 @@ do { \ _ret; \ }) -size_t bch2_rand_range(size_t); - void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); void memcpy_from_bio(void *, struct bio *, struct bvec_iter); +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *); + +static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) +{ + if (ratio && !get_random_u32_below(ratio)) + bch2_corrupt_bio(bio); +} +#else +#define bch2_maybe_corrupt_bio(...) do {} while (0) +#endif + static inline void memcpy_u64s_small(void *dst, const void *src, unsigned u64s) {