diff --git a/.bcachefs_revision b/.bcachefs_revision index e02e3d5c..b81d691d 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -dbe591cee299957e282eb7857edea35050b1d8b5 +e2e7dcddb3660e90a972473bb10de570964754d7 diff --git a/c_src/cmd_format.c b/c_src/cmd_format.c index 2d900f1e..d8714888 100644 --- a/c_src/cmd_format.c +++ b/c_src/cmd_format.c @@ -207,9 +207,8 @@ int cmd_format(int argc, char *argv[]) force = true; break; case O_fs_size: - if (bch2_strtoull_h(optarg, &dev_opts.opts.fs_size)) + if (bch2_strtoull_h(optarg, &dev_opts.fs_size)) die("invalid filesystem size"); - dev_opts.opts.fs_size_defined = true; unconsumed_dev_option = true; break; case O_superblock_size: @@ -233,8 +232,7 @@ int cmd_format(int argc, char *argv[]) darray_push(&device_paths, optarg); dev_opts.path = optarg; darray_push(&devices, dev_opts); - dev_opts.opts.fs_size = 0; - dev_opts.opts.fs_size_defined = 0; + dev_opts.fs_size = 0; unconsumed_dev_option = false; break; case O_quiet: diff --git a/c_src/cmd_fsck.c b/c_src/cmd_fsck.c index 3a33ca40..859ec731 100644 --- a/c_src/cmd_fsck.c +++ b/c_src/cmd_fsck.c @@ -326,7 +326,7 @@ kernel_fsck_err: } else { userland_fsck: printf("Running userspace offline fsck\n"); - ret = bch2_parse_mount_opts(NULL, &opts, &parse_later, opts_str.buf); + ret = bch2_parse_mount_opts(NULL, &opts, &parse_later, opts_str.buf, false); if (ret) return ret; diff --git a/c_src/cmd_migrate.c b/c_src/cmd_migrate.c index a0328ca8..8155a2b3 100644 --- a/c_src/cmd_migrate.c +++ b/c_src/cmd_migrate.c @@ -228,9 +228,10 @@ static int migrate_fs(const char *fs_path, printf("Creating new filesystem on %s in space reserved at %s\n", dev->path, file_path); - dev->opts.fs_size = get_size(dev->bdev->bd_fd); - dev->opts.bucket_size = bch2_pick_bucket_size(fs_opts, devs); - dev->nbuckets = dev->opts.fs_size / dev->opts.bucket_size; + dev->fs_size = get_size(dev->bdev->bd_fd); + opt_set(dev->opts, bucket_size, bch2_pick_bucket_size(fs_opts, devs)); + + dev->nbuckets = dev->fs_size / dev->opts.bucket_size; bch2_check_bucket_size(fs_opts, dev); diff --git a/c_src/libbcachefs.c b/c_src/libbcachefs.c index 081a8176..0d19b411 100644 --- a/c_src/libbcachefs.c +++ b/c_src/libbcachefs.c @@ -78,13 +78,13 @@ u64 bch2_pick_bucket_size(struct bch_opts opts, dev_opts_list devs) u64 min_dev_size = BCH_MIN_NR_NBUCKETS * bucket_size; darray_for_each(devs, i) - if (i->opts.fs_size < min_dev_size) + if (i->fs_size < min_dev_size) die("cannot format %s, too small (%llu bytes, min %llu)", - i->path, i->opts.fs_size, min_dev_size); + i->path, i->fs_size, min_dev_size); u64 total_fs_size = 0; darray_for_each(devs, i) - total_fs_size += i->opts.fs_size; + total_fs_size += i->fs_size; struct sysinfo info; si_meminfo(&info); @@ -181,8 +181,8 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, /* get device size, if it wasn't specified: */ darray_for_each(devs, i) - if (!opt_defined(i->opts, fs_size)) - opt_set(i->opts, fs_size, get_size(i->bdev->bd_fd)); + if (!i->fs_size) + i->fs_size = get_size(i->bdev->bd_fd); /* calculate bucket sizes: */ u64 fs_bucket_size = bch2_pick_bucket_size(fs_opts, devs); @@ -190,10 +190,10 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, darray_for_each(devs, i) if (!opt_defined(i->opts, bucket_size)) opt_set(i->opts, bucket_size, - min(fs_bucket_size, dev_max_bucket_size(i->opts.fs_size))); + min(fs_bucket_size, dev_max_bucket_size(i->fs_size))); darray_for_each(devs, i) { - i->nbuckets = i->opts.fs_size / i->opts.bucket_size; + i->nbuckets = i->fs_size / i->opts.bucket_size; bch2_check_bucket_size(fs_opts, i); } @@ -292,7 +292,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, bch2_sb_members_cpy_v2_v1(&sb); darray_for_each(devs, i) { - u64 size_sectors = i->opts.fs_size >> 9; + u64 size_sectors = i->fs_size >> 9; sb.sb->dev_idx = i - devs.data; diff --git a/c_src/libbcachefs.h b/c_src/libbcachefs.h index 141a6e8f..619bbbd5 100644 --- a/c_src/libbcachefs.h +++ b/c_src/libbcachefs.h @@ -66,6 +66,7 @@ struct dev_opts { u64 sb_end; u64 nbuckets; + u64 fs_size; const char *label; /* make this a bch_opt */ diff --git a/c_src/posix_to_bcachefs.c b/c_src/posix_to_bcachefs.c index 63aa0937..72ea11b8 100644 --- a/c_src/posix_to_bcachefs.c +++ b/c_src/posix_to_bcachefs.c @@ -6,8 +6,8 @@ #include "posix_to_bcachefs.h" #include "libbcachefs/alloc_foreground.h" #include "libbcachefs/buckets.h" -#include "libbcachefs/fs-common.h" #include "libbcachefs/io_write.h" +#include "libbcachefs/namei.h" #include "libbcachefs/str_hash.h" #include "libbcachefs/xattr.h" diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 3ecc3dd1..451c323d 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -67,6 +67,7 @@ #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) #define fallthrough __attribute__((__fallthrough__)) #define __noreturn __attribute__((__noreturn__)) +#define __no_kmsan_checks #ifndef __counted_by #define __counted_by(nr) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2e2406dc..1e0615fe 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -12,6 +12,7 @@ #include <linux/byteorder.h> #include <linux/compiler.h> #include <linux/dcache.h> +#include <linux/kmsan-checks.h> #include <linux/math.h> #include <linux/minmax.h> diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h new file mode 100644 index 00000000..e1082dc4 --- /dev/null +++ b/include/linux/kmsan-checks.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN checks to be used for one-off annotations in subsystems. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko <glider@google.com> + * + */ + +#ifndef _LINUX_KMSAN_CHECKS_H +#define _LINUX_KMSAN_CHECKS_H + +#include <linux/types.h> + +#ifdef CONFIG_KMSAN + +/** + * kmsan_poison_memory() - Mark the memory range as uninitialized. + * @address: address to start with. + * @size: size of buffer to poison. + * @flags: GFP flags for allocations done by this function. + * + * Until other data is written to this range, KMSAN will treat it as + * uninitialized. Error reports for this memory will reference the call site of + * kmsan_poison_memory() as origin. + */ +void kmsan_poison_memory(const void *address, size_t size, gfp_t flags); + +/** + * kmsan_unpoison_memory() - Mark the memory range as initialized. + * @address: address to start with. + * @size: size of buffer to unpoison. + * + * Until other data is written to this range, KMSAN will treat it as + * initialized. + */ +void kmsan_unpoison_memory(const void *address, size_t size); + +/** + * kmsan_check_memory() - Check the memory range for being initialized. + * @address: address to start with. + * @size: size of buffer to check. + * + * If any piece of the given range is marked as uninitialized, KMSAN will report + * an error. + */ +void kmsan_check_memory(const void *address, size_t size); + +/** + * kmsan_copy_to_user() - Notify KMSAN about a data transfer to userspace. + * @to: destination address in the userspace. + * @from: source address in the kernel. + * @to_copy: number of bytes to copy. + * @left: number of bytes not copied. + * + * If this is a real userspace data transfer, KMSAN checks the bytes that were + * actually copied to ensure there was no information leak. If @to belongs to + * the kernel space (which is possible for compat syscalls), KMSAN just copies + * the metadata. + */ +void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, + size_t left); + +/** + * kmsan_memmove() - Notify KMSAN about a data copy within kernel. + * @to: destination address in the kernel. + * @from: source address in the kernel. + * @size: number of bytes to copy. + * + * Invoked after non-instrumented version (e.g. implemented using assembly + * code) of memmove()/memcpy() is called, in order to copy KMSAN's metadata. + */ +void kmsan_memmove(void *to, const void *from, size_t to_copy); + +#else + +static inline void kmsan_poison_memory(const void *address, size_t size, + gfp_t flags) +{ +} +static inline void kmsan_unpoison_memory(const void *address, size_t size) +{ +} +static inline void kmsan_check_memory(const void *address, size_t size) +{ +} +static inline void kmsan_copy_to_user(void __user *to, const void *from, + size_t to_copy, size_t left) +{ +} + +static inline void kmsan_memmove(void *to, const void *from, size_t to_copy) +{ +} + +#endif + +#endif /* _LINUX_KMSAN_CHECKS_H */ diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 54e0cc37..5fb396be 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -777,14 +777,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s s64 delta_sectors, s64 delta_fragmented, unsigned flags) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = data_type, - }; s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; - return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + d, dev_data_type, + .dev = ca->dev_idx, + .data_type = data_type); } int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, @@ -837,7 +835,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); if (!ca) - return -EIO; + return -BCH_ERR_trigger_alloc; struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); @@ -1031,7 +1029,7 @@ fsck_err: invalid_bucket: bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = -EIO; + ret = -BCH_ERR_trigger_alloc; goto err; } diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 1759c15a..0cac6534 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -127,14 +127,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) void bch2_open_bucket_write_error(struct bch_fs *c, struct open_buckets *obs, - unsigned dev) + unsigned dev, int err) { struct open_bucket *ob; unsigned i; open_bucket_for_each(c, obs, ob, i) if (ob->dev == dev && ob->ec) - bch2_ec_bucket_cancel(c, ob); + bch2_ec_bucket_cancel(c, ob, err); } static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) @@ -631,7 +631,7 @@ static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, struct bch_dev_usage *usage) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); + u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index baf5dc16..69ec6a01 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -82,7 +82,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, } void bch2_open_bucket_write_error(struct bch_fs *, - struct open_buckets *, unsigned); + struct open_buckets *, unsigned, int); void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index c9dfc365..20c497f0 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -50,6 +50,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); + prt_str(out, " data_type="); + bch2_prt_data_type(out, bp.v->data_type); prt_printf(out, " suboffset=%u len=%u gen=%u pos=", (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), bp.v->bucket_len, @@ -782,7 +784,7 @@ enum alloc_sector_counter { ALLOC_SECTORS_NR }; -static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) +static int data_type_to_alloc_counter(enum bch_data_type t) { switch (t) { case BCH_DATA_btree: @@ -791,9 +793,10 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t case BCH_DATA_cached: return ALLOC_cached; case BCH_DATA_stripe: + case BCH_DATA_parity: return ALLOC_stripe; default: - BUG(); + return -1; } } @@ -844,7 +847,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b if (bp.v->bucket_gen != a->gen) continue; - sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; + int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); + if (alloc_counter < 0) + continue; + + sectors[alloc_counter] += bp.v->bucket_len; }; bch2_trans_iter_exit(trans, &iter); if (ret) diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 054e2d5e..08263290 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) { return bpos_eq(l.k->p, r.k->p) && + l.k->size == r.k->size && bkey_bytes(l.k) == bkey_bytes(r.k) && !memcmp(l.v, r.v, bkey_val_bytes(l.k)); } diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 6abc9f17..2ba33ffc 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -2117,8 +2117,14 @@ out: return; err: set_btree_node_noevict(b); - bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, - "writing btree node: %s", bch2_err_str(ret)); + + if (!bch2_err_matches(ret, EROFS)) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); + bch2_btree_pos_to_text(&buf, c, b); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + } goto out; } @@ -2135,10 +2141,14 @@ static void btree_node_write_endio(struct bio *bio) bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, wbio->submit_time, !bio->bi_status); - if (ca && bio->bi_status) - bch_err_dev_ratelimited(ca, - "btree write error: %s", - bch2_blk_status_to_str(bio->bi_status)); + if (ca && bio->bi_status) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "btree write error: %s\n ", + bch2_blk_status_to_str(bio->bi_status)); + bch2_btree_pos_to_text(&buf, c, b); + bch_err_dev_ratelimited(ca, "%s", buf.buf); + printbuf_exit(&buf); + } if (bio->bi_status) { unsigned long flags; diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index b96157f3..8823eec6 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -335,13 +335,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra } __always_inline -static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip) { BUG_ON(err <= 0); BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); trans->restarted = err; trans->last_restarted_ip = ip; + return -err; +} + +__always_inline +static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +{ + btree_trans_restart_foreign_task(trans, err, ip); #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index caef65ad..94eb2b73 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -91,10 +91,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { - struct task_struct *task = i->trans->locking_wait.task; + struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", task ?task->pid : 0); + prt_printf(out, "%u ", task ? task->pid : 0); } prt_newline(out); } @@ -172,7 +172,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { trace_would_deadlock(g, i->trans); - return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + return btree_trans_restart_foreign_task(i->trans, + BCH_ERR_transaction_restart_would_deadlock, + _THIS_IP_); } else { i->trans->lock_must_abort = true; wake_up_process(i->trans->locking_wait.task); diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index d50dc31d..7d7e52dd 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -164,6 +164,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); + kmsan_check_memory(insert, bkey_bytes(&insert->k)); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index b3e346b5..bd2eb42e 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -512,6 +512,8 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 47d8690f..d2e1c043 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -133,6 +133,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr enum btree_id btree, struct bkey_i *k) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + if (unlikely(!btree_type_uses_write_buffer(btree))) { int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); dump_stack(); diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index d3e0cf01..67f1e320 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -649,6 +649,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, return 0; } +/* If the node has been reused, we might be reading uninitialized memory - that's fine: */ +static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) +{ + struct btree_node *b_data = READ_ONCE(b->data); + + return (b_data ? b_data->keys.seq : 0) == seq; +} + static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; @@ -677,17 +685,9 @@ static void btree_update_nodes_written(struct btree_update *as) * on disk: */ for (i = 0; i < as->nr_old_nodes; i++) { - __le64 seq; - b = as->old_nodes[i]; - bch2_trans_begin(trans); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - seq = b->data ? b->data->keys.seq : 0; - six_unlock_read(&b->c.lock); - bch2_trans_unlock_long(trans); - - if (seq == as->old_nodes_seq[i]) + if (btree_node_seq_matches(b, as->old_nodes_seq[i])) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index bb7742cf..e56ef623 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -724,9 +724,7 @@ static int __trigger_extent(struct btree_trans *trans, .replicas.nr_required = 1, }; - struct disk_accounting_pos acct_compression_key = { - .type = BCH_DISK_ACCOUNTING_compression, - }; + unsigned cur_compression_type = 0; u64 compression_acct[3] = { 1, 0, 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { @@ -760,13 +758,13 @@ static int __trigger_extent(struct btree_trans *trans, acc_replicas_key.replicas.nr_required = 0; } - if (acct_compression_key.compression.type && - acct_compression_key.compression.type != p.crc.compression_type) { + if (cur_compression_type && + cur_compression_type != p.crc.compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; @@ -775,7 +773,7 @@ static int __trigger_extent(struct btree_trans *trans, compression_acct[2] = 0; } - acct_compression_key.compression.type = p.crc.compression_type; + cur_compression_type = p.crc.compression_type; if (p.crc.compression_type) { compression_acct[1] += p.crc.uncompressed_size; compression_acct[2] += p.crc.compressed_size; @@ -789,45 +787,34 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - struct disk_accounting_pos acc_snapshot_key = { - .type = BCH_DISK_ACCOUNTING_snapshot, - .snapshot.id = k.k->p.snapshot, - }; - ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } - if (acct_compression_key.compression.type) { + if (cur_compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; } if (level) { - struct disk_accounting_pos acc_btree_key = { - .type = BCH_DISK_ACCOUNTING_btree, - .btree.id = btree_id, - }; - ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { bool insert = !(flags & BTREE_TRIGGER_overwrite); - struct disk_accounting_pos acc_inum_key = { - .type = BCH_DISK_ACCOUNTING_inum, - .inum.inum = k.k->p.inode, - }; + s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), *replicas_sectors, }; - ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); + ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) return ret; } @@ -876,15 +863,15 @@ int bch2_trigger_extent(struct btree_trans *trans, } int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta = 0; + s64 need_rebalance_sectors_delta[1] = { 0 }; s64 s = bch2_bkey_sectors_need_rebalance(c, old); need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta -= s; + need_rebalance_sectors_delta[0] -= s; s = bch2_bkey_sectors_need_rebalance(c, new.s_c); need_rebalance_delta += s != 0; - need_rebalance_sectors_delta += s; + need_rebalance_sectors_delta[0] += s; if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, @@ -893,12 +880,9 @@ int bch2_trigger_extent(struct btree_trans *trans, return ret; } - if (need_rebalance_sectors_delta) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_rebalance_work, - }; - int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, - flags & BTREE_TRIGGER_gc); + if (need_rebalance_sectors_delta[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + need_rebalance_sectors_delta, rebalance_work); if (ret) return ret; } @@ -914,17 +898,13 @@ static int __trigger_reservation(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 sectors = k.k->size; + s64 sectors[1] = { k.k->size }; if (flags & BTREE_TRIGGER_overwrite) - sectors = -sectors; + sectors[0] = -sectors[0]; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_persistent_reserved, - .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas, - }; - - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, + persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); } return 0; diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index 7f9e4c59..37266890 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -466,7 +466,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, prt_str(&buf, ")"); WARN_RATELIMIT(1, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_recompute_checksum; } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 31467f77..85fc9034 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, size_t src_len = src->bi_iter.bi_size; size_t dst_len = crc.uncompressed_size << 9; void *workspace; - int ret; + int ret = 0, ret2; enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); mempool_t *workspace_pool = &c->compress_workspace[opt]; @@ -189,7 +189,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, else ret = -BCH_ERR_compression_workspace_not_initialized; if (ret) - goto out; + goto err; } src_data = bio_map_or_bounce(c, src, READ); @@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, switch (crc.compression_type) { case BCH_COMPRESSION_TYPE_lz4_old: case BCH_COMPRESSION_TYPE_lz4: - ret = LZ4_decompress_safe_partial(src_data.b, dst_data, - src_len, dst_len, dst_len); - if (ret != dst_len) - goto err; + ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, + src_len, dst_len, dst_len); + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_lz4; break; case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { @@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, zlib_set_workspace(&strm, workspace); zlib_inflateInit2(&strm, -MAX_WBITS); - ret = zlib_inflate(&strm, Z_FINISH); + ret2 = zlib_inflate(&strm, Z_FINISH); mempool_free(workspace, workspace_pool); - if (ret != Z_STREAM_END) - goto err; + if (ret2 != Z_STREAM_END) + ret = -BCH_ERR_decompress_gzip; break; } case BCH_COMPRESSION_TYPE_zstd: { ZSTD_DCtx *ctx; size_t real_src_len = le32_to_cpup(src_data.b); - if (real_src_len > src_len - 4) + if (real_src_len > src_len - 4) { + ret = -BCH_ERR_decompress_zstd_src_len_bad; goto err; + } workspace = mempool_alloc(workspace_pool, GFP_NOFS); ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - ret = zstd_decompress_dctx(ctx, + ret2 = zstd_decompress_dctx(ctx, dst_data, dst_len, src_data.b + 4, real_src_len); mempool_free(workspace, workspace_pool); - if (ret != dst_len) - goto err; + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_zstd; break; } default: BUG(); } - ret = 0; +err: fsck_err: -out: bio_unmap_or_unbounce(c, src_data); return ret; -err: - ret = -EIO; - goto out; } int bch2_bio_uncompress_inplace(struct bch_write_op *op, @@ -268,27 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, BUG_ON(!bio->bi_vcnt); BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || - crc->compressed_size << 9 > c->opts.encoded_extent_max) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, - "extent too big to decompress"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - return -EIO; + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) { + bch2_write_op_error(op, op->pos.offset, + "extent too big to decompress (%u > %u)", + crc->uncompressed_size << 9, c->opts.encoded_extent_max); + return -BCH_ERR_decompress_exceeded_max_encoded_extent; } data = __bounce_alloc(c, dst_len, WRITE); - if (__bio_uncompress(c, bio, data.b, *crc)) { - if (!c->opts.no_data_io) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, - "decompression error"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - ret = -EIO; + ret = __bio_uncompress(c, bio, data.b, *crc); + + if (c->opts.no_data_io) + ret = 0; + + if (ret) { + bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret)); goto err; } @@ -321,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || crc.compressed_size << 9 > c->opts.encoded_extent_max) - return -EIO; + return -BCH_ERR_decompress_exceeded_max_encoded_extent; dst_data = dst_len == dst_iter.bi_size ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 08bb7f30..0ec273da 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -354,7 +354,7 @@ restart_drop_extra_replicas: printbuf_exit(&buf); bch2_fatal_error(c); - ret = -EIO; + ret = -BCH_ERR_invalid_bkey; goto out; } diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index f4c283d1..d7f9f793 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -729,3 +729,54 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) return ret < 0 ? ret : 0; } + +/* fsck */ + +static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode_nr) + break; + if (!bkey_is_inode(k.k)) + continue; + ret = bch2_inode_unpack(k, inode); + goto found; + } + ret = -BCH_ERR_ENOENT_inode; +found: + bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bch_inode_unpacked dir_inode; + struct bch_hash_info dir_hash_info; + int ret; + + ret = lookup_first_inode(trans, pos.inode, &dir_inode); + if (ret) + goto err; + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_internal_snapshot_node); + bch2_trans_iter_exit(trans, &iter); +err: + bch_err_fn(c, ret); + return ret; +} diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index a6e15a01..0880772b 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -82,4 +82,6 @@ int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); +int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); + #endif /* _BCACHEFS_DIRENT_H */ diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c index b32e91ba..8a8de614 100644 --- a/libbcachefs/disk_accounting.c +++ b/libbcachefs/disk_accounting.c @@ -135,6 +135,12 @@ static inline bool is_zero(char *start, char *end) #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) +static const unsigned bch2_accounting_type_nr_counters[] = { +#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr, + BCH_DISK_ACCOUNTING_TYPES() +#undef x +}; + int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -193,6 +199,11 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), c, accounting_key_junk_at_end, "junk at end of accounting key"); + + bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], + c, accounting_key_nr_counters_wrong, + "accounting key with %u counters, should be %u", + bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); fsck_err: return ret; } diff --git a/libbcachefs/disk_accounting.h b/libbcachefs/disk_accounting.h index f4372caf..abb1f620 100644 --- a/libbcachefs/disk_accounting.h +++ b/libbcachefs/disk_accounting.h @@ -33,10 +33,12 @@ static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a) static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, struct bkey_s_c_accounting src) { - EBUG_ON(dst->k.u64s != src.k->u64s); - - for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++) + for (unsigned i = 0; + i < min(bch2_accounting_counters(&dst->k), + bch2_accounting_counters(src.k)); + i++) dst->v.d[i] += src.v->d[i]; + if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0) dst->k.bversion = src.k->bversion; } @@ -85,6 +87,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); + +#define disk_accounting_key_init(_k, _type, ...) \ +do { \ + memset(&(_k), 0, sizeof(_k)); \ + (_k).type = BCH_DISK_ACCOUNTING_##_type; \ + (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ +} while (0) + +#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ +({ \ + struct disk_accounting_pos pos; \ + disk_accounting_key_init(pos, __VA_ARGS__); \ + bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ +}) + +#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ + bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) + int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, diff --git a/libbcachefs/disk_accounting_format.h b/libbcachefs/disk_accounting_format.h index 7b6e6c97..8269af1d 100644 --- a/libbcachefs/disk_accounting_format.h +++ b/libbcachefs/disk_accounting_format.h @@ -95,40 +95,81 @@ static inline bool data_type_is_hidden(enum bch_data_type type) } } +/* + * field 1: name + * field 2: id + * field 3: number of counters (max 3) + */ + #define BCH_DISK_ACCOUNTING_TYPES() \ - x(nr_inodes, 0) \ - x(persistent_reserved, 1) \ - x(replicas, 2) \ - x(dev_data_type, 3) \ - x(compression, 4) \ - x(snapshot, 5) \ - x(btree, 6) \ - x(rebalance_work, 7) \ - x(inum, 8) + x(nr_inodes, 0, 1) \ + x(persistent_reserved, 1, 1) \ + x(replicas, 2, 1) \ + x(dev_data_type, 3, 3) \ + x(compression, 4, 3) \ + x(snapshot, 5, 1) \ + x(btree, 6, 1) \ + x(rebalance_work, 7, 1) \ + x(inum, 8, 3) enum disk_accounting_type { -#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr, +#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, BCH_DISK_ACCOUNTING_TYPES() #undef x BCH_DISK_ACCOUNTING_TYPE_NR, }; -struct bch_nr_inodes { +/* + * No subtypes - number of inodes in the entire filesystem + * + * XXX: perhaps we could add a per-subvolume counter? + */ +struct bch_acct_nr_inodes { }; -struct bch_persistent_reserved { +/* + * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the + * reservation: + */ +struct bch_acct_persistent_reserved { __u8 nr_replicas; }; -struct bch_dev_data_type { +/* + * device, data type counter fields: + * [ + * nr_buckets + * live sectors (in buckets of that data type) + * sectors of internal fragmentation + * ] + * + * XXX: live sectors should've been done differently, you can have multiple data + * types in the same bucket (user, stripe, cached) and this collapses them to + * the bucket data type, and makes the internal fragmentation counter redundant + */ +struct bch_acct_dev_data_type { __u8 dev; __u8 data_type; }; +/* + * Compression type fields: + * [ + * number of extents + * uncompressed size + * compressed size + * ] + * + * Compression ratio, average extent size (fragmentation). + */ struct bch_acct_compression { __u8 type; }; +/* + * On disk usage by snapshot id; counts same values as replicas counter, but + * aggregated differently + */ struct bch_acct_snapshot { __u32 id; } __packed; @@ -137,10 +178,27 @@ struct bch_acct_btree { __u32 id; } __packed; +/* + * inum counter fields: + * [ + * number of extents + * sum of extent sizes - bkey size + * this field is similar to inode.bi_sectors, except here extents in + * different snapshots but the same inode number are all collapsed to the + * same counter + * sum of on disk size - same values tracked by replicas counters + * ] + * + * This tracks on disk fragmentation. + */ struct bch_acct_inum { __u64 inum; } __packed; +/* + * Simple counter of the amount of data (on disk sectors) rebalance needs to + * move, extents counted here are also in the rebalance_work btree. + */ struct bch_acct_rebalance_work { }; @@ -149,10 +207,10 @@ struct disk_accounting_pos { struct { __u8 type; union { - struct bch_nr_inodes nr_inodes; - struct bch_persistent_reserved persistent_reserved; + struct bch_acct_nr_inodes nr_inodes; + struct bch_acct_persistent_reserved persistent_reserved; struct bch_replicas_entry_v1 replicas; - struct bch_dev_data_type dev_data_type; + struct bch_acct_dev_data_type dev_data_type; struct bch_acct_compression compression; struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index c73ba73f..f2b9225f 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -1124,7 +1124,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_erasure_coding_found_btree_node; } k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); @@ -1190,7 +1190,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); if (!ca) - return -EIO; + return -BCH_ERR_ENOENT_dev_not_found; struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); @@ -1227,21 +1227,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) { struct btree_trans *trans = bch2_trans_get(c); struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - unsigned i, nr_data = v->nr_blocks - v->nr_redundant; - int ret = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; - ret = bch2_btree_write_buffer_flush_sync(trans); + int ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; - for (i = 0; i < nr_data; i++) { + for (unsigned i = 0; i < nr_data; i++) { ret = ec_stripe_update_bucket(trans, s, i); if (ret) break; } err: bch2_trans_put(trans); - return ret; } @@ -1451,11 +1449,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int ec_stripe_new_set_pending(c, h); } -void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err) { struct ec_stripe_new *s = ob->ec; - s->err = -EIO; + s->err = err; } void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index 8f2228e5..62d27e04 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -249,7 +249,7 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index c179954a..101806d7 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -116,6 +116,7 @@ x(ENOENT, ENOENT_snapshot_tree) \ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ + x(ENOENT, ENOENT_dev_bucket_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOENT, ENOENT_inode_no_backpointer) \ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ @@ -207,6 +208,7 @@ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ x(EINVAL, varint_decode_error) \ + x(EINVAL, erasure_coding_found_btree_node) \ x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ @@ -267,6 +269,7 @@ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, journal_shutdown) \ x(EIO, journal_flush_err) \ + x(EIO, journal_write_err) \ x(EIO, btree_node_read_err) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ @@ -275,6 +278,7 @@ x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ x(EIO, bucket_ref_update) \ + x(EIO, trigger_alloc) \ x(EIO, trigger_pointer) \ x(EIO, trigger_stripe_pointer) \ x(EIO, metadata_bucket_inconsistency) \ @@ -290,7 +294,19 @@ x(EIO, EIO_fault_injected) \ x(EIO, ec_block_read) \ x(EIO, ec_block_write) \ - x(EIO, data_read) \ + x(EIO, recompute_checksum) \ + x(EIO, decompress) \ + x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \ + x(BCH_ERR_decompress, decompress_lz4) \ + x(BCH_ERR_decompress, decompress_gzip) \ + x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \ + x(BCH_ERR_decompress, decompress_zstd) \ + x(EIO, data_write) \ + x(BCH_ERR_data_write, data_write_io) \ + x(BCH_ERR_data_write, data_write_csum) \ + x(BCH_ERR_data_write, data_write_invalid_ptr) \ + x(BCH_ERR_data_write, data_write_misaligned) \ + x(BCH_ERR_decompress, data_read) \ x(BCH_ERR_data_read, no_device_to_read_from) \ x(BCH_ERR_data_read, data_read_io_err) \ x(BCH_ERR_data_read, data_read_csum_err) \ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 6d68c89a..207f35d3 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -3,8 +3,8 @@ #include "btree_cache.h" #include "btree_iter.h" #include "error.h" -#include "fs-common.h" #include "journal.h" +#include "namei.h" #include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 04946d99..ca2073db 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -136,12 +136,8 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return -BCH_ERR_extent_poisened; - rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 pick_latency; @@ -592,29 +588,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, struct bch_extent_crc_unpacked src, enum bch_extent_entry_type type) { -#define set_common_fields(_dst, _src) \ - _dst.type = 1 << type; \ - _dst.csum_type = _src.csum_type, \ - _dst.compression_type = _src.compression_type, \ - _dst._compressed_size = _src.compressed_size - 1, \ - _dst._uncompressed_size = _src.uncompressed_size - 1, \ - _dst.offset = _src.offset +#define common_fields(_src) \ + .type = BIT(type), \ + .csum_type = _src.csum_type, \ + .compression_type = _src.compression_type, \ + ._compressed_size = _src.compressed_size - 1, \ + ._uncompressed_size = _src.uncompressed_size - 1, \ + .offset = _src.offset switch (type) { case BCH_EXTENT_ENTRY_crc32: - set_common_fields(dst->crc32, src); - dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); + dst->crc32 = (struct bch_extent_crc32) { + common_fields(src), + .csum = (u32 __force) *((__le32 *) &src.csum.lo), + }; break; case BCH_EXTENT_ENTRY_crc64: - set_common_fields(dst->crc64, src); - dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = (u64 __force) src.csum.lo; - dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); + dst->crc64 = (struct bch_extent_crc64) { + common_fields(src), + .nonce = src.nonce, + .csum_lo = (u64 __force) src.csum.lo, + .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), + }; break; case BCH_EXTENT_ENTRY_crc128: - set_common_fields(dst->crc128, src); - dst->crc128.nonce = src.nonce; - dst->crc128.csum = src.csum; + dst->crc128 = (struct bch_extent_crc128) { + common_fields(src), + .nonce = src.nonce, + .csum = src.csum, + }; break; default: BUG(); diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index 5ab1c73c..a03e2c78 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -225,11 +225,11 @@ static void bchfs_read(struct btree_trans *trans, bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); + swap(rbio->bio.bi_iter.bi_size, bytes); if (flags & BCH_READ_last_fragment) break; - swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); err: if (ret && diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 5b47b94f..e3a3230f 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -5,8 +5,8 @@ #include "chardev.h" #include "dirent.h" #include "fs.h" -#include "fs-common.h" #include "fs-ioctl.h" +#include "namei.h" #include "quota.h" #include <linux/compat.h> diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 4453dd2f..94e97e28 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -11,7 +11,6 @@ #include "errcode.h" #include "extents.h" #include "fs.h" -#include "fs-common.h" #include "fs-io.h" #include "fs-ioctl.h" #include "fs-io-buffered.h" @@ -22,6 +21,7 @@ #include "io_read.h" #include "journal.h" #include "keylist.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "snapshot.h" @@ -641,7 +641,9 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, &inum); if (ret > 0) ret = -ENOENT; if (ret) @@ -651,30 +653,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (inode) goto out; + /* + * Note: if check/repair needs it, we commit before + * bch2_inode_hash_init_insert(), as after that point we can't take a + * restart - not in the top level loop with a commit_do(), like we + * usually do: + */ + struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); + /* + * don't remove it: check_inodes might find another inode that points + * back to this dirent + */ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "dirent to missing inode:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); if (ret) goto err; - - /* regular files may have hardlinks: */ - if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && - !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), - c, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, &inode_u), - buf.buf))) { - ret = -ENOENT; - goto err; - } out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); @@ -2177,7 +2179,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) /* Some options can't be parsed until after the fs is started: */ opts = bch2_opts_empty(); - ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); + ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false); if (ret) goto err_stop_fs; @@ -2331,6 +2333,8 @@ static int bch2_fs_parse_param(struct fs_context *fc, int ret = bch2_parse_one_mount_opt(c, &opts->opts, &opts->parse_later, param->key, param->string); + if (ret) + pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret)); return bch2_err_class(ret); } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 0e85131d..f955b8f9 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -10,10 +10,10 @@ #include "dirent.h" #include "error.h" #include "fs.h" -#include "fs-common.h" #include "fsck.h" #include "inode.h" #include "keylist.h" +#include "namei.h" #include "recovery_passes.h" #include "snapshot.h" #include "super.h" @@ -23,13 +23,6 @@ #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, struct bch_inode_unpacked *inode) { @@ -116,29 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode_nr) - break; - if (!bkey_is_inode(k.k)) - continue; - ret = bch2_inode_unpack(k, inode); - goto found; - } - ret = -BCH_ERR_ENOENT_inode; -found: - bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, struct bch_inode_unpacked *inode) { @@ -179,32 +149,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, return 0; } -static int __remove_dirent(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bch_inode_unpacked dir_inode; - struct bch_hash_info dir_hash_info; - int ret; - - ret = lookup_first_inode(trans, pos.inode, &dir_inode); - if (ret) - goto err; - - dir_hash_info = bch2_hash_info_init(c, &dir_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); -err: - bch_err_fn(c, ret); - return ret; -} - /* * Find any subvolume associated with a tree of snapshots * We can't rely on master_subvol - it might have been deleted. @@ -548,7 +492,7 @@ static int remove_backpointer(struct btree_trans *trans, SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); int ret = bkey_err(d) ?: dirent_points_to_inode(c, d, inode) ?: - __remove_dirent(trans, d.k->p); + bch2_fsck_remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1985,169 +1929,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa trans_was_restarted(trans, restart_count); } -noinline_for_stack -static int check_dirent_inode_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = { NULL }; - int ret = 0; - - if (inode_points_to_dirent(target, d)) - return 0; - - if (!target->bi_dir && - !target->bi_dir_offset) { - fsck_err_on(S_ISDIR(target->bi_mode), - trans, inode_dir_missing_backpointer, - "directory with missing backpointer\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - fsck_err_on(target->bi_flags & BCH_INODE_unlinked, - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - target->bi_flags &= ~BCH_INODE_unlinked; - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - return __bch2_fsck_write_inode(trans, target); - } - - if (bch2_inode_should_have_single_bp(target) && - !fsck_err(trans, inode_wrong_backpointer, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf))) - goto err; - - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, - SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot)); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; - - if (fsck_err_on(!backpointer_exists, - trans, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target->bi_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - ret = __bch2_fsck_write_inode(trans, target); - goto out; - } - - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); - - if (fsck_err_on(backpointer_exists && - (S_ISDIR(target->bi_mode) || - target->bi_subvol), - trans, inode_dir_multiple_links, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf)) { - ret = __remove_dirent(trans, d.k->p); - goto out; - } - - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - trans, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - ret = __bch2_fsck_write_inode(trans, target); - if (ret) - goto err; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -noinline_for_stack -static int check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct bkey_i_dirent *n; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = check_dirent_inode_dirent(trans, iter, d, target); - if (ret) - goto err; - - if (fsck_err_on(d.v->d_type != inode_d_type(target), - trans, dirent_d_type_wrong, - "incorrect d_type: got %s, should be %s:\n%s", - bch2_d_type_str(d.v->d_type), - bch2_d_type_str(inode_d_type(target)), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = inode_d_type(target); - if (n->v.d_type == DT_SUBVOL) { - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); - n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); - } else { - n->v.d_inum = cpu_to_le64(target->bi_inum); - } - - ret = bch2_trans_update(trans, iter, &n->k_i, 0); - if (ret) - goto err; - - d = dirent_i_to_s_c(n); - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - /* find a subvolume that's a descendent of @snapshot: */ static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) { @@ -2247,7 +2028,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (fsck_err(trans, dirent_to_missing_subvol, "dirent points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) - return __remove_dirent(trans, d.k->p); + return bch2_fsck_remove_dirent(trans, d.k->p); ret = 0; goto out; } @@ -2291,7 +2072,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto err; } - ret = check_dirent_target(trans, iter, d, &subvol_root); + ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); if (ret) goto err; out: @@ -2378,13 +2159,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = __remove_dirent(trans, d.k->p); + ret = bch2_fsck_remove_dirent(trans, d.k->p); if (ret) goto err; } darray_for_each(target->inodes, i) { - ret = check_dirent_target(trans, iter, d, &i->inode); + ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); if (ret) goto err; } @@ -3240,7 +3021,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) if (arg.opts) { char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); + bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false); if (!IS_ERR(optstr)) kfree(optstr); @@ -3348,7 +3129,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); + bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false); if (!IS_ERR(optstr)) kfree(optstr); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 7aca010e..80051073 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -731,10 +731,9 @@ int bch2_trigger_inode(struct btree_trans *trans, bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } - s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); - if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { - struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; - int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); + s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) }; + if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes); if (ret) return ret; } @@ -1079,7 +1078,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } @@ -1243,7 +1242,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 428b9be6..f82cfbf4 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -277,6 +277,7 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; return S_ISDIR(inode->bi_mode) || + inode->bi_subvol || (!inode->bi_nlink && inode_has_bp); } diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index 4fb279f1..a04dffa4 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -295,6 +295,13 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, bool *read_full, struct bch_io_failures *failed) { + /* + * We're in the retry path, but we don't know what to repair yet, and we + * don't want to do a promote here: + */ + if (failed && !failed->nr) + return NULL; + struct bch_fs *c = trans->c; /* * if failed != NULL we're not actually doing a promote, we're @@ -429,6 +436,71 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } +static void get_rbio_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bkey_buf *sk) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + rbio->data_btree, rbio->data_pos, 0))); + if (ret) + return; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) + if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { + bch2_bkey_buf_reassemble(sk, trans->c, k); + break; + } + + bch2_trans_iter_exit(trans, &iter); +} + +static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + enum btree_id btree, struct bkey_s_c read_k) +{ + struct bch_fs *c = trans->c; + + struct data_update *u = rbio_data_update(rbio); + if (u) + read_k = bkey_i_to_s_c(u->k.k); + + u64 flags = bch2_bkey_extent_flags(read_k); + if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), + BTREE_ITER_intent); + int ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_and_val_eq(k, read_k)) + goto out; + + struct bkey_i *new = bch2_trans_kmalloc(trans, + bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); + ret = PTR_ERR_OR_ZERO(new) ?: + (bkey_reassemble(new, k), 0) ?: + bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: + bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + + /* + * Propagate key change back to data update path, in particular so it + * knows the extent has been poisoned and it's safe to change the + * checksum + */ + if (u && !ret) + bch2_bkey_buf_copy(&u->k, c, new); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, @@ -462,7 +534,8 @@ retry: err: bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; if (ret) { @@ -486,13 +559,21 @@ static void bch2_rbio_retry(struct work_struct *work) .inum = rbio->read_pos.inode, }; struct bch_io_failures failed = { .nr = 0 }; + struct btree_trans *trans = bch2_trans_get(c); + struct bkey_buf sk; + bch2_bkey_buf_init(&sk); + bkey_init(&sk.k->k); + trace_io_read_retry(&rbio->bio); this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], bvec_iter_sectors(rbio->bvec_iter)); - if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) + get_rbio_extent(trans, rbio, &sk); + + if (!bkey_deleted(&sk.k->k) && + bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(&failed, &rbio->pick, rbio->ret == -BCH_ERR_data_read_retry_csum_err); @@ -513,7 +594,7 @@ static void bch2_rbio_retry(struct work_struct *work) int ret = rbio->data_update ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) - : __bch2_read(trans, rbio, iter, inum, &failed, flags); + : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); if (ret) { rbio->ret = ret; @@ -534,6 +615,7 @@ static void bch2_rbio_retry(struct work_struct *work) } bch2_rbio_done(rbio); + bch2_bkey_buf_exit(&sk, c); bch2_trans_put(trans); } @@ -958,6 +1040,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, bvec_iter_sectors(iter)); goto out_read_done; } + + if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && + !orig->data_update) + return -BCH_ERR_extent_poisened; retry_pick: ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); @@ -966,6 +1052,16 @@ retry_pick: goto hole; if (unlikely(ret < 0)) { + if (ret == -BCH_ERR_data_read_csum_err) { + int ret2 = maybe_poison_extent(trans, orig, data_btree, k); + if (ret2) { + ret = ret2; + goto err; + } + + trace_and_count(c, io_read_fail_and_poison, &orig->bio); + } + struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); prt_printf(&buf, "%s\n ", bch2_err_str(ret)); @@ -1263,12 +1359,15 @@ out_read_done: int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) + struct bch_io_failures *failed, + struct bkey_buf *prev_read, + unsigned flags) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; + enum btree_id data_btree; int ret; EBUG_ON(rbio->data_update); @@ -1279,7 +1378,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, BTREE_ITER_slots); while (1) { - enum btree_id data_btree = BTREE_ID_extents; + data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1311,6 +1410,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, k = bkey_i_to_s_c(sk.k); + if (unlikely(flags & BCH_READ_in_retry)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) + failed->nr = 0; + bch2_bkey_buf_copy(prev_read, c, sk.k); + } + /* * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: @@ -1326,13 +1431,14 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, data_btree, k, offset_into_extent, failed, flags, -1); + swap(bvec_iter.bi_size, bytes); + if (ret) goto err; if (flags & BCH_READ_last_fragment) break; - swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); err: if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) @@ -1344,9 +1450,7 @@ err: break; } - bch2_trans_iter_exit(trans, &iter); - - if (ret) { + if (unlikely(ret)) { struct printbuf buf = PRINTBUF; lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, @@ -1362,6 +1466,7 @@ err: bch2_rbio_done(rbio); } + bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); return ret; } diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h index cd219504..1a85b092 100644 --- a/libbcachefs/io_read.h +++ b/libbcachefs/io_read.h @@ -137,12 +137,15 @@ static inline void bch2_read_extent(struct btree_trans *trans, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, unsigned flags) { - __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags, -1); + int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, + data_btree, k, offset_into_extent, NULL, flags, -1); + /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ + WARN(ret, "unhandled error from __bch2_read_extent()"); } int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); + subvol_inum, + struct bch_io_failures *, struct bkey_buf *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) @@ -152,7 +155,7 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, rbio->subvol = inum.subvol; bch2_trans_run(c, - __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, + __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, BCH_READ_retry_if_stale| BCH_READ_may_promote| BCH_READ_user_mapped)); diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index a2e6b305..07b55839 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -402,61 +402,36 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ -void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_write_op *op, u64 offset, const char *fmt, ...) +void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) { - if (op->subvol) - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9)); - else { - struct bpos pos = op->pos; - pos.offset = offset; - lockrestart_do(trans, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); - } + struct printbuf buf = PRINTBUF; - prt_str(out, "write error: "); - - va_list args; - va_start(args, fmt); - prt_vprintf(out, fmt, args); - va_end(args); - - if (op->flags & BCH_WRITE_move) { - struct data_update *u = container_of(op, struct data_update, op); - - prt_printf(out, "\n from internal move "); - bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k)); - } -} - -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64 offset, - const char *fmt, ...) -{ - if (op->subvol) - bch2_inum_offset_err_msg(op->c, out, + if (op->subvol) { + bch2_inum_offset_err_msg(op->c, &buf, (subvol_inum) { op->subvol, op->pos.inode, }, offset << 9); - else { + } else { struct bpos pos = op->pos; pos.offset = offset; - bch2_inum_snap_offset_err_msg(op->c, out, pos); + bch2_inum_snap_offset_err_msg(op->c, &buf, pos); } - prt_str(out, "write error: "); + prt_str(&buf, "write error: "); va_list args; va_start(args, fmt); - prt_vprintf(out, fmt, args); + prt_vprintf(&buf, fmt, args); va_end(args); if (op->flags & BCH_WRITE_move) { struct data_update *u = container_of(op, struct data_update, op); - prt_printf(out, "\n from internal move "); - bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k)); + prt_printf(&buf, "\n from internal move "); + bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); } + + bch_err_ratelimited(op->c, "%s", buf.buf); + printbuf_exit(&buf); } void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -554,7 +529,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -EIO; + return -BCH_ERR_data_write_io; } if (dst != src) @@ -598,11 +573,8 @@ static void __bch2_write_index(struct bch_write_op *op) if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k), + bch2_write_op_error(op, bkey_start_offset(&insert->k), "btree update error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); } if (ret) @@ -611,7 +583,7 @@ static void __bch2_write_index(struct bch_write_op *op) out: /* If some a bucket wasn't written, we can't erasure code it: */ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) - bch2_open_bucket_write_error(c, &op->open_buckets, dev); + bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); bch2_open_buckets_put(c, &op->open_buckets); return; @@ -837,7 +809,6 @@ static int bch2_write_rechecksum(struct bch_fs *c, { struct bio *bio = &op->wbio.bio; struct bch_extent_crc_unpacked new_crc; - int ret; /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ @@ -845,10 +816,10 @@ static int bch2_write_rechecksum(struct bch_fs *c, bch2_csum_type_is_encryption(new_csum_type)) new_csum_type = op->crc.csum_type; - ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, - NULL, &new_crc, - op->crc.offset, op->crc.live_size, - new_csum_type); + int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, + NULL, &new_crc, + op->crc.offset, op->crc.live_size, + new_csum_type); if (ret) return ret; @@ -858,44 +829,12 @@ static int bch2_write_rechecksum(struct bch_fs *c, return 0; } -static int bch2_write_decrypt(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct nonce nonce = extent_nonce(op->version, op->crc); - struct bch_csum csum; - int ret; - - if (!bch2_csum_type_is_encryption(op->crc.csum_type)) - return 0; - - /* - * If we need to decrypt data in the write path, we'll no longer be able - * to verify the existing checksum (poly1305 mac, in this case) after - * it's decrypted - this is the last point we'll be able to reverify the - * checksum: - */ - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return -EIO; - - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - return ret; -} - -static enum prep_encoded_ret { - PREP_ENCODED_OK, - PREP_ENCODED_ERR, - PREP_ENCODED_CHECKSUM_ERR, - PREP_ENCODED_DO_WRITE, -} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) +static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) { struct bch_fs *c = op->c; struct bio *bio = &op->wbio.bio; - - if (!(op->flags & BCH_WRITE_data_encoded)) - return PREP_ENCODED_OK; + struct bch_csum csum; + int ret = 0; BUG_ON(bio_sectors(bio) != op->crc.compressed_size); @@ -906,12 +845,13 @@ static enum prep_encoded_ret { (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || op->incompressible)) { if (!crc_is_compressed(op->crc) && - op->csum_type != op->crc.csum_type && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + op->csum_type != op->crc.csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } - return PREP_ENCODED_DO_WRITE; + return 1; } /* @@ -919,20 +859,24 @@ static enum prep_encoded_ret { * is, we have to decompress it: */ if (crc_is_compressed(op->crc)) { - struct bch_csum csum; - - if (bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; - /* Last point we can still verify checksum: */ - csum = bch2_checksum_bio(c, op->crc.csum_type, - extent_nonce(op->version, op->crc), - bio); + struct nonce nonce = extent_nonce(op->version, op->crc); + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + goto csum_err; - if (bch2_bio_uncompress_inplace(op, bio)) - return PREP_ENCODED_ERR; + if (bch2_csum_type_is_encryption(op->crc.csum_type)) { + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; + + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } + + ret = bch2_bio_uncompress_inplace(op, bio); + if (ret) + return ret; } /* @@ -944,22 +888,44 @@ static enum prep_encoded_ret { * If the data is checksummed and we're only writing a subset, * rechecksum and adjust bio to point to currently live data: */ - if ((op->crc.live_size != op->crc.uncompressed_size || - op->crc.csum_type != op->csum_type) && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + if (op->crc.live_size != op->crc.uncompressed_size || + op->crc.csum_type != op->csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } /* * If we want to compress the data, it has to be decrypted: */ - if ((op->compression_opt || - bch2_csum_type_is_encryption(op->crc.csum_type) != - bch2_csum_type_is_encryption(op->csum_type)) && - bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; + if (bch2_csum_type_is_encryption(op->crc.csum_type) && + (op->compression_opt || op->crc.csum_type != op->csum_type)) { + struct nonce nonce = extent_nonce(op->version, op->crc); + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + goto csum_err; - return PREP_ENCODED_OK; + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; + + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } + + return 0; +csum_err: + bch2_write_op_error(op, op->pos.offset, + "error verifying existing checksum while moving existing data (memory corruption?)\n" + " expected %0llx:%0llx got %0llx:%0llx type %s", + op->crc.csum.hi, + op->crc.csum.lo, + csum.hi, + csum.lo, + op->crc.csum_type < BCH_CSUM_NR + ? __bch2_csum_types[op->crc.csum_type] + : "(unknown)"); + return -BCH_ERR_data_write_csum; } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -974,29 +940,28 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bool page_alloc_failed = false; int ret, more = 0; + if (op->incompressible) + op->compression_opt = 0; + BUG_ON(!bio_sectors(src)); ec_buf = bch2_writepoint_ec_buf(c, wp); - switch (bch2_write_prep_encoded_data(op, wp)) { - case PREP_ENCODED_OK: - break; - case PREP_ENCODED_ERR: - ret = -EIO; - goto err; - case PREP_ENCODED_CHECKSUM_ERR: - goto csum_err; - case PREP_ENCODED_DO_WRITE: - /* XXX look for bug here */ - if (ec_buf) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bio_copy_data(dst, src); - bounce = true; + if (unlikely(op->flags & BCH_WRITE_data_encoded)) { + ret = bch2_write_prep_encoded_data(op, wp); + if (ret < 0) + goto err; + if (ret) { + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } + init_append_extent(op, wp, op->version, op->crc); + goto do_write; } - init_append_extent(op, wp, op->version, op->crc); - goto do_write; } if (ec_buf || @@ -1089,12 +1054,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, * data can't be modified (by userspace) while it's in * flight. */ - if (bch2_rechecksum_bio(c, src, version, op->crc, + ret = bch2_rechecksum_bio(c, src, version, op->crc, &crc, &op->crc, src_len >> 9, bio_sectors(src) - (src_len >> 9), - op->csum_type)) - goto csum_err; + op->csum_type); + if (ret) + goto err; /* * rchecksum_bio sets compression_type on crc from op->crc, * this isn't always correct as sometimes we're changing @@ -1104,12 +1070,12 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, crc.nonce = nonce; } else { if ((op->flags & BCH_WRITE_data_encoded) && - bch2_rechecksum_bio(c, src, version, op->crc, + (ret = bch2_rechecksum_bio(c, src, version, op->crc, NULL, &op->crc, src_len >> 9, bio_sectors(src) - (src_len >> 9), - op->crc.csum_type)) - goto csum_err; + op->crc.csum_type))) + goto err; crc.compressed_size = dst_len >> 9; crc.uncompressed_size = src_len >> 9; @@ -1168,16 +1134,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, do_write: *_dst = dst; return more; -csum_err: - { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, - "error verifying existing checksum while rewriting existing data (memory corruption?)"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - ret = -EIO; err: if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); @@ -1255,38 +1211,35 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans *trans = bch2_trans_get(c); + int ret = 0; for_each_keylist_key(&op->insert_keys, orig) { - int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); - - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - - struct printbuf buf = PRINTBUF; - bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k), - "btree update error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - if (ret) { - op->error = ret; + if (ret) break; - } } bch2_trans_put(trans); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + bch2_write_op_error(op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); + } + + if (ret) + op->error = ret; } static void __bch2_nocow_write_done(struct bch_write_op *op) { if (unlikely(op->flags & BCH_WRITE_io_error)) { - op->error = -EIO; + op->error = -BCH_ERR_data_write_io; } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1436,11 +1389,8 @@ err: darray_exit(&buckets); if (ret) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, + bch2_write_op_error(op, op->pos.offset, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); op->error = ret; op->flags |= BCH_WRITE_submitted; } @@ -1480,7 +1430,7 @@ err_bucket_stale: "pointer to invalid bucket in nocow path on device %llu\n %s", stale_at->b.inode, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_data_write_invalid_ptr; } else { /* We can retry this: */ ret = -BCH_ERR_transaction_restart; @@ -1558,13 +1508,9 @@ err: op->flags |= BCH_WRITE_submitted; if (unlikely(ret < 0)) { - if (!(op->flags & BCH_WRITE_alloc_nowait)) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, + if (!(op->flags & BCH_WRITE_alloc_nowait)) + bch2_write_op_error(op, op->pos.offset, "%s(): %s", __func__, bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } op->error = ret; break; } @@ -1691,11 +1637,8 @@ CLOSURE_CALLBACK(bch2_write) wbio_init(bio)->put_bio = false; if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, - "misaligned write"); - printbuf_exit(&buf); - op->error = -EIO; + bch2_write_op_error(op, op->pos.offset, "misaligned write"); + op->error = -BCH_ERR_data_write_misaligned; goto err; } diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h index 62773053..b8ab19a1 100644 --- a/libbcachefs/io_write.h +++ b/libbcachefs/io_write.h @@ -14,13 +14,8 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); -__printf(5, 6) -void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_write_op *op, u64, const char *, ...); - -__printf(4, 5) -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64, - const char *, ...); +__printf(3, 4) +void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); #define BCH_WRITE_FLAGS() \ x(alloc_nowait) \ diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index ce730269..ecb97d43 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -62,8 +62,7 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 prt_newline(out); } - prt_printf(out, "expires:\t"); - prt_printf(out, "%li jiffies\n", buf->expires - jiffies); + prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies); prt_printf(out, "flags:\t"); if (buf->noflush) @@ -142,6 +141,8 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) bool stuck = false; struct printbuf buf = PRINTBUF; + buf.atomic++; + if (!(error == -BCH_ERR_journal_full || error == -BCH_ERR_journal_pin_full) || nr_unwritten_journal_entries(j) || @@ -172,7 +173,7 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", bch2_err_str(error)); bch2_journal_debug_to_text(&buf, j); - bch_err(c, "%s", buf.buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_reset(&buf); bch2_journal_pins_to_text(&buf, j); @@ -726,10 +727,10 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, remaining_wait)) return ret; + bch_err(c, "Journal stuck? Waited for 10 seconds, err %s", bch2_err_str(ret)); struct printbuf buf = PRINTBUF; bch2_journal_debug_to_text(&buf, j); - bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", - buf.buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); closure_wait_event(&j->async_wait, @@ -1510,7 +1511,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) return -BCH_ERR_ENOMEM_dev_journal_init; diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index cf2700b0..4ed6137f 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1624,7 +1624,7 @@ static CLOSURE_CALLBACK(journal_write_done) if (!w->devs_written.nr) { bch_err(c, "unable to write journal to sufficient devices"); - err = -EIO; + err = -BCH_ERR_journal_write_err; } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 3ed31492..5d1547aa 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -645,7 +645,6 @@ static u64 journal_seq_to_flush(struct journal *j) * @j: journal object * @direct: direct or background reclaim? * @kicked: requested to run since we last ran? - * Returns: 0 on success, or -EIO if the journal has been shutdown * * Background journal reclaim writes out btree nodes. It should be run * early enough so that we never completely run out of journal buckets. @@ -685,10 +684,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (kthread && kthread_should_stop()) break; - if (bch2_journal_error(j)) { - ret = -EIO; + ret = bch2_journal_error(j); + if (ret) break; - } bch2_journal_do_discards(j); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 8fcdc698..66d1c055 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -126,26 +126,40 @@ static void move_write_done(struct bch_write_op *op) static void move_write(struct moving_io *io) { + struct bch_fs *c = io->write.op.c; struct moving_context *ctxt = io->write.ctxt; + struct bch_read_bio *rbio = &io->write.rbio; if (ctxt->stats) { - if (io->write.rbio.bio.bi_status) + if (rbio->bio.bi_status) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_uncorrected); - else if (io->write.rbio.saw_error) + else if (rbio->saw_error) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_corrected); } - if (unlikely(io->write.rbio.ret || - io->write.rbio.bio.bi_status || - io->write.data_opts.scrub)) { + /* + * If the extent has been bitrotted, we're going to have to give it a + * new checksum in order to move it - but the poison bit will ensure + * that userspace still gets the appropriate error. + */ + if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && + (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + + rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, + nonce, &rbio->bio); + rbio->ret = 0; + } + + if (unlikely(rbio->ret || io->write.data_opts.scrub)) { move_free(io); return; } if (trace_io_move_write_enabled()) { - struct bch_fs *c = io->write.op.c; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); @@ -528,6 +542,37 @@ int bch2_move_ratelimit(struct moving_context *ctxt) return 0; } +/* + * Move requires non extents iterators, and there's also no need for it to + * signal indirect_extent_missing_error: + */ +static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_reflink_p p) +{ + if (unlikely(REFLINK_P_ERROR(p.v))) + return bkey_s_c_null; + + struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v)); + + bch2_trans_iter_init(trans, iter, + BTREE_ID_reflink, reflink_pos, + BTREE_ITER_not_extents); + + struct bkey_s_c k = bch2_btree_iter_peek(iter); + if (!k.k || bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } + + if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { + bch2_trans_iter_exit(trans, iter); + return bkey_s_c_null; + } + + return k; +} + static int bch2_move_data_btree(struct moving_context *ctxt, struct bpos start, struct bpos end, @@ -592,17 +637,16 @@ static int bch2_move_data_btree(struct moving_context *ctxt, k.k->type == KEY_TYPE_reflink_p && REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - s64 offset_into_extent = 0; bch2_trans_iter_exit(trans, &reflink_iter); - k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); + k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; - if (bkey_deleted(k.k)) + if (!k.k) goto next_nondata; /* @@ -611,7 +655,6 @@ static int bch2_move_data_btree(struct moving_context *ctxt, * pointer - need to fixup iter->k */ extent_iter = &reflink_iter; - offset_into_extent = 0; } if (!bkey_extent_is_direct_data(k.k)) diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h index 82e473ed..807f779f 100644 --- a/libbcachefs/move_types.h +++ b/libbcachefs/move_types.h @@ -32,7 +32,7 @@ struct bch_move_stats { struct move_bucket_key { struct bpos bucket; - u8 gen; + unsigned gen; }; struct move_bucket { diff --git a/libbcachefs/fs-common.c b/libbcachefs/namei.c similarity index 75% rename from libbcachefs/fs-common.c rename to libbcachefs/namei.c index fbc3da59..93246ad3 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/namei.c @@ -4,8 +4,8 @@ #include "acl.h" #include "btree_update.h" #include "dirent.h" -#include "fs-common.h" #include "inode.h" +#include "namei.h" #include "subvolume.h" #include "xattr.h" @@ -564,6 +564,8 @@ err: return ret; } +/* inum_to_path */ + static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) { bch2_printbuf_make_room(out, n); @@ -654,3 +656,179 @@ disconnected: prt_str_reversed(path, "(disconnected)"); goto out; } + +/* fsck */ + +static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = { NULL }; + int ret = 0; + + if (inode_points_to_dirent(target, d)) + return 0; + + if (!target->bi_dir && + !target->bi_dir_offset) { + fsck_err_on(S_ISDIR(target->bi_mode), + trans, inode_dir_missing_backpointer, + "directory with missing backpointer\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + fsck_err_on(target->bi_flags & BCH_INODE_unlinked, + trans, inode_unlinked_but_has_dirent, + "inode unlinked but has dirent\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + target->bi_flags &= ~BCH_INODE_unlinked; + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + return __bch2_fsck_write_inode(trans, target); + } + + if (bch2_inode_should_have_single_bp(target) && + !fsck_err(trans, inode_wrong_backpointer, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf))) + goto err; + + struct bkey_s_c_dirent bp_dirent = + bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, + SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), + 0, dirent); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + bool backpointer_exists = !ret; + ret = 0; + + if (!backpointer_exists) { + if (fsck_err(trans, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target->bi_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target); + } + } else { + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (S_ISDIR(target->bi_mode) || target->bi_subvol) { + /* + * XXX: verify connectivity of the other dirent + * up to the root before removing this one + * + * Additionally, bch2_lookup would need to cope with the + * dirent it found being removed - or should we remove + * the other one, even though the inode points to it? + */ + if (in_fsck) { + if (fsck_err(trans, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf)) + ret = bch2_fsck_remove_dirent(trans, d.k->p); + } else { + bch2_fs_inconsistent(c, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf); + } + + goto out; + } else { + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(!target->bi_nlink, + trans, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target); + if (ret) + goto err; + } + } + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +int __bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); + if (ret) + goto err; + + if (fsck_err_on(d.v->d_type != inode_d_type(target), + trans, dirent_d_type_wrong, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + if (n->v.d_type == DT_SUBVOL) { + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); + } else { + n->v.d_inum = cpu_to_le64(target->bi_inum); + } + + ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); + if (ret) + goto err; + } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} diff --git a/libbcachefs/fs-common.h b/libbcachefs/namei.h similarity index 61% rename from libbcachefs/fs-common.h rename to libbcachefs/namei.h index 2b59210b..2e6f6364 100644 --- a/libbcachefs/fs-common.h +++ b/libbcachefs/namei.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_COMMON_H -#define _BCACHEFS_FS_COMMON_H +#ifndef _BCACHEFS_NAMEI_H +#define _BCACHEFS_NAMEI_H #include "dirent.h" @@ -44,4 +44,29 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -#endif /* _BCACHEFS_FS_COMMON_H */ +int __bch2_check_dirent_target(struct btree_trans *, + struct btree_iter *, + struct bkey_s_c_dirent, + struct bch_inode_unpacked *, bool); + +static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static inline int bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + if (likely(inode_points_to_dirent(target, d) && + d.v->d_type == inode_d_type(target))) + return 0; + + return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); +} + +#endif /* _BCACHEFS_NAMEI_H */ diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 24a46103..e5c42e20 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -44,7 +44,7 @@ const char * const __bch2_btree_ids[] = { NULL }; -static const char * const __bch2_csum_types[] = { +const char * const __bch2_csum_types[] = { BCH_CSUM_TYPES() NULL }; @@ -219,10 +219,10 @@ typedef void (*sb_opt_set_fn)(struct bch_sb *, u64); typedef u64 (*member_opt_get_fn)(const struct bch_member *); typedef void (*member_opt_set_fn)(struct bch_member *, u64); -static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL; -static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL; -static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL; -static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL; +__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL; +__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL; +__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL; +__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL; #define type_compatible_or_null(_p, _type) \ __builtin_choose_expr( \ @@ -551,14 +551,15 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, goto bad_opt; ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); - if (ret == -BCH_ERR_option_needs_open_fs && parse_later) { - prt_printf(parse_later, "%s=%s,", name, val); - if (parse_later->allocation_failure) { - ret = -ENOMEM; - goto out; + if (ret == -BCH_ERR_option_needs_open_fs) { + ret = 0; + + if (parse_later) { + prt_printf(parse_later, "%s=%s,", name, val); + if (parse_later->allocation_failure) + ret = -ENOMEM; } - ret = 0; goto out; } @@ -569,28 +570,24 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, bch2_opt_set_by_id(opts, id, v); ret = 0; - goto out; - -bad_opt: - pr_err("Bad mount option %s", name); - ret = -BCH_ERR_option_name; - goto out; - -bad_val: - pr_err("Invalid mount option %s", err.buf); - ret = -BCH_ERR_option_value; - out: printbuf_exit(&err); return ret; +bad_opt: + ret = -BCH_ERR_option_name; + goto out; +bad_val: + ret = -BCH_ERR_option_value; + goto out; } int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, - struct printbuf *parse_later, char *options) + struct printbuf *parse_later, char *options, + bool ignore_unknown) { char *copied_opts, *copied_opts_start; char *opt, *name, *val; - int ret; + int ret = 0; if (!options) return 0; @@ -615,14 +612,14 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, val = opt; ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val); - if (ret < 0) - goto out; + if (ret == -BCH_ERR_option_name && ignore_unknown) + ret = 0; + if (ret) { + pr_err("Error parsing option %s: %s", name, bch2_err_str(ret)); + break; + } } - ret = 0; - goto out; - -out: kfree(copied_opts_start); return ret; } diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 8d1dc881..4d063130 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -16,6 +16,7 @@ extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; +extern const char * const __bch2_csum_types[]; extern const char * const __bch2_csum_opts[]; extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; @@ -499,11 +500,6 @@ enum fsck_err_opts { OPT_STR(bch2_member_states), \ BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ "state", "rw,ro,failed,spare") \ - x(fs_size, u64, \ - OPT_DEVICE|OPT_HIDDEN, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_MEMBER_OPT, 0, \ - "size", "Size of filesystem on device") \ x(bucket_size, u32, \ OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ OPT_UINT(0, S64_MAX), \ @@ -640,7 +636,7 @@ int bch2_opts_check_may_set(struct bch_fs *); int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, struct printbuf *, const char *, const char *); int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, - char *); + char *, bool); /* inode opts: */ diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 29a56938..10c6a7fd 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -95,6 +95,9 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | bch2_bkey_ptrs_need_move(c, opts, ptrs); } @@ -107,6 +110,9 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) if (!opts) return 0; + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index a6e26733..266c5770 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -13,12 +13,12 @@ #include "disk_accounting.h" #include "errcode.h" #include "error.h" -#include "fs-common.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "logged_ops.h" #include "move.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h index fa27ec59..5c4e5de7 100644 --- a/libbcachefs/sb-counters_format.h +++ b/libbcachefs/sb-counters_format.h @@ -16,6 +16,7 @@ enum counters_flags { x(io_read_split, 33, TYPE_COUNTER) \ x(io_read_reuse_race, 34, TYPE_COUNTER) \ x(io_read_retry, 32, TYPE_COUNTER) \ + x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ x(io_move_read, 35, TYPE_SECTORS) \ diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index 67455beb..1736abea 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -311,13 +311,14 @@ enum bch_fsck_flags { x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ + x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ x(dirent_cf_name_too_big, 304, 0) \ x(dirent_stray_data_after_cf_name, 305, 0) \ - x(MAX, 307, 0) + x(MAX, 308, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/super.c b/libbcachefs/super.c index d662adfb..99f9a0aa 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -1990,15 +1990,12 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = BCH_DATA_free, - }; u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: + bch2_disk_accounting_mod2(trans, false, v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free)) ?: bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) goto err; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 251ba822..8c200b55 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -148,6 +148,7 @@ write_attribute(trigger_btree_key_cache_shrink); write_attribute(trigger_freelist_wakeup); write_attribute(trigger_btree_updates); read_attribute(gc_gens_pos); +__sysfs_attribute(read_fua_test, 0400); read_attribute(uuid); read_attribute(minor); @@ -310,6 +311,116 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes); } +static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bio *bio = NULL; + void *buf = NULL; + unsigned bs = c->opts.block_size, iters; + u64 end, test_duration = NSEC_PER_SEC * 2; + struct bch2_time_stats stats_nofua, stats_fua, stats_random; + int ret = 0; + + bch2_time_stats_init_no_pcpu(&stats_nofua); + bch2_time_stats_init_no_pcpu(&stats_fua); + bch2_time_stats_init_no_pcpu(&stats_random); + + if (!bch2_dev_get_ioref(c, ca->dev_idx, READ)) { + prt_str(out, "offline\n"); + return 0; + } + + struct block_device *bdev = ca->disk_sb.bdev; + + bio = bio_kmalloc(1, GFP_KERNEL); + if (!bio) { + ret = -ENOMEM; + goto err; + } + + buf = kmalloc(bs, GFP_KERNEL); + if (!buf) + goto err; + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_nofua, submit_time); + + if (ret) + goto err; + } + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ); + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_fua, submit_time); + + if (ret) + goto err; + } + + u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca); + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); + bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9; + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_random, submit_time); + + if (ret) + goto err; + } + + u64 ns_nofua = mean_and_variance_get_mean(stats_nofua.duration_stats); + u64 ns_fua = mean_and_variance_get_mean(stats_fua.duration_stats); + u64 ns_rand = mean_and_variance_get_mean(stats_random.duration_stats); + + u64 stddev_nofua = mean_and_variance_get_stddev(stats_nofua.duration_stats); + u64 stddev_fua = mean_and_variance_get_stddev(stats_fua.duration_stats); + u64 stddev_rand = mean_and_variance_get_stddev(stats_random.duration_stats); + + printbuf_tabstop_push(out, 8); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 12); + prt_printf(out, "This test must be run on an idle drive for accurate results\n"); + prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device)); + prt_printf(out, "fua support advertized: %s\n", bdev_fua(bdev) ? "yes" : "no"); + prt_newline(out); + prt_printf(out, "ns:\tlatency\rstddev\r\n"); + prt_printf(out, "nofua\t%llu\r%llu\r\n", ns_nofua, stddev_nofua); + prt_printf(out, "fua\t%llu\r%llu\r\n", ns_fua, stddev_fua); + prt_printf(out, "random\t%llu\r%llu\r\n", ns_rand, stddev_rand); + + bool read_cache = ns_nofua * 2 < ns_rand; + bool fua_cached = read_cache && ns_fua < (ns_nofua + ns_rand) / 2; + + if (!read_cache) + prt_str(out, "reads don't appear to be cached - safe\n"); + else if (!fua_cached) + prt_str(out, "fua reads don't appear to be cached - safe\n"); + else + prt_str(out, "fua reads appear to be cached - unsafe\n"); +err: + kfree(buf); + kfree(bio); + percpu_ref_put(&ca->io_ref); + bch_err_fn(c, ret); + return ret; +} + SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -823,6 +934,9 @@ SHOW(bch2_dev) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, ca); + if (attr == &sysfs_read_fua_test) + return bch2_read_fua_test(out, ca); + int opt_id = bch2_opt_lookup(attr->name); if (opt_id >= 0) return sysfs_opt_show(c, ca, opt_id, out); @@ -879,6 +993,8 @@ struct attribute *bch2_dev_files[] = { &sysfs_io_latency_stats_write, &sysfs_congested, + &sysfs_read_fua_test, + /* debug: */ &sysfs_alloc_debug, &sysfs_open_buckets, diff --git a/libbcachefs/time_stats.c b/libbcachefs/time_stats.c index 3fe82757..a8382d87 100644 --- a/libbcachefs/time_stats.c +++ b/libbcachefs/time_stats.c @@ -10,6 +10,9 @@ #include "eytzinger.h" #include "time_stats.h" +/* disable automatic switching to percpu mode */ +#define TIME_STATS_NONPCPU ((struct time_stat_buffer *) 1) + static const struct time_unit time_units[] = { { "ns", 1 }, { "us", NSEC_PER_USEC }, @@ -123,11 +126,12 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) { unsigned long flags; - if (!stats->buffer) { + if ((unsigned long) stats->buffer <= 1) { spin_lock_irqsave(&stats->lock, flags); time_stats_update_one(stats, start, end); - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && + if (!stats->buffer && + mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && stats->duration_stats.n > 1024) stats->buffer = alloc_percpu_gfp(struct time_stat_buffer, @@ -157,7 +161,8 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats) unsigned offset = offsetof(struct bch2_time_stats, min_duration); memset((void *) stats + offset, 0, sizeof(*stats) - offset); - if (stats->buffer) { + if (stats->buffer && + stats->buffer != TIME_STATS_NONPCPU) { int cpu; for_each_possible_cpu(cpu) per_cpu_ptr(stats->buffer, cpu)->nr = 0; @@ -167,7 +172,10 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats) void bch2_time_stats_exit(struct bch2_time_stats *stats) { - free_percpu(stats->buffer); + if (stats->buffer != TIME_STATS_NONPCPU) { + free_percpu(stats->buffer); + stats->buffer = NULL; + } } void bch2_time_stats_init(struct bch2_time_stats *stats) @@ -177,3 +185,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats) stats->min_freq = U64_MAX; spin_lock_init(&stats->lock); } + +void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats) +{ + bch2_time_stats_init(stats); + stats->buffer = TIME_STATS_NONPCPU; +} diff --git a/libbcachefs/time_stats.h b/libbcachefs/time_stats.h index dc6493f7..eddb0985 100644 --- a/libbcachefs/time_stats.h +++ b/libbcachefs/time_stats.h @@ -145,6 +145,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v) void bch2_time_stats_reset(struct bch2_time_stats *); void bch2_time_stats_exit(struct bch2_time_stats *); void bch2_time_stats_init(struct bch2_time_stats *); +void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *); static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) { diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 519d00d6..8c07189a 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -339,6 +339,11 @@ DEFINE_EVENT(bio, io_read_reuse_race, TP_ARGS(bio) ); +DEFINE_EVENT(bio, io_read_fail_and_poison, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + /* ec.c */ TRACE_EVENT(stripe_create, diff --git a/libbcachefs/util.h b/libbcachefs/util.h index d41e133a..7d921fc9 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -431,7 +431,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src, static inline void __memcpy_u64s(void *dst, const void *src, unsigned u64s) { -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("rep ; movsq" @@ -508,7 +508,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src, u64 *dst = (u64 *) _dst + u64s - 1; u64 *src = (u64 *) _src + u64s - 1; -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("std ;\n"