diff --git a/.bcachefs_revision b/.bcachefs_revision index 9f202c51..7d7555ff 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -9736cbbc5cc39f6c666befdd787788b6ce6497f6 +46af7258b951a79a66511172ab8772ad2dfaa4e3 diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 2384a5e3..e3c3a9b4 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -10,6 +10,8 @@ #include <linux/types.h> #include <linux/bvec.h> #include <linux/kobject.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> struct bio_set; struct bio; @@ -63,6 +65,8 @@ struct block_device { struct gendisk * bd_disk; struct gendisk __bd_disk; int bd_fd; + + struct mutex bd_holder_lock; }; #define bdev_kobj(_bdev) (&((_bdev)->kobj)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b295bd9a..6964396e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -65,7 +65,10 @@ unsigned bdev_logical_block_size(struct block_device *bdev); sector_t get_capacity(struct gendisk *disk); struct blk_holder_ops { - void (*mark_dead)(struct block_device *bdev); + void (*mark_dead)(struct block_device *bdev, bool surprise); + void (*sync)(struct block_device *bdev); + int (*freeze)(struct block_device *bdev); + int (*thaw)(struct block_device *bdev); }; static inline struct block_device *file_bdev(struct file *file) @@ -80,8 +83,12 @@ int lookup_bdev(const char *path, dev_t *); struct super_block { void *s_fs_info; + struct rw_semaphore s_umount; }; +static inline void evict_inodes(struct super_block *sb) {} +static inline int sync_filesystem(struct super_block *) { return 0; } + /* * File types * diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f395ce7f..b9d0ea22 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -9,6 +9,8 @@ struct dentry { struct inode *d_inode; }; +static inline void shrink_dcache_sb(struct super_block *) {} + #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } #define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n)) diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index d2c3f59a..b432bb6e 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -536,6 +536,7 @@ struct bch_dev { */ struct bch_member_cpu mi; atomic64_t errors[BCH_MEMBER_ERROR_NR]; + unsigned long write_errors_start; __uuid_t uuid; char name[BDEVNAME_SIZE]; @@ -1002,15 +1003,11 @@ struct bch_fs { wait_queue_head_t copygc_running_wq; /* STRIPES: */ - GENRADIX(struct stripe) stripes; GENRADIX(struct gc_stripe) gc_stripes; struct hlist_head ec_stripes_new[32]; spinlock_t ec_stripes_new_lock; - ec_stripes_heap ec_stripes_heap; - struct mutex ec_stripes_heap_lock; - /* ERASURE CODING */ struct list_head ec_stripe_head_list; struct mutex ec_stripe_head_lock; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 13cc0833..7a5b0d21 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -690,7 +690,8 @@ struct bch_sb_field_ext { x(cached_backpointers, BCH_VERSION(1, 21)) \ x(stripe_backpointers, BCH_VERSION(1, 22)) \ x(stripe_lru, BCH_VERSION(1, 23)) \ - x(casefolding, BCH_VERSION(1, 24)) + x(casefolding, BCH_VERSION(1, 24)) \ + x(extent_flags, BCH_VERSION(1, 25)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -859,6 +860,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); +LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -927,7 +929,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u BIT_ULL(BCH_FEATURE_new_siphash)| \ BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ BIT_ULL(BCH_FEATURE_new_varint)| \ - BIT_ULL(BCH_FEATURE_journal_no_flush)) + BIT_ULL(BCH_FEATURE_journal_no_flush)| \ + BIT_ULL(BCH_FEATURE_incompat_version_field)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index ca755e8d..1ec1f90e 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -203,7 +203,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return NULL; } - bch2_btree_lock_init(&b->c, 0); + bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); __bch2_btree_node_to_freelist(bc, b); return b; @@ -795,17 +795,18 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea } b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); - if (!b) { + if (b) { + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); + } else { mutex_unlock(&bc->lock); bch2_trans_unlock(trans); b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) goto err; + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); mutex_lock(&bc->lock); } - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); - BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 80a0094b..6638bb1f 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1187,7 +1187,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, le64_to_cpu(i->journal_seq), b->written, b->written + sectors, ptr_written); - b->written += sectors; + b->written = min(b->written + sectors, btree_sectors(c)); if (blacklisted && !first) continue; @@ -1329,6 +1329,7 @@ static void btree_node_read_work(struct work_struct *work) bch_info(c, "retrying read"); ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); rb->have_ioref = ca != NULL; + rb->start_time = local_clock(); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_buf_bytes(b); @@ -1339,17 +1340,22 @@ static void btree_node_read_work(struct work_struct *work) } else { bio->bi_status = BLK_STS_REMOVED; } + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rb->start_time, !bio->bi_status); start: printbuf_reset(&buf); bch2_btree_pos_to_text(&buf, c, b); - bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read, - "btree read error %s for %s", - bch2_blk_status_to_str(bio->bi_status), buf.buf); + + if (ca && bio->bi_status) + bch_err_dev_ratelimited(ca, + "btree read error %s for %s", + bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; - bch2_mark_io_failure(&failed, &rb->pick); + bch2_mark_io_failure(&failed, &rb->pick, false); can_retry = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), @@ -1401,12 +1407,11 @@ static void btree_node_read_endio(struct bio *bio) struct btree_read_bio *rb = container_of(bio, struct btree_read_bio, bio); struct bch_fs *c = rb->c; + struct bch_dev *ca = rb->have_ioref + ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - if (rb->have_ioref) { - struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); - - bch2_latency_acct(ca, rb->start_time, READ); - } + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rb->start_time, !bio->bi_status); queue_work(c->btree_read_complete_wq, &rb->work); } @@ -2075,6 +2080,11 @@ static void btree_node_write_work(struct work_struct *work) container_of(work, struct btree_write_bio, work); struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; + unsigned commit_flags = + BCH_WATERMARK_interior_updates| + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw; u64 start_time = wbio->start_time; int ret = 0; @@ -2083,38 +2093,24 @@ static void btree_node_write_work(struct work_struct *work) wbio->wbio.used_mempool, wbio->data); - bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, - bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = -BCH_ERR_btree_node_write_all_failed; - goto err; - } - - if (wbio->wbio.first_btree_write) { - if (wbio->wbio.failed.nr) { - - } - } else { + if (wbio->wbio.failed.nr) { + ret = bch2_trans_do(c, + bch2_btree_node_rewrite_key_get_iter(trans, b, + commit_flags)); + } else if (!wbio->wbio.first_btree_write) { ret = bch2_trans_do(c, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, - BCH_WATERMARK_interior_updates| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw, - !wbio->wbio.failed.nr)); - if (ret) - goto err; + commit_flags, true)); } -out: + + if (ret) { + set_btree_node_noevict(b); + bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, + "writing btree node: %s", bch2_err_str(ret)); + } + bio_put(&wbio->wbio.bio); btree_node_write_done(c, b, start_time); - return; -err: - set_btree_node_noevict(b); - bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, - "writing btree node: %s", bch2_err_str(ret)); - goto out; } static void btree_node_write_endio(struct bio *bio) @@ -2126,16 +2122,17 @@ static void btree_node_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct btree *b = wbio->bio.bi_private; struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; - unsigned long flags; - if (wbio->have_ioref) - bch2_latency_acct(ca, wbio->submit_time, WRITE); + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + wbio->submit_time, !bio->bi_status); - if (!ca || - bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, - "btree write error: %s", - bch2_blk_status_to_str(bio->bi_status)) || - bch2_meta_write_fault("btree")) { + if (ca && bio->bi_status) + bch_err_dev_ratelimited(ca, + "btree write error: %s", + bch2_blk_status_to_str(bio->bi_status)); + + if (bio->bi_status) { + unsigned long flags; spin_lock_irqsave(&c->btree_write_error_lock, flags); bch2_dev_list_add_dev(&orig->failed, wbio->dev); spin_unlock_irqrestore(&c->btree_write_error_lock, flags); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 1821f40c..edce5943 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -156,7 +156,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k } if (ck) { - bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); ck->c.cached = true; goto lock; } diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index 10b805a6..caef65ad 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -7,9 +7,10 @@ static struct lock_class_key bch2_btree_node_lock_key; void bch2_btree_lock_init(struct btree_bkey_cached_common *b, - enum six_lock_init_flags flags) + enum six_lock_init_flags flags, + gfp_t gfp) { - __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); + __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp); lockdep_set_notrack_class(&b->lock); } diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index b54ef48e..b33ab7af 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -13,7 +13,7 @@ #include "btree_iter.h" #include "six.h" -void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); +void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); void bch2_trans_unlock_noassert(struct btree_trans *); void bch2_trans_unlock_write(struct btree_trans *); diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c index a7f06dee..67816132 100644 --- a/libbcachefs/btree_node_scan.c +++ b/libbcachefs/btree_node_scan.c @@ -166,11 +166,17 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, bn, PAGE_SIZE); + u64 submit_time = local_clock(); submit_bio_wait(bio); - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, - "IO error in try_read_btree_node() at %llu: %s", - offset, bch2_blk_status_to_str(bio->bi_status))) + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, + "IO error in try_read_btree_node() at %llu: %s", + offset, bch2_blk_status_to_str(bio->bi_status)); return; + } if (le64_to_cpu(bn->magic) != bset_magic(c)) return; @@ -264,7 +270,7 @@ static int read_btree_nodes_worker(void *p) err: bio_put(bio); free_page((unsigned long) buf); - percpu_ref_get(&ca->io_ref); + percpu_ref_put(&ca->io_ref); closure_put(w->cl); kfree(w); return 0; @@ -283,29 +289,28 @@ static int read_btree_nodes(struct find_btree_nodes *f) continue; struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); - struct task_struct *t; - if (!w) { percpu_ref_put(&ca->io_ref); ret = -ENOMEM; goto err; } - percpu_ref_get(&ca->io_ref); - closure_get(&cl); w->cl = &cl; w->f = f; w->ca = ca; - t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); + struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { percpu_ref_put(&ca->io_ref); - closure_put(&cl); - f->ret = ret; - bch_err(c, "error starting kthread: %i", ret); + kfree(w); + bch_err_msg(c, ret, "starting kthread"); break; } + + closure_get(&cl); + percpu_ref_get(&ca->io_ref); + wake_up_process(t); } err: closure_sync(&cl); diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index aac2947a..d3e0cf01 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -2126,6 +2126,31 @@ err_free_update: goto out; } +static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b) +{ + bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->c.level, + BTREE_ITER_intent); + int ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + + /* has node been freed? */ + if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); + ret = -BCH_ERR_btree_node_dying; + goto err; + } + + BUG_ON(!btree_node_hashed(b)); + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; +} + int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, @@ -2191,7 +2216,29 @@ err: goto out; } -int bch2_btree_node_rewrite_key(struct btree_trans *trans, +static int bch2_btree_node_rewrite_key(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_i *k, unsigned flags) +{ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, + btree, k->k.p, + BTREE_MAX_DEPTH, level, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto out; + + bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); + ret = found + ? bch2_btree_node_rewrite(trans, &iter, b, flags) + : -ENOENT; +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_btree_node_rewrite_pos(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bpos pos, unsigned flags) { @@ -2211,6 +2258,19 @@ err: return ret; } +int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, + struct btree *b, unsigned flags) +{ + struct btree_iter iter; + int ret = get_iter_to_node(trans, &iter, b); + if (ret) + return ret == -BCH_ERR_btree_node_dying ? 0 : ret; + + ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + struct async_btree_rewrite { struct bch_fs *c; struct work_struct work; @@ -2220,57 +2280,14 @@ struct async_btree_rewrite { struct bkey_buf key; }; -static int async_btree_node_rewrite_trans(struct btree_trans *trans, - struct async_btree_rewrite *a) -{ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, - a->btree_id, a->key.k->k.p, - BTREE_MAX_DEPTH, a->level, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto out; - - bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); - ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, 0) - : -ENOENT; - -#if 0 - /* Tracepoint... */ - if (!ret || ret == -ENOENT) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - if (!ret) { - prt_printf(&buf, "rewrite node:\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); - } else { - prt_printf(&buf, "node to rewrite not found:\n want: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); - prt_printf(&buf, "\n got: "); - if (b) - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - else - prt_str(&buf, "(null)"); - } - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } -#endif -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static void async_btree_node_rewrite_work(struct work_struct *work) { struct async_btree_rewrite *a = container_of(work, struct async_btree_rewrite, work); struct bch_fs *c = a->c; - int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); + int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, + a->btree_id, a->level, a->key.k, 0)); if (ret != -ENOENT) bch_err_fn_ratelimited(c, ret); @@ -2514,30 +2531,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, unsigned commit_flags, bool skip_triggers) { struct btree_iter iter; - int ret; - - bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, - BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter); + int ret = get_iter_to_node(trans, &iter, b); if (ret) - goto out; - - /* has node been freed? */ - if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) { - /* node has been freed: */ - BUG_ON(!btree_node_dying(b)); - goto out; - } - - BUG_ON(!btree_node_hashed(b)); + return ret == -BCH_ERR_btree_node_dying ? 0 : ret; bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); ret = bch2_btree_node_update_key(trans, &iter, b, new_key, commit_flags, skip_triggers); -out: bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index b5be250b..be71cd73 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -169,9 +169,12 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, struct btree *, unsigned); -int bch2_btree_node_rewrite_key(struct btree_trans *, +int bch2_btree_node_rewrite_pos(struct btree_trans *, enum btree_id, unsigned, struct bpos, unsigned); +int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, + struct btree *, unsigned); + void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 7e484afe..522574bc 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -573,7 +573,6 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, prt_str_indented(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); - prt_newline(out); } void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) @@ -707,6 +706,18 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, return 0; } +static bool can_write_extent(struct bch_fs *c, + struct bch_devs_list *devs_have, + unsigned target) +{ + struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); + + darray_for_each(*devs_have, i) + __clear_bit(*i, devs.d); + + return !bch2_is_zero(&devs, sizeof(devs)); +} + int bch2_data_update_init(struct btree_trans *trans, struct btree_iter *iter, struct moving_context *ctxt, @@ -788,6 +799,20 @@ int bch2_data_update_init(struct btree_trans *trans, ptr_bit <<= 1; } + if (!can_write_extent(c, &m->op.devs_have, + m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) { + /* + * Check if we have rw devices not in devs_have: this can happen + * if we're trying to move data on a ro or failed device + * + * If we can't move it, we need to clear the rebalance_work bit, + * if applicable + * + * Also, copygc should skip ro/failed devices: + */ + return -BCH_ERR_data_update_done_no_rw_devs; + } + unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); /* diff --git a/libbcachefs/dirent_format.h b/libbcachefs/dirent_format.h index 2e766032..a46dbddd 100644 --- a/libbcachefs/dirent_format.h +++ b/libbcachefs/dirent_format.h @@ -44,9 +44,9 @@ struct bch_dirent { __u8 d_pad; __le16 d_name_len; __le16 d_cf_name_len; - __u8 d_names[0]; + __u8 d_names[]; } d_cf_name_block __packed; - __u8 d_name[0]; + __DECLARE_FLEX_ARRAY(__u8, d_name); } __packed; } __packed __aligned(8); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 1090cdb7..865cc53a 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -105,6 +105,7 @@ struct ec_bio { struct bch_dev *ca; struct ec_stripe_buf *buf; size_t idx; + u64 submit_time; struct bio bio; }; @@ -494,38 +495,6 @@ int bch2_trigger_stripe(struct btree_trans *trans, return ret; } - if (flags & BTREE_TRIGGER_atomic) { - struct stripe *m = genradix_ptr(&c->stripes, idx); - - if (!m) { - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_bkey_val_to_text(&buf1, c, old); - bch2_bkey_val_to_text(&buf2, c, new); - bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" - "old %s\n" - "new %s", idx, buf1.buf, buf2.buf); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - bch2_inconsistent_error(c); - return -1; - } - - if (!new_s) { - bch2_stripes_heap_del(c, m, idx); - - memset(m, 0, sizeof(*m)); - } else { - stripe_to_mem(m, new_s); - - if (!old_s) - bch2_stripes_heap_insert(c, m, idx); - else - bch2_stripes_heap_update(c, m, idx); - } - } - return 0; } @@ -748,14 +717,15 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; - if (bch2_dev_io_err_on(bio->bi_status, ca, - bio_data_dir(bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "erasure coding %s error: %s", + bch2_account_io_completion(ca, bio_data_dir(bio), + ec_bio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "erasure coding %s error: %s", str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status)); clear_bit(ec_bio->idx, ec_bio->buf->valid); + } int stale = dev_ptr_stale(ca, ptr); if (stale) { @@ -818,6 +788,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ec_bio->ca = ca; ec_bio->buf = buf; ec_bio->idx = idx; + ec_bio->submit_time = local_clock(); ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ec_bio->bio.bi_end_io = ec_block_endio; @@ -939,26 +910,6 @@ err: static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) { - ec_stripes_heap n, *h = &c->ec_stripes_heap; - - if (idx >= h->size) { - if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - - mutex_lock(&c->ec_stripes_heap_lock); - if (n.size > h->size) { - memcpy(n.data, h->data, h->nr * sizeof(h->data[0])); - n.nr = h->nr; - swap(*h, n); - } - mutex_unlock(&c->ec_stripes_heap_lock); - - free_heap(&n); - } - - if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; @@ -1031,155 +982,26 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) s->idx = 0; } -/* Heap of all existing stripes, ordered by blocks_nonempty */ - -static u64 stripe_idx_to_delete(struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - - lockdep_assert_held(&c->ec_stripes_heap_lock); - - if (h->nr && - h->data[0].blocks_nonempty == 0 && - !bch2_stripe_is_open(c, h->data[0].idx)) - return h->data[0].idx; - - return 0; -} - -static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, - size_t i) -{ - struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); - - genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; -} - -static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - - return ((_l->blocks_nonempty > _r->blocks_nonempty) < - (_l->blocks_nonempty < _r->blocks_nonempty)); -} - -static inline void ec_stripes_heap_swap(void *l, void *r, void *h) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - ec_stripes_heap *_h = (ec_stripes_heap *)h; - size_t i = _l - _h->data; - size_t j = _r - _h->data; - - swap(*_l, *_r); - - ec_stripes_heap_set_backpointer(_h, i); - ec_stripes_heap_set_backpointer(_h, j); -} - -static const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, -}; - -static void heap_verify_backpointer(struct bch_fs *c, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m = genradix_ptr(&c->stripes, idx); - - BUG_ON(m->heap_idx >= h->nr); - BUG_ON(h->data[m->heap_idx].idx != idx); -} - -void bch2_stripes_heap_del(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_insert(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - BUG_ON(min_heap_full(&c->ec_stripes_heap)); - - genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr; - min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) { - .idx = idx, - .blocks_nonempty = m->blocks_nonempty, - }), - &callbacks, - &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_update(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - bool do_deletes; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; - - i = m->heap_idx; - min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap); - min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - - do_deletes = stripe_idx_to_delete(c) != 0; - mutex_unlock(&c->ec_stripes_heap_lock); - - if (do_deletes) - bch2_do_stripe_deletes(c); -} - /* stripe deletion */ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) { - struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_stripe s; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_intent); - ret = bkey_err(k); + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), + BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) goto err; - if (k.k->type != KEY_TYPE_stripe) { - bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); - ret = -EINVAL; - goto err; - } - - s = bkey_s_c_to_stripe(k); - for (unsigned i = 0; i < s.v->nr_blocks; i++) - if (stripe_blockcount_get(s.v, i)) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); - printbuf_exit(&buf); - ret = -EINVAL; - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0); + /* + * We expect write buffer races here + * Important: check stripe_is_open with stripe key locked: + */ + if (k.k->type == KEY_TYPE_stripe && + !bch2_stripe_is_open(trans->c, idx) && + stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) + ret = bch2_btree_delete_at(trans, &iter, 0); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1194,21 +1016,16 @@ static void ec_stripe_delete_work(struct work_struct *work) struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - while (1) { - mutex_lock(&c->ec_stripes_heap_lock); - u64 idx = stripe_idx_to_delete(c); - mutex_unlock(&c->ec_stripes_heap_lock); - - if (!idx) - break; - - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - ec_stripe_delete(trans, idx)); - bch_err_fn(c, ret); - if (ret) - break; - } - + bch2_trans_run(c, + bch2_btree_write_buffer_tryflush(trans) ?: + for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX), + 0, lru_k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + ec_stripe_delete(trans, lru_k.k->p.offset); + }))); bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } @@ -1557,6 +1374,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ret) goto err; err: + trace_stripe_create(c, s->idx, ret); + bch2_disk_reservation_put(c, &s->res); for (i = 0; i < v->nr_blocks; i++) @@ -1998,39 +1817,40 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, return 0; } -static s64 get_existing_stripe(struct bch_fs *c, - struct ec_stripe_head *head) +static int __get_existing_stripe(struct btree_trans *trans, + struct ec_stripe_head *head, + struct ec_stripe_buf *stripe, + u64 idx) { - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t heap_idx; - u64 stripe_idx; - s64 ret = -1; + struct bch_fs *c = trans->c; - if (may_create_new_stripe(c)) - return -1; + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), 0); + int ret = bkey_err(k); + if (ret) + goto err; - mutex_lock(&c->ec_stripes_heap_lock); - for (heap_idx = 0; heap_idx < h->nr; heap_idx++) { - /* No blocks worth reusing, stripe will just be deleted: */ - if (!h->data[heap_idx].blocks_nonempty) - continue; + /* We expect write buffer races here */ + if (k.k->type != KEY_TYPE_stripe) + goto out; - stripe_idx = h->data[heap_idx].idx; + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + if (stripe_lru_pos(s.v) <= 1) + goto out; - m = genradix_ptr(&c->stripes, stripe_idx); - - if (m->disk_label == head->disk_label && - m->algorithm == head->algo && - m->nr_redundant == head->redundancy && - m->sectors == head->blocksize && - m->blocks_nonempty < m->nr_blocks - m->nr_redundant && - bch2_try_open_stripe(c, head->s, stripe_idx)) { - ret = stripe_idx; - break; - } + if (s.v->disk_label == head->disk_label && + s.v->algorithm == head->algo && + s.v->nr_redundant == head->redundancy && + le16_to_cpu(s.v->sectors) == head->blocksize && + bch2_try_open_stripe(c, head->s, idx)) { + bkey_reassemble(&stripe->key, k); + ret = 1; } - mutex_unlock(&c->ec_stripes_heap_lock); +out: + bch2_set_btree_iter_dontneed(&iter); +err: + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -2082,24 +1902,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri struct ec_stripe_new *s) { struct bch_fs *c = trans->c; - s64 idx; - int ret; /* * If we can't allocate a new stripe, and there's no stripes with empty * blocks for us to reuse, that means we have to wait on copygc: */ - idx = get_existing_stripe(c, h); - if (idx < 0) - return -BCH_ERR_stripe_alloc_blocked; + if (may_create_new_stripe(c)) + return -1; - ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); - bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, - "reading stripe key: %s", bch2_err_str(ret)); - if (ret) { - bch2_stripe_close(c, s); - return ret; + struct btree_iter lru_iter; + struct bkey_s_c lru_k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX), + 0, lru_k, ret) { + ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset); + if (ret) + break; } + bch2_trans_iter_exit(trans, &lru_iter); + if (!ret) + ret = -BCH_ERR_stripe_alloc_blocked; + if (ret == 1) + ret = 0; + if (ret) + return ret; return init_new_stripe_from_existing(c, s); } @@ -2397,46 +2226,7 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_prefetch, k, ({ - if (k.k->type != KEY_TYPE_stripe) - continue; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - break; - - struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); - - stripe_to_mem(m, bkey_s_c_to_stripe(k).v); - - bch2_stripes_heap_insert(c, m, k.k->p.offset); - 0; - }))); - bch_err_fn(c, ret); - return ret; -} - -void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min_t(size_t, h->nr, 50); i++) { - m = genradix_ptr(&c->stripes, h->data[i].idx); - - prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, - h->data[i].blocks_nonempty, - m->nr_blocks - m->nr_redundant, - m->nr_redundant); - if (bch2_stripe_is_open(c, h->data[i].idx)) - prt_str(out, " open"); - prt_newline(out); - } - mutex_unlock(&c->ec_stripes_heap_lock); + return 0; } static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -2507,15 +2297,12 @@ void bch2_fs_ec_exit(struct bch_fs *c) BUG_ON(!list_empty(&c->ec_stripe_new_list)); - free_heap(&c->ec_stripes_heap); - genradix_free(&c->stripes); bioset_exit(&c->ec_bioset); } void bch2_fs_ec_init_early(struct bch_fs *c) { spin_lock_init(&c->ec_stripes_new_lock); - mutex_init(&c->ec_stripes_heap_lock); INIT_LIST_HEAD(&c->ec_stripe_head_list); mutex_init(&c->ec_stripe_head_lock); diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index cd1c837e..8f2228e5 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -99,15 +99,13 @@ static inline u64 stripe_lru_pos(const struct bch_stripe *s) if (!s) return 0; - unsigned blocks_empty = 0, blocks_nonempty = 0; + unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0; - for (unsigned i = 0; i < s->nr_blocks; i++) { - blocks_empty += !stripe_blockcount_get(s, i); - blocks_nonempty += !!stripe_blockcount_get(s, i); - } + for (unsigned i = 0; i < nr_data; i++) + blocks_empty += !stripe_blockcount_get(s, i); /* Will be picked up by the stripe_delete worker */ - if (!blocks_nonempty) + if (blocks_empty == nr_data) return STRIPE_LRU_POS_EMPTY; if (!blocks_empty) @@ -260,10 +258,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, unsigned, unsigned, unsigned, enum bch_watermark, struct closure *); -void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); - void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); @@ -300,7 +294,6 @@ void bch2_fs_ec_flush(struct bch_fs *); int bch2_stripes_read(struct bch_fs *); -void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_ec_exit(struct bch_fs *); diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h index 37558cc2..06144bfd 100644 --- a/libbcachefs/ec_types.h +++ b/libbcachefs/ec_types.h @@ -31,11 +31,4 @@ struct gc_stripe { struct bch_replicas_padded r; }; -struct ec_stripe_heap_entry { - size_t idx; - unsigned blocks_nonempty; -}; - -typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap; - #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 89df9781..531fe575 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -119,6 +119,7 @@ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOENT, ENOENT_inode_no_backpointer) \ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ + x(ENOENT, btree_node_dying) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(EEXIST, EEXIST_str_hash_set) \ @@ -185,6 +186,7 @@ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ + x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ x(EINVAL, mismatched_block_size) \ @@ -205,6 +207,7 @@ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ x(EINVAL, varint_decode_error) \ + x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -269,12 +272,29 @@ x(EIO, mark_stripe) \ x(EIO, stripe_reconstruct) \ x(EIO, key_type_error) \ + x(EIO, extent_poisened) \ x(EIO, no_device_to_read_from) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ x(EIO, insufficient_journal_devices) \ x(EIO, device_offline) \ + x(EIO, EIO_fault_injected) \ + x(EIO, data_read) \ + x(BCH_ERR_data_read, data_read_retry) \ + x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ + x(BCH_ERR_data_read_retry_avoid,data_read_device_offline) \ + x(BCH_ERR_data_read_retry_avoid,data_read_io_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_ec_reconstruct_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_csum_err) \ + x(BCH_ERR_data_read_retry, data_read_csum_err_maybe_userspace) \ + x(BCH_ERR_data_read, data_read_decompress_err) \ + x(BCH_ERR_data_read, data_read_decrypt_err) \ + x(BCH_ERR_data_read, data_read_ptr_stale_race) \ + x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ + x(BCH_ERR_data_read, data_read_no_encryption_key) \ + x(BCH_ERR_data_read, data_read_buffer_too_small) \ + x(BCH_ERR_data_read, data_read_key_overwritten) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 3f93a5a6..6d68c89a 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -54,25 +54,41 @@ void bch2_io_error_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); struct bch_fs *c = ca->fs; - bool dev; + + /* XXX: if it's reads or checksums that are failing, set it to failed */ down_write(&c->state_lock); - dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED); - if (dev - ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED) - : bch2_fs_emergency_read_only(c)) + unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); + + if (write_errors_start && + time_after(jiffies, + write_errors_start + c->opts.write_error_timeout * HZ)) { + if (ca->mi.state >= BCH_MEMBER_STATE_ro) + goto out; + + bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED); + bch_err(ca, - "too many IO errors, setting %s RO", + "writes erroring for %u seconds, setting %s ro", + c->opts.write_error_timeout, dev ? "device" : "filesystem"); + if (!dev) + bch2_fs_emergency_read_only(c); + + } +out: up_write(&c->state_lock); } void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) { atomic64_inc(&ca->errors[type]); - //queue_work(system_long_wq, &ca->io_error_work); + + if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start) + ca->write_errors_start = jiffies; + + queue_work(system_long_wq, &ca->io_error_work); } enum ask_yn { diff --git a/libbcachefs/error.h b/libbcachefs/error.h index b3cc69f2..7d3f0e2a 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -216,27 +216,37 @@ void bch2_io_error_work(struct work_struct *); /* Does the error handling without logging a message */ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); -#define bch2_dev_io_err_on(cond, ca, _type, ...) \ -({ \ - bool _ret = (cond); \ - \ - if (_ret) { \ - bch_err_dev_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca, _type); \ - } \ - _ret; \ -}) +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif -#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \ -({ \ - bool _ret = (cond); \ - \ - if (_ret) { \ - bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca, _type); \ - } \ - _ret; \ -}) +static inline void bch2_account_io_success_fail(struct bch_dev *ca, + enum bch_member_error_type type, + bool success) +{ + if (likely(success)) { + if (type == BCH_MEMBER_ERROR_write && + ca->write_errors_start) + ca->write_errors_start = 0; + } else { + bch2_io_error(ca, type); + } +} + +static inline void bch2_account_io_completion(struct bch_dev *ca, + enum bch_member_error_type type, + u64 submit_time, bool success) +{ + if (unlikely(!ca)) + return; + + if (type != BCH_MEMBER_ERROR_checksum) + bch2_latency_acct(ca, submit_time, type); + + bch2_account_io_success_fail(ca, type, success); +} int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 78a51d96..f62ee96b 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -28,6 +28,13 @@ #include "trace.h" #include "util.h" +static const char * const bch2_extent_flags_strs[] = { +#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, + BCH_EXTENT_FLAGS() +#undef x + NULL, +}; + static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, @@ -51,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, } void bch2_mark_io_failure(struct bch_io_failures *failed, - struct extent_ptr_decoded *p) + struct extent_ptr_decoded *p, + bool csum_error) { struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); @@ -59,25 +67,28 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); f = &failed->devs[failed->nr++]; - f->dev = p->ptr.dev; - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else if (p->idx != f->idx) { - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else { - f->nr_failed++; + memset(f, 0, sizeof(*f)); + f->dev = p->ptr.dev; } + + if (p->do_ec_reconstruct) + f->failed_ec = true; + else if (!csum_error) + f->failed_io = true; + else + f->failed_csum_nr++; } -static inline u64 dev_latency(struct bch_fs *c, unsigned dev) +static inline u64 dev_latency(struct bch_dev *ca) { - struct bch_dev *ca = bch2_dev_rcu(c, dev); return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; } +static inline int dev_failed(struct bch_dev *ca) +{ + return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; +} + /* * returns true if p1 is better than p2: */ @@ -85,9 +96,18 @@ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p1, const struct extent_ptr_decoded p2) { - if (likely(!p1.idx && !p2.idx)) { - u64 l1 = dev_latency(c, p1.ptr.dev); - u64 l2 = dev_latency(c, p2.ptr.dev); + if (likely(!p1.do_ec_reconstruct && + !p2.do_ec_reconstruct)) { + struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); + + int failed_delta = dev_failed(ca1) - dev_failed(ca2); + + if (failed_delta) + return failed_delta < 0; + + u64 l1 = dev_latency(ca1); + u64 l2 = dev_latency(ca2); /* * Square the latencies, to bias more in favor of the faster @@ -103,9 +123,9 @@ static inline bool ptr_better(struct bch_fs *c, } if (bch2_force_reconstruct_read) - return p1.idx > p2.idx; + return p1.do_ec_reconstruct > p2.do_ec_reconstruct; - return p1.idx < p2.idx; + return p1.do_ec_reconstruct < p2.do_ec_reconstruct; } /* @@ -114,19 +134,24 @@ static inline bool ptr_better(struct bch_fs *c, * other devices, it will still pick a pointer from avoid. */ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_failures *failed, - struct extent_ptr_decoded *pick, - int dev) + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick, + int dev) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; struct bch_dev_io_failures *f; + unsigned csum_retry = 0; + bool have_csum_retries = false; int ret = 0; if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; + if (bch2_bkey_extent_ptrs_flags(ptrs) & BCH_EXTENT_FLAG_poisoned) + return -BCH_ERR_extent_poisened; +again: rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { /* @@ -154,20 +179,28 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; - f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; - if (f) - p.idx = f->nr_failed < f->nr_retries - ? f->idx - : f->idx + 1; + if (unlikely(failed) && + (f = bch2_dev_io_failures(failed, p.ptr.dev))) { + have_csum_retries |= !f->failed_io && f->failed_csum_nr < BCH_MAX_CSUM_RETRIES; - if (!p.idx && (!ca || !bch2_dev_is_readable(ca))) - p.idx++; + if (p.has_ec && + !f->failed_ec && + (f->failed_io || f->failed_csum_nr)) + p.do_ec_reconstruct = true; + else if (f->failed_io || + f->failed_csum_nr > csum_retry) + continue; + } - if (!p.idx && p.has_ec && bch2_force_reconstruct_read) - p.idx++; + if (!ca || !bch2_dev_is_online(ca)) { + if (p.has_ec) + p.do_ec_reconstruct = true; + else + continue; + } - if (p.idx > (unsigned) p.has_ec) - continue; + if (p.has_ec && bch2_force_reconstruct_read) + p.do_ec_reconstruct = true; if (ret > 0 && !ptr_better(c, p, *pick)) continue; @@ -177,6 +210,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, } rcu_read_unlock(); + if (unlikely(ret == -BCH_ERR_no_device_to_read_from && + have_csum_retries && + csum_retry < BCH_MAX_CSUM_RETRIES)) { + csum_retry++; + goto again; + } + return ret; } @@ -1002,7 +1042,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); + return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr); } void bch2_extent_ptr_set_cached(struct bch_fs *c, @@ -1225,6 +1265,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, bch2_extent_rebalance_to_text(out, c, &entry->rebalance); break; + case BCH_EXTENT_ENTRY_flags: + prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); + break; + default: prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; @@ -1386,6 +1430,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, #endif break; } + case BCH_EXTENT_ENTRY_flags: + bkey_fsck_err_on(entry != ptrs.start, + c, extent_flags_not_at_start, + "extent flags entry not at start"); + break; } } @@ -1452,6 +1501,28 @@ void bch2_ptr_swab(struct bkey_s k) } } +int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags) +{ + int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags); + if (ret) + return ret; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + + if (ptrs.start != ptrs.end && + extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) { + ptrs.start->flags.flags = flags; + } else { + struct bch_extent_flags f = { + .type = BIT(BCH_EXTENT_ENTRY_flags), + .flags = flags, + }; + __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f); + } + + return 0; +} + /* Generic extent code: */ int bch2_cut_front_s(struct bpos where, struct bkey_s k) @@ -1497,8 +1568,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) entry->crc128.offset += sub; break; case BCH_EXTENT_ENTRY_stripe_ptr: - break; case BCH_EXTENT_ENTRY_rebalance: + case BCH_EXTENT_ENTRY_flags: break; } diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 8fae6b23..b4058502 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -320,8 +320,8 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ({ \ __label__ out; \ \ - (_ptr).idx = 0; \ (_ptr).has_ec = false; \ + (_ptr).do_ec_reconstruct = false; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ switch (__extent_entry_type(_entry)) { \ @@ -401,7 +401,7 @@ out: \ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, unsigned); void bch2_mark_io_failure(struct bch_io_failures *, - struct extent_ptr_decoded *); + struct extent_ptr_decoded *, bool); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, struct extent_ptr_decoded *, int); @@ -704,7 +704,7 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, ptr1.unwritten == ptr2.unwritten && ptr1.offset == ptr2.offset && ptr1.dev == ptr2.dev && - ptr1.dev == ptr2.dev); + ptr1.gen == ptr2.gen); } void bch2_ptr_swab(struct bkey_s); @@ -753,4 +753,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size) k->size = new_size; } +static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs) +{ + if (ptrs.start != ptrs.end && + extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) + return ptrs.start->flags.flags; + return 0; +} + +static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k) +{ + return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k)); +} + +int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64); + #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/libbcachefs/extents_format.h b/libbcachefs/extents_format.h index c198dfc3..74c0252c 100644 --- a/libbcachefs/extents_format.h +++ b/libbcachefs/extents_format.h @@ -79,8 +79,9 @@ x(crc64, 2) \ x(crc128, 3) \ x(stripe_ptr, 4) \ - x(rebalance, 5) -#define BCH_EXTENT_ENTRY_MAX 6 + x(rebalance, 5) \ + x(flags, 6) +#define BCH_EXTENT_ENTRY_MAX 7 enum bch_extent_entry_type { #define x(f, n) BCH_EXTENT_ENTRY_##f = n, @@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr { #endif }; +#define BCH_EXTENT_FLAGS() \ + x(poisoned, 0) + +enum bch_extent_flags_e { +#define x(n, v) BCH_EXTENT_FLAG_##n = v, + BCH_EXTENT_FLAGS() +#undef x +}; + +struct bch_extent_flags { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:7, + flags:57; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 flags:57, + type:7; +#endif +}; + /* bch_extent_rebalance: */ #include "rebalance_format.h" diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h index 43d6c341..f8b8e598 100644 --- a/libbcachefs/extents_types.h +++ b/libbcachefs/extents_types.h @@ -20,21 +20,23 @@ struct bch_extent_crc_unpacked { }; struct extent_ptr_decoded { - unsigned idx; bool has_ec; + unsigned do_ec_reconstruct; struct bch_extent_crc_unpacked crc; struct bch_extent_ptr ptr; struct bch_extent_stripe_ptr ec; }; +#define BCH_MAX_CSUM_RETRIES 3 + struct bch_io_failures { u8 nr; struct bch_dev_io_failures { u8 dev; - u8 idx; - u8 nr_failed; - u8 nr_retries; - } devs[BCH_REPLICAS_MAX]; + unsigned failed_csum_nr:4, + failed_io:1, + failed_ec:1; + } devs[BCH_REPLICAS_MAX + 1]; }; #endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index ca70a3de..fbc3da59 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -268,16 +268,8 @@ int bch2_unlink_trans(struct btree_trans *trans, dir_hash = bch2_hash_info_init(c, dir_u); - struct bkey_s_c dirent_k = - bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, - &dir_hash, dir, name, BTREE_ITER_intent); - ret = bkey_err(dirent_k); - if (ret) - goto err; - - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(dirent_k), &inum); - if (ret > 0) - ret = -ENOENT; + ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_intent); if (ret) goto err; @@ -334,7 +326,6 @@ int bch2_unlink_trans(struct btree_trans *trans, dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); - dir_u->bi_size -= bkey_bytes(dirent_k.k); ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash, &dirent_iter, diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 94bf34b9..717e7b94 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -466,6 +466,7 @@ int bchfs_truncate(struct mnt_idmap *idmap, ret = bch2_truncate_folio(inode, iattr->ia_size); if (unlikely(ret < 0)) goto err; + ret = 0; truncate_setsize(&inode->v, iattr->ia_size); diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 4465a2a8..5b47b94f 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -69,8 +69,9 @@ static int bch2_inode_flags_set(struct btree_trans *trans, if (ret < 0) return ret; - if (!bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding)) - return -EOPNOTSUPP; + ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding); + if (ret) + return ret; bch2_check_set_feature(c, BCH_FEATURE_casefolding); #else @@ -243,7 +244,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, int ret = 0; subvol_inum inum; - kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL); if (!kname) return -ENOMEM; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 2c011a46..459ca825 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -2218,9 +2218,10 @@ static int bch2_fs_get_tree(struct fs_context *fc) bch2_opts_apply(&c->opts, opts); - ret = bch2_fs_start(c); - if (ret) - goto err_stop_fs; + /* + * need to initialise sb and set c->vfs_sb _before_ starting fs, + * for blk_holder_ops + */ sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); ret = PTR_ERR_OR_ZERO(sb); @@ -2282,6 +2283,10 @@ got_sb: sb->s_shrink->seeks = 0; + ret = bch2_fs_start(c); + if (ret) + goto err_put_super; + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); bch_err_msg(c, ret, "mounting: error getting root inode"); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 9bf316e7..0e85131d 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1978,31 +1978,10 @@ fsck_err: return ret; } -static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - darray_for_each(w->inodes, i) - if (fsck_err_on(i->inode.bi_size != i->i_size, - trans, inode_dir_wrong_nlink, - "directory %llu:%u with wrong i_size: got %llu, should be %llu", - w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) { - i->inode.bi_size = i->i_size; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; - } -fsck_err: - bch_err_fn(c, ret); - return ret; -} - static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) { u32 restart_count = trans->restart_count; return check_subdir_count_notnested(trans, w) ?: - check_dir_i_size_notnested(trans, w) ?: trans_was_restarted(trans, restart_count); } diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index 821ff222..652dbc58 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -329,10 +329,17 @@ nopromote: static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, struct bch_read_bio *rbio, struct bpos read_pos) { - return lockrestart_do(trans, + int ret = lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, out, (subvol_inum) { rbio->subvol, read_pos.inode }, read_pos.offset << 9)); + if (ret) + return ret; + + if (rbio->flags & BCH_READ_data_update) + prt_str(out, "(internal move) "); + + return 0; } static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, @@ -341,10 +348,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); } -#define READ_RETRY_AVOID 1 -#define READ_RETRY 2 -#define READ_ERR 3 - enum rbio_context { RBIO_CONTEXT_NULL, RBIO_CONTEXT_HIGHPRI, @@ -375,6 +378,11 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { BUG_ON(rbio->bounce && !rbio->split); + if (rbio->have_ioref) { + struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); + percpu_ref_put(&ca->io_ref); + } + if (rbio->split) { struct bch_read_bio *parent = rbio->parent; @@ -408,13 +416,90 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } -static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) +static struct bkey_s_c get_rbio_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct btree_iter *iter) +{ + if (rbio->flags & BCH_READ_data_update) { + struct data_update *u = container_of(rbio, struct data_update, rbio); + + return bch2_bkey_get_iter(trans, iter, + u->btree_id, bkey_start_pos(&u->k.k->k), 0); + } else { + struct bpos pos = rbio->read_pos; + int ret = bch2_subvolume_get_snapshot(trans, rbio->subvol, &pos.snapshot); + if (ret) + return bkey_s_c_err(ret); + + return bch2_bkey_get_iter(trans, iter, + BTREE_ID_extents, pos, 0); + } +} + +static void mark_io_failure_if_current_extent_matches(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bch_io_failures *failed) +{ + struct btree_iter iter = {}; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = get_rbio_extent(trans, rbio, &iter))); + + if (!ret) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) + if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) + bch2_mark_io_failure(failed, &rbio->pick, + rbio->ret == -BCH_ERR_data_read_csum_err); + } + + bch2_trans_iter_exit(trans, &iter); +} + +static noinline int maybe_poison_extent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, struct bch_io_failures *failed) +{ + u64 flags = bch2_bkey_extent_flags(k); + if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + /* + * Make sure we actually attempt to read and got checksum failures from + * every replica + */ + + rcu_read_lock(); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) + continue; + + struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, ptr->dev); + if (!f || f->failed_csum_nr != BCH_MAX_CSUM_RETRIES) { + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + + struct bkey_i *new = __bch2_bkey_make_mut(trans, iter, &k, 0, 0, + bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); + return PTR_ERR_OR_ZERO(new) ?: + bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: + bch2_trans_commit(trans, NULL, NULL, 0); +} + +static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) { struct data_update *u = container_of(rbio, struct data_update, rbio); - struct btree_trans *trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); @@ -429,7 +514,7 @@ retry: if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ - rbio->hole = true; + rbio->ret = -BCH_ERR_data_read_key_overwritten; goto err; } @@ -441,14 +526,19 @@ retry: err: bch2_trans_iter_exit(trans, &iter); - if (ret == READ_RETRY) + if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; - if (ret) - rbio->bio.bi_status = BLK_STS_IOERR; + + if (ret) { + if (ret == -BCH_ERR_no_device_to_read_from && failed) + maybe_poison_extent(trans, &iter, k, failed); + + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; + } BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); - bch2_rbio_done(rbio); - bch2_trans_put(trans); + return ret; } static void bch2_rbio_retry(struct work_struct *work) @@ -463,16 +553,22 @@ static void bch2_rbio_retry(struct work_struct *work) .inum = rbio->read_pos.inode, }; struct bch_io_failures failed = { .nr = 0 }; + struct btree_trans *trans = bch2_trans_get(c); trace_io_read_retry(&rbio->bio); this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], bvec_iter_sectors(rbio->bvec_iter)); - if (rbio->retry == READ_RETRY_AVOID) - bch2_mark_io_failure(&failed, &rbio->pick); + if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) + mark_io_failure_if_current_extent_matches(trans, rbio, &failed); - if (!rbio->split) - rbio->bio.bi_status = 0; + if (!rbio->split) { + rbio->bio.bi_status = 0; + rbio->ret = 0; + } + + unsigned subvol = rbio->subvol; + struct bpos read_pos = rbio->read_pos; rbio = bch2_rbio_free(rbio); @@ -481,29 +577,55 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_last_fragment; flags |= BCH_READ_must_clone; - if (flags & BCH_READ_data_update) - bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); - else - __bch2_read(c, rbio, iter, inum, &failed, flags); + int ret = flags & BCH_READ_data_update + ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) + : __bch2_read(trans, rbio, iter, inum, &failed, flags); + + if (ret) { + rbio->ret = ret; + rbio->bio.bi_status = BLK_STS_IOERR; + } else { + struct printbuf buf = PRINTBUF; + + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, + (subvol_inum) { subvol, read_pos.inode }, + read_pos.offset << 9)); + if (rbio->flags & BCH_READ_data_update) + prt_str(&buf, "(internal move) "); + prt_str(&buf, "successful retry"); + + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } + + bch2_rbio_done(rbio); + bch2_trans_put(trans); } -static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, - blk_status_t error) +static void bch2_rbio_error(struct bch_read_bio *rbio, + int ret, blk_status_t blk_error) { - rbio->retry = retry; - rbio->saw_error = true; + BUG_ON(ret >= 0); + + rbio->ret = ret; + rbio->bio.bi_status = blk_error; + + bch2_rbio_parent(rbio)->saw_error = true; if (rbio->flags & BCH_READ_in_retry) return; - if (retry == READ_ERR) { - rbio = bch2_rbio_free(rbio); - - rbio->bio.bi_status = error; - bch2_rbio_done(rbio); - } else { + if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { bch2_rbio_punt(rbio, bch2_rbio_retry, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + } else { + rbio = bch2_rbio_free(rbio); + + rbio->ret = ret; + rbio->bio.bi_status = blk_error; + + bch2_rbio_done(rbio); } } @@ -519,15 +641,13 @@ static void bch2_read_io_err(struct work_struct *work) bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); - if (ca) { - bch2_io_error(ca, BCH_MEMBER_ERROR_read); + if (ca) bch_err_ratelimited(ca, "%s", buf.buf); - } else { + else bch_err_ratelimited(c, "%s", buf.buf); - } printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + bch2_rbio_error(rbio, -BCH_ERR_data_read_io_err, bio->bi_status); } static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, @@ -609,14 +729,12 @@ static void bch2_read_csum_err(struct work_struct *work) bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) { - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + if (ca) bch_err_ratelimited(ca, "%s", buf.buf); - } else { + else bch_err_ratelimited(c, "%s", buf.buf); - } - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -636,7 +754,7 @@ static void bch2_read_decompress_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -656,16 +774,53 @@ static void bch2_read_decrypt_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); printbuf_exit(&buf); } +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_read_corrupt_ratio; +module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(read_corrupt_ratio, ""); + +static void corrupt_bio(struct bio *bio) +{ + struct bvec_iter iter; + struct bio_vec bv; + unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); + + bio_for_each_segment(bv, bio, iter) { + unsigned u64s = bv.bv_len / sizeof(u64); + + if (offset < u64s) { + u64 *segment = bvec_kmap_local(&bv); + segment[offset] = get_random_u64(); + kunmap_local(segment); + return; + } + offset -= u64s; + } +} + +static inline void maybe_corrupt_bio(struct bio *bio) +{ + if (bch2_read_corrupt_ratio && + !get_random_u32_below(bch2_read_corrupt_ratio)) + corrupt_bio(bio); +} +#else +static inline void maybe_corrupt_bio(struct bio *bio) +{ +} +#endif + /* Inner part that may run in process context */ static void __bch2_read_endio(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; struct bio *src = &rbio->bio; struct bio *dst = &bch2_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->bvec_iter; @@ -686,8 +841,26 @@ static void __bch2_read_endio(struct work_struct *work) src->bi_iter = rbio->bvec_iter; } + maybe_corrupt_bio(src); + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) + bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; + + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { + rbio->flags |= BCH_READ_must_bounce; + bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err_maybe_userspace, + BLK_STS_IOERR); + goto out; + } + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); + + if (!csum_good) goto csum_err; /* @@ -760,17 +933,6 @@ out: memalloc_nofs_restore(nofs_flags); return; csum_err: - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { - rbio->flags |= BCH_READ_must_bounce; - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - goto out; - } - bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decompression_err: @@ -790,10 +952,8 @@ static void bch2_read_endio(struct bio *bio) struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; - if (rbio->have_ioref) { - bch2_latency_acct(ca, rbio->submit_time, READ); - percpu_ref_put(&ca->io_ref); - } + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rbio->submit_time, !bio->bi_status); if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; @@ -808,9 +968,9 @@ static void bch2_read_endio(struct bio *bio) trace_and_count(c, io_read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_retry_if_stale) - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); else - bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); return; } @@ -883,7 +1043,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_read_bio *rbio = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); - int pick_ret; + int ret = 0; if (bkey_extent_is_inline_data(k.k)) { unsigned bytes = min_t(unsigned, iter.bi_size, @@ -899,16 +1059,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto out_read_done; } retry_pick: - pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); + ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); /* hole or reservation - just zero fill: */ - if (!pick_ret) + if (!ret) goto hole; - if (unlikely(pick_ret < 0)) { + if (unlikely(ret < 0)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); + prt_printf(&buf, "%s\n ", bch2_err_str(ret)); bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "%s", buf.buf); @@ -924,6 +1084,7 @@ retry_pick: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); + ret = -BCH_ERR_data_read_no_encryption_key; goto err; } @@ -940,7 +1101,7 @@ retry_pick: ca && unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); - bch2_mark_io_failure(failed, &pick); + bch2_mark_io_failure(failed, &pick, false); percpu_ref_put(&ca->io_ref); goto retry_pick; } @@ -984,10 +1145,10 @@ retry_pick: */ struct data_update *u = container_of(orig, struct data_update, rbio); if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { - BUG(); if (ca) percpu_ref_put(&ca->io_ref); - goto hole; + rbio->ret = -BCH_ERR_data_read_buffer_too_small; + goto out_read_done; } iter.bi_size = pick.crc.compressed_size << 9; @@ -1067,8 +1228,7 @@ retry_pick: rbio->flags = flags; rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; - rbio->hole = 0; - rbio->retry = 0; + rbio->ret = 0; rbio->context = 0; rbio->pick = pick; rbio->subvol = orig->subvol; @@ -1104,7 +1264,7 @@ retry_pick: trace_and_count(c, io_read_split, &orig->bio); } - if (!rbio->pick.idx) { + if (likely(!rbio->pick.do_ec_reconstruct)) { if (unlikely(!rbio->have_ioref)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); @@ -1114,7 +1274,9 @@ retry_pick: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, + -BCH_ERR_data_read_device_offline, + BLK_STS_IOERR); goto out; } @@ -1140,7 +1302,8 @@ retry_pick: } else { /* Attempting reconstruct read: */ if (bch2_ec_read_extent(trans, rbio, k)) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_ec_reconstruct_err, + BLK_STS_IOERR); goto out; } @@ -1156,25 +1319,22 @@ out: rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); - ret = rbio->retry; + ret = rbio->ret; rbio = bch2_rbio_free(rbio); - if (ret == READ_RETRY_AVOID) { - bch2_mark_io_failure(failed, &pick); - ret = READ_RETRY; - } - - if (!ret) - goto out_read_done; + if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) + bch2_mark_io_failure(failed, &pick, + ret == -BCH_ERR_data_read_csum_err); return ret; } err: if (flags & BCH_READ_in_retry) - return READ_ERR; + return ret; - orig->bio.bi_status = BLK_STS_IOERR; + orig->bio.bi_status = BLK_STS_IOERR; + orig->ret = ret; goto out_read_done; hole: @@ -1186,20 +1346,21 @@ hole: * to read no longer exists we have to signal that: */ if (flags & BCH_READ_data_update) - orig->hole = true; + orig->ret = -BCH_ERR_data_read_key_overwritten; zero_fill_bio_iter(&orig->bio, iter); out_read_done: - if (flags & BCH_READ_last_fragment) + if ((flags & BCH_READ_last_fragment) && + !(flags & BCH_READ_in_retry)) bch2_rbio_done(orig); return 0; } -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) +int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) { - struct btree_trans *trans = bch2_trans_get(c); + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; @@ -1232,6 +1393,23 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, if (ret) goto err; + if (unlikely(flags & BCH_READ_in_retry)) { + struct data_update *u = flags & BCH_READ_data_update + ? container_of(rbio, struct data_update, rbio) + : NULL; + + if (u && + !bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { + /* extent we wanted to read no longer exists: */ + ret = -BCH_ERR_data_read_key_overwritten; + goto err; + } + + if (!bkey_deleted(&sk.k->k) && + !bkey_and_val_eq(k, bkey_i_to_s_c(sk.k))) + failed->nr = 0; + } + s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); unsigned sectors = k.k->size - offset_into_extent; @@ -1271,28 +1449,32 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - ret != READ_RETRY && - ret != READ_RETRY_AVOID) + !bch2_err_matches(ret, BCH_ERR_data_read_retry)) break; } - bch2_trans_iter_exit(trans, &iter); + if (unlikely(ret)) { + if (ret == -BCH_ERR_no_device_to_read_from && failed) + maybe_poison_extent(trans, &iter, k, failed); - if (ret) { struct printbuf buf = PRINTBUF; lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9)); - prt_printf(&buf, "read error %i from btree lookup", ret); + prt_printf(&buf, "read error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - rbio->bio.bi_status = BLK_STS_IOERR; - bch2_rbio_done(rbio); + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; + + if (!(flags & BCH_READ_in_retry)) + bch2_rbio_done(rbio); } - bch2_trans_put(trans); + bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); + return ret; } void bch2_fs_io_read_exit(struct bch_fs *c) diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h index 73275da5..edcf50a4 100644 --- a/libbcachefs/io_read.h +++ b/libbcachefs/io_read.h @@ -3,6 +3,7 @@ #define _BCACHEFS_IO_READ_H #include "bkey_buf.h" +#include "btree_iter.h" #include "reflink.h" struct bch_read_bio { @@ -40,13 +41,12 @@ struct bch_read_bio { split:1, have_ioref:1, narrow_crcs:1, - hole:1, saw_error:1, - retry:2, context:2; }; u16 _state; }; + s16 ret; struct extent_ptr_decoded pick; @@ -141,22 +141,21 @@ static inline void bch2_read_extent(struct btree_trans *trans, data_btree, k, offset_into_extent, NULL, flags, -1); } -void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); +int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, + subvol_inum, struct bch_io_failures *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) { - struct bch_io_failures failed = { .nr = 0 }; - BUG_ON(rbio->_state); rbio->subvol = inum.subvol; - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, - BCH_READ_retry_if_stale| - BCH_READ_may_promote| - BCH_READ_user_mapped); + bch2_trans_run(c, + __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, + BCH_READ_retry_if_stale| + BCH_READ_may_promote| + BCH_READ_user_mapped)); } static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, @@ -166,6 +165,7 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, rbio->c = orig->c; rbio->_state = 0; + rbio->ret = 0; rbio->split = true; rbio->parent = orig; rbio->opts = orig->opts; @@ -182,6 +182,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, rbio->start_time = local_clock(); rbio->c = c; rbio->_state = 0; + rbio->ret = 0; rbio->opts = opts; rbio->bio.bi_end_io = end_io; return rbio; diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index 738bdbfb..dbfcb28f 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -716,11 +716,15 @@ static void bch2_write_endio(struct bio *bio) ? bch2_dev_have_ref(c, wbio->dev) : NULL; - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + wbio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_inum_offset_ratelimited(ca, op->pos.inode, wbio->inode_offset << 9, "data write error: %s", - bch2_blk_status_to_str(bio->bi_status))) { + bch2_blk_status_to_str(bio->bi_status)); set_bit(wbio->dev, op->failed.d); op->flags |= BCH_WRITE_io_error; } @@ -732,10 +736,8 @@ static void bch2_write_endio(struct bio *bio) set_bit(wbio->dev, op->devs_need_flush->d); } - if (wbio->have_ioref) { - bch2_latency_acct(ca, wbio->submit_time, WRITE); + if (wbio->have_ioref) percpu_ref_put(&ca->io_ref); - } if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h index bf942566..62773053 100644 --- a/libbcachefs/io_write.h +++ b/libbcachefs/io_write.h @@ -11,12 +11,6 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void bch2_latency_acct(struct bch_dev *, u64, int); -#else -static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -#endif - void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 3d097de8..8d4f3bfa 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -1096,8 +1096,8 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, /* allocate journal on a device: */ -static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, - bool new_fs, struct closure *cl) +static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, + bool new_fs, struct closure *cl) { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; @@ -1225,26 +1225,20 @@ err_free: return ret; } -/* - * Allocate more journal space at runtime - not currently making use if it, but - * the code works: - */ -int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - unsigned nr) +static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca, + unsigned nr, bool new_fs) { struct journal_device *ja = &ca->journal; - struct closure cl; int ret = 0; + struct closure cl; closure_init_stack(&cl); - down_write(&c->state_lock); - /* don't handle reducing nr of buckets yet: */ if (nr < ja->nr) - goto unlock; + return 0; - while (ja->nr < nr) { + while (!ret && ja->nr < nr) { struct disk_reservation disk_res = { 0, 0, 0 }; /* @@ -1257,25 +1251,38 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, * filesystem-wide allocation will succeed, this is a device * specific allocation - we can hang here: */ + if (!new_fs) { + ret = bch2_disk_reservation_get(c, &disk_res, + bucket_to_sector(ca, nr - ja->nr), 1, 0); + if (ret) + break; + } - ret = bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0); - if (ret) - break; + ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl); - ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); + if (ret == -BCH_ERR_bucket_alloc_blocked || + ret == -BCH_ERR_open_buckets_empty) + ret = 0; /* wait and retry */ bch2_disk_reservation_put(c, &disk_res); - closure_sync(&cl); - - if (ret && ret != -BCH_ERR_bucket_alloc_blocked) - break; } - bch_err_fn(c, ret); -unlock: + return ret; +} + +/* + * Allocate more journal space at runtime - not currently making use if it, but + * the code works: + */ +int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + unsigned nr) +{ + down_write(&c->state_lock); + int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false); up_write(&c->state_lock); + + bch_err_fn(c, ret); return ret; } @@ -1301,7 +1308,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL); + ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs); err: bch_err_fn(ca, ret); return ret; diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 7d59ccc0..331c9d76 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1041,13 +1041,19 @@ reread: bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, buf->data, sectors_read << 9); + u64 submit_time = local_clock(); ret = submit_bio_wait(bio); kfree(bio); - if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, - "journal read error: sector %llu", - offset) || - bch2_meta_read_fault("journal")) { + if (!ret && bch2_meta_read_fault("journal")) + ret = -BCH_ERR_EIO_fault_injected; + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + submit_time, !ret); + + if (ret) { + bch_err_dev_ratelimited(ca, + "journal read error: sector %llu", offset); /* * We don't error out of the recovery process * here, since the relevant journal entry may be @@ -1110,13 +1116,16 @@ reread: struct bch_csum csum; csum_good = jset_csum_good(c, j, &csum); - if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, - "%s", - (printbuf_reset(&err), - prt_str(&err, "journal "), - bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf))) + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); + + if (!csum_good) { + bch_err_dev_ratelimited(ca, "%s", + (printbuf_reset(&err), + prt_str(&err, "journal "), + bch2_csum_err_msg(&err, csum_type, j->csum, csum), + err.buf)); saw_bad = true; + } ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, @@ -1655,6 +1664,7 @@ static CLOSURE_CALLBACK(journal_write_done) } bool completed = false; + bool do_discards = false; for (seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); @@ -1667,7 +1677,6 @@ static CLOSURE_CALLBACK(journal_write_done) j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; - bch2_do_discards(c); closure_wake_up(&c->freelist_wait); bch2_reset_alloc_cursors(c); } @@ -1718,6 +1727,9 @@ static CLOSURE_CALLBACK(journal_write_done) */ bch2_journal_do_writes(j); spin_unlock(&j->lock); + + if (do_discards) + bch2_do_discards(c); } static void journal_write_endio(struct bio *bio) @@ -1727,13 +1739,16 @@ static void journal_write_endio(struct bio *bio) struct journal *j = &ca->fs->journal; struct journal_buf *w = j->buf + jbio->buf_idx; - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + jbio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), - bch2_blk_status_to_str(bio->bi_status)) || - bch2_meta_write_fault("journal")) { - unsigned long flags; + bch2_blk_status_to_str(bio->bi_status)); + unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); @@ -1762,7 +1777,11 @@ static CLOSURE_CALLBACK(journal_write_submit) sectors); struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->bio[w->idx]->bio; + struct journal_bio *jbio = ja->bio[w->idx]; + struct bio *bio = &jbio->bio; + + jbio->submit_time = local_clock(); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; @@ -1794,6 +1813,10 @@ static CLOSURE_CALLBACK(journal_write_preflush) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* + * Wait for previous journal writes to comelete; they won't necessarily + * be flushed if they're still in flight + */ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { spin_lock(&j->lock); if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index a0b17c6e..fd82f5d8 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -175,6 +175,7 @@ typedef DARRAY(u64) darray_u64; struct journal_bio { struct bch_dev *ca; unsigned buf_idx; + u64 submit_time; struct bio bio; }; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index e944f279..a3096e2a 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -125,8 +125,8 @@ static void move_write(struct moving_io *io) &ctxt->stats->sectors_error_corrected); } - if (unlikely(io->write.rbio.bio.bi_status || - io->write.rbio.hole || + if (unlikely(io->write.rbio.ret || + io->write.rbio.bio.bi_status || io->write.data_opts.scrub)) { move_free(io); return; @@ -816,7 +816,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (!bp.v->level) ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); else if (!data_opts.scrub) - ret = bch2_btree_node_rewrite_key(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); else ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 4ecb721c..fa19fc44 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -74,20 +74,14 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, struct move_bucket *b, u64 time) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a; - int ret; - if (bch2_bucket_is_open(trans->c, - b->k.bucket.inode, - b->k.bucket.offset)) + if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) return 0; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - b->k.bucket, BTREE_ITER_cached); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, + b->k.bucket, BTREE_ITER_cached); + int ret = bkey_err(k); if (ret) return ret; @@ -95,13 +89,18 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, if (!ca) goto out; - a = bch2_alloc_to_v4(k, &_a); + if (ca->mi.state != BCH_MEMBER_STATE_rw || + !bch2_dev_is_online(ca)) + goto out_put; + + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); b->k.gen = a->gen; b->sectors = bch2_bucket_sectors_dirty(*a); u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); ret = lru_idx && lru_idx <= time; - +out_put: bch2_dev_put(ca); out: bch2_trans_iter_exit(trans, &iter); diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 071a92ec..afb89d31 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -145,6 +145,11 @@ enum fsck_err_opts { OPT_STR(bch2_error_actions), \ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ + x(write_error_timeout, u16, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, 300), \ + BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ + NULL, "Number of consecutive write errors allowed before kicking out a device")\ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ diff --git a/libbcachefs/recovery_passes_types.h b/libbcachefs/recovery_passes_types.h index 41855796..e89b9c78 100644 --- a/libbcachefs/recovery_passes_types.h +++ b/libbcachefs/recovery_passes_types.h @@ -24,7 +24,7 @@ x(check_topology, 4, 0) \ x(accounting_read, 39, PASS_ALWAYS) \ x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, PASS_ALWAYS) \ + x(stripes_read, 1, 0) \ x(initialize_subvolumes, 2, 0) \ x(snapshots_read, 3, PASS_ALWAYS) \ x(check_allocations, 5, PASS_FSCK) \ diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 50118661..68172c6e 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -606,7 +606,7 @@ s64 bch2_remap_range(struct bch_fs *c, u64 dst_done = 0; u32 dst_snapshot, src_snapshot; bool reflink_p_may_update_opts_field = - bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); + !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); int ret = 0, ret2 = 0; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c index 21130ead..acb5d845 100644 --- a/libbcachefs/sb-downgrade.c +++ b/libbcachefs/sb-downgrade.c @@ -91,9 +91,6 @@ BCH_FSCK_ERR_accounting_mismatch, \ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(directory_size, \ - BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ - BCH_FSCK_ERR_directory_size_mismatch) \ x(cached_backpointers, \ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index cdafd877..67455beb 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -179,6 +179,7 @@ enum bch_fsck_flags { x(ptr_crc_redundant, 160, 0) \ x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ + x(extent_flags_not_at_start, 306, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ x(reflink_v_pos_bad, 292, 0) \ @@ -316,7 +317,7 @@ enum bch_fsck_flags { x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ x(dirent_cf_name_too_big, 304, 0) \ x(dirent_stray_data_after_cf_name, 305, 0) \ - x(MAX, 306, 0) + x(MAX, 307, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h index b29b6c6c..38261638 100644 --- a/libbcachefs/sb-members.h +++ b/libbcachefs/sb-members.h @@ -35,7 +35,7 @@ static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) return ret; } -static inline bool bch2_dev_is_readable(struct bch_dev *ca) +static inline bool bch2_dev_is_healthy(struct bch_dev *ca) { return bch2_dev_is_online(ca) && ca->mi.state != BCH_MEMBER_STATE_failed; @@ -283,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) { + might_sleep(); + rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, dev); if (ca && !percpu_ref_tryget(&ca->io_ref)) diff --git a/libbcachefs/six.c b/libbcachefs/six.c index 7e7c66a1..7c403427 100644 --- a/libbcachefs/six.c +++ b/libbcachefs/six.c @@ -850,7 +850,8 @@ void six_lock_exit(struct six_lock *lock) EXPORT_SYMBOL_GPL(six_lock_exit); void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags) + struct lock_class_key *key, enum six_lock_init_flags flags, + gfp_t gfp) { atomic_set(&lock->state, 0); raw_spin_lock_init(&lock->wait_lock); @@ -873,7 +874,7 @@ void __six_lock_init(struct six_lock *lock, const char *name, * failure if they wish by checking lock->readers, but generally * will not want to treat it as an error. */ - lock->readers = alloc_percpu(unsigned); + lock->readers = alloc_percpu_gfp(unsigned, gfp); } #endif } diff --git a/libbcachefs/six.h b/libbcachefs/six.h index c142e06b..59b851cf 100644 --- a/libbcachefs/six.h +++ b/libbcachefs/six.h @@ -164,18 +164,19 @@ enum six_lock_init_flags { }; void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags); + struct lock_class_key *key, enum six_lock_init_flags flags, + gfp_t gfp); /** * six_lock_init - initialize a six lock * @lock: lock to initialize * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU */ -#define six_lock_init(lock, flags) \ +#define six_lock_init(lock, flags, gfp) \ do { \ static struct lock_class_key __key; \ \ - __six_lock_init((lock), #lock, &__key, flags); \ + __six_lock_init((lock), #lock, &__key, flags, gfp); \ } while (0) /** diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 0edc8814..ee32d043 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -25,9 +25,6 @@ #include <linux/sort.h> #include <linux/string_choices.h> -static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { -}; - struct bch2_metadata_version { u16 version; const char *name; @@ -69,14 +66,22 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta return v; } -void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) +int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) { - mutex_lock(&c->sb_lock); - SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && + version <= c->sb.version_incompat_allowed) + ? 0 + : -BCH_ERR_may_not_use_incompat_feature; + + if (!ret) { + mutex_lock(&c->sb_lock); + SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + return ret; } const char * const bch2_sb_fields[] = { @@ -366,7 +371,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct bch_sb *sb = disk_sb->sb; struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; - u16 block_size; int ret; ret = bch2_sb_compatible(sb, out); @@ -385,8 +389,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, return -BCH_ERR_invalid_sb_features; } - block_size = le16_to_cpu(sb->block_size); - if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { prt_printf(out, "Bad user UUID (got zeroes)"); return -BCH_ERR_invalid_sb_uuid; @@ -452,6 +454,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); + + if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) + SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); } #ifdef __KERNEL__ @@ -743,7 +748,7 @@ retry: memset(sb, 0, sizeof(*sb)); sb->mode = BLK_OPEN_READ; sb->have_bio = true; - sb->holder = kmalloc(1, GFP_KERNEL); + sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL); if (!sb->holder) return -ENOMEM; @@ -906,16 +911,16 @@ static void write_super_endio(struct bio *bio) { struct bch_dev *ca = bio->bi_private; + bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status); + /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, - bio_data_dir(bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "superblock %s error: %s", + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "superblock %s error: %s", str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status)); ca->sb_write_error = 1; + } closure_put(&ca->fs->sb_write); percpu_ref_put(&ca->io_ref); @@ -1154,7 +1159,7 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written), c, ": Unable to write superblock to sufficient devices (from %ps)", (void *) _RET_IP_)) - ret = -1; + ret = -BCH_ERR_erofs_sb_err; out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); @@ -1211,11 +1216,12 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) bch2_sb_field_resize(&c->disk_sb, downgrade, 0); c->disk_sb.sb->version = cpu_to_le16(new_version); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); - if (incompat) + if (incompat) { + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); + } } static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index f1ab4f94..167dd98f 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -21,17 +21,14 @@ static inline bool bch2_version_compatible(u16 version) void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); -void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); +int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); -static inline bool bch2_request_incompat_feature(struct bch_fs *c, - enum bcachefs_metadata_version version) +static inline int bch2_request_incompat_feature(struct bch_fs *c, + enum bcachefs_metadata_version version) { - if (unlikely(version > c->sb.version_incompat)) { - if (version > c->sb.version_incompat_allowed) - return false; - bch2_set_version_incompat(c, version); - } - return true; + return likely(version <= c->sb.version_incompat) + ? 0 + : bch2_set_version_incompat(c, version); } static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 10c281ad..cffad3b6 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -1075,6 +1075,7 @@ int bch2_fs_start(struct bch_fs *c) } set_bit(BCH_FS_started, &c->flags); + wake_up(&c->ro_ref_wait); if (c->opts.read_only) { bch2_fs_read_only(c); @@ -1431,6 +1432,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); + /* + * Stash pointer to the filesystem for blk_holder_ops - note that once + * attached to a filesystem, we will always close the block device + * before tearing down the filesystem object. + */ + ca->disk_sb.holder->c = ca->fs; + ca->dev = ca->disk_sb.bdev->bd_dev; percpu_ref_reinit(&ca->io_ref); @@ -2016,6 +2024,102 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } +/* blk_holder_ops: */ + +static struct bch_fs *bdev_get_fs(struct block_device *bdev) + __releases(&bdev->bd_holder_lock) +{ + struct bch_sb_handle_holder *holder = bdev->bd_holder; + struct bch_fs *c = holder->c; + + if (c && !bch2_ro_ref_tryget(c)) + c = NULL; + + mutex_unlock(&bdev->bd_holder_lock); + + if (c) + wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); + return c; +} + +/* returns with ref on ca->ref */ +static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) +{ + for_each_member_device(c, ca) + if (ca->disk_sb.bdev == bdev) + return ca; + return NULL; +} + +static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) +{ + struct bch_fs *c = bdev_get_fs(bdev); + if (!c) + return; + + struct super_block *sb = c->vfs_sb; + if (sb) { + /* + * Not necessary, c->ro_ref guards against the filesystem being + * unmounted - we only take this to avoid a warning in + * sync_filesystem: + */ + down_read(&sb->s_umount); + } + + down_write(&c->state_lock); + struct bch_dev *ca = bdev_to_bch_dev(c, bdev); + if (!ca) + goto unlock; + + if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) { + __bch2_dev_offline(c, ca); + } else { + if (sb) { + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + evict_inodes(sb); + } + + bch2_journal_flush(&c->journal); + bch2_fs_emergency_read_only(c); + } + + bch2_dev_put(ca); +unlock: + if (sb) + up_read(&sb->s_umount); + up_write(&c->state_lock); + bch2_ro_ref_put(c); +} + +static void bch2_fs_bdev_sync(struct block_device *bdev) +{ + struct bch_fs *c = bdev_get_fs(bdev); + if (!c) + return; + + struct super_block *sb = c->vfs_sb; + if (sb) { + /* + * Not necessary, c->ro_ref guards against the filesystem being + * unmounted - we only take this to avoid a warning in + * sync_filesystem: + */ + down_read(&sb->s_umount); + sync_filesystem(sb); + up_read(&sb->s_umount); + } + + bch2_ro_ref_put(c); +} + +const struct blk_holder_ops bch2_sb_handle_bdev_ops = { + .mark_dead = bch2_fs_bdev_mark_dead, + .sync = bch2_fs_bdev_sync, +}; + /* Filesystem open: */ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 04f8287e..23533bce 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -42,4 +42,6 @@ void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); +extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; + #endif /* _BCACHEFS_SUPER_H */ diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 368a63d9..3a899f79 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -2,13 +2,19 @@ #ifndef _BCACHEFS_SUPER_TYPES_H #define _BCACHEFS_SUPER_TYPES_H +struct bch_fs; + +struct bch_sb_handle_holder { + struct bch_fs *c; +}; + struct bch_sb_handle { struct bch_sb *sb; struct file *s_bdev_file; struct block_device *bdev; char *sb_name; struct bio *bio; - void *holder; + struct bch_sb_handle_holder *holder; size_t buffer_size; blk_mode_t mode; unsigned have_layout:1; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index a9953181..2ed3f755 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -174,7 +174,6 @@ read_attribute(journal_debug); read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_reserve_cache); -read_attribute(stripes_heap); read_attribute(open_buckets); read_attribute(open_buckets_partial); read_attribute(nocow_lock_table); @@ -355,9 +354,6 @@ SHOW(bch2_fs) if (attr == &sysfs_btree_reserve_cache) bch2_btree_reserve_cache_to_text(out, c); - if (attr == &sysfs_stripes_heap) - bch2_stripes_heap_to_text(out, c); - if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, NULL); @@ -566,7 +562,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_btree_key_cache, &sysfs_btree_reserve_cache, &sysfs_new_stripes, - &sysfs_stripes_heap, &sysfs_open_buckets, &sysfs_open_buckets_partial, #ifdef BCH_WRITE_REF_DEBUG diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 5718988d..c8669a6b 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -339,6 +339,30 @@ DEFINE_EVENT(bio, io_read_reuse_race, TP_ARGS(bio) ); +/* ec.c */ + +TRACE_EVENT(stripe_create, + TP_PROTO(struct bch_fs *c, u64 idx, int ret), + TP_ARGS(c, idx, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, idx ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->idx = idx; + __entry->ret = ret; + ), + + TP_printk("%d,%d idx %llu ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->idx, + __entry->ret) +); + /* Journal */ DEFINE_EVENT(bch_fs, journal_full, diff --git a/linux/blkdev.c b/linux/blkdev.c index e496fc11..eb257d8b 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -208,6 +208,8 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, bdev->queue.backing_dev_info = bdev->bd_disk->bdi; bdev->bd_inode = &bdev->__bd_inode; + mutex_init(&bdev->bd_holder_lock); + struct file *file = calloc(sizeof(*file), 1); file->f_inode = bdev->bd_inode;