mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-03-27 00:00:04 +03:00
Update bcachefs sources to 46af7258b951 bcachefs: BCH_SB_FEATURES_ALL includes BCH_FEATURE_incompat_verison_field
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
86cbeaf1c2
commit
6cbadc946d
.bcachefs_revision
include/linux
libbcachefs
bcachefs.hbcachefs_format.hbtree_cache.cbtree_io.cbtree_key_cache.cbtree_locking.cbtree_locking.hbtree_node_scan.cbtree_update_interior.cbtree_update_interior.hdata_update.cdirent_format.hec.cec.hec_types.herrcode.herror.cerror.hextents.cextents.hextents_format.hextents_types.hfs-common.cfs-io.cfs-ioctl.cfs.cfsck.cio_read.cio_read.hio_write.cio_write.hjournal.cjournal_io.cjournal_types.hmove.cmovinggc.copts.hrecovery_passes_types.hreflink.csb-downgrade.csb-errors_format.hsb-members.hsix.csix.hsuper-io.csuper-io.hsuper.csuper.hsuper_types.hsysfs.ctrace.h
linux
@ -1 +1 @@
|
||||
9736cbbc5cc39f6c666befdd787788b6ce6497f6
|
||||
46af7258b951a79a66511172ab8772ad2dfaa4e3
|
||||
|
@ -10,6 +10,8 @@
|
||||
#include <linux/types.h>
|
||||
#include <linux/bvec.h>
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/rwsem.h>
|
||||
|
||||
struct bio_set;
|
||||
struct bio;
|
||||
@ -63,6 +65,8 @@ struct block_device {
|
||||
struct gendisk * bd_disk;
|
||||
struct gendisk __bd_disk;
|
||||
int bd_fd;
|
||||
|
||||
struct mutex bd_holder_lock;
|
||||
};
|
||||
|
||||
#define bdev_kobj(_bdev) (&((_bdev)->kobj))
|
||||
|
@ -65,7 +65,10 @@ unsigned bdev_logical_block_size(struct block_device *bdev);
|
||||
sector_t get_capacity(struct gendisk *disk);
|
||||
|
||||
struct blk_holder_ops {
|
||||
void (*mark_dead)(struct block_device *bdev);
|
||||
void (*mark_dead)(struct block_device *bdev, bool surprise);
|
||||
void (*sync)(struct block_device *bdev);
|
||||
int (*freeze)(struct block_device *bdev);
|
||||
int (*thaw)(struct block_device *bdev);
|
||||
};
|
||||
|
||||
static inline struct block_device *file_bdev(struct file *file)
|
||||
@ -80,8 +83,12 @@ int lookup_bdev(const char *path, dev_t *);
|
||||
|
||||
struct super_block {
|
||||
void *s_fs_info;
|
||||
struct rw_semaphore s_umount;
|
||||
};
|
||||
|
||||
static inline void evict_inodes(struct super_block *sb) {}
|
||||
static inline int sync_filesystem(struct super_block *) { return 0; }
|
||||
|
||||
/*
|
||||
* File types
|
||||
*
|
||||
|
@ -9,6 +9,8 @@ struct dentry {
|
||||
struct inode *d_inode;
|
||||
};
|
||||
|
||||
static inline void shrink_dcache_sb(struct super_block *) {}
|
||||
|
||||
#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
|
||||
#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n))
|
||||
|
||||
|
@ -536,6 +536,7 @@ struct bch_dev {
|
||||
*/
|
||||
struct bch_member_cpu mi;
|
||||
atomic64_t errors[BCH_MEMBER_ERROR_NR];
|
||||
unsigned long write_errors_start;
|
||||
|
||||
__uuid_t uuid;
|
||||
char name[BDEVNAME_SIZE];
|
||||
@ -1002,15 +1003,11 @@ struct bch_fs {
|
||||
wait_queue_head_t copygc_running_wq;
|
||||
|
||||
/* STRIPES: */
|
||||
GENRADIX(struct stripe) stripes;
|
||||
GENRADIX(struct gc_stripe) gc_stripes;
|
||||
|
||||
struct hlist_head ec_stripes_new[32];
|
||||
spinlock_t ec_stripes_new_lock;
|
||||
|
||||
ec_stripes_heap ec_stripes_heap;
|
||||
struct mutex ec_stripes_heap_lock;
|
||||
|
||||
/* ERASURE CODING */
|
||||
struct list_head ec_stripe_head_list;
|
||||
struct mutex ec_stripe_head_lock;
|
||||
|
@ -690,7 +690,8 @@ struct bch_sb_field_ext {
|
||||
x(cached_backpointers, BCH_VERSION(1, 21)) \
|
||||
x(stripe_backpointers, BCH_VERSION(1, 22)) \
|
||||
x(stripe_lru, BCH_VERSION(1, 23)) \
|
||||
x(casefolding, BCH_VERSION(1, 24))
|
||||
x(casefolding, BCH_VERSION(1, 24)) \
|
||||
x(extent_flags, BCH_VERSION(1, 25))
|
||||
|
||||
enum bcachefs_metadata_version {
|
||||
bcachefs_metadata_version_min = 9,
|
||||
@ -859,6 +860,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
|
||||
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
|
||||
struct bch_sb, flags[5], 48, 64);
|
||||
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
|
||||
LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
|
||||
|
||||
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
|
||||
{
|
||||
@ -927,7 +929,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
|
||||
BIT_ULL(BCH_FEATURE_new_siphash)| \
|
||||
BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
|
||||
BIT_ULL(BCH_FEATURE_new_varint)| \
|
||||
BIT_ULL(BCH_FEATURE_journal_no_flush))
|
||||
BIT_ULL(BCH_FEATURE_journal_no_flush)| \
|
||||
BIT_ULL(BCH_FEATURE_incompat_version_field))
|
||||
|
||||
enum bch_sb_feature {
|
||||
#define x(f, n) BCH_FEATURE_##f,
|
||||
|
@ -203,7 +203,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bch2_btree_lock_init(&b->c, 0);
|
||||
bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
|
||||
|
||||
__bch2_btree_node_to_freelist(bc, b);
|
||||
return b;
|
||||
@ -795,17 +795,18 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
|
||||
}
|
||||
|
||||
b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
|
||||
if (!b) {
|
||||
if (b) {
|
||||
bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
|
||||
} else {
|
||||
mutex_unlock(&bc->lock);
|
||||
bch2_trans_unlock(trans);
|
||||
b = __btree_node_mem_alloc(c, GFP_KERNEL);
|
||||
if (!b)
|
||||
goto err;
|
||||
bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
|
||||
mutex_lock(&bc->lock);
|
||||
}
|
||||
|
||||
bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
|
||||
|
||||
BUG_ON(!six_trylock_intent(&b->c.lock));
|
||||
BUG_ON(!six_trylock_write(&b->c.lock));
|
||||
|
||||
|
@ -1187,7 +1187,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
le64_to_cpu(i->journal_seq),
|
||||
b->written, b->written + sectors, ptr_written);
|
||||
|
||||
b->written += sectors;
|
||||
b->written = min(b->written + sectors, btree_sectors(c));
|
||||
|
||||
if (blacklisted && !first)
|
||||
continue;
|
||||
@ -1329,6 +1329,7 @@ static void btree_node_read_work(struct work_struct *work)
|
||||
bch_info(c, "retrying read");
|
||||
ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
|
||||
rb->have_ioref = ca != NULL;
|
||||
rb->start_time = local_clock();
|
||||
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
|
||||
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
|
||||
bio->bi_iter.bi_size = btree_buf_bytes(b);
|
||||
@ -1339,17 +1340,22 @@ static void btree_node_read_work(struct work_struct *work)
|
||||
} else {
|
||||
bio->bi_status = BLK_STS_REMOVED;
|
||||
}
|
||||
|
||||
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
||||
rb->start_time, !bio->bi_status);
|
||||
start:
|
||||
printbuf_reset(&buf);
|
||||
bch2_btree_pos_to_text(&buf, c, b);
|
||||
bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
|
||||
"btree read error %s for %s",
|
||||
bch2_blk_status_to_str(bio->bi_status), buf.buf);
|
||||
|
||||
if (ca && bio->bi_status)
|
||||
bch_err_dev_ratelimited(ca,
|
||||
"btree read error %s for %s",
|
||||
bch2_blk_status_to_str(bio->bi_status), buf.buf);
|
||||
if (rb->have_ioref)
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
rb->have_ioref = false;
|
||||
|
||||
bch2_mark_io_failure(&failed, &rb->pick);
|
||||
bch2_mark_io_failure(&failed, &rb->pick, false);
|
||||
|
||||
can_retry = bch2_bkey_pick_read_device(c,
|
||||
bkey_i_to_s_c(&b->key),
|
||||
@ -1401,12 +1407,11 @@ static void btree_node_read_endio(struct bio *bio)
|
||||
struct btree_read_bio *rb =
|
||||
container_of(bio, struct btree_read_bio, bio);
|
||||
struct bch_fs *c = rb->c;
|
||||
struct bch_dev *ca = rb->have_ioref
|
||||
? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
|
||||
|
||||
if (rb->have_ioref) {
|
||||
struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
|
||||
|
||||
bch2_latency_acct(ca, rb->start_time, READ);
|
||||
}
|
||||
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
||||
rb->start_time, !bio->bi_status);
|
||||
|
||||
queue_work(c->btree_read_complete_wq, &rb->work);
|
||||
}
|
||||
@ -2075,6 +2080,11 @@ static void btree_node_write_work(struct work_struct *work)
|
||||
container_of(work, struct btree_write_bio, work);
|
||||
struct bch_fs *c = wbio->wbio.c;
|
||||
struct btree *b = wbio->wbio.bio.bi_private;
|
||||
unsigned commit_flags =
|
||||
BCH_WATERMARK_interior_updates|
|
||||
BCH_TRANS_COMMIT_journal_reclaim|
|
||||
BCH_TRANS_COMMIT_no_enospc|
|
||||
BCH_TRANS_COMMIT_no_check_rw;
|
||||
u64 start_time = wbio->start_time;
|
||||
int ret = 0;
|
||||
|
||||
@ -2083,38 +2093,24 @@ static void btree_node_write_work(struct work_struct *work)
|
||||
wbio->wbio.used_mempool,
|
||||
wbio->data);
|
||||
|
||||
bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
|
||||
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
|
||||
|
||||
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
|
||||
ret = -BCH_ERR_btree_node_write_all_failed;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (wbio->wbio.first_btree_write) {
|
||||
if (wbio->wbio.failed.nr) {
|
||||
|
||||
}
|
||||
} else {
|
||||
if (wbio->wbio.failed.nr) {
|
||||
ret = bch2_trans_do(c,
|
||||
bch2_btree_node_rewrite_key_get_iter(trans, b,
|
||||
commit_flags));
|
||||
} else if (!wbio->wbio.first_btree_write) {
|
||||
ret = bch2_trans_do(c,
|
||||
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
|
||||
BCH_WATERMARK_interior_updates|
|
||||
BCH_TRANS_COMMIT_journal_reclaim|
|
||||
BCH_TRANS_COMMIT_no_enospc|
|
||||
BCH_TRANS_COMMIT_no_check_rw,
|
||||
!wbio->wbio.failed.nr));
|
||||
if (ret)
|
||||
goto err;
|
||||
commit_flags, true));
|
||||
}
|
||||
out:
|
||||
|
||||
if (ret) {
|
||||
set_btree_node_noevict(b);
|
||||
bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
|
||||
"writing btree node: %s", bch2_err_str(ret));
|
||||
}
|
||||
|
||||
bio_put(&wbio->wbio.bio);
|
||||
btree_node_write_done(c, b, start_time);
|
||||
return;
|
||||
err:
|
||||
set_btree_node_noevict(b);
|
||||
bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
|
||||
"writing btree node: %s", bch2_err_str(ret));
|
||||
goto out;
|
||||
}
|
||||
|
||||
static void btree_node_write_endio(struct bio *bio)
|
||||
@ -2126,16 +2122,17 @@ static void btree_node_write_endio(struct bio *bio)
|
||||
struct bch_fs *c = wbio->c;
|
||||
struct btree *b = wbio->bio.bi_private;
|
||||
struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
|
||||
unsigned long flags;
|
||||
|
||||
if (wbio->have_ioref)
|
||||
bch2_latency_acct(ca, wbio->submit_time, WRITE);
|
||||
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
|
||||
wbio->submit_time, !bio->bi_status);
|
||||
|
||||
if (!ca ||
|
||||
bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
|
||||
"btree write error: %s",
|
||||
bch2_blk_status_to_str(bio->bi_status)) ||
|
||||
bch2_meta_write_fault("btree")) {
|
||||
if (ca && bio->bi_status)
|
||||
bch_err_dev_ratelimited(ca,
|
||||
"btree write error: %s",
|
||||
bch2_blk_status_to_str(bio->bi_status));
|
||||
|
||||
if (bio->bi_status) {
|
||||
unsigned long flags;
|
||||
spin_lock_irqsave(&c->btree_write_error_lock, flags);
|
||||
bch2_dev_list_add_dev(&orig->failed, wbio->dev);
|
||||
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
|
||||
|
@ -156,7 +156,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
|
||||
}
|
||||
|
||||
if (ck) {
|
||||
bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
|
||||
bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
|
||||
ck->c.cached = true;
|
||||
goto lock;
|
||||
}
|
||||
|
@ -7,9 +7,10 @@
|
||||
static struct lock_class_key bch2_btree_node_lock_key;
|
||||
|
||||
void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
|
||||
enum six_lock_init_flags flags)
|
||||
enum six_lock_init_flags flags,
|
||||
gfp_t gfp)
|
||||
{
|
||||
__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
|
||||
__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp);
|
||||
lockdep_set_notrack_class(&b->lock);
|
||||
}
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
#include "btree_iter.h"
|
||||
#include "six.h"
|
||||
|
||||
void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
|
||||
void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
|
||||
|
||||
void bch2_trans_unlock_noassert(struct btree_trans *);
|
||||
void bch2_trans_unlock_write(struct btree_trans *);
|
||||
|
@ -166,11 +166,17 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
|
||||
bio->bi_iter.bi_sector = offset;
|
||||
bch2_bio_map(bio, bn, PAGE_SIZE);
|
||||
|
||||
u64 submit_time = local_clock();
|
||||
submit_bio_wait(bio);
|
||||
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
|
||||
"IO error in try_read_btree_node() at %llu: %s",
|
||||
offset, bch2_blk_status_to_str(bio->bi_status)))
|
||||
|
||||
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
|
||||
|
||||
if (bio->bi_status) {
|
||||
bch_err_dev_ratelimited(ca,
|
||||
"IO error in try_read_btree_node() at %llu: %s",
|
||||
offset, bch2_blk_status_to_str(bio->bi_status));
|
||||
return;
|
||||
}
|
||||
|
||||
if (le64_to_cpu(bn->magic) != bset_magic(c))
|
||||
return;
|
||||
@ -264,7 +270,7 @@ static int read_btree_nodes_worker(void *p)
|
||||
err:
|
||||
bio_put(bio);
|
||||
free_page((unsigned long) buf);
|
||||
percpu_ref_get(&ca->io_ref);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
closure_put(w->cl);
|
||||
kfree(w);
|
||||
return 0;
|
||||
@ -283,29 +289,28 @@ static int read_btree_nodes(struct find_btree_nodes *f)
|
||||
continue;
|
||||
|
||||
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
|
||||
struct task_struct *t;
|
||||
|
||||
if (!w) {
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
percpu_ref_get(&ca->io_ref);
|
||||
closure_get(&cl);
|
||||
w->cl = &cl;
|
||||
w->f = f;
|
||||
w->ca = ca;
|
||||
|
||||
t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
|
||||
struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
|
||||
ret = PTR_ERR_OR_ZERO(t);
|
||||
if (ret) {
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
closure_put(&cl);
|
||||
f->ret = ret;
|
||||
bch_err(c, "error starting kthread: %i", ret);
|
||||
kfree(w);
|
||||
bch_err_msg(c, ret, "starting kthread");
|
||||
break;
|
||||
}
|
||||
|
||||
closure_get(&cl);
|
||||
percpu_ref_get(&ca->io_ref);
|
||||
wake_up_process(t);
|
||||
}
|
||||
err:
|
||||
closure_sync(&cl);
|
||||
|
@ -2126,6 +2126,31 @@ err_free_update:
|
||||
goto out;
|
||||
}
|
||||
|
||||
static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
|
||||
struct btree *b)
|
||||
{
|
||||
bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
|
||||
BTREE_MAX_DEPTH, b->c.level,
|
||||
BTREE_ITER_intent);
|
||||
int ret = bch2_btree_iter_traverse(iter);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
/* has node been freed? */
|
||||
if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
|
||||
/* node has been freed: */
|
||||
BUG_ON(!btree_node_dying(b));
|
||||
ret = -BCH_ERR_btree_node_dying;
|
||||
goto err;
|
||||
}
|
||||
|
||||
BUG_ON(!btree_node_hashed(b));
|
||||
return 0;
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_btree_node_rewrite(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct btree *b,
|
||||
@ -2191,7 +2216,29 @@ err:
|
||||
goto out;
|
||||
}
|
||||
|
||||
int bch2_btree_node_rewrite_key(struct btree_trans *trans,
|
||||
static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
|
||||
enum btree_id btree, unsigned level,
|
||||
struct bkey_i *k, unsigned flags)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
bch2_trans_node_iter_init(trans, &iter,
|
||||
btree, k->k.p,
|
||||
BTREE_MAX_DEPTH, level, 0);
|
||||
struct btree *b = bch2_btree_iter_peek_node(&iter);
|
||||
int ret = PTR_ERR_OR_ZERO(b);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
|
||||
ret = found
|
||||
? bch2_btree_node_rewrite(trans, &iter, b, flags)
|
||||
: -ENOENT;
|
||||
out:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
|
||||
enum btree_id btree, unsigned level,
|
||||
struct bpos pos, unsigned flags)
|
||||
{
|
||||
@ -2211,6 +2258,19 @@ err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
|
||||
struct btree *b, unsigned flags)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
int ret = get_iter_to_node(trans, &iter, b);
|
||||
if (ret)
|
||||
return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
|
||||
|
||||
ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct async_btree_rewrite {
|
||||
struct bch_fs *c;
|
||||
struct work_struct work;
|
||||
@ -2220,57 +2280,14 @@ struct async_btree_rewrite {
|
||||
struct bkey_buf key;
|
||||
};
|
||||
|
||||
static int async_btree_node_rewrite_trans(struct btree_trans *trans,
|
||||
struct async_btree_rewrite *a)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
bch2_trans_node_iter_init(trans, &iter,
|
||||
a->btree_id, a->key.k->k.p,
|
||||
BTREE_MAX_DEPTH, a->level, 0);
|
||||
struct btree *b = bch2_btree_iter_peek_node(&iter);
|
||||
int ret = PTR_ERR_OR_ZERO(b);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
|
||||
ret = found
|
||||
? bch2_btree_node_rewrite(trans, &iter, b, 0)
|
||||
: -ENOENT;
|
||||
|
||||
#if 0
|
||||
/* Tracepoint... */
|
||||
if (!ret || ret == -ENOENT) {
|
||||
struct bch_fs *c = trans->c;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
if (!ret) {
|
||||
prt_printf(&buf, "rewrite node:\n ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
|
||||
} else {
|
||||
prt_printf(&buf, "node to rewrite not found:\n want: ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
|
||||
prt_printf(&buf, "\n got: ");
|
||||
if (b)
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
else
|
||||
prt_str(&buf, "(null)");
|
||||
}
|
||||
bch_info(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
#endif
|
||||
out:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void async_btree_node_rewrite_work(struct work_struct *work)
|
||||
{
|
||||
struct async_btree_rewrite *a =
|
||||
container_of(work, struct async_btree_rewrite, work);
|
||||
struct bch_fs *c = a->c;
|
||||
|
||||
int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
|
||||
int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
|
||||
a->btree_id, a->level, a->key.k, 0));
|
||||
if (ret != -ENOENT)
|
||||
bch_err_fn_ratelimited(c, ret);
|
||||
|
||||
@ -2514,30 +2531,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
|
||||
unsigned commit_flags, bool skip_triggers)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
int ret;
|
||||
|
||||
bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
|
||||
BTREE_MAX_DEPTH, b->c.level,
|
||||
BTREE_ITER_intent);
|
||||
ret = bch2_btree_iter_traverse(&iter);
|
||||
int ret = get_iter_to_node(trans, &iter, b);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/* has node been freed? */
|
||||
if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
|
||||
/* node has been freed: */
|
||||
BUG_ON(!btree_node_dying(b));
|
||||
goto out;
|
||||
}
|
||||
|
||||
BUG_ON(!btree_node_hashed(b));
|
||||
return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
|
||||
|
||||
bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
|
||||
!bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
|
||||
|
||||
ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
|
||||
commit_flags, skip_triggers);
|
||||
out:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
@ -169,9 +169,12 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
|
||||
|
||||
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
|
||||
struct btree *, unsigned);
|
||||
int bch2_btree_node_rewrite_key(struct btree_trans *,
|
||||
int bch2_btree_node_rewrite_pos(struct btree_trans *,
|
||||
enum btree_id, unsigned,
|
||||
struct bpos, unsigned);
|
||||
int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
|
||||
struct btree *, unsigned);
|
||||
|
||||
void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
|
||||
|
||||
int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
|
||||
|
@ -573,7 +573,6 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
|
||||
prt_str_indented(out, "extra replicas:\t");
|
||||
prt_u64(out, data_opts->extra_replicas);
|
||||
prt_newline(out);
|
||||
}
|
||||
|
||||
void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
|
||||
@ -707,6 +706,18 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool can_write_extent(struct bch_fs *c,
|
||||
struct bch_devs_list *devs_have,
|
||||
unsigned target)
|
||||
{
|
||||
struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
|
||||
|
||||
darray_for_each(*devs_have, i)
|
||||
__clear_bit(*i, devs.d);
|
||||
|
||||
return !bch2_is_zero(&devs, sizeof(devs));
|
||||
}
|
||||
|
||||
int bch2_data_update_init(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct moving_context *ctxt,
|
||||
@ -788,6 +799,20 @@ int bch2_data_update_init(struct btree_trans *trans,
|
||||
ptr_bit <<= 1;
|
||||
}
|
||||
|
||||
if (!can_write_extent(c, &m->op.devs_have,
|
||||
m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) {
|
||||
/*
|
||||
* Check if we have rw devices not in devs_have: this can happen
|
||||
* if we're trying to move data on a ro or failed device
|
||||
*
|
||||
* If we can't move it, we need to clear the rebalance_work bit,
|
||||
* if applicable
|
||||
*
|
||||
* Also, copygc should skip ro/failed devices:
|
||||
*/
|
||||
return -BCH_ERR_data_update_done_no_rw_devs;
|
||||
}
|
||||
|
||||
unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
|
||||
|
||||
/*
|
||||
|
@ -44,9 +44,9 @@ struct bch_dirent {
|
||||
__u8 d_pad;
|
||||
__le16 d_name_len;
|
||||
__le16 d_cf_name_len;
|
||||
__u8 d_names[0];
|
||||
__u8 d_names[];
|
||||
} d_cf_name_block __packed;
|
||||
__u8 d_name[0];
|
||||
__DECLARE_FLEX_ARRAY(__u8, d_name);
|
||||
} __packed;
|
||||
} __packed __aligned(8);
|
||||
|
||||
|
377
libbcachefs/ec.c
377
libbcachefs/ec.c
@ -105,6 +105,7 @@ struct ec_bio {
|
||||
struct bch_dev *ca;
|
||||
struct ec_stripe_buf *buf;
|
||||
size_t idx;
|
||||
u64 submit_time;
|
||||
struct bio bio;
|
||||
};
|
||||
|
||||
@ -494,38 +495,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (flags & BTREE_TRIGGER_atomic) {
|
||||
struct stripe *m = genradix_ptr(&c->stripes, idx);
|
||||
|
||||
if (!m) {
|
||||
struct printbuf buf1 = PRINTBUF;
|
||||
struct printbuf buf2 = PRINTBUF;
|
||||
|
||||
bch2_bkey_val_to_text(&buf1, c, old);
|
||||
bch2_bkey_val_to_text(&buf2, c, new);
|
||||
bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
|
||||
"old %s\n"
|
||||
"new %s", idx, buf1.buf, buf2.buf);
|
||||
printbuf_exit(&buf2);
|
||||
printbuf_exit(&buf1);
|
||||
bch2_inconsistent_error(c);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!new_s) {
|
||||
bch2_stripes_heap_del(c, m, idx);
|
||||
|
||||
memset(m, 0, sizeof(*m));
|
||||
} else {
|
||||
stripe_to_mem(m, new_s);
|
||||
|
||||
if (!old_s)
|
||||
bch2_stripes_heap_insert(c, m, idx);
|
||||
else
|
||||
bch2_stripes_heap_update(c, m, idx);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -748,14 +717,15 @@ static void ec_block_endio(struct bio *bio)
|
||||
struct bch_dev *ca = ec_bio->ca;
|
||||
struct closure *cl = bio->bi_private;
|
||||
|
||||
if (bch2_dev_io_err_on(bio->bi_status, ca,
|
||||
bio_data_dir(bio)
|
||||
? BCH_MEMBER_ERROR_write
|
||||
: BCH_MEMBER_ERROR_read,
|
||||
"erasure coding %s error: %s",
|
||||
bch2_account_io_completion(ca, bio_data_dir(bio),
|
||||
ec_bio->submit_time, !bio->bi_status);
|
||||
|
||||
if (bio->bi_status) {
|
||||
bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
|
||||
str_write_read(bio_data_dir(bio)),
|
||||
bch2_blk_status_to_str(bio->bi_status)))
|
||||
bch2_blk_status_to_str(bio->bi_status));
|
||||
clear_bit(ec_bio->idx, ec_bio->buf->valid);
|
||||
}
|
||||
|
||||
int stale = dev_ptr_stale(ca, ptr);
|
||||
if (stale) {
|
||||
@ -818,6 +788,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
|
||||
ec_bio->ca = ca;
|
||||
ec_bio->buf = buf;
|
||||
ec_bio->idx = idx;
|
||||
ec_bio->submit_time = local_clock();
|
||||
|
||||
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
|
||||
ec_bio->bio.bi_end_io = ec_block_endio;
|
||||
@ -939,26 +910,6 @@ err:
|
||||
|
||||
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
|
||||
{
|
||||
ec_stripes_heap n, *h = &c->ec_stripes_heap;
|
||||
|
||||
if (idx >= h->size) {
|
||||
if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
|
||||
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
||||
|
||||
mutex_lock(&c->ec_stripes_heap_lock);
|
||||
if (n.size > h->size) {
|
||||
memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
|
||||
n.nr = h->nr;
|
||||
swap(*h, n);
|
||||
}
|
||||
mutex_unlock(&c->ec_stripes_heap_lock);
|
||||
|
||||
free_heap(&n);
|
||||
}
|
||||
|
||||
if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
|
||||
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
||||
|
||||
if (c->gc_pos.phase != GC_PHASE_not_running &&
|
||||
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
|
||||
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
||||
@ -1031,155 +982,26 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
|
||||
s->idx = 0;
|
||||
}
|
||||
|
||||
/* Heap of all existing stripes, ordered by blocks_nonempty */
|
||||
|
||||
static u64 stripe_idx_to_delete(struct bch_fs *c)
|
||||
{
|
||||
ec_stripes_heap *h = &c->ec_stripes_heap;
|
||||
|
||||
lockdep_assert_held(&c->ec_stripes_heap_lock);
|
||||
|
||||
if (h->nr &&
|
||||
h->data[0].blocks_nonempty == 0 &&
|
||||
!bch2_stripe_is_open(c, h->data[0].idx))
|
||||
return h->data[0].idx;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
|
||||
size_t i)
|
||||
{
|
||||
struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
|
||||
|
||||
genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
|
||||
}
|
||||
|
||||
static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
|
||||
{
|
||||
struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
|
||||
struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
|
||||
|
||||
return ((_l->blocks_nonempty > _r->blocks_nonempty) <
|
||||
(_l->blocks_nonempty < _r->blocks_nonempty));
|
||||
}
|
||||
|
||||
static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
|
||||
{
|
||||
struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
|
||||
struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
|
||||
ec_stripes_heap *_h = (ec_stripes_heap *)h;
|
||||
size_t i = _l - _h->data;
|
||||
size_t j = _r - _h->data;
|
||||
|
||||
swap(*_l, *_r);
|
||||
|
||||
ec_stripes_heap_set_backpointer(_h, i);
|
||||
ec_stripes_heap_set_backpointer(_h, j);
|
||||
}
|
||||
|
||||
static const struct min_heap_callbacks callbacks = {
|
||||
.less = ec_stripes_heap_cmp,
|
||||
.swp = ec_stripes_heap_swap,
|
||||
};
|
||||
|
||||
static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
|
||||
{
|
||||
ec_stripes_heap *h = &c->ec_stripes_heap;
|
||||
struct stripe *m = genradix_ptr(&c->stripes, idx);
|
||||
|
||||
BUG_ON(m->heap_idx >= h->nr);
|
||||
BUG_ON(h->data[m->heap_idx].idx != idx);
|
||||
}
|
||||
|
||||
void bch2_stripes_heap_del(struct bch_fs *c,
|
||||
struct stripe *m, size_t idx)
|
||||
{
|
||||
mutex_lock(&c->ec_stripes_heap_lock);
|
||||
heap_verify_backpointer(c, idx);
|
||||
|
||||
min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
|
||||
mutex_unlock(&c->ec_stripes_heap_lock);
|
||||
}
|
||||
|
||||
void bch2_stripes_heap_insert(struct bch_fs *c,
|
||||
struct stripe *m, size_t idx)
|
||||
{
|
||||
mutex_lock(&c->ec_stripes_heap_lock);
|
||||
BUG_ON(min_heap_full(&c->ec_stripes_heap));
|
||||
|
||||
genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
|
||||
min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
|
||||
.idx = idx,
|
||||
.blocks_nonempty = m->blocks_nonempty,
|
||||
}),
|
||||
&callbacks,
|
||||
&c->ec_stripes_heap);
|
||||
|
||||
heap_verify_backpointer(c, idx);
|
||||
mutex_unlock(&c->ec_stripes_heap_lock);
|
||||
}
|
||||
|
||||
void bch2_stripes_heap_update(struct bch_fs *c,
|
||||
struct stripe *m, size_t idx)
|
||||
{
|
||||
ec_stripes_heap *h = &c->ec_stripes_heap;
|
||||
bool do_deletes;
|
||||
size_t i;
|
||||
|
||||
mutex_lock(&c->ec_stripes_heap_lock);
|
||||
heap_verify_backpointer(c, idx);
|
||||
|
||||
h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
|
||||
|
||||
i = m->heap_idx;
|
||||
min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap);
|
||||
min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
|
||||
|
||||
heap_verify_backpointer(c, idx);
|
||||
|
||||
do_deletes = stripe_idx_to_delete(c) != 0;
|
||||
mutex_unlock(&c->ec_stripes_heap_lock);
|
||||
|
||||
if (do_deletes)
|
||||
bch2_do_stripe_deletes(c);
|
||||
}
|
||||
|
||||
/* stripe deletion */
|
||||
|
||||
static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
struct bkey_s_c_stripe s;
|
||||
int ret;
|
||||
|
||||
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
|
||||
BTREE_ITER_intent);
|
||||
ret = bkey_err(k);
|
||||
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
|
||||
BTREE_ID_stripes, POS(0, idx),
|
||||
BTREE_ITER_intent);
|
||||
int ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (k.k->type != KEY_TYPE_stripe) {
|
||||
bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
s = bkey_s_c_to_stripe(k);
|
||||
for (unsigned i = 0; i < s.v->nr_blocks; i++)
|
||||
if (stripe_blockcount_get(s.v, i)) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = bch2_btree_delete_at(trans, &iter, 0);
|
||||
/*
|
||||
* We expect write buffer races here
|
||||
* Important: check stripe_is_open with stripe key locked:
|
||||
*/
|
||||
if (k.k->type == KEY_TYPE_stripe &&
|
||||
!bch2_stripe_is_open(trans->c, idx) &&
|
||||
stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
|
||||
ret = bch2_btree_delete_at(trans, &iter, 0);
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
@ -1194,21 +1016,16 @@ static void ec_stripe_delete_work(struct work_struct *work)
|
||||
struct bch_fs *c =
|
||||
container_of(work, struct bch_fs, ec_stripe_delete_work);
|
||||
|
||||
while (1) {
|
||||
mutex_lock(&c->ec_stripes_heap_lock);
|
||||
u64 idx = stripe_idx_to_delete(c);
|
||||
mutex_unlock(&c->ec_stripes_heap_lock);
|
||||
|
||||
if (!idx)
|
||||
break;
|
||||
|
||||
int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
||||
ec_stripe_delete(trans, idx));
|
||||
bch_err_fn(c, ret);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
bch2_trans_run(c,
|
||||
bch2_btree_write_buffer_tryflush(trans) ?:
|
||||
for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
|
||||
lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
|
||||
lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
|
||||
0, lru_k,
|
||||
NULL, NULL,
|
||||
BCH_TRANS_COMMIT_no_enospc, ({
|
||||
ec_stripe_delete(trans, lru_k.k->p.offset);
|
||||
})));
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
|
||||
}
|
||||
|
||||
@ -1557,6 +1374,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
||||
if (ret)
|
||||
goto err;
|
||||
err:
|
||||
trace_stripe_create(c, s->idx, ret);
|
||||
|
||||
bch2_disk_reservation_put(c, &s->res);
|
||||
|
||||
for (i = 0; i < v->nr_blocks; i++)
|
||||
@ -1998,39 +1817,40 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static s64 get_existing_stripe(struct bch_fs *c,
|
||||
struct ec_stripe_head *head)
|
||||
static int __get_existing_stripe(struct btree_trans *trans,
|
||||
struct ec_stripe_head *head,
|
||||
struct ec_stripe_buf *stripe,
|
||||
u64 idx)
|
||||
{
|
||||
ec_stripes_heap *h = &c->ec_stripes_heap;
|
||||
struct stripe *m;
|
||||
size_t heap_idx;
|
||||
u64 stripe_idx;
|
||||
s64 ret = -1;
|
||||
struct bch_fs *c = trans->c;
|
||||
|
||||
if (may_create_new_stripe(c))
|
||||
return -1;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
|
||||
BTREE_ID_stripes, POS(0, idx), 0);
|
||||
int ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
mutex_lock(&c->ec_stripes_heap_lock);
|
||||
for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
|
||||
/* No blocks worth reusing, stripe will just be deleted: */
|
||||
if (!h->data[heap_idx].blocks_nonempty)
|
||||
continue;
|
||||
/* We expect write buffer races here */
|
||||
if (k.k->type != KEY_TYPE_stripe)
|
||||
goto out;
|
||||
|
||||
stripe_idx = h->data[heap_idx].idx;
|
||||
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
|
||||
if (stripe_lru_pos(s.v) <= 1)
|
||||
goto out;
|
||||
|
||||
m = genradix_ptr(&c->stripes, stripe_idx);
|
||||
|
||||
if (m->disk_label == head->disk_label &&
|
||||
m->algorithm == head->algo &&
|
||||
m->nr_redundant == head->redundancy &&
|
||||
m->sectors == head->blocksize &&
|
||||
m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
|
||||
bch2_try_open_stripe(c, head->s, stripe_idx)) {
|
||||
ret = stripe_idx;
|
||||
break;
|
||||
}
|
||||
if (s.v->disk_label == head->disk_label &&
|
||||
s.v->algorithm == head->algo &&
|
||||
s.v->nr_redundant == head->redundancy &&
|
||||
le16_to_cpu(s.v->sectors) == head->blocksize &&
|
||||
bch2_try_open_stripe(c, head->s, idx)) {
|
||||
bkey_reassemble(&stripe->key, k);
|
||||
ret = 1;
|
||||
}
|
||||
mutex_unlock(&c->ec_stripes_heap_lock);
|
||||
out:
|
||||
bch2_set_btree_iter_dontneed(&iter);
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -2082,24 +1902,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
|
||||
struct ec_stripe_new *s)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
s64 idx;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* If we can't allocate a new stripe, and there's no stripes with empty
|
||||
* blocks for us to reuse, that means we have to wait on copygc:
|
||||
*/
|
||||
idx = get_existing_stripe(c, h);
|
||||
if (idx < 0)
|
||||
return -BCH_ERR_stripe_alloc_blocked;
|
||||
if (may_create_new_stripe(c))
|
||||
return -1;
|
||||
|
||||
ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
|
||||
bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
|
||||
"reading stripe key: %s", bch2_err_str(ret));
|
||||
if (ret) {
|
||||
bch2_stripe_close(c, s);
|
||||
return ret;
|
||||
struct btree_iter lru_iter;
|
||||
struct bkey_s_c lru_k;
|
||||
int ret = 0;
|
||||
|
||||
for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
|
||||
lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
|
||||
lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
|
||||
0, lru_k, ret) {
|
||||
ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &lru_iter);
|
||||
if (!ret)
|
||||
ret = -BCH_ERR_stripe_alloc_blocked;
|
||||
if (ret == 1)
|
||||
ret = 0;
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return init_new_stripe_from_existing(c, s);
|
||||
}
|
||||
@ -2397,46 +2226,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
|
||||
|
||||
int bch2_stripes_read(struct bch_fs *c)
|
||||
{
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
|
||||
BTREE_ITER_prefetch, k, ({
|
||||
if (k.k->type != KEY_TYPE_stripe)
|
||||
continue;
|
||||
|
||||
ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
|
||||
|
||||
stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
|
||||
|
||||
bch2_stripes_heap_insert(c, m, k.k->p.offset);
|
||||
0;
|
||||
})));
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
{
|
||||
ec_stripes_heap *h = &c->ec_stripes_heap;
|
||||
struct stripe *m;
|
||||
size_t i;
|
||||
|
||||
mutex_lock(&c->ec_stripes_heap_lock);
|
||||
for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
|
||||
m = genradix_ptr(&c->stripes, h->data[i].idx);
|
||||
|
||||
prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
|
||||
h->data[i].blocks_nonempty,
|
||||
m->nr_blocks - m->nr_redundant,
|
||||
m->nr_redundant);
|
||||
if (bch2_stripe_is_open(c, h->data[i].idx))
|
||||
prt_str(out, " open");
|
||||
prt_newline(out);
|
||||
}
|
||||
mutex_unlock(&c->ec_stripes_heap_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
@ -2507,15 +2297,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
|
||||
|
||||
BUG_ON(!list_empty(&c->ec_stripe_new_list));
|
||||
|
||||
free_heap(&c->ec_stripes_heap);
|
||||
genradix_free(&c->stripes);
|
||||
bioset_exit(&c->ec_bioset);
|
||||
}
|
||||
|
||||
void bch2_fs_ec_init_early(struct bch_fs *c)
|
||||
{
|
||||
spin_lock_init(&c->ec_stripes_new_lock);
|
||||
mutex_init(&c->ec_stripes_heap_lock);
|
||||
|
||||
INIT_LIST_HEAD(&c->ec_stripe_head_list);
|
||||
mutex_init(&c->ec_stripe_head_lock);
|
||||
|
@ -99,15 +99,13 @@ static inline u64 stripe_lru_pos(const struct bch_stripe *s)
|
||||
if (!s)
|
||||
return 0;
|
||||
|
||||
unsigned blocks_empty = 0, blocks_nonempty = 0;
|
||||
unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
|
||||
|
||||
for (unsigned i = 0; i < s->nr_blocks; i++) {
|
||||
blocks_empty += !stripe_blockcount_get(s, i);
|
||||
blocks_nonempty += !!stripe_blockcount_get(s, i);
|
||||
}
|
||||
for (unsigned i = 0; i < nr_data; i++)
|
||||
blocks_empty += !stripe_blockcount_get(s, i);
|
||||
|
||||
/* Will be picked up by the stripe_delete worker */
|
||||
if (!blocks_nonempty)
|
||||
if (blocks_empty == nr_data)
|
||||
return STRIPE_LRU_POS_EMPTY;
|
||||
|
||||
if (!blocks_empty)
|
||||
@ -260,10 +258,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
|
||||
unsigned, unsigned, unsigned,
|
||||
enum bch_watermark, struct closure *);
|
||||
|
||||
void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
|
||||
void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
|
||||
void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
|
||||
|
||||
void bch2_do_stripe_deletes(struct bch_fs *);
|
||||
void bch2_ec_do_stripe_creates(struct bch_fs *);
|
||||
void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
|
||||
@ -300,7 +294,6 @@ void bch2_fs_ec_flush(struct bch_fs *);
|
||||
|
||||
int bch2_stripes_read(struct bch_fs *);
|
||||
|
||||
void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
|
||||
void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
|
||||
|
||||
void bch2_fs_ec_exit(struct bch_fs *);
|
||||
|
@ -31,11 +31,4 @@ struct gc_stripe {
|
||||
struct bch_replicas_padded r;
|
||||
};
|
||||
|
||||
struct ec_stripe_heap_entry {
|
||||
size_t idx;
|
||||
unsigned blocks_nonempty;
|
||||
};
|
||||
|
||||
typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap;
|
||||
|
||||
#endif /* _BCACHEFS_EC_TYPES_H */
|
||||
|
@ -119,6 +119,7 @@
|
||||
x(ENOENT, ENOENT_dev_idx_not_found) \
|
||||
x(ENOENT, ENOENT_inode_no_backpointer) \
|
||||
x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
|
||||
x(ENOENT, btree_node_dying) \
|
||||
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
|
||||
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
|
||||
x(EEXIST, EEXIST_str_hash_set) \
|
||||
@ -185,6 +186,7 @@
|
||||
x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
|
||||
x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \
|
||||
x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \
|
||||
x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \
|
||||
x(EINVAL, device_state_not_allowed) \
|
||||
x(EINVAL, member_info_missing) \
|
||||
x(EINVAL, mismatched_block_size) \
|
||||
@ -205,6 +207,7 @@
|
||||
x(EINVAL, no_resize_with_buckets_nouse) \
|
||||
x(EINVAL, inode_unpack_error) \
|
||||
x(EINVAL, varint_decode_error) \
|
||||
x(EOPNOTSUPP, may_not_use_incompat_feature) \
|
||||
x(EROFS, erofs_trans_commit) \
|
||||
x(EROFS, erofs_no_writes) \
|
||||
x(EROFS, erofs_journal_err) \
|
||||
@ -269,12 +272,29 @@
|
||||
x(EIO, mark_stripe) \
|
||||
x(EIO, stripe_reconstruct) \
|
||||
x(EIO, key_type_error) \
|
||||
x(EIO, extent_poisened) \
|
||||
x(EIO, no_device_to_read_from) \
|
||||
x(EIO, missing_indirect_extent) \
|
||||
x(EIO, invalidate_stripe_to_dev) \
|
||||
x(EIO, no_encryption_key) \
|
||||
x(EIO, insufficient_journal_devices) \
|
||||
x(EIO, device_offline) \
|
||||
x(EIO, EIO_fault_injected) \
|
||||
x(EIO, data_read) \
|
||||
x(BCH_ERR_data_read, data_read_retry) \
|
||||
x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_device_offline) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_io_err) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_ec_reconstruct_err) \
|
||||
x(BCH_ERR_data_read_retry_avoid,data_read_csum_err) \
|
||||
x(BCH_ERR_data_read_retry, data_read_csum_err_maybe_userspace) \
|
||||
x(BCH_ERR_data_read, data_read_decompress_err) \
|
||||
x(BCH_ERR_data_read, data_read_decrypt_err) \
|
||||
x(BCH_ERR_data_read, data_read_ptr_stale_race) \
|
||||
x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \
|
||||
x(BCH_ERR_data_read, data_read_no_encryption_key) \
|
||||
x(BCH_ERR_data_read, data_read_buffer_too_small) \
|
||||
x(BCH_ERR_data_read, data_read_key_overwritten) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
|
||||
|
@ -54,25 +54,41 @@ void bch2_io_error_work(struct work_struct *work)
|
||||
{
|
||||
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
|
||||
struct bch_fs *c = ca->fs;
|
||||
bool dev;
|
||||
|
||||
/* XXX: if it's reads or checksums that are failing, set it to failed */
|
||||
|
||||
down_write(&c->state_lock);
|
||||
dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
|
||||
BCH_FORCE_IF_DEGRADED);
|
||||
if (dev
|
||||
? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
|
||||
BCH_FORCE_IF_DEGRADED)
|
||||
: bch2_fs_emergency_read_only(c))
|
||||
unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
|
||||
|
||||
if (write_errors_start &&
|
||||
time_after(jiffies,
|
||||
write_errors_start + c->opts.write_error_timeout * HZ)) {
|
||||
if (ca->mi.state >= BCH_MEMBER_STATE_ro)
|
||||
goto out;
|
||||
|
||||
bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
|
||||
BCH_FORCE_IF_DEGRADED);
|
||||
|
||||
bch_err(ca,
|
||||
"too many IO errors, setting %s RO",
|
||||
"writes erroring for %u seconds, setting %s ro",
|
||||
c->opts.write_error_timeout,
|
||||
dev ? "device" : "filesystem");
|
||||
if (!dev)
|
||||
bch2_fs_emergency_read_only(c);
|
||||
|
||||
}
|
||||
out:
|
||||
up_write(&c->state_lock);
|
||||
}
|
||||
|
||||
void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
|
||||
{
|
||||
atomic64_inc(&ca->errors[type]);
|
||||
//queue_work(system_long_wq, &ca->io_error_work);
|
||||
|
||||
if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
|
||||
ca->write_errors_start = jiffies;
|
||||
|
||||
queue_work(system_long_wq, &ca->io_error_work);
|
||||
}
|
||||
|
||||
enum ask_yn {
|
||||
|
@ -216,27 +216,37 @@ void bch2_io_error_work(struct work_struct *);
|
||||
/* Does the error handling without logging a message */
|
||||
void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
|
||||
|
||||
#define bch2_dev_io_err_on(cond, ca, _type, ...) \
|
||||
({ \
|
||||
bool _ret = (cond); \
|
||||
\
|
||||
if (_ret) { \
|
||||
bch_err_dev_ratelimited(ca, __VA_ARGS__); \
|
||||
bch2_io_error(ca, _type); \
|
||||
} \
|
||||
_ret; \
|
||||
})
|
||||
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
||||
void bch2_latency_acct(struct bch_dev *, u64, int);
|
||||
#else
|
||||
static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
|
||||
#endif
|
||||
|
||||
#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \
|
||||
({ \
|
||||
bool _ret = (cond); \
|
||||
\
|
||||
if (_ret) { \
|
||||
bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
|
||||
bch2_io_error(ca, _type); \
|
||||
} \
|
||||
_ret; \
|
||||
})
|
||||
static inline void bch2_account_io_success_fail(struct bch_dev *ca,
|
||||
enum bch_member_error_type type,
|
||||
bool success)
|
||||
{
|
||||
if (likely(success)) {
|
||||
if (type == BCH_MEMBER_ERROR_write &&
|
||||
ca->write_errors_start)
|
||||
ca->write_errors_start = 0;
|
||||
} else {
|
||||
bch2_io_error(ca, type);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void bch2_account_io_completion(struct bch_dev *ca,
|
||||
enum bch_member_error_type type,
|
||||
u64 submit_time, bool success)
|
||||
{
|
||||
if (unlikely(!ca))
|
||||
return;
|
||||
|
||||
if (type != BCH_MEMBER_ERROR_checksum)
|
||||
bch2_latency_acct(ca, submit_time, type);
|
||||
|
||||
bch2_account_io_success_fail(ca, type, success);
|
||||
}
|
||||
|
||||
int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
|
||||
|
||||
|
@ -28,6 +28,13 @@
|
||||
#include "trace.h"
|
||||
#include "util.h"
|
||||
|
||||
static const char * const bch2_extent_flags_strs[] = {
|
||||
#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
|
||||
BCH_EXTENT_FLAGS()
|
||||
#undef x
|
||||
NULL,
|
||||
};
|
||||
|
||||
static unsigned bch2_crc_field_size_max[] = {
|
||||
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
|
||||
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
|
||||
@ -51,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
|
||||
}
|
||||
|
||||
void bch2_mark_io_failure(struct bch_io_failures *failed,
|
||||
struct extent_ptr_decoded *p)
|
||||
struct extent_ptr_decoded *p,
|
||||
bool csum_error)
|
||||
{
|
||||
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
|
||||
|
||||
@ -59,25 +67,28 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
|
||||
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
|
||||
|
||||
f = &failed->devs[failed->nr++];
|
||||
f->dev = p->ptr.dev;
|
||||
f->idx = p->idx;
|
||||
f->nr_failed = 1;
|
||||
f->nr_retries = 0;
|
||||
} else if (p->idx != f->idx) {
|
||||
f->idx = p->idx;
|
||||
f->nr_failed = 1;
|
||||
f->nr_retries = 0;
|
||||
} else {
|
||||
f->nr_failed++;
|
||||
memset(f, 0, sizeof(*f));
|
||||
f->dev = p->ptr.dev;
|
||||
}
|
||||
|
||||
if (p->do_ec_reconstruct)
|
||||
f->failed_ec = true;
|
||||
else if (!csum_error)
|
||||
f->failed_io = true;
|
||||
else
|
||||
f->failed_csum_nr++;
|
||||
}
|
||||
|
||||
static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
|
||||
static inline u64 dev_latency(struct bch_dev *ca)
|
||||
{
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, dev);
|
||||
return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
|
||||
}
|
||||
|
||||
static inline int dev_failed(struct bch_dev *ca)
|
||||
{
|
||||
return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
|
||||
}
|
||||
|
||||
/*
|
||||
* returns true if p1 is better than p2:
|
||||
*/
|
||||
@ -85,9 +96,18 @@ static inline bool ptr_better(struct bch_fs *c,
|
||||
const struct extent_ptr_decoded p1,
|
||||
const struct extent_ptr_decoded p2)
|
||||
{
|
||||
if (likely(!p1.idx && !p2.idx)) {
|
||||
u64 l1 = dev_latency(c, p1.ptr.dev);
|
||||
u64 l2 = dev_latency(c, p2.ptr.dev);
|
||||
if (likely(!p1.do_ec_reconstruct &&
|
||||
!p2.do_ec_reconstruct)) {
|
||||
struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
|
||||
struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
|
||||
|
||||
int failed_delta = dev_failed(ca1) - dev_failed(ca2);
|
||||
|
||||
if (failed_delta)
|
||||
return failed_delta < 0;
|
||||
|
||||
u64 l1 = dev_latency(ca1);
|
||||
u64 l2 = dev_latency(ca2);
|
||||
|
||||
/*
|
||||
* Square the latencies, to bias more in favor of the faster
|
||||
@ -103,9 +123,9 @@ static inline bool ptr_better(struct bch_fs *c,
|
||||
}
|
||||
|
||||
if (bch2_force_reconstruct_read)
|
||||
return p1.idx > p2.idx;
|
||||
return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
|
||||
|
||||
return p1.idx < p2.idx;
|
||||
return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -114,19 +134,24 @@ static inline bool ptr_better(struct bch_fs *c,
|
||||
* other devices, it will still pick a pointer from avoid.
|
||||
*/
|
||||
int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
||||
struct bch_io_failures *failed,
|
||||
struct extent_ptr_decoded *pick,
|
||||
int dev)
|
||||
struct bch_io_failures *failed,
|
||||
struct extent_ptr_decoded *pick,
|
||||
int dev)
|
||||
{
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
struct bch_dev_io_failures *f;
|
||||
unsigned csum_retry = 0;
|
||||
bool have_csum_retries = false;
|
||||
int ret = 0;
|
||||
|
||||
if (k.k->type == KEY_TYPE_error)
|
||||
return -BCH_ERR_key_type_error;
|
||||
|
||||
if (bch2_bkey_extent_ptrs_flags(ptrs) & BCH_EXTENT_FLAG_poisoned)
|
||||
return -BCH_ERR_extent_poisened;
|
||||
again:
|
||||
rcu_read_lock();
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
||||
/*
|
||||
@ -154,20 +179,28 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
||||
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
|
||||
continue;
|
||||
|
||||
f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
|
||||
if (f)
|
||||
p.idx = f->nr_failed < f->nr_retries
|
||||
? f->idx
|
||||
: f->idx + 1;
|
||||
if (unlikely(failed) &&
|
||||
(f = bch2_dev_io_failures(failed, p.ptr.dev))) {
|
||||
have_csum_retries |= !f->failed_io && f->failed_csum_nr < BCH_MAX_CSUM_RETRIES;
|
||||
|
||||
if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
|
||||
p.idx++;
|
||||
if (p.has_ec &&
|
||||
!f->failed_ec &&
|
||||
(f->failed_io || f->failed_csum_nr))
|
||||
p.do_ec_reconstruct = true;
|
||||
else if (f->failed_io ||
|
||||
f->failed_csum_nr > csum_retry)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
|
||||
p.idx++;
|
||||
if (!ca || !bch2_dev_is_online(ca)) {
|
||||
if (p.has_ec)
|
||||
p.do_ec_reconstruct = true;
|
||||
else
|
||||
continue;
|
||||
}
|
||||
|
||||
if (p.idx > (unsigned) p.has_ec)
|
||||
continue;
|
||||
if (p.has_ec && bch2_force_reconstruct_read)
|
||||
p.do_ec_reconstruct = true;
|
||||
|
||||
if (ret > 0 && !ptr_better(c, p, *pick))
|
||||
continue;
|
||||
@ -177,6 +210,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (unlikely(ret == -BCH_ERR_no_device_to_read_from &&
|
||||
have_csum_retries &&
|
||||
csum_retry < BCH_MAX_CSUM_RETRIES)) {
|
||||
csum_retry++;
|
||||
goto again;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1002,7 +1042,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
|
||||
|
||||
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
|
||||
|
||||
return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
|
||||
return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
|
||||
}
|
||||
|
||||
void bch2_extent_ptr_set_cached(struct bch_fs *c,
|
||||
@ -1225,6 +1265,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
|
||||
break;
|
||||
|
||||
case BCH_EXTENT_ENTRY_flags:
|
||||
prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
|
||||
break;
|
||||
|
||||
default:
|
||||
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
|
||||
return;
|
||||
@ -1386,6 +1430,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
case BCH_EXTENT_ENTRY_flags:
|
||||
bkey_fsck_err_on(entry != ptrs.start,
|
||||
c, extent_flags_not_at_start,
|
||||
"extent flags entry not at start");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1452,6 +1501,28 @@ void bch2_ptr_swab(struct bkey_s k)
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
|
||||
{
|
||||
int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
|
||||
|
||||
if (ptrs.start != ptrs.end &&
|
||||
extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
|
||||
ptrs.start->flags.flags = flags;
|
||||
} else {
|
||||
struct bch_extent_flags f = {
|
||||
.type = BIT(BCH_EXTENT_ENTRY_flags),
|
||||
.flags = flags,
|
||||
};
|
||||
__extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Generic extent code: */
|
||||
|
||||
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
|
||||
@ -1497,8 +1568,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
|
||||
entry->crc128.offset += sub;
|
||||
break;
|
||||
case BCH_EXTENT_ENTRY_stripe_ptr:
|
||||
break;
|
||||
case BCH_EXTENT_ENTRY_rebalance:
|
||||
case BCH_EXTENT_ENTRY_flags:
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -320,8 +320,8 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
|
||||
({ \
|
||||
__label__ out; \
|
||||
\
|
||||
(_ptr).idx = 0; \
|
||||
(_ptr).has_ec = false; \
|
||||
(_ptr).do_ec_reconstruct = false; \
|
||||
\
|
||||
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
|
||||
switch (__extent_entry_type(_entry)) { \
|
||||
@ -401,7 +401,7 @@ out: \
|
||||
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
|
||||
unsigned);
|
||||
void bch2_mark_io_failure(struct bch_io_failures *,
|
||||
struct extent_ptr_decoded *);
|
||||
struct extent_ptr_decoded *, bool);
|
||||
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
|
||||
struct bch_io_failures *,
|
||||
struct extent_ptr_decoded *, int);
|
||||
@ -704,7 +704,7 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
|
||||
ptr1.unwritten == ptr2.unwritten &&
|
||||
ptr1.offset == ptr2.offset &&
|
||||
ptr1.dev == ptr2.dev &&
|
||||
ptr1.dev == ptr2.dev);
|
||||
ptr1.gen == ptr2.gen);
|
||||
}
|
||||
|
||||
void bch2_ptr_swab(struct bkey_s);
|
||||
@ -753,4 +753,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
|
||||
k->size = new_size;
|
||||
}
|
||||
|
||||
static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
|
||||
{
|
||||
if (ptrs.start != ptrs.end &&
|
||||
extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
|
||||
return ptrs.start->flags.flags;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
|
||||
{
|
||||
return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
|
||||
}
|
||||
|
||||
int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
|
||||
|
||||
#endif /* _BCACHEFS_EXTENTS_H */
|
||||
|
@ -79,8 +79,9 @@
|
||||
x(crc64, 2) \
|
||||
x(crc128, 3) \
|
||||
x(stripe_ptr, 4) \
|
||||
x(rebalance, 5)
|
||||
#define BCH_EXTENT_ENTRY_MAX 6
|
||||
x(rebalance, 5) \
|
||||
x(flags, 6)
|
||||
#define BCH_EXTENT_ENTRY_MAX 7
|
||||
|
||||
enum bch_extent_entry_type {
|
||||
#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
|
||||
@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr {
|
||||
#endif
|
||||
};
|
||||
|
||||
#define BCH_EXTENT_FLAGS() \
|
||||
x(poisoned, 0)
|
||||
|
||||
enum bch_extent_flags_e {
|
||||
#define x(n, v) BCH_EXTENT_FLAG_##n = v,
|
||||
BCH_EXTENT_FLAGS()
|
||||
#undef x
|
||||
};
|
||||
|
||||
struct bch_extent_flags {
|
||||
#if defined(__LITTLE_ENDIAN_BITFIELD)
|
||||
__u64 type:7,
|
||||
flags:57;
|
||||
#elif defined (__BIG_ENDIAN_BITFIELD)
|
||||
__u64 flags:57,
|
||||
type:7;
|
||||
#endif
|
||||
};
|
||||
|
||||
/* bch_extent_rebalance: */
|
||||
#include "rebalance_format.h"
|
||||
|
||||
|
@ -20,21 +20,23 @@ struct bch_extent_crc_unpacked {
|
||||
};
|
||||
|
||||
struct extent_ptr_decoded {
|
||||
unsigned idx;
|
||||
bool has_ec;
|
||||
unsigned do_ec_reconstruct;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
struct bch_extent_ptr ptr;
|
||||
struct bch_extent_stripe_ptr ec;
|
||||
};
|
||||
|
||||
#define BCH_MAX_CSUM_RETRIES 3
|
||||
|
||||
struct bch_io_failures {
|
||||
u8 nr;
|
||||
struct bch_dev_io_failures {
|
||||
u8 dev;
|
||||
u8 idx;
|
||||
u8 nr_failed;
|
||||
u8 nr_retries;
|
||||
} devs[BCH_REPLICAS_MAX];
|
||||
unsigned failed_csum_nr:4,
|
||||
failed_io:1,
|
||||
failed_ec:1;
|
||||
} devs[BCH_REPLICAS_MAX + 1];
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
|
||||
|
@ -268,16 +268,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
|
||||
|
||||
dir_hash = bch2_hash_info_init(c, dir_u);
|
||||
|
||||
struct bkey_s_c dirent_k =
|
||||
bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
|
||||
&dir_hash, dir, name, BTREE_ITER_intent);
|
||||
ret = bkey_err(dirent_k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(dirent_k), &inum);
|
||||
if (ret > 0)
|
||||
ret = -ENOENT;
|
||||
ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
|
||||
name, &inum, BTREE_ITER_intent);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
@ -334,7 +326,6 @@ int bch2_unlink_trans(struct btree_trans *trans,
|
||||
|
||||
dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
|
||||
dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
|
||||
dir_u->bi_size -= bkey_bytes(dirent_k.k);
|
||||
|
||||
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
|
||||
&dir_hash, &dirent_iter,
|
||||
|
@ -466,6 +466,7 @@ int bchfs_truncate(struct mnt_idmap *idmap,
|
||||
ret = bch2_truncate_folio(inode, iattr->ia_size);
|
||||
if (unlikely(ret < 0))
|
||||
goto err;
|
||||
ret = 0;
|
||||
|
||||
truncate_setsize(&inode->v, iattr->ia_size);
|
||||
|
||||
|
@ -69,8 +69,9 @@ static int bch2_inode_flags_set(struct btree_trans *trans,
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (!bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding))
|
||||
return -EOPNOTSUPP;
|
||||
ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
bch2_check_set_feature(c, BCH_FEATURE_casefolding);
|
||||
#else
|
||||
@ -243,7 +244,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
|
||||
int ret = 0;
|
||||
subvol_inum inum;
|
||||
|
||||
kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
|
||||
kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
|
||||
if (!kname)
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -2218,9 +2218,10 @@ static int bch2_fs_get_tree(struct fs_context *fc)
|
||||
|
||||
bch2_opts_apply(&c->opts, opts);
|
||||
|
||||
ret = bch2_fs_start(c);
|
||||
if (ret)
|
||||
goto err_stop_fs;
|
||||
/*
|
||||
* need to initialise sb and set c->vfs_sb _before_ starting fs,
|
||||
* for blk_holder_ops
|
||||
*/
|
||||
|
||||
sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
|
||||
ret = PTR_ERR_OR_ZERO(sb);
|
||||
@ -2282,6 +2283,10 @@ got_sb:
|
||||
|
||||
sb->s_shrink->seeks = 0;
|
||||
|
||||
ret = bch2_fs_start(c);
|
||||
if (ret)
|
||||
goto err_put_super;
|
||||
|
||||
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
|
||||
ret = PTR_ERR_OR_ZERO(vinode);
|
||||
bch_err_msg(c, ret, "mounting: error getting root inode");
|
||||
|
@ -1978,31 +1978,10 @@ fsck_err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
int ret = 0;
|
||||
|
||||
darray_for_each(w->inodes, i)
|
||||
if (fsck_err_on(i->inode.bi_size != i->i_size,
|
||||
trans, inode_dir_wrong_nlink,
|
||||
"directory %llu:%u with wrong i_size: got %llu, should be %llu",
|
||||
w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) {
|
||||
i->inode.bi_size = i->i_size;
|
||||
ret = bch2_fsck_write_inode(trans, &i->inode);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
fsck_err:
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
|
||||
{
|
||||
u32 restart_count = trans->restart_count;
|
||||
return check_subdir_count_notnested(trans, w) ?:
|
||||
check_dir_i_size_notnested(trans, w) ?:
|
||||
trans_was_restarted(trans, restart_count);
|
||||
}
|
||||
|
||||
|
@ -329,10 +329,17 @@ nopromote:
|
||||
static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
|
||||
struct bch_read_bio *rbio, struct bpos read_pos)
|
||||
{
|
||||
return lockrestart_do(trans,
|
||||
int ret = lockrestart_do(trans,
|
||||
bch2_inum_offset_err_msg_trans(trans, out,
|
||||
(subvol_inum) { rbio->subvol, read_pos.inode },
|
||||
read_pos.offset << 9));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (rbio->flags & BCH_READ_data_update)
|
||||
prt_str(out, "(internal move) ");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
|
||||
@ -341,10 +348,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
|
||||
bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
|
||||
}
|
||||
|
||||
#define READ_RETRY_AVOID 1
|
||||
#define READ_RETRY 2
|
||||
#define READ_ERR 3
|
||||
|
||||
enum rbio_context {
|
||||
RBIO_CONTEXT_NULL,
|
||||
RBIO_CONTEXT_HIGHPRI,
|
||||
@ -375,6 +378,11 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
|
||||
{
|
||||
BUG_ON(rbio->bounce && !rbio->split);
|
||||
|
||||
if (rbio->have_ioref) {
|
||||
struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
}
|
||||
|
||||
if (rbio->split) {
|
||||
struct bch_read_bio *parent = rbio->parent;
|
||||
|
||||
@ -408,13 +416,90 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
|
||||
bio_endio(&rbio->bio);
|
||||
}
|
||||
|
||||
static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
struct bvec_iter bvec_iter,
|
||||
struct bch_io_failures *failed,
|
||||
unsigned flags)
|
||||
static struct bkey_s_c get_rbio_extent(struct btree_trans *trans,
|
||||
struct bch_read_bio *rbio,
|
||||
struct btree_iter *iter)
|
||||
{
|
||||
if (rbio->flags & BCH_READ_data_update) {
|
||||
struct data_update *u = container_of(rbio, struct data_update, rbio);
|
||||
|
||||
return bch2_bkey_get_iter(trans, iter,
|
||||
u->btree_id, bkey_start_pos(&u->k.k->k), 0);
|
||||
} else {
|
||||
struct bpos pos = rbio->read_pos;
|
||||
int ret = bch2_subvolume_get_snapshot(trans, rbio->subvol, &pos.snapshot);
|
||||
if (ret)
|
||||
return bkey_s_c_err(ret);
|
||||
|
||||
return bch2_bkey_get_iter(trans, iter,
|
||||
BTREE_ID_extents, pos, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void mark_io_failure_if_current_extent_matches(struct btree_trans *trans,
|
||||
struct bch_read_bio *rbio,
|
||||
struct bch_io_failures *failed)
|
||||
{
|
||||
struct btree_iter iter = {};
|
||||
struct bkey_s_c k;
|
||||
int ret = lockrestart_do(trans,
|
||||
bkey_err(k = get_rbio_extent(trans, rbio, &iter)));
|
||||
|
||||
if (!ret) {
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
|
||||
bkey_for_each_ptr(ptrs, ptr)
|
||||
if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr))
|
||||
bch2_mark_io_failure(failed, &rbio->pick,
|
||||
rbio->ret == -BCH_ERR_data_read_csum_err);
|
||||
}
|
||||
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
}
|
||||
|
||||
static noinline int maybe_poison_extent(struct btree_trans *trans, struct btree_iter *iter,
|
||||
struct bkey_s_c k, struct bch_io_failures *failed)
|
||||
{
|
||||
u64 flags = bch2_bkey_extent_flags(k);
|
||||
if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
|
||||
return 0;
|
||||
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
|
||||
/*
|
||||
* Make sure we actually attempt to read and got checksum failures from
|
||||
* every replica
|
||||
*/
|
||||
|
||||
rcu_read_lock();
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
|
||||
if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
|
||||
continue;
|
||||
|
||||
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, ptr->dev);
|
||||
if (!f || f->failed_csum_nr != BCH_MAX_CSUM_RETRIES) {
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
struct bkey_i *new = __bch2_bkey_make_mut(trans, iter, &k, 0, 0,
|
||||
bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
|
||||
return PTR_ERR_OR_ZERO(new) ?:
|
||||
bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
|
||||
bch2_trans_commit(trans, NULL, NULL, 0);
|
||||
}
|
||||
|
||||
static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
|
||||
struct bch_read_bio *rbio,
|
||||
struct bvec_iter bvec_iter,
|
||||
struct bch_io_failures *failed,
|
||||
unsigned flags)
|
||||
{
|
||||
struct data_update *u = container_of(rbio, struct data_update, rbio);
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
@ -429,7 +514,7 @@ retry:
|
||||
|
||||
if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
|
||||
/* extent we wanted to read no longer exists: */
|
||||
rbio->hole = true;
|
||||
rbio->ret = -BCH_ERR_data_read_key_overwritten;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -441,14 +526,19 @@ retry:
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (ret == READ_RETRY)
|
||||
if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
|
||||
goto retry;
|
||||
if (ret)
|
||||
rbio->bio.bi_status = BLK_STS_IOERR;
|
||||
|
||||
if (ret) {
|
||||
if (ret == -BCH_ERR_no_device_to_read_from && failed)
|
||||
maybe_poison_extent(trans, &iter, k, failed);
|
||||
|
||||
rbio->bio.bi_status = BLK_STS_IOERR;
|
||||
rbio->ret = ret;
|
||||
}
|
||||
|
||||
BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
|
||||
bch2_rbio_done(rbio);
|
||||
bch2_trans_put(trans);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void bch2_rbio_retry(struct work_struct *work)
|
||||
@ -463,16 +553,22 @@ static void bch2_rbio_retry(struct work_struct *work)
|
||||
.inum = rbio->read_pos.inode,
|
||||
};
|
||||
struct bch_io_failures failed = { .nr = 0 };
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
|
||||
trace_io_read_retry(&rbio->bio);
|
||||
this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
|
||||
bvec_iter_sectors(rbio->bvec_iter));
|
||||
|
||||
if (rbio->retry == READ_RETRY_AVOID)
|
||||
bch2_mark_io_failure(&failed, &rbio->pick);
|
||||
if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
|
||||
mark_io_failure_if_current_extent_matches(trans, rbio, &failed);
|
||||
|
||||
if (!rbio->split)
|
||||
rbio->bio.bi_status = 0;
|
||||
if (!rbio->split) {
|
||||
rbio->bio.bi_status = 0;
|
||||
rbio->ret = 0;
|
||||
}
|
||||
|
||||
unsigned subvol = rbio->subvol;
|
||||
struct bpos read_pos = rbio->read_pos;
|
||||
|
||||
rbio = bch2_rbio_free(rbio);
|
||||
|
||||
@ -481,29 +577,55 @@ static void bch2_rbio_retry(struct work_struct *work)
|
||||
flags &= ~BCH_READ_last_fragment;
|
||||
flags |= BCH_READ_must_clone;
|
||||
|
||||
if (flags & BCH_READ_data_update)
|
||||
bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
|
||||
else
|
||||
__bch2_read(c, rbio, iter, inum, &failed, flags);
|
||||
int ret = flags & BCH_READ_data_update
|
||||
? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
|
||||
: __bch2_read(trans, rbio, iter, inum, &failed, flags);
|
||||
|
||||
if (ret) {
|
||||
rbio->ret = ret;
|
||||
rbio->bio.bi_status = BLK_STS_IOERR;
|
||||
} else {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
lockrestart_do(trans,
|
||||
bch2_inum_offset_err_msg_trans(trans, &buf,
|
||||
(subvol_inum) { subvol, read_pos.inode },
|
||||
read_pos.offset << 9));
|
||||
if (rbio->flags & BCH_READ_data_update)
|
||||
prt_str(&buf, "(internal move) ");
|
||||
prt_str(&buf, "successful retry");
|
||||
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
bch2_rbio_done(rbio);
|
||||
bch2_trans_put(trans);
|
||||
}
|
||||
|
||||
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
|
||||
blk_status_t error)
|
||||
static void bch2_rbio_error(struct bch_read_bio *rbio,
|
||||
int ret, blk_status_t blk_error)
|
||||
{
|
||||
rbio->retry = retry;
|
||||
rbio->saw_error = true;
|
||||
BUG_ON(ret >= 0);
|
||||
|
||||
rbio->ret = ret;
|
||||
rbio->bio.bi_status = blk_error;
|
||||
|
||||
bch2_rbio_parent(rbio)->saw_error = true;
|
||||
|
||||
if (rbio->flags & BCH_READ_in_retry)
|
||||
return;
|
||||
|
||||
if (retry == READ_ERR) {
|
||||
rbio = bch2_rbio_free(rbio);
|
||||
|
||||
rbio->bio.bi_status = error;
|
||||
bch2_rbio_done(rbio);
|
||||
} else {
|
||||
if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
|
||||
bch2_rbio_punt(rbio, bch2_rbio_retry,
|
||||
RBIO_CONTEXT_UNBOUND, system_unbound_wq);
|
||||
} else {
|
||||
rbio = bch2_rbio_free(rbio);
|
||||
|
||||
rbio->ret = ret;
|
||||
rbio->bio.bi_status = blk_error;
|
||||
|
||||
bch2_rbio_done(rbio);
|
||||
}
|
||||
}
|
||||
|
||||
@ -519,15 +641,13 @@ static void bch2_read_io_err(struct work_struct *work)
|
||||
bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
|
||||
prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
|
||||
|
||||
if (ca) {
|
||||
bch2_io_error(ca, BCH_MEMBER_ERROR_read);
|
||||
if (ca)
|
||||
bch_err_ratelimited(ca, "%s", buf.buf);
|
||||
} else {
|
||||
else
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
}
|
||||
|
||||
printbuf_exit(&buf);
|
||||
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_io_err, bio->bi_status);
|
||||
}
|
||||
|
||||
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
|
||||
@ -609,14 +729,12 @@ static void bch2_read_csum_err(struct work_struct *work)
|
||||
bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
|
||||
|
||||
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
|
||||
if (ca) {
|
||||
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
|
||||
if (ca)
|
||||
bch_err_ratelimited(ca, "%s", buf.buf);
|
||||
} else {
|
||||
else
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
}
|
||||
|
||||
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err, BLK_STS_IOERR);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
@ -636,7 +754,7 @@ static void bch2_read_decompress_err(struct work_struct *work)
|
||||
else
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
|
||||
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
@ -656,16 +774,53 @@ static void bch2_read_decrypt_err(struct work_struct *work)
|
||||
else
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
|
||||
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
static unsigned bch2_read_corrupt_ratio;
|
||||
module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
|
||||
MODULE_PARM_DESC(read_corrupt_ratio, "");
|
||||
|
||||
static void corrupt_bio(struct bio *bio)
|
||||
{
|
||||
struct bvec_iter iter;
|
||||
struct bio_vec bv;
|
||||
unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
|
||||
|
||||
bio_for_each_segment(bv, bio, iter) {
|
||||
unsigned u64s = bv.bv_len / sizeof(u64);
|
||||
|
||||
if (offset < u64s) {
|
||||
u64 *segment = bvec_kmap_local(&bv);
|
||||
segment[offset] = get_random_u64();
|
||||
kunmap_local(segment);
|
||||
return;
|
||||
}
|
||||
offset -= u64s;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void maybe_corrupt_bio(struct bio *bio)
|
||||
{
|
||||
if (bch2_read_corrupt_ratio &&
|
||||
!get_random_u32_below(bch2_read_corrupt_ratio))
|
||||
corrupt_bio(bio);
|
||||
}
|
||||
#else
|
||||
static inline void maybe_corrupt_bio(struct bio *bio)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Inner part that may run in process context */
|
||||
static void __bch2_read_endio(struct work_struct *work)
|
||||
{
|
||||
struct bch_read_bio *rbio =
|
||||
container_of(work, struct bch_read_bio, work);
|
||||
struct bch_fs *c = rbio->c;
|
||||
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
|
||||
struct bio *src = &rbio->bio;
|
||||
struct bio *dst = &bch2_rbio_parent(rbio)->bio;
|
||||
struct bvec_iter dst_iter = rbio->bvec_iter;
|
||||
@ -686,8 +841,26 @@ static void __bch2_read_endio(struct work_struct *work)
|
||||
src->bi_iter = rbio->bvec_iter;
|
||||
}
|
||||
|
||||
maybe_corrupt_bio(src);
|
||||
|
||||
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
|
||||
if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
|
||||
bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
|
||||
|
||||
/*
|
||||
* Checksum error: if the bio wasn't bounced, we may have been
|
||||
* reading into buffers owned by userspace (that userspace can
|
||||
* scribble over) - retry the read, bouncing it this time:
|
||||
*/
|
||||
if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
|
||||
rbio->flags |= BCH_READ_must_bounce;
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err_maybe_userspace,
|
||||
BLK_STS_IOERR);
|
||||
goto out;
|
||||
}
|
||||
|
||||
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
|
||||
|
||||
if (!csum_good)
|
||||
goto csum_err;
|
||||
|
||||
/*
|
||||
@ -760,17 +933,6 @@ out:
|
||||
memalloc_nofs_restore(nofs_flags);
|
||||
return;
|
||||
csum_err:
|
||||
/*
|
||||
* Checksum error: if the bio wasn't bounced, we may have been
|
||||
* reading into buffers owned by userspace (that userspace can
|
||||
* scribble over) - retry the read, bouncing it this time:
|
||||
*/
|
||||
if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
|
||||
rbio->flags |= BCH_READ_must_bounce;
|
||||
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
|
||||
goto out;
|
||||
}
|
||||
|
||||
bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
|
||||
goto out;
|
||||
decompression_err:
|
||||
@ -790,10 +952,8 @@ static void bch2_read_endio(struct bio *bio)
|
||||
struct workqueue_struct *wq = NULL;
|
||||
enum rbio_context context = RBIO_CONTEXT_NULL;
|
||||
|
||||
if (rbio->have_ioref) {
|
||||
bch2_latency_acct(ca, rbio->submit_time, READ);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
}
|
||||
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
||||
rbio->submit_time, !bio->bi_status);
|
||||
|
||||
if (!rbio->split)
|
||||
rbio->bio.bi_end_io = rbio->end_io;
|
||||
@ -808,9 +968,9 @@ static void bch2_read_endio(struct bio *bio)
|
||||
trace_and_count(c, io_read_reuse_race, &rbio->bio);
|
||||
|
||||
if (rbio->flags & BCH_READ_retry_if_stale)
|
||||
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
|
||||
else
|
||||
bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -883,7 +1043,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
||||
struct bch_read_bio *rbio = NULL;
|
||||
bool bounce = false, read_full = false, narrow_crcs = false;
|
||||
struct bpos data_pos = bkey_start_pos(k.k);
|
||||
int pick_ret;
|
||||
int ret = 0;
|
||||
|
||||
if (bkey_extent_is_inline_data(k.k)) {
|
||||
unsigned bytes = min_t(unsigned, iter.bi_size,
|
||||
@ -899,16 +1059,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
||||
goto out_read_done;
|
||||
}
|
||||
retry_pick:
|
||||
pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
|
||||
ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
|
||||
|
||||
/* hole or reservation - just zero fill: */
|
||||
if (!pick_ret)
|
||||
if (!ret)
|
||||
goto hole;
|
||||
|
||||
if (unlikely(pick_ret < 0)) {
|
||||
if (unlikely(ret < 0)) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
|
||||
prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret));
|
||||
prt_printf(&buf, "%s\n ", bch2_err_str(ret));
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
@ -924,6 +1084,7 @@ retry_pick:
|
||||
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
ret = -BCH_ERR_data_read_no_encryption_key;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -940,7 +1101,7 @@ retry_pick:
|
||||
ca &&
|
||||
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
|
||||
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
|
||||
bch2_mark_io_failure(failed, &pick);
|
||||
bch2_mark_io_failure(failed, &pick, false);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
goto retry_pick;
|
||||
}
|
||||
@ -984,10 +1145,10 @@ retry_pick:
|
||||
*/
|
||||
struct data_update *u = container_of(orig, struct data_update, rbio);
|
||||
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
|
||||
BUG();
|
||||
if (ca)
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
goto hole;
|
||||
rbio->ret = -BCH_ERR_data_read_buffer_too_small;
|
||||
goto out_read_done;
|
||||
}
|
||||
|
||||
iter.bi_size = pick.crc.compressed_size << 9;
|
||||
@ -1067,8 +1228,7 @@ retry_pick:
|
||||
rbio->flags = flags;
|
||||
rbio->have_ioref = ca != NULL;
|
||||
rbio->narrow_crcs = narrow_crcs;
|
||||
rbio->hole = 0;
|
||||
rbio->retry = 0;
|
||||
rbio->ret = 0;
|
||||
rbio->context = 0;
|
||||
rbio->pick = pick;
|
||||
rbio->subvol = orig->subvol;
|
||||
@ -1104,7 +1264,7 @@ retry_pick:
|
||||
trace_and_count(c, io_read_split, &orig->bio);
|
||||
}
|
||||
|
||||
if (!rbio->pick.idx) {
|
||||
if (likely(!rbio->pick.do_ec_reconstruct)) {
|
||||
if (unlikely(!rbio->have_ioref)) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
|
||||
@ -1114,7 +1274,9 @@ retry_pick:
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
|
||||
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
||||
bch2_rbio_error(rbio,
|
||||
-BCH_ERR_data_read_device_offline,
|
||||
BLK_STS_IOERR);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1140,7 +1302,8 @@ retry_pick:
|
||||
} else {
|
||||
/* Attempting reconstruct read: */
|
||||
if (bch2_ec_read_extent(trans, rbio, k)) {
|
||||
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
||||
bch2_rbio_error(rbio, -BCH_ERR_data_read_ec_reconstruct_err,
|
||||
BLK_STS_IOERR);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1156,25 +1319,22 @@ out:
|
||||
rbio->context = RBIO_CONTEXT_UNBOUND;
|
||||
bch2_read_endio(&rbio->bio);
|
||||
|
||||
ret = rbio->retry;
|
||||
ret = rbio->ret;
|
||||
rbio = bch2_rbio_free(rbio);
|
||||
|
||||
if (ret == READ_RETRY_AVOID) {
|
||||
bch2_mark_io_failure(failed, &pick);
|
||||
ret = READ_RETRY;
|
||||
}
|
||||
|
||||
if (!ret)
|
||||
goto out_read_done;
|
||||
if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
|
||||
bch2_mark_io_failure(failed, &pick,
|
||||
ret == -BCH_ERR_data_read_csum_err);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
err:
|
||||
if (flags & BCH_READ_in_retry)
|
||||
return READ_ERR;
|
||||
return ret;
|
||||
|
||||
orig->bio.bi_status = BLK_STS_IOERR;
|
||||
orig->bio.bi_status = BLK_STS_IOERR;
|
||||
orig->ret = ret;
|
||||
goto out_read_done;
|
||||
|
||||
hole:
|
||||
@ -1186,20 +1346,21 @@ hole:
|
||||
* to read no longer exists we have to signal that:
|
||||
*/
|
||||
if (flags & BCH_READ_data_update)
|
||||
orig->hole = true;
|
||||
orig->ret = -BCH_ERR_data_read_key_overwritten;
|
||||
|
||||
zero_fill_bio_iter(&orig->bio, iter);
|
||||
out_read_done:
|
||||
if (flags & BCH_READ_last_fragment)
|
||||
if ((flags & BCH_READ_last_fragment) &&
|
||||
!(flags & BCH_READ_in_retry))
|
||||
bch2_rbio_done(orig);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
struct bvec_iter bvec_iter, subvol_inum inum,
|
||||
struct bch_io_failures *failed, unsigned flags)
|
||||
int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
|
||||
struct bvec_iter bvec_iter, subvol_inum inum,
|
||||
struct bch_io_failures *failed, unsigned flags)
|
||||
{
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
struct bkey_buf sk;
|
||||
struct bkey_s_c k;
|
||||
@ -1232,6 +1393,23 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (unlikely(flags & BCH_READ_in_retry)) {
|
||||
struct data_update *u = flags & BCH_READ_data_update
|
||||
? container_of(rbio, struct data_update, rbio)
|
||||
: NULL;
|
||||
|
||||
if (u &&
|
||||
!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
|
||||
/* extent we wanted to read no longer exists: */
|
||||
ret = -BCH_ERR_data_read_key_overwritten;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (!bkey_deleted(&sk.k->k) &&
|
||||
!bkey_and_val_eq(k, bkey_i_to_s_c(sk.k)))
|
||||
failed->nr = 0;
|
||||
}
|
||||
|
||||
s64 offset_into_extent = iter.pos.offset -
|
||||
bkey_start_offset(k.k);
|
||||
unsigned sectors = k.k->size - offset_into_extent;
|
||||
@ -1271,28 +1449,32 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
err:
|
||||
if (ret &&
|
||||
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
|
||||
ret != READ_RETRY &&
|
||||
ret != READ_RETRY_AVOID)
|
||||
!bch2_err_matches(ret, BCH_ERR_data_read_retry))
|
||||
break;
|
||||
}
|
||||
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
if (unlikely(ret)) {
|
||||
if (ret == -BCH_ERR_no_device_to_read_from && failed)
|
||||
maybe_poison_extent(trans, &iter, k, failed);
|
||||
|
||||
if (ret) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
lockrestart_do(trans,
|
||||
bch2_inum_offset_err_msg_trans(trans, &buf, inum,
|
||||
bvec_iter.bi_sector << 9));
|
||||
prt_printf(&buf, "read error %i from btree lookup", ret);
|
||||
prt_printf(&buf, "read error: %s", bch2_err_str(ret));
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
|
||||
rbio->bio.bi_status = BLK_STS_IOERR;
|
||||
bch2_rbio_done(rbio);
|
||||
rbio->bio.bi_status = BLK_STS_IOERR;
|
||||
rbio->ret = ret;
|
||||
|
||||
if (!(flags & BCH_READ_in_retry))
|
||||
bch2_rbio_done(rbio);
|
||||
}
|
||||
|
||||
bch2_trans_put(trans);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
bch2_bkey_buf_exit(&sk, c);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_fs_io_read_exit(struct bch_fs *c)
|
||||
|
@ -3,6 +3,7 @@
|
||||
#define _BCACHEFS_IO_READ_H
|
||||
|
||||
#include "bkey_buf.h"
|
||||
#include "btree_iter.h"
|
||||
#include "reflink.h"
|
||||
|
||||
struct bch_read_bio {
|
||||
@ -40,13 +41,12 @@ struct bch_read_bio {
|
||||
split:1,
|
||||
have_ioref:1,
|
||||
narrow_crcs:1,
|
||||
hole:1,
|
||||
saw_error:1,
|
||||
retry:2,
|
||||
context:2;
|
||||
};
|
||||
u16 _state;
|
||||
};
|
||||
s16 ret;
|
||||
|
||||
struct extent_ptr_decoded pick;
|
||||
|
||||
@ -141,22 +141,21 @@ static inline void bch2_read_extent(struct btree_trans *trans,
|
||||
data_btree, k, offset_into_extent, NULL, flags, -1);
|
||||
}
|
||||
|
||||
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
|
||||
subvol_inum, struct bch_io_failures *, unsigned flags);
|
||||
int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
|
||||
subvol_inum, struct bch_io_failures *, unsigned flags);
|
||||
|
||||
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
subvol_inum inum)
|
||||
{
|
||||
struct bch_io_failures failed = { .nr = 0 };
|
||||
|
||||
BUG_ON(rbio->_state);
|
||||
|
||||
rbio->subvol = inum.subvol;
|
||||
|
||||
__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
|
||||
BCH_READ_retry_if_stale|
|
||||
BCH_READ_may_promote|
|
||||
BCH_READ_user_mapped);
|
||||
bch2_trans_run(c,
|
||||
__bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL,
|
||||
BCH_READ_retry_if_stale|
|
||||
BCH_READ_may_promote|
|
||||
BCH_READ_user_mapped));
|
||||
}
|
||||
|
||||
static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
|
||||
@ -166,6 +165,7 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
|
||||
|
||||
rbio->c = orig->c;
|
||||
rbio->_state = 0;
|
||||
rbio->ret = 0;
|
||||
rbio->split = true;
|
||||
rbio->parent = orig;
|
||||
rbio->opts = orig->opts;
|
||||
@ -182,6 +182,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
|
||||
rbio->start_time = local_clock();
|
||||
rbio->c = c;
|
||||
rbio->_state = 0;
|
||||
rbio->ret = 0;
|
||||
rbio->opts = opts;
|
||||
rbio->bio.bi_end_io = end_io;
|
||||
return rbio;
|
||||
|
@ -716,11 +716,15 @@ static void bch2_write_endio(struct bio *bio)
|
||||
? bch2_dev_have_ref(c, wbio->dev)
|
||||
: NULL;
|
||||
|
||||
if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
|
||||
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
|
||||
wbio->submit_time, !bio->bi_status);
|
||||
|
||||
if (bio->bi_status) {
|
||||
bch_err_inum_offset_ratelimited(ca,
|
||||
op->pos.inode,
|
||||
wbio->inode_offset << 9,
|
||||
"data write error: %s",
|
||||
bch2_blk_status_to_str(bio->bi_status))) {
|
||||
bch2_blk_status_to_str(bio->bi_status));
|
||||
set_bit(wbio->dev, op->failed.d);
|
||||
op->flags |= BCH_WRITE_io_error;
|
||||
}
|
||||
@ -732,10 +736,8 @@ static void bch2_write_endio(struct bio *bio)
|
||||
set_bit(wbio->dev, op->devs_need_flush->d);
|
||||
}
|
||||
|
||||
if (wbio->have_ioref) {
|
||||
bch2_latency_acct(ca, wbio->submit_time, WRITE);
|
||||
if (wbio->have_ioref)
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
}
|
||||
|
||||
if (wbio->bounce)
|
||||
bch2_bio_free_pages_pool(c, bio);
|
||||
|
@ -11,12 +11,6 @@
|
||||
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
|
||||
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
|
||||
|
||||
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
||||
void bch2_latency_acct(struct bch_dev *, u64, int);
|
||||
#else
|
||||
static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
|
||||
#endif
|
||||
|
||||
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
|
||||
enum bch_data_type, const struct bkey_i *, bool);
|
||||
|
||||
|
@ -1096,8 +1096,8 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
|
||||
|
||||
/* allocate journal on a device: */
|
||||
|
||||
static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
|
||||
bool new_fs, struct closure *cl)
|
||||
static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
|
||||
bool new_fs, struct closure *cl)
|
||||
{
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct journal_device *ja = &ca->journal;
|
||||
@ -1225,26 +1225,20 @@ err_free:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate more journal space at runtime - not currently making use if it, but
|
||||
* the code works:
|
||||
*/
|
||||
int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
|
||||
unsigned nr)
|
||||
static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca,
|
||||
unsigned nr, bool new_fs)
|
||||
{
|
||||
struct journal_device *ja = &ca->journal;
|
||||
struct closure cl;
|
||||
int ret = 0;
|
||||
|
||||
struct closure cl;
|
||||
closure_init_stack(&cl);
|
||||
|
||||
down_write(&c->state_lock);
|
||||
|
||||
/* don't handle reducing nr of buckets yet: */
|
||||
if (nr < ja->nr)
|
||||
goto unlock;
|
||||
return 0;
|
||||
|
||||
while (ja->nr < nr) {
|
||||
while (!ret && ja->nr < nr) {
|
||||
struct disk_reservation disk_res = { 0, 0, 0 };
|
||||
|
||||
/*
|
||||
@ -1257,25 +1251,38 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
|
||||
* filesystem-wide allocation will succeed, this is a device
|
||||
* specific allocation - we can hang here:
|
||||
*/
|
||||
if (!new_fs) {
|
||||
ret = bch2_disk_reservation_get(c, &disk_res,
|
||||
bucket_to_sector(ca, nr - ja->nr), 1, 0);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
ret = bch2_disk_reservation_get(c, &disk_res,
|
||||
bucket_to_sector(ca, nr - ja->nr), 1, 0);
|
||||
if (ret)
|
||||
break;
|
||||
ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl);
|
||||
|
||||
ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
|
||||
if (ret == -BCH_ERR_bucket_alloc_blocked ||
|
||||
ret == -BCH_ERR_open_buckets_empty)
|
||||
ret = 0; /* wait and retry */
|
||||
|
||||
bch2_disk_reservation_put(c, &disk_res);
|
||||
|
||||
closure_sync(&cl);
|
||||
|
||||
if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
|
||||
break;
|
||||
}
|
||||
|
||||
bch_err_fn(c, ret);
|
||||
unlock:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate more journal space at runtime - not currently making use if it, but
|
||||
* the code works:
|
||||
*/
|
||||
int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
|
||||
unsigned nr)
|
||||
{
|
||||
down_write(&c->state_lock);
|
||||
int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false);
|
||||
up_write(&c->state_lock);
|
||||
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1301,7 +1308,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
|
||||
min(1 << 13,
|
||||
(1 << 24) / ca->mi.bucket_size));
|
||||
|
||||
ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL);
|
||||
ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs);
|
||||
err:
|
||||
bch_err_fn(ca, ret);
|
||||
return ret;
|
||||
|
@ -1041,13 +1041,19 @@ reread:
|
||||
bio->bi_iter.bi_sector = offset;
|
||||
bch2_bio_map(bio, buf->data, sectors_read << 9);
|
||||
|
||||
u64 submit_time = local_clock();
|
||||
ret = submit_bio_wait(bio);
|
||||
kfree(bio);
|
||||
|
||||
if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
|
||||
"journal read error: sector %llu",
|
||||
offset) ||
|
||||
bch2_meta_read_fault("journal")) {
|
||||
if (!ret && bch2_meta_read_fault("journal"))
|
||||
ret = -BCH_ERR_EIO_fault_injected;
|
||||
|
||||
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
|
||||
submit_time, !ret);
|
||||
|
||||
if (ret) {
|
||||
bch_err_dev_ratelimited(ca,
|
||||
"journal read error: sector %llu", offset);
|
||||
/*
|
||||
* We don't error out of the recovery process
|
||||
* here, since the relevant journal entry may be
|
||||
@ -1110,13 +1116,16 @@ reread:
|
||||
struct bch_csum csum;
|
||||
csum_good = jset_csum_good(c, j, &csum);
|
||||
|
||||
if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
|
||||
"%s",
|
||||
(printbuf_reset(&err),
|
||||
prt_str(&err, "journal "),
|
||||
bch2_csum_err_msg(&err, csum_type, j->csum, csum),
|
||||
err.buf)))
|
||||
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
|
||||
|
||||
if (!csum_good) {
|
||||
bch_err_dev_ratelimited(ca, "%s",
|
||||
(printbuf_reset(&err),
|
||||
prt_str(&err, "journal "),
|
||||
bch2_csum_err_msg(&err, csum_type, j->csum, csum),
|
||||
err.buf));
|
||||
saw_bad = true;
|
||||
}
|
||||
|
||||
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
|
||||
j->encrypted_start,
|
||||
@ -1655,6 +1664,7 @@ static CLOSURE_CALLBACK(journal_write_done)
|
||||
}
|
||||
|
||||
bool completed = false;
|
||||
bool do_discards = false;
|
||||
|
||||
for (seq = journal_last_unwritten_seq(j);
|
||||
seq <= journal_cur_seq(j);
|
||||
@ -1667,7 +1677,6 @@ static CLOSURE_CALLBACK(journal_write_done)
|
||||
j->flushed_seq_ondisk = seq;
|
||||
j->last_seq_ondisk = w->last_seq;
|
||||
|
||||
bch2_do_discards(c);
|
||||
closure_wake_up(&c->freelist_wait);
|
||||
bch2_reset_alloc_cursors(c);
|
||||
}
|
||||
@ -1718,6 +1727,9 @@ static CLOSURE_CALLBACK(journal_write_done)
|
||||
*/
|
||||
bch2_journal_do_writes(j);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
if (do_discards)
|
||||
bch2_do_discards(c);
|
||||
}
|
||||
|
||||
static void journal_write_endio(struct bio *bio)
|
||||
@ -1727,13 +1739,16 @@ static void journal_write_endio(struct bio *bio)
|
||||
struct journal *j = &ca->fs->journal;
|
||||
struct journal_buf *w = j->buf + jbio->buf_idx;
|
||||
|
||||
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
|
||||
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
|
||||
jbio->submit_time, !bio->bi_status);
|
||||
|
||||
if (bio->bi_status) {
|
||||
bch_err_dev_ratelimited(ca,
|
||||
"error writing journal entry %llu: %s",
|
||||
le64_to_cpu(w->data->seq),
|
||||
bch2_blk_status_to_str(bio->bi_status)) ||
|
||||
bch2_meta_write_fault("journal")) {
|
||||
unsigned long flags;
|
||||
bch2_blk_status_to_str(bio->bi_status));
|
||||
|
||||
unsigned long flags;
|
||||
spin_lock_irqsave(&j->err_lock, flags);
|
||||
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
|
||||
spin_unlock_irqrestore(&j->err_lock, flags);
|
||||
@ -1762,7 +1777,11 @@ static CLOSURE_CALLBACK(journal_write_submit)
|
||||
sectors);
|
||||
|
||||
struct journal_device *ja = &ca->journal;
|
||||
struct bio *bio = &ja->bio[w->idx]->bio;
|
||||
struct journal_bio *jbio = ja->bio[w->idx];
|
||||
struct bio *bio = &jbio->bio;
|
||||
|
||||
jbio->submit_time = local_clock();
|
||||
|
||||
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
|
||||
bio->bi_iter.bi_sector = ptr->offset;
|
||||
bio->bi_end_io = journal_write_endio;
|
||||
@ -1794,6 +1813,10 @@ static CLOSURE_CALLBACK(journal_write_preflush)
|
||||
struct journal *j = container_of(w, struct journal, buf[w->idx]);
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
|
||||
/*
|
||||
* Wait for previous journal writes to comelete; they won't necessarily
|
||||
* be flushed if they're still in flight
|
||||
*/
|
||||
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
|
||||
spin_lock(&j->lock);
|
||||
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
|
||||
|
@ -175,6 +175,7 @@ typedef DARRAY(u64) darray_u64;
|
||||
struct journal_bio {
|
||||
struct bch_dev *ca;
|
||||
unsigned buf_idx;
|
||||
u64 submit_time;
|
||||
|
||||
struct bio bio;
|
||||
};
|
||||
|
@ -125,8 +125,8 @@ static void move_write(struct moving_io *io)
|
||||
&ctxt->stats->sectors_error_corrected);
|
||||
}
|
||||
|
||||
if (unlikely(io->write.rbio.bio.bi_status ||
|
||||
io->write.rbio.hole ||
|
||||
if (unlikely(io->write.rbio.ret ||
|
||||
io->write.rbio.bio.bi_status ||
|
||||
io->write.data_opts.scrub)) {
|
||||
move_free(io);
|
||||
return;
|
||||
@ -816,7 +816,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
|
||||
if (!bp.v->level)
|
||||
ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
|
||||
else if (!data_opts.scrub)
|
||||
ret = bch2_btree_node_rewrite_key(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
|
||||
ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
|
||||
else
|
||||
ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
|
||||
|
||||
|
@ -74,20 +74,14 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
|
||||
struct move_bucket *b, u64 time)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
struct bch_alloc_v4 _a;
|
||||
const struct bch_alloc_v4 *a;
|
||||
int ret;
|
||||
|
||||
if (bch2_bucket_is_open(trans->c,
|
||||
b->k.bucket.inode,
|
||||
b->k.bucket.offset))
|
||||
if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
|
||||
return 0;
|
||||
|
||||
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
|
||||
b->k.bucket, BTREE_ITER_cached);
|
||||
ret = bkey_err(k);
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
|
||||
b->k.bucket, BTREE_ITER_cached);
|
||||
int ret = bkey_err(k);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -95,13 +89,18 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
|
||||
if (!ca)
|
||||
goto out;
|
||||
|
||||
a = bch2_alloc_to_v4(k, &_a);
|
||||
if (ca->mi.state != BCH_MEMBER_STATE_rw ||
|
||||
!bch2_dev_is_online(ca))
|
||||
goto out_put;
|
||||
|
||||
struct bch_alloc_v4 _a;
|
||||
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
|
||||
b->k.gen = a->gen;
|
||||
b->sectors = bch2_bucket_sectors_dirty(*a);
|
||||
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
|
||||
|
||||
ret = lru_idx && lru_idx <= time;
|
||||
|
||||
out_put:
|
||||
bch2_dev_put(ca);
|
||||
out:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
@ -145,6 +145,11 @@ enum fsck_err_opts {
|
||||
OPT_STR(bch2_error_actions), \
|
||||
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
|
||||
NULL, "Action to take on filesystem error") \
|
||||
x(write_error_timeout, u16, \
|
||||
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
||||
OPT_UINT(1, 300), \
|
||||
BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
|
||||
NULL, "Number of consecutive write errors allowed before kicking out a device")\
|
||||
x(metadata_replicas, u8, \
|
||||
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
||||
OPT_UINT(1, BCH_REPLICAS_MAX), \
|
||||
|
@ -24,7 +24,7 @@
|
||||
x(check_topology, 4, 0) \
|
||||
x(accounting_read, 39, PASS_ALWAYS) \
|
||||
x(alloc_read, 0, PASS_ALWAYS) \
|
||||
x(stripes_read, 1, PASS_ALWAYS) \
|
||||
x(stripes_read, 1, 0) \
|
||||
x(initialize_subvolumes, 2, 0) \
|
||||
x(snapshots_read, 3, PASS_ALWAYS) \
|
||||
x(check_allocations, 5, PASS_FSCK) \
|
||||
|
@ -606,7 +606,7 @@ s64 bch2_remap_range(struct bch_fs *c,
|
||||
u64 dst_done = 0;
|
||||
u32 dst_snapshot, src_snapshot;
|
||||
bool reflink_p_may_update_opts_field =
|
||||
bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
|
||||
!bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
|
||||
int ret = 0, ret2 = 0;
|
||||
|
||||
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
|
||||
|
@ -91,9 +91,6 @@
|
||||
BCH_FSCK_ERR_accounting_mismatch, \
|
||||
BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
|
||||
BCH_FSCK_ERR_accounting_key_junk_at_end) \
|
||||
x(directory_size, \
|
||||
BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
|
||||
BCH_FSCK_ERR_directory_size_mismatch) \
|
||||
x(cached_backpointers, \
|
||||
BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
|
||||
BCH_FSCK_ERR_ptr_to_missing_backpointer) \
|
||||
|
@ -179,6 +179,7 @@ enum bch_fsck_flags {
|
||||
x(ptr_crc_redundant, 160, 0) \
|
||||
x(ptr_crc_nonce_mismatch, 162, 0) \
|
||||
x(ptr_stripe_redundant, 163, 0) \
|
||||
x(extent_flags_not_at_start, 306, 0) \
|
||||
x(reservation_key_nr_replicas_invalid, 164, 0) \
|
||||
x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
|
||||
x(reflink_v_pos_bad, 292, 0) \
|
||||
@ -316,7 +317,7 @@ enum bch_fsck_flags {
|
||||
x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
|
||||
x(dirent_cf_name_too_big, 304, 0) \
|
||||
x(dirent_stray_data_after_cf_name, 305, 0) \
|
||||
x(MAX, 306, 0)
|
||||
x(MAX, 307, 0)
|
||||
|
||||
enum bch_sb_error_id {
|
||||
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
|
||||
|
@ -35,7 +35,7 @@ static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_is_readable(struct bch_dev *ca)
|
||||
static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
|
||||
{
|
||||
return bch2_dev_is_online(ca) &&
|
||||
ca->mi.state != BCH_MEMBER_STATE_failed;
|
||||
@ -283,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev
|
||||
|
||||
static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
|
||||
{
|
||||
might_sleep();
|
||||
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, dev);
|
||||
if (ca && !percpu_ref_tryget(&ca->io_ref))
|
||||
|
@ -850,7 +850,8 @@ void six_lock_exit(struct six_lock *lock)
|
||||
EXPORT_SYMBOL_GPL(six_lock_exit);
|
||||
|
||||
void __six_lock_init(struct six_lock *lock, const char *name,
|
||||
struct lock_class_key *key, enum six_lock_init_flags flags)
|
||||
struct lock_class_key *key, enum six_lock_init_flags flags,
|
||||
gfp_t gfp)
|
||||
{
|
||||
atomic_set(&lock->state, 0);
|
||||
raw_spin_lock_init(&lock->wait_lock);
|
||||
@ -873,7 +874,7 @@ void __six_lock_init(struct six_lock *lock, const char *name,
|
||||
* failure if they wish by checking lock->readers, but generally
|
||||
* will not want to treat it as an error.
|
||||
*/
|
||||
lock->readers = alloc_percpu(unsigned);
|
||||
lock->readers = alloc_percpu_gfp(unsigned, gfp);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -164,18 +164,19 @@ enum six_lock_init_flags {
|
||||
};
|
||||
|
||||
void __six_lock_init(struct six_lock *lock, const char *name,
|
||||
struct lock_class_key *key, enum six_lock_init_flags flags);
|
||||
struct lock_class_key *key, enum six_lock_init_flags flags,
|
||||
gfp_t gfp);
|
||||
|
||||
/**
|
||||
* six_lock_init - initialize a six lock
|
||||
* @lock: lock to initialize
|
||||
* @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU
|
||||
*/
|
||||
#define six_lock_init(lock, flags) \
|
||||
#define six_lock_init(lock, flags, gfp) \
|
||||
do { \
|
||||
static struct lock_class_key __key; \
|
||||
\
|
||||
__six_lock_init((lock), #lock, &__key, flags); \
|
||||
__six_lock_init((lock), #lock, &__key, flags, gfp); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
|
@ -25,9 +25,6 @@
|
||||
#include <linux/sort.h>
|
||||
#include <linux/string_choices.h>
|
||||
|
||||
static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
|
||||
};
|
||||
|
||||
struct bch2_metadata_version {
|
||||
u16 version;
|
||||
const char *name;
|
||||
@ -69,14 +66,22 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta
|
||||
return v;
|
||||
}
|
||||
|
||||
void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
|
||||
int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
|
||||
{
|
||||
mutex_lock(&c->sb_lock);
|
||||
SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
|
||||
max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
|
||||
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field);
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
|
||||
version <= c->sb.version_incompat_allowed)
|
||||
? 0
|
||||
: -BCH_ERR_may_not_use_incompat_feature;
|
||||
|
||||
if (!ret) {
|
||||
mutex_lock(&c->sb_lock);
|
||||
SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
|
||||
max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
const char * const bch2_sb_fields[] = {
|
||||
@ -366,7 +371,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
||||
struct bch_sb *sb = disk_sb->sb;
|
||||
struct bch_sb_field_members_v1 *mi;
|
||||
enum bch_opt_id opt_id;
|
||||
u16 block_size;
|
||||
int ret;
|
||||
|
||||
ret = bch2_sb_compatible(sb, out);
|
||||
@ -385,8 +389,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
||||
return -BCH_ERR_invalid_sb_features;
|
||||
}
|
||||
|
||||
block_size = le16_to_cpu(sb->block_size);
|
||||
|
||||
if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
|
||||
prt_printf(out, "Bad user UUID (got zeroes)");
|
||||
return -BCH_ERR_invalid_sb_uuid;
|
||||
@ -452,6 +454,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
||||
|
||||
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
|
||||
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
|
||||
|
||||
if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
|
||||
SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
@ -743,7 +748,7 @@ retry:
|
||||
memset(sb, 0, sizeof(*sb));
|
||||
sb->mode = BLK_OPEN_READ;
|
||||
sb->have_bio = true;
|
||||
sb->holder = kmalloc(1, GFP_KERNEL);
|
||||
sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL);
|
||||
if (!sb->holder)
|
||||
return -ENOMEM;
|
||||
|
||||
@ -906,16 +911,16 @@ static void write_super_endio(struct bio *bio)
|
||||
{
|
||||
struct bch_dev *ca = bio->bi_private;
|
||||
|
||||
bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
|
||||
|
||||
/* XXX: return errors directly */
|
||||
|
||||
if (bch2_dev_io_err_on(bio->bi_status, ca,
|
||||
bio_data_dir(bio)
|
||||
? BCH_MEMBER_ERROR_write
|
||||
: BCH_MEMBER_ERROR_read,
|
||||
"superblock %s error: %s",
|
||||
if (bio->bi_status) {
|
||||
bch_err_dev_ratelimited(ca, "superblock %s error: %s",
|
||||
str_write_read(bio_data_dir(bio)),
|
||||
bch2_blk_status_to_str(bio->bi_status)))
|
||||
bch2_blk_status_to_str(bio->bi_status));
|
||||
ca->sb_write_error = 1;
|
||||
}
|
||||
|
||||
closure_put(&ca->fs->sb_write);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
@ -1154,7 +1159,7 @@ int bch2_write_super(struct bch_fs *c)
|
||||
!can_mount_with_written), c,
|
||||
": Unable to write superblock to sufficient devices (from %ps)",
|
||||
(void *) _RET_IP_))
|
||||
ret = -1;
|
||||
ret = -BCH_ERR_erofs_sb_err;
|
||||
out:
|
||||
/* Make new options visible after they're persistent: */
|
||||
bch2_sb_update(c);
|
||||
@ -1211,11 +1216,12 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
|
||||
bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
|
||||
|
||||
c->disk_sb.sb->version = cpu_to_le16(new_version);
|
||||
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
|
||||
|
||||
if (incompat)
|
||||
if (incompat) {
|
||||
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
|
||||
SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
|
||||
max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
|
||||
}
|
||||
}
|
||||
|
||||
static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
|
||||
|
@ -21,17 +21,14 @@ static inline bool bch2_version_compatible(u16 version)
|
||||
void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
|
||||
enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
|
||||
|
||||
void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
|
||||
int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
|
||||
|
||||
static inline bool bch2_request_incompat_feature(struct bch_fs *c,
|
||||
enum bcachefs_metadata_version version)
|
||||
static inline int bch2_request_incompat_feature(struct bch_fs *c,
|
||||
enum bcachefs_metadata_version version)
|
||||
{
|
||||
if (unlikely(version > c->sb.version_incompat)) {
|
||||
if (version > c->sb.version_incompat_allowed)
|
||||
return false;
|
||||
bch2_set_version_incompat(c, version);
|
||||
}
|
||||
return true;
|
||||
return likely(version <= c->sb.version_incompat)
|
||||
? 0
|
||||
: bch2_set_version_incompat(c, version);
|
||||
}
|
||||
|
||||
static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
|
||||
|
@ -1075,6 +1075,7 @@ int bch2_fs_start(struct bch_fs *c)
|
||||
}
|
||||
|
||||
set_bit(BCH_FS_started, &c->flags);
|
||||
wake_up(&c->ro_ref_wait);
|
||||
|
||||
if (c->opts.read_only) {
|
||||
bch2_fs_read_only(c);
|
||||
@ -1431,6 +1432,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
|
||||
ca->disk_sb = *sb;
|
||||
memset(sb, 0, sizeof(*sb));
|
||||
|
||||
/*
|
||||
* Stash pointer to the filesystem for blk_holder_ops - note that once
|
||||
* attached to a filesystem, we will always close the block device
|
||||
* before tearing down the filesystem object.
|
||||
*/
|
||||
ca->disk_sb.holder->c = ca->fs;
|
||||
|
||||
ca->dev = ca->disk_sb.bdev->bd_dev;
|
||||
|
||||
percpu_ref_reinit(&ca->io_ref);
|
||||
@ -2016,6 +2024,102 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
|
||||
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
|
||||
}
|
||||
|
||||
/* blk_holder_ops: */
|
||||
|
||||
static struct bch_fs *bdev_get_fs(struct block_device *bdev)
|
||||
__releases(&bdev->bd_holder_lock)
|
||||
{
|
||||
struct bch_sb_handle_holder *holder = bdev->bd_holder;
|
||||
struct bch_fs *c = holder->c;
|
||||
|
||||
if (c && !bch2_ro_ref_tryget(c))
|
||||
c = NULL;
|
||||
|
||||
mutex_unlock(&bdev->bd_holder_lock);
|
||||
|
||||
if (c)
|
||||
wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
|
||||
return c;
|
||||
}
|
||||
|
||||
/* returns with ref on ca->ref */
|
||||
static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
|
||||
{
|
||||
for_each_member_device(c, ca)
|
||||
if (ca->disk_sb.bdev == bdev)
|
||||
return ca;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
|
||||
{
|
||||
struct bch_fs *c = bdev_get_fs(bdev);
|
||||
if (!c)
|
||||
return;
|
||||
|
||||
struct super_block *sb = c->vfs_sb;
|
||||
if (sb) {
|
||||
/*
|
||||
* Not necessary, c->ro_ref guards against the filesystem being
|
||||
* unmounted - we only take this to avoid a warning in
|
||||
* sync_filesystem:
|
||||
*/
|
||||
down_read(&sb->s_umount);
|
||||
}
|
||||
|
||||
down_write(&c->state_lock);
|
||||
struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
|
||||
if (!ca)
|
||||
goto unlock;
|
||||
|
||||
if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) {
|
||||
__bch2_dev_offline(c, ca);
|
||||
} else {
|
||||
if (sb) {
|
||||
if (!surprise)
|
||||
sync_filesystem(sb);
|
||||
shrink_dcache_sb(sb);
|
||||
evict_inodes(sb);
|
||||
}
|
||||
|
||||
bch2_journal_flush(&c->journal);
|
||||
bch2_fs_emergency_read_only(c);
|
||||
}
|
||||
|
||||
bch2_dev_put(ca);
|
||||
unlock:
|
||||
if (sb)
|
||||
up_read(&sb->s_umount);
|
||||
up_write(&c->state_lock);
|
||||
bch2_ro_ref_put(c);
|
||||
}
|
||||
|
||||
static void bch2_fs_bdev_sync(struct block_device *bdev)
|
||||
{
|
||||
struct bch_fs *c = bdev_get_fs(bdev);
|
||||
if (!c)
|
||||
return;
|
||||
|
||||
struct super_block *sb = c->vfs_sb;
|
||||
if (sb) {
|
||||
/*
|
||||
* Not necessary, c->ro_ref guards against the filesystem being
|
||||
* unmounted - we only take this to avoid a warning in
|
||||
* sync_filesystem:
|
||||
*/
|
||||
down_read(&sb->s_umount);
|
||||
sync_filesystem(sb);
|
||||
up_read(&sb->s_umount);
|
||||
}
|
||||
|
||||
bch2_ro_ref_put(c);
|
||||
}
|
||||
|
||||
const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
|
||||
.mark_dead = bch2_fs_bdev_mark_dead,
|
||||
.sync = bch2_fs_bdev_sync,
|
||||
};
|
||||
|
||||
/* Filesystem open: */
|
||||
|
||||
static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
|
||||
|
@ -42,4 +42,6 @@ void bch2_fs_stop(struct bch_fs *);
|
||||
int bch2_fs_start(struct bch_fs *);
|
||||
struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
|
||||
|
||||
extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
|
||||
|
||||
#endif /* _BCACHEFS_SUPER_H */
|
||||
|
@ -2,13 +2,19 @@
|
||||
#ifndef _BCACHEFS_SUPER_TYPES_H
|
||||
#define _BCACHEFS_SUPER_TYPES_H
|
||||
|
||||
struct bch_fs;
|
||||
|
||||
struct bch_sb_handle_holder {
|
||||
struct bch_fs *c;
|
||||
};
|
||||
|
||||
struct bch_sb_handle {
|
||||
struct bch_sb *sb;
|
||||
struct file *s_bdev_file;
|
||||
struct block_device *bdev;
|
||||
char *sb_name;
|
||||
struct bio *bio;
|
||||
void *holder;
|
||||
struct bch_sb_handle_holder *holder;
|
||||
size_t buffer_size;
|
||||
blk_mode_t mode;
|
||||
unsigned have_layout:1;
|
||||
|
@ -174,7 +174,6 @@ read_attribute(journal_debug);
|
||||
read_attribute(btree_cache);
|
||||
read_attribute(btree_key_cache);
|
||||
read_attribute(btree_reserve_cache);
|
||||
read_attribute(stripes_heap);
|
||||
read_attribute(open_buckets);
|
||||
read_attribute(open_buckets_partial);
|
||||
read_attribute(nocow_lock_table);
|
||||
@ -355,9 +354,6 @@ SHOW(bch2_fs)
|
||||
if (attr == &sysfs_btree_reserve_cache)
|
||||
bch2_btree_reserve_cache_to_text(out, c);
|
||||
|
||||
if (attr == &sysfs_stripes_heap)
|
||||
bch2_stripes_heap_to_text(out, c);
|
||||
|
||||
if (attr == &sysfs_open_buckets)
|
||||
bch2_open_buckets_to_text(out, c, NULL);
|
||||
|
||||
@ -566,7 +562,6 @@ struct attribute *bch2_fs_internal_files[] = {
|
||||
&sysfs_btree_key_cache,
|
||||
&sysfs_btree_reserve_cache,
|
||||
&sysfs_new_stripes,
|
||||
&sysfs_stripes_heap,
|
||||
&sysfs_open_buckets,
|
||||
&sysfs_open_buckets_partial,
|
||||
#ifdef BCH_WRITE_REF_DEBUG
|
||||
|
@ -339,6 +339,30 @@ DEFINE_EVENT(bio, io_read_reuse_race,
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
/* ec.c */
|
||||
|
||||
TRACE_EVENT(stripe_create,
|
||||
TP_PROTO(struct bch_fs *c, u64 idx, int ret),
|
||||
TP_ARGS(c, idx, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev )
|
||||
__field(u64, idx )
|
||||
__field(int, ret )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = c->dev;
|
||||
__entry->idx = idx;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk("%d,%d idx %llu ret %i",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->idx,
|
||||
__entry->ret)
|
||||
);
|
||||
|
||||
/* Journal */
|
||||
|
||||
DEFINE_EVENT(bch_fs, journal_full,
|
||||
|
@ -208,6 +208,8 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
|
||||
bdev->queue.backing_dev_info = bdev->bd_disk->bdi;
|
||||
bdev->bd_inode = &bdev->__bd_inode;
|
||||
|
||||
mutex_init(&bdev->bd_holder_lock);
|
||||
|
||||
struct file *file = calloc(sizeof(*file), 1);
|
||||
file->f_inode = bdev->bd_inode;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user