Update bcachefs sources to 46af7258b951 bcachefs: BCH_SB_FEATURES_ALL includes BCH_FEATURE_incompat_verison_field

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-03-10 14:22:35 -04:00
parent 86cbeaf1c2
commit 6cbadc946d
56 changed files with 1096 additions and 793 deletions

View File

@ -1 +1 @@
9736cbbc5cc39f6c666befdd787788b6ce6497f6
46af7258b951a79a66511172ab8772ad2dfaa4e3

View File

@ -10,6 +10,8 @@
#include <linux/types.h>
#include <linux/bvec.h>
#include <linux/kobject.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
struct bio_set;
struct bio;
@ -63,6 +65,8 @@ struct block_device {
struct gendisk * bd_disk;
struct gendisk __bd_disk;
int bd_fd;
struct mutex bd_holder_lock;
};
#define bdev_kobj(_bdev) (&((_bdev)->kobj))

View File

@ -65,7 +65,10 @@ unsigned bdev_logical_block_size(struct block_device *bdev);
sector_t get_capacity(struct gendisk *disk);
struct blk_holder_ops {
void (*mark_dead)(struct block_device *bdev);
void (*mark_dead)(struct block_device *bdev, bool surprise);
void (*sync)(struct block_device *bdev);
int (*freeze)(struct block_device *bdev);
int (*thaw)(struct block_device *bdev);
};
static inline struct block_device *file_bdev(struct file *file)
@ -80,8 +83,12 @@ int lookup_bdev(const char *path, dev_t *);
struct super_block {
void *s_fs_info;
struct rw_semaphore s_umount;
};
static inline void evict_inodes(struct super_block *sb) {}
static inline int sync_filesystem(struct super_block *) { return 0; }
/*
* File types
*

View File

@ -9,6 +9,8 @@ struct dentry {
struct inode *d_inode;
};
static inline void shrink_dcache_sb(struct super_block *) {}
#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n))

View File

@ -536,6 +536,7 @@ struct bch_dev {
*/
struct bch_member_cpu mi;
atomic64_t errors[BCH_MEMBER_ERROR_NR];
unsigned long write_errors_start;
__uuid_t uuid;
char name[BDEVNAME_SIZE];
@ -1002,15 +1003,11 @@ struct bch_fs {
wait_queue_head_t copygc_running_wq;
/* STRIPES: */
GENRADIX(struct stripe) stripes;
GENRADIX(struct gc_stripe) gc_stripes;
struct hlist_head ec_stripes_new[32];
spinlock_t ec_stripes_new_lock;
ec_stripes_heap ec_stripes_heap;
struct mutex ec_stripes_heap_lock;
/* ERASURE CODING */
struct list_head ec_stripe_head_list;
struct mutex ec_stripe_head_lock;

View File

@ -690,7 +690,8 @@ struct bch_sb_field_ext {
x(cached_backpointers, BCH_VERSION(1, 21)) \
x(stripe_backpointers, BCH_VERSION(1, 22)) \
x(stripe_lru, BCH_VERSION(1, 23)) \
x(casefolding, BCH_VERSION(1, 24))
x(casefolding, BCH_VERSION(1, 24)) \
x(extent_flags, BCH_VERSION(1, 25))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@ -859,6 +860,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
struct bch_sb, flags[5], 48, 64);
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
@ -927,7 +929,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
BIT_ULL(BCH_FEATURE_new_siphash)| \
BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
BIT_ULL(BCH_FEATURE_new_varint)| \
BIT_ULL(BCH_FEATURE_journal_no_flush))
BIT_ULL(BCH_FEATURE_journal_no_flush)| \
BIT_ULL(BCH_FEATURE_incompat_version_field))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,

View File

@ -203,7 +203,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
return NULL;
}
bch2_btree_lock_init(&b->c, 0);
bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
__bch2_btree_node_to_freelist(bc, b);
return b;
@ -795,17 +795,18 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
}
b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
if (!b) {
if (b) {
bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
} else {
mutex_unlock(&bc->lock);
bch2_trans_unlock(trans);
b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
goto err;
bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
mutex_lock(&bc->lock);
}
bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
BUG_ON(!six_trylock_intent(&b->c.lock));
BUG_ON(!six_trylock_write(&b->c.lock));

View File

@ -1187,7 +1187,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
le64_to_cpu(i->journal_seq),
b->written, b->written + sectors, ptr_written);
b->written += sectors;
b->written = min(b->written + sectors, btree_sectors(c));
if (blacklisted && !first)
continue;
@ -1329,6 +1329,7 @@ static void btree_node_read_work(struct work_struct *work)
bch_info(c, "retrying read");
ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
rb->have_ioref = ca != NULL;
rb->start_time = local_clock();
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_buf_bytes(b);
@ -1339,17 +1340,22 @@ static void btree_node_read_work(struct work_struct *work)
} else {
bio->bi_status = BLK_STS_REMOVED;
}
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
rb->start_time, !bio->bi_status);
start:
printbuf_reset(&buf);
bch2_btree_pos_to_text(&buf, c, b);
bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
"btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (ca && bio->bi_status)
bch_err_dev_ratelimited(ca,
"btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
bch2_mark_io_failure(&failed, &rb->pick);
bch2_mark_io_failure(&failed, &rb->pick, false);
can_retry = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
@ -1401,12 +1407,11 @@ static void btree_node_read_endio(struct bio *bio)
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
struct bch_fs *c = rb->c;
struct bch_dev *ca = rb->have_ioref
? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
if (rb->have_ioref) {
struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
}
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
rb->start_time, !bio->bi_status);
queue_work(c->btree_read_complete_wq, &rb->work);
}
@ -2075,6 +2080,11 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
unsigned commit_flags =
BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw;
u64 start_time = wbio->start_time;
int ret = 0;
@ -2083,38 +2093,24 @@ static void btree_node_write_work(struct work_struct *work)
wbio->wbio.used_mempool,
wbio->data);
bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
ret = -BCH_ERR_btree_node_write_all_failed;
goto err;
}
if (wbio->wbio.first_btree_write) {
if (wbio->wbio.failed.nr) {
}
} else {
if (wbio->wbio.failed.nr) {
ret = bch2_trans_do(c,
bch2_btree_node_rewrite_key_get_iter(trans, b,
commit_flags));
} else if (!wbio->wbio.first_btree_write) {
ret = bch2_trans_do(c,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw,
!wbio->wbio.failed.nr));
if (ret)
goto err;
commit_flags, true));
}
out:
if (ret) {
set_btree_node_noevict(b);
bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
"writing btree node: %s", bch2_err_str(ret));
}
bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b, start_time);
return;
err:
set_btree_node_noevict(b);
bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
"writing btree node: %s", bch2_err_str(ret));
goto out;
}
static void btree_node_write_endio(struct bio *bio)
@ -2126,16 +2122,17 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct btree *b = wbio->bio.bi_private;
struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
unsigned long flags;
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
wbio->submit_time, !bio->bi_status);
if (!ca ||
bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"btree write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("btree")) {
if (ca && bio->bi_status)
bch_err_dev_ratelimited(ca,
"btree write error: %s",
bch2_blk_status_to_str(bio->bi_status));
if (bio->bi_status) {
unsigned long flags;
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bch2_dev_list_add_dev(&orig->failed, wbio->dev);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);

View File

@ -156,7 +156,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
}
if (ck) {
bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
ck->c.cached = true;
goto lock;
}

View File

@ -7,9 +7,10 @@
static struct lock_class_key bch2_btree_node_lock_key;
void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
enum six_lock_init_flags flags)
enum six_lock_init_flags flags,
gfp_t gfp)
{
__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp);
lockdep_set_notrack_class(&b->lock);
}

View File

@ -13,7 +13,7 @@
#include "btree_iter.h"
#include "six.h"
void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
void bch2_trans_unlock_noassert(struct btree_trans *);
void bch2_trans_unlock_write(struct btree_trans *);

View File

@ -166,11 +166,17 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, bn, PAGE_SIZE);
u64 submit_time = local_clock();
submit_bio_wait(bio);
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
"IO error in try_read_btree_node() at %llu: %s",
offset, bch2_blk_status_to_str(bio->bi_status)))
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
if (bio->bi_status) {
bch_err_dev_ratelimited(ca,
"IO error in try_read_btree_node() at %llu: %s",
offset, bch2_blk_status_to_str(bio->bi_status));
return;
}
if (le64_to_cpu(bn->magic) != bset_magic(c))
return;
@ -264,7 +270,7 @@ static int read_btree_nodes_worker(void *p)
err:
bio_put(bio);
free_page((unsigned long) buf);
percpu_ref_get(&ca->io_ref);
percpu_ref_put(&ca->io_ref);
closure_put(w->cl);
kfree(w);
return 0;
@ -283,29 +289,28 @@ static int read_btree_nodes(struct find_btree_nodes *f)
continue;
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
struct task_struct *t;
if (!w) {
percpu_ref_put(&ca->io_ref);
ret = -ENOMEM;
goto err;
}
percpu_ref_get(&ca->io_ref);
closure_get(&cl);
w->cl = &cl;
w->f = f;
w->ca = ca;
t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
ret = PTR_ERR_OR_ZERO(t);
if (ret) {
percpu_ref_put(&ca->io_ref);
closure_put(&cl);
f->ret = ret;
bch_err(c, "error starting kthread: %i", ret);
kfree(w);
bch_err_msg(c, ret, "starting kthread");
break;
}
closure_get(&cl);
percpu_ref_get(&ca->io_ref);
wake_up_process(t);
}
err:
closure_sync(&cl);

View File

@ -2126,6 +2126,31 @@ err_free_update:
goto out;
}
static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
struct btree *b)
{
bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level,
BTREE_ITER_intent);
int ret = bch2_btree_iter_traverse(iter);
if (ret)
goto err;
/* has node been freed? */
if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
/* node has been freed: */
BUG_ON(!btree_node_dying(b));
ret = -BCH_ERR_btree_node_dying;
goto err;
}
BUG_ON(!btree_node_hashed(b));
return 0;
err:
bch2_trans_iter_exit(trans, iter);
return ret;
}
int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_iter *iter,
struct btree *b,
@ -2191,7 +2216,29 @@ err:
goto out;
}
int bch2_btree_node_rewrite_key(struct btree_trans *trans,
static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_i *k, unsigned flags)
{
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter,
btree, k->k.p,
BTREE_MAX_DEPTH, level, 0);
struct btree *b = bch2_btree_iter_peek_node(&iter);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto out;
bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
ret = found
? bch2_btree_node_rewrite(trans, &iter, b, flags)
: -ENOENT;
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bpos pos, unsigned flags)
{
@ -2211,6 +2258,19 @@ err:
return ret;
}
int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
struct btree *b, unsigned flags)
{
struct btree_iter iter;
int ret = get_iter_to_node(trans, &iter, b);
if (ret)
return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
struct async_btree_rewrite {
struct bch_fs *c;
struct work_struct work;
@ -2220,57 +2280,14 @@ struct async_btree_rewrite {
struct bkey_buf key;
};
static int async_btree_node_rewrite_trans(struct btree_trans *trans,
struct async_btree_rewrite *a)
{
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter,
a->btree_id, a->key.k->k.p,
BTREE_MAX_DEPTH, a->level, 0);
struct btree *b = bch2_btree_iter_peek_node(&iter);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto out;
bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
ret = found
? bch2_btree_node_rewrite(trans, &iter, b, 0)
: -ENOENT;
#if 0
/* Tracepoint... */
if (!ret || ret == -ENOENT) {
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
if (!ret) {
prt_printf(&buf, "rewrite node:\n ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
} else {
prt_printf(&buf, "node to rewrite not found:\n want: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
prt_printf(&buf, "\n got: ");
if (b)
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
else
prt_str(&buf, "(null)");
}
bch_info(c, "%s", buf.buf);
printbuf_exit(&buf);
}
#endif
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static void async_btree_node_rewrite_work(struct work_struct *work)
{
struct async_btree_rewrite *a =
container_of(work, struct async_btree_rewrite, work);
struct bch_fs *c = a->c;
int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
a->btree_id, a->level, a->key.k, 0));
if (ret != -ENOENT)
bch_err_fn_ratelimited(c, ret);
@ -2514,30 +2531,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
unsigned commit_flags, bool skip_triggers)
{
struct btree_iter iter;
int ret;
bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level,
BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter);
int ret = get_iter_to_node(trans, &iter, b);
if (ret)
goto out;
/* has node been freed? */
if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
/* node has been freed: */
BUG_ON(!btree_node_dying(b));
goto out;
}
BUG_ON(!btree_node_hashed(b));
return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
!bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
commit_flags, skip_triggers);
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
}

View File

@ -169,9 +169,12 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
struct btree *, unsigned);
int bch2_btree_node_rewrite_key(struct btree_trans *,
int bch2_btree_node_rewrite_pos(struct btree_trans *,
enum btree_id, unsigned,
struct bpos, unsigned);
int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
struct btree *, unsigned);
void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,

View File

@ -573,7 +573,6 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
prt_str_indented(out, "extra replicas:\t");
prt_u64(out, data_opts->extra_replicas);
prt_newline(out);
}
void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
@ -707,6 +706,18 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
return 0;
}
static bool can_write_extent(struct bch_fs *c,
struct bch_devs_list *devs_have,
unsigned target)
{
struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
darray_for_each(*devs_have, i)
__clear_bit(*i, devs.d);
return !bch2_is_zero(&devs, sizeof(devs));
}
int bch2_data_update_init(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt,
@ -788,6 +799,20 @@ int bch2_data_update_init(struct btree_trans *trans,
ptr_bit <<= 1;
}
if (!can_write_extent(c, &m->op.devs_have,
m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) {
/*
* Check if we have rw devices not in devs_have: this can happen
* if we're trying to move data on a ro or failed device
*
* If we can't move it, we need to clear the rebalance_work bit,
* if applicable
*
* Also, copygc should skip ro/failed devices:
*/
return -BCH_ERR_data_update_done_no_rw_devs;
}
unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/*

View File

@ -44,9 +44,9 @@ struct bch_dirent {
__u8 d_pad;
__le16 d_name_len;
__le16 d_cf_name_len;
__u8 d_names[0];
__u8 d_names[];
} d_cf_name_block __packed;
__u8 d_name[0];
__DECLARE_FLEX_ARRAY(__u8, d_name);
} __packed;
} __packed __aligned(8);

View File

@ -105,6 +105,7 @@ struct ec_bio {
struct bch_dev *ca;
struct ec_stripe_buf *buf;
size_t idx;
u64 submit_time;
struct bio bio;
};
@ -494,38 +495,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
return ret;
}
if (flags & BTREE_TRIGGER_atomic) {
struct stripe *m = genradix_ptr(&c->stripes, idx);
if (!m) {
struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF;
bch2_bkey_val_to_text(&buf1, c, old);
bch2_bkey_val_to_text(&buf2, c, new);
bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
"old %s\n"
"new %s", idx, buf1.buf, buf2.buf);
printbuf_exit(&buf2);
printbuf_exit(&buf1);
bch2_inconsistent_error(c);
return -1;
}
if (!new_s) {
bch2_stripes_heap_del(c, m, idx);
memset(m, 0, sizeof(*m));
} else {
stripe_to_mem(m, new_s);
if (!old_s)
bch2_stripes_heap_insert(c, m, idx);
else
bch2_stripes_heap_update(c, m, idx);
}
}
return 0;
}
@ -748,14 +717,15 @@ static void ec_block_endio(struct bio *bio)
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
if (bch2_dev_io_err_on(bio->bi_status, ca,
bio_data_dir(bio)
? BCH_MEMBER_ERROR_write
: BCH_MEMBER_ERROR_read,
"erasure coding %s error: %s",
bch2_account_io_completion(ca, bio_data_dir(bio),
ec_bio->submit_time, !bio->bi_status);
if (bio->bi_status) {
bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
str_write_read(bio_data_dir(bio)),
bch2_blk_status_to_str(bio->bi_status)))
bch2_blk_status_to_str(bio->bi_status));
clear_bit(ec_bio->idx, ec_bio->buf->valid);
}
int stale = dev_ptr_stale(ca, ptr);
if (stale) {
@ -818,6 +788,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
ec_bio->ca = ca;
ec_bio->buf = buf;
ec_bio->idx = idx;
ec_bio->submit_time = local_clock();
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
ec_bio->bio.bi_end_io = ec_block_endio;
@ -939,26 +910,6 @@ err:
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
{
ec_stripes_heap n, *h = &c->ec_stripes_heap;
if (idx >= h->size) {
if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
mutex_lock(&c->ec_stripes_heap_lock);
if (n.size > h->size) {
memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
n.nr = h->nr;
swap(*h, n);
}
mutex_unlock(&c->ec_stripes_heap_lock);
free_heap(&n);
}
if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
@ -1031,155 +982,26 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
s->idx = 0;
}
/* Heap of all existing stripes, ordered by blocks_nonempty */
static u64 stripe_idx_to_delete(struct bch_fs *c)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
lockdep_assert_held(&c->ec_stripes_heap_lock);
if (h->nr &&
h->data[0].blocks_nonempty == 0 &&
!bch2_stripe_is_open(c, h->data[0].idx))
return h->data[0].idx;
return 0;
}
static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
size_t i)
{
struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
}
static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
{
struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
return ((_l->blocks_nonempty > _r->blocks_nonempty) <
(_l->blocks_nonempty < _r->blocks_nonempty));
}
static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
{
struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
ec_stripes_heap *_h = (ec_stripes_heap *)h;
size_t i = _l - _h->data;
size_t j = _r - _h->data;
swap(*_l, *_r);
ec_stripes_heap_set_backpointer(_h, i);
ec_stripes_heap_set_backpointer(_h, j);
}
static const struct min_heap_callbacks callbacks = {
.less = ec_stripes_heap_cmp,
.swp = ec_stripes_heap_swap,
};
static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
struct stripe *m = genradix_ptr(&c->stripes, idx);
BUG_ON(m->heap_idx >= h->nr);
BUG_ON(h->data[m->heap_idx].idx != idx);
}
void bch2_stripes_heap_del(struct bch_fs *c,
struct stripe *m, size_t idx)
{
mutex_lock(&c->ec_stripes_heap_lock);
heap_verify_backpointer(c, idx);
min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
mutex_unlock(&c->ec_stripes_heap_lock);
}
void bch2_stripes_heap_insert(struct bch_fs *c,
struct stripe *m, size_t idx)
{
mutex_lock(&c->ec_stripes_heap_lock);
BUG_ON(min_heap_full(&c->ec_stripes_heap));
genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
.idx = idx,
.blocks_nonempty = m->blocks_nonempty,
}),
&callbacks,
&c->ec_stripes_heap);
heap_verify_backpointer(c, idx);
mutex_unlock(&c->ec_stripes_heap_lock);
}
void bch2_stripes_heap_update(struct bch_fs *c,
struct stripe *m, size_t idx)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
bool do_deletes;
size_t i;
mutex_lock(&c->ec_stripes_heap_lock);
heap_verify_backpointer(c, idx);
h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
i = m->heap_idx;
min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap);
min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
heap_verify_backpointer(c, idx);
do_deletes = stripe_idx_to_delete(c) != 0;
mutex_unlock(&c->ec_stripes_heap_lock);
if (do_deletes)
bch2_do_stripe_deletes(c);
}
/* stripe deletion */
static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_stripe s;
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
BTREE_ITER_intent);
ret = bkey_err(k);
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
BTREE_ID_stripes, POS(0, idx),
BTREE_ITER_intent);
int ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_stripe) {
bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
ret = -EINVAL;
goto err;
}
s = bkey_s_c_to_stripe(k);
for (unsigned i = 0; i < s.v->nr_blocks; i++)
if (stripe_blockcount_get(s.v, i)) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
printbuf_exit(&buf);
ret = -EINVAL;
goto err;
}
ret = bch2_btree_delete_at(trans, &iter, 0);
/*
* We expect write buffer races here
* Important: check stripe_is_open with stripe key locked:
*/
if (k.k->type == KEY_TYPE_stripe &&
!bch2_stripe_is_open(trans->c, idx) &&
stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
ret = bch2_btree_delete_at(trans, &iter, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@ -1194,21 +1016,16 @@ static void ec_stripe_delete_work(struct work_struct *work)
struct bch_fs *c =
container_of(work, struct bch_fs, ec_stripe_delete_work);
while (1) {
mutex_lock(&c->ec_stripes_heap_lock);
u64 idx = stripe_idx_to_delete(c);
mutex_unlock(&c->ec_stripes_heap_lock);
if (!idx)
break;
int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
ec_stripe_delete(trans, idx));
bch_err_fn(c, ret);
if (ret)
break;
}
bch2_trans_run(c,
bch2_btree_write_buffer_tryflush(trans) ?:
for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
0, lru_k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc, ({
ec_stripe_delete(trans, lru_k.k->p.offset);
})));
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
}
@ -1557,6 +1374,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ret)
goto err;
err:
trace_stripe_create(c, s->idx, ret);
bch2_disk_reservation_put(c, &s->res);
for (i = 0; i < v->nr_blocks; i++)
@ -1998,39 +1817,40 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
return 0;
}
static s64 get_existing_stripe(struct bch_fs *c,
struct ec_stripe_head *head)
static int __get_existing_stripe(struct btree_trans *trans,
struct ec_stripe_head *head,
struct ec_stripe_buf *stripe,
u64 idx)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
struct stripe *m;
size_t heap_idx;
u64 stripe_idx;
s64 ret = -1;
struct bch_fs *c = trans->c;
if (may_create_new_stripe(c))
return -1;
struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
BTREE_ID_stripes, POS(0, idx), 0);
int ret = bkey_err(k);
if (ret)
goto err;
mutex_lock(&c->ec_stripes_heap_lock);
for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
/* No blocks worth reusing, stripe will just be deleted: */
if (!h->data[heap_idx].blocks_nonempty)
continue;
/* We expect write buffer races here */
if (k.k->type != KEY_TYPE_stripe)
goto out;
stripe_idx = h->data[heap_idx].idx;
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
if (stripe_lru_pos(s.v) <= 1)
goto out;
m = genradix_ptr(&c->stripes, stripe_idx);
if (m->disk_label == head->disk_label &&
m->algorithm == head->algo &&
m->nr_redundant == head->redundancy &&
m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
bch2_try_open_stripe(c, head->s, stripe_idx)) {
ret = stripe_idx;
break;
}
if (s.v->disk_label == head->disk_label &&
s.v->algorithm == head->algo &&
s.v->nr_redundant == head->redundancy &&
le16_to_cpu(s.v->sectors) == head->blocksize &&
bch2_try_open_stripe(c, head->s, idx)) {
bkey_reassemble(&stripe->key, k);
ret = 1;
}
mutex_unlock(&c->ec_stripes_heap_lock);
out:
bch2_set_btree_iter_dontneed(&iter);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@ -2082,24 +1902,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
struct ec_stripe_new *s)
{
struct bch_fs *c = trans->c;
s64 idx;
int ret;
/*
* If we can't allocate a new stripe, and there's no stripes with empty
* blocks for us to reuse, that means we have to wait on copygc:
*/
idx = get_existing_stripe(c, h);
if (idx < 0)
return -BCH_ERR_stripe_alloc_blocked;
if (may_create_new_stripe(c))
return -1;
ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
"reading stripe key: %s", bch2_err_str(ret));
if (ret) {
bch2_stripe_close(c, s);
return ret;
struct btree_iter lru_iter;
struct bkey_s_c lru_k;
int ret = 0;
for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
0, lru_k, ret) {
ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
if (ret)
break;
}
bch2_trans_iter_exit(trans, &lru_iter);
if (!ret)
ret = -BCH_ERR_stripe_alloc_blocked;
if (ret == 1)
ret = 0;
if (ret)
return ret;
return init_new_stripe_from_existing(c, s);
}
@ -2397,46 +2226,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
int bch2_stripes_read(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
BTREE_ITER_prefetch, k, ({
if (k.k->type != KEY_TYPE_stripe)
continue;
ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
if (ret)
break;
struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
bch2_stripes_heap_insert(c, m, k.k->p.offset);
0;
})));
bch_err_fn(c, ret);
return ret;
}
void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
struct stripe *m;
size_t i;
mutex_lock(&c->ec_stripes_heap_lock);
for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
m = genradix_ptr(&c->stripes, h->data[i].idx);
prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
h->data[i].blocks_nonempty,
m->nr_blocks - m->nr_redundant,
m->nr_redundant);
if (bch2_stripe_is_open(c, h->data[i].idx))
prt_str(out, " open");
prt_newline(out);
}
mutex_unlock(&c->ec_stripes_heap_lock);
return 0;
}
static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@ -2507,15 +2297,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
BUG_ON(!list_empty(&c->ec_stripe_new_list));
free_heap(&c->ec_stripes_heap);
genradix_free(&c->stripes);
bioset_exit(&c->ec_bioset);
}
void bch2_fs_ec_init_early(struct bch_fs *c)
{
spin_lock_init(&c->ec_stripes_new_lock);
mutex_init(&c->ec_stripes_heap_lock);
INIT_LIST_HEAD(&c->ec_stripe_head_list);
mutex_init(&c->ec_stripe_head_lock);

View File

@ -99,15 +99,13 @@ static inline u64 stripe_lru_pos(const struct bch_stripe *s)
if (!s)
return 0;
unsigned blocks_empty = 0, blocks_nonempty = 0;
unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
for (unsigned i = 0; i < s->nr_blocks; i++) {
blocks_empty += !stripe_blockcount_get(s, i);
blocks_nonempty += !!stripe_blockcount_get(s, i);
}
for (unsigned i = 0; i < nr_data; i++)
blocks_empty += !stripe_blockcount_get(s, i);
/* Will be picked up by the stripe_delete worker */
if (!blocks_nonempty)
if (blocks_empty == nr_data)
return STRIPE_LRU_POS_EMPTY;
if (!blocks_empty)
@ -260,10 +258,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
unsigned, unsigned, unsigned,
enum bch_watermark, struct closure *);
void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
void bch2_do_stripe_deletes(struct bch_fs *);
void bch2_ec_do_stripe_creates(struct bch_fs *);
void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
@ -300,7 +294,6 @@ void bch2_fs_ec_flush(struct bch_fs *);
int bch2_stripes_read(struct bch_fs *);
void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_ec_exit(struct bch_fs *);

View File

@ -31,11 +31,4 @@ struct gc_stripe {
struct bch_replicas_padded r;
};
struct ec_stripe_heap_entry {
size_t idx;
unsigned blocks_nonempty;
};
typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap;
#endif /* _BCACHEFS_EC_TYPES_H */

View File

@ -119,6 +119,7 @@
x(ENOENT, ENOENT_dev_idx_not_found) \
x(ENOENT, ENOENT_inode_no_backpointer) \
x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
x(ENOENT, btree_node_dying) \
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
x(EEXIST, EEXIST_str_hash_set) \
@ -185,6 +186,7 @@
x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \
x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \
x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \
x(EINVAL, device_state_not_allowed) \
x(EINVAL, member_info_missing) \
x(EINVAL, mismatched_block_size) \
@ -205,6 +207,7 @@
x(EINVAL, no_resize_with_buckets_nouse) \
x(EINVAL, inode_unpack_error) \
x(EINVAL, varint_decode_error) \
x(EOPNOTSUPP, may_not_use_incompat_feature) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@ -269,12 +272,29 @@
x(EIO, mark_stripe) \
x(EIO, stripe_reconstruct) \
x(EIO, key_type_error) \
x(EIO, extent_poisened) \
x(EIO, no_device_to_read_from) \
x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \
x(EIO, no_encryption_key) \
x(EIO, insufficient_journal_devices) \
x(EIO, device_offline) \
x(EIO, EIO_fault_injected) \
x(EIO, data_read) \
x(BCH_ERR_data_read, data_read_retry) \
x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
x(BCH_ERR_data_read_retry_avoid,data_read_device_offline) \
x(BCH_ERR_data_read_retry_avoid,data_read_io_err) \
x(BCH_ERR_data_read_retry_avoid,data_read_ec_reconstruct_err) \
x(BCH_ERR_data_read_retry_avoid,data_read_csum_err) \
x(BCH_ERR_data_read_retry, data_read_csum_err_maybe_userspace) \
x(BCH_ERR_data_read, data_read_decompress_err) \
x(BCH_ERR_data_read, data_read_decrypt_err) \
x(BCH_ERR_data_read, data_read_ptr_stale_race) \
x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \
x(BCH_ERR_data_read, data_read_no_encryption_key) \
x(BCH_ERR_data_read, data_read_buffer_too_small) \
x(BCH_ERR_data_read, data_read_key_overwritten) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \

View File

@ -54,25 +54,41 @@ void bch2_io_error_work(struct work_struct *work)
{
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
struct bch_fs *c = ca->fs;
bool dev;
/* XXX: if it's reads or checksums that are failing, set it to failed */
down_write(&c->state_lock);
dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
BCH_FORCE_IF_DEGRADED);
if (dev
? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
BCH_FORCE_IF_DEGRADED)
: bch2_fs_emergency_read_only(c))
unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
if (write_errors_start &&
time_after(jiffies,
write_errors_start + c->opts.write_error_timeout * HZ)) {
if (ca->mi.state >= BCH_MEMBER_STATE_ro)
goto out;
bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
BCH_FORCE_IF_DEGRADED);
bch_err(ca,
"too many IO errors, setting %s RO",
"writes erroring for %u seconds, setting %s ro",
c->opts.write_error_timeout,
dev ? "device" : "filesystem");
if (!dev)
bch2_fs_emergency_read_only(c);
}
out:
up_write(&c->state_lock);
}
void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
{
atomic64_inc(&ca->errors[type]);
//queue_work(system_long_wq, &ca->io_error_work);
if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
ca->write_errors_start = jiffies;
queue_work(system_long_wq, &ca->io_error_work);
}
enum ask_yn {

View File

@ -216,27 +216,37 @@ void bch2_io_error_work(struct work_struct *);
/* Does the error handling without logging a message */
void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
#define bch2_dev_io_err_on(cond, ca, _type, ...) \
({ \
bool _ret = (cond); \
\
if (_ret) { \
bch_err_dev_ratelimited(ca, __VA_ARGS__); \
bch2_io_error(ca, _type); \
} \
_ret; \
})
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void bch2_latency_acct(struct bch_dev *, u64, int);
#else
static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
#endif
#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \
({ \
bool _ret = (cond); \
\
if (_ret) { \
bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
bch2_io_error(ca, _type); \
} \
_ret; \
})
static inline void bch2_account_io_success_fail(struct bch_dev *ca,
enum bch_member_error_type type,
bool success)
{
if (likely(success)) {
if (type == BCH_MEMBER_ERROR_write &&
ca->write_errors_start)
ca->write_errors_start = 0;
} else {
bch2_io_error(ca, type);
}
}
static inline void bch2_account_io_completion(struct bch_dev *ca,
enum bch_member_error_type type,
u64 submit_time, bool success)
{
if (unlikely(!ca))
return;
if (type != BCH_MEMBER_ERROR_checksum)
bch2_latency_acct(ca, submit_time, type);
bch2_account_io_success_fail(ca, type, success);
}
int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);

View File

@ -28,6 +28,13 @@
#include "trace.h"
#include "util.h"
static const char * const bch2_extent_flags_strs[] = {
#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
BCH_EXTENT_FLAGS()
#undef x
NULL,
};
static unsigned bch2_crc_field_size_max[] = {
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@ -51,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
}
void bch2_mark_io_failure(struct bch_io_failures *failed,
struct extent_ptr_decoded *p)
struct extent_ptr_decoded *p,
bool csum_error)
{
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
@ -59,25 +67,28 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
f = &failed->devs[failed->nr++];
f->dev = p->ptr.dev;
f->idx = p->idx;
f->nr_failed = 1;
f->nr_retries = 0;
} else if (p->idx != f->idx) {
f->idx = p->idx;
f->nr_failed = 1;
f->nr_retries = 0;
} else {
f->nr_failed++;
memset(f, 0, sizeof(*f));
f->dev = p->ptr.dev;
}
if (p->do_ec_reconstruct)
f->failed_ec = true;
else if (!csum_error)
f->failed_io = true;
else
f->failed_csum_nr++;
}
static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
static inline u64 dev_latency(struct bch_dev *ca)
{
struct bch_dev *ca = bch2_dev_rcu(c, dev);
return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
}
static inline int dev_failed(struct bch_dev *ca)
{
return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
}
/*
* returns true if p1 is better than p2:
*/
@ -85,9 +96,18 @@ static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p1,
const struct extent_ptr_decoded p2)
{
if (likely(!p1.idx && !p2.idx)) {
u64 l1 = dev_latency(c, p1.ptr.dev);
u64 l2 = dev_latency(c, p2.ptr.dev);
if (likely(!p1.do_ec_reconstruct &&
!p2.do_ec_reconstruct)) {
struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
int failed_delta = dev_failed(ca1) - dev_failed(ca2);
if (failed_delta)
return failed_delta < 0;
u64 l1 = dev_latency(ca1);
u64 l2 = dev_latency(ca2);
/*
* Square the latencies, to bias more in favor of the faster
@ -103,9 +123,9 @@ static inline bool ptr_better(struct bch_fs *c,
}
if (bch2_force_reconstruct_read)
return p1.idx > p2.idx;
return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
return p1.idx < p2.idx;
return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
}
/*
@ -114,19 +134,24 @@ static inline bool ptr_better(struct bch_fs *c,
* other devices, it will still pick a pointer from avoid.
*/
int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_failures *failed,
struct extent_ptr_decoded *pick,
int dev)
struct bch_io_failures *failed,
struct extent_ptr_decoded *pick,
int dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
struct bch_dev_io_failures *f;
unsigned csum_retry = 0;
bool have_csum_retries = false;
int ret = 0;
if (k.k->type == KEY_TYPE_error)
return -BCH_ERR_key_type_error;
if (bch2_bkey_extent_ptrs_flags(ptrs) & BCH_EXTENT_FLAG_poisoned)
return -BCH_ERR_extent_poisened;
again:
rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
/*
@ -154,20 +179,28 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
if (f)
p.idx = f->nr_failed < f->nr_retries
? f->idx
: f->idx + 1;
if (unlikely(failed) &&
(f = bch2_dev_io_failures(failed, p.ptr.dev))) {
have_csum_retries |= !f->failed_io && f->failed_csum_nr < BCH_MAX_CSUM_RETRIES;
if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
p.idx++;
if (p.has_ec &&
!f->failed_ec &&
(f->failed_io || f->failed_csum_nr))
p.do_ec_reconstruct = true;
else if (f->failed_io ||
f->failed_csum_nr > csum_retry)
continue;
}
if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
p.idx++;
if (!ca || !bch2_dev_is_online(ca)) {
if (p.has_ec)
p.do_ec_reconstruct = true;
else
continue;
}
if (p.idx > (unsigned) p.has_ec)
continue;
if (p.has_ec && bch2_force_reconstruct_read)
p.do_ec_reconstruct = true;
if (ret > 0 && !ptr_better(c, p, *pick))
continue;
@ -177,6 +210,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
}
rcu_read_unlock();
if (unlikely(ret == -BCH_ERR_no_device_to_read_from &&
have_csum_retries &&
csum_retry < BCH_MAX_CSUM_RETRIES)) {
csum_retry++;
goto again;
}
return ret;
}
@ -1002,7 +1042,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
}
void bch2_extent_ptr_set_cached(struct bch_fs *c,
@ -1225,6 +1265,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
break;
case BCH_EXTENT_ENTRY_flags:
prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
break;
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
@ -1386,6 +1430,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
#endif
break;
}
case BCH_EXTENT_ENTRY_flags:
bkey_fsck_err_on(entry != ptrs.start,
c, extent_flags_not_at_start,
"extent flags entry not at start");
break;
}
}
@ -1452,6 +1501,28 @@ void bch2_ptr_swab(struct bkey_s k)
}
}
int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
{
int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
if (ret)
return ret;
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
if (ptrs.start != ptrs.end &&
extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
ptrs.start->flags.flags = flags;
} else {
struct bch_extent_flags f = {
.type = BIT(BCH_EXTENT_ENTRY_flags),
.flags = flags,
};
__extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
}
return 0;
}
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
@ -1497,8 +1568,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
entry->crc128.offset += sub;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
break;
case BCH_EXTENT_ENTRY_rebalance:
case BCH_EXTENT_ENTRY_flags:
break;
}

View File

@ -320,8 +320,8 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
({ \
__label__ out; \
\
(_ptr).idx = 0; \
(_ptr).has_ec = false; \
(_ptr).do_ec_reconstruct = false; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
switch (__extent_entry_type(_entry)) { \
@ -401,7 +401,7 @@ out: \
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
unsigned);
void bch2_mark_io_failure(struct bch_io_failures *,
struct extent_ptr_decoded *);
struct extent_ptr_decoded *, bool);
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
struct bch_io_failures *,
struct extent_ptr_decoded *, int);
@ -704,7 +704,7 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
ptr1.unwritten == ptr2.unwritten &&
ptr1.offset == ptr2.offset &&
ptr1.dev == ptr2.dev &&
ptr1.dev == ptr2.dev);
ptr1.gen == ptr2.gen);
}
void bch2_ptr_swab(struct bkey_s);
@ -753,4 +753,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
k->size = new_size;
}
static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
{
if (ptrs.start != ptrs.end &&
extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
return ptrs.start->flags.flags;
return 0;
}
static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
{
return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
}
int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
#endif /* _BCACHEFS_EXTENTS_H */

View File

@ -79,8 +79,9 @@
x(crc64, 2) \
x(crc128, 3) \
x(stripe_ptr, 4) \
x(rebalance, 5)
#define BCH_EXTENT_ENTRY_MAX 6
x(rebalance, 5) \
x(flags, 6)
#define BCH_EXTENT_ENTRY_MAX 7
enum bch_extent_entry_type {
#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr {
#endif
};
#define BCH_EXTENT_FLAGS() \
x(poisoned, 0)
enum bch_extent_flags_e {
#define x(n, v) BCH_EXTENT_FLAG_##n = v,
BCH_EXTENT_FLAGS()
#undef x
};
struct bch_extent_flags {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:7,
flags:57;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 flags:57,
type:7;
#endif
};
/* bch_extent_rebalance: */
#include "rebalance_format.h"

View File

@ -20,21 +20,23 @@ struct bch_extent_crc_unpacked {
};
struct extent_ptr_decoded {
unsigned idx;
bool has_ec;
unsigned do_ec_reconstruct;
struct bch_extent_crc_unpacked crc;
struct bch_extent_ptr ptr;
struct bch_extent_stripe_ptr ec;
};
#define BCH_MAX_CSUM_RETRIES 3
struct bch_io_failures {
u8 nr;
struct bch_dev_io_failures {
u8 dev;
u8 idx;
u8 nr_failed;
u8 nr_retries;
} devs[BCH_REPLICAS_MAX];
unsigned failed_csum_nr:4,
failed_io:1,
failed_ec:1;
} devs[BCH_REPLICAS_MAX + 1];
};
#endif /* _BCACHEFS_EXTENTS_TYPES_H */

View File

@ -268,16 +268,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_hash = bch2_hash_info_init(c, dir_u);
struct bkey_s_c dirent_k =
bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
&dir_hash, dir, name, BTREE_ITER_intent);
ret = bkey_err(dirent_k);
if (ret)
goto err;
ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(dirent_k), &inum);
if (ret > 0)
ret = -ENOENT;
ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
name, &inum, BTREE_ITER_intent);
if (ret)
goto err;
@ -334,7 +326,6 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
dir_u->bi_size -= bkey_bytes(dirent_k.k);
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash, &dirent_iter,

View File

@ -466,6 +466,7 @@ int bchfs_truncate(struct mnt_idmap *idmap,
ret = bch2_truncate_folio(inode, iattr->ia_size);
if (unlikely(ret < 0))
goto err;
ret = 0;
truncate_setsize(&inode->v, iattr->ia_size);

View File

@ -69,8 +69,9 @@ static int bch2_inode_flags_set(struct btree_trans *trans,
if (ret < 0)
return ret;
if (!bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding))
return -EOPNOTSUPP;
ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding);
if (ret)
return ret;
bch2_check_set_feature(c, BCH_FEATURE_casefolding);
#else
@ -243,7 +244,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
int ret = 0;
subvol_inum inum;
kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
if (!kname)
return -ENOMEM;

View File

@ -2218,9 +2218,10 @@ static int bch2_fs_get_tree(struct fs_context *fc)
bch2_opts_apply(&c->opts, opts);
ret = bch2_fs_start(c);
if (ret)
goto err_stop_fs;
/*
* need to initialise sb and set c->vfs_sb _before_ starting fs,
* for blk_holder_ops
*/
sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
ret = PTR_ERR_OR_ZERO(sb);
@ -2282,6 +2283,10 @@ got_sb:
sb->s_shrink->seeks = 0;
ret = bch2_fs_start(c);
if (ret)
goto err_put_super;
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
bch_err_msg(c, ret, "mounting: error getting root inode");

View File

@ -1978,31 +1978,10 @@ fsck_err:
return ret;
}
static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
int ret = 0;
darray_for_each(w->inodes, i)
if (fsck_err_on(i->inode.bi_size != i->i_size,
trans, inode_dir_wrong_nlink,
"directory %llu:%u with wrong i_size: got %llu, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) {
i->inode.bi_size = i->i_size;
ret = bch2_fsck_write_inode(trans, &i->inode);
if (ret)
break;
}
fsck_err:
bch_err_fn(c, ret);
return ret;
}
static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
{
u32 restart_count = trans->restart_count;
return check_subdir_count_notnested(trans, w) ?:
check_dir_i_size_notnested(trans, w) ?:
trans_was_restarted(trans, restart_count);
}

View File

@ -329,10 +329,17 @@ nopromote:
static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
struct bch_read_bio *rbio, struct bpos read_pos)
{
return lockrestart_do(trans,
int ret = lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, out,
(subvol_inum) { rbio->subvol, read_pos.inode },
read_pos.offset << 9));
if (ret)
return ret;
if (rbio->flags & BCH_READ_data_update)
prt_str(out, "(internal move) ");
return 0;
}
static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
@ -341,10 +348,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
}
#define READ_RETRY_AVOID 1
#define READ_RETRY 2
#define READ_ERR 3
enum rbio_context {
RBIO_CONTEXT_NULL,
RBIO_CONTEXT_HIGHPRI,
@ -375,6 +378,11 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
{
BUG_ON(rbio->bounce && !rbio->split);
if (rbio->have_ioref) {
struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
percpu_ref_put(&ca->io_ref);
}
if (rbio->split) {
struct bch_read_bio *parent = rbio->parent;
@ -408,13 +416,90 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
bio_endio(&rbio->bio);
}
static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
struct bvec_iter bvec_iter,
struct bch_io_failures *failed,
unsigned flags)
static struct bkey_s_c get_rbio_extent(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct btree_iter *iter)
{
if (rbio->flags & BCH_READ_data_update) {
struct data_update *u = container_of(rbio, struct data_update, rbio);
return bch2_bkey_get_iter(trans, iter,
u->btree_id, bkey_start_pos(&u->k.k->k), 0);
} else {
struct bpos pos = rbio->read_pos;
int ret = bch2_subvolume_get_snapshot(trans, rbio->subvol, &pos.snapshot);
if (ret)
return bkey_s_c_err(ret);
return bch2_bkey_get_iter(trans, iter,
BTREE_ID_extents, pos, 0);
}
}
static void mark_io_failure_if_current_extent_matches(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct bch_io_failures *failed)
{
struct btree_iter iter = {};
struct bkey_s_c k;
int ret = lockrestart_do(trans,
bkey_err(k = get_rbio_extent(trans, rbio, &iter)));
if (!ret) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr)
if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr))
bch2_mark_io_failure(failed, &rbio->pick,
rbio->ret == -BCH_ERR_data_read_csum_err);
}
bch2_trans_iter_exit(trans, &iter);
}
static noinline int maybe_poison_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k, struct bch_io_failures *failed)
{
u64 flags = bch2_bkey_extent_flags(k);
if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
return 0;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
/*
* Make sure we actually attempt to read and got checksum failures from
* every replica
*/
rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
continue;
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, ptr->dev);
if (!f || f->failed_csum_nr != BCH_MAX_CSUM_RETRIES) {
rcu_read_unlock();
return 0;
}
}
rcu_read_unlock();
struct bkey_i *new = __bch2_bkey_make_mut(trans, iter, &k, 0, 0,
bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
return PTR_ERR_OR_ZERO(new) ?:
bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
bch2_trans_commit(trans, NULL, NULL, 0);
}
static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct bvec_iter bvec_iter,
struct bch_io_failures *failed,
unsigned flags)
{
struct data_update *u = container_of(rbio, struct data_update, rbio);
struct btree_trans *trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
@ -429,7 +514,7 @@ retry:
if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
/* extent we wanted to read no longer exists: */
rbio->hole = true;
rbio->ret = -BCH_ERR_data_read_key_overwritten;
goto err;
}
@ -441,14 +526,19 @@ retry:
err:
bch2_trans_iter_exit(trans, &iter);
if (ret == READ_RETRY)
if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
goto retry;
if (ret)
rbio->bio.bi_status = BLK_STS_IOERR;
if (ret) {
if (ret == -BCH_ERR_no_device_to_read_from && failed)
maybe_poison_extent(trans, &iter, k, failed);
rbio->bio.bi_status = BLK_STS_IOERR;
rbio->ret = ret;
}
BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
bch2_rbio_done(rbio);
bch2_trans_put(trans);
return ret;
}
static void bch2_rbio_retry(struct work_struct *work)
@ -463,16 +553,22 @@ static void bch2_rbio_retry(struct work_struct *work)
.inum = rbio->read_pos.inode,
};
struct bch_io_failures failed = { .nr = 0 };
struct btree_trans *trans = bch2_trans_get(c);
trace_io_read_retry(&rbio->bio);
this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
bvec_iter_sectors(rbio->bvec_iter));
if (rbio->retry == READ_RETRY_AVOID)
bch2_mark_io_failure(&failed, &rbio->pick);
if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
mark_io_failure_if_current_extent_matches(trans, rbio, &failed);
if (!rbio->split)
rbio->bio.bi_status = 0;
if (!rbio->split) {
rbio->bio.bi_status = 0;
rbio->ret = 0;
}
unsigned subvol = rbio->subvol;
struct bpos read_pos = rbio->read_pos;
rbio = bch2_rbio_free(rbio);
@ -481,29 +577,55 @@ static void bch2_rbio_retry(struct work_struct *work)
flags &= ~BCH_READ_last_fragment;
flags |= BCH_READ_must_clone;
if (flags & BCH_READ_data_update)
bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
else
__bch2_read(c, rbio, iter, inum, &failed, flags);
int ret = flags & BCH_READ_data_update
? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
: __bch2_read(trans, rbio, iter, inum, &failed, flags);
if (ret) {
rbio->ret = ret;
rbio->bio.bi_status = BLK_STS_IOERR;
} else {
struct printbuf buf = PRINTBUF;
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf,
(subvol_inum) { subvol, read_pos.inode },
read_pos.offset << 9));
if (rbio->flags & BCH_READ_data_update)
prt_str(&buf, "(internal move) ");
prt_str(&buf, "successful retry");
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
}
bch2_rbio_done(rbio);
bch2_trans_put(trans);
}
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
blk_status_t error)
static void bch2_rbio_error(struct bch_read_bio *rbio,
int ret, blk_status_t blk_error)
{
rbio->retry = retry;
rbio->saw_error = true;
BUG_ON(ret >= 0);
rbio->ret = ret;
rbio->bio.bi_status = blk_error;
bch2_rbio_parent(rbio)->saw_error = true;
if (rbio->flags & BCH_READ_in_retry)
return;
if (retry == READ_ERR) {
rbio = bch2_rbio_free(rbio);
rbio->bio.bi_status = error;
bch2_rbio_done(rbio);
} else {
if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
bch2_rbio_punt(rbio, bch2_rbio_retry,
RBIO_CONTEXT_UNBOUND, system_unbound_wq);
} else {
rbio = bch2_rbio_free(rbio);
rbio->ret = ret;
rbio->bio.bi_status = blk_error;
bch2_rbio_done(rbio);
}
}
@ -519,15 +641,13 @@ static void bch2_read_io_err(struct work_struct *work)
bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
if (ca) {
bch2_io_error(ca, BCH_MEMBER_ERROR_read);
if (ca)
bch_err_ratelimited(ca, "%s", buf.buf);
} else {
else
bch_err_ratelimited(c, "%s", buf.buf);
}
printbuf_exit(&buf);
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
bch2_rbio_error(rbio, -BCH_ERR_data_read_io_err, bio->bi_status);
}
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
@ -609,14 +729,12 @@ static void bch2_read_csum_err(struct work_struct *work)
bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
if (ca) {
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
if (ca)
bch_err_ratelimited(ca, "%s", buf.buf);
} else {
else
bch_err_ratelimited(c, "%s", buf.buf);
}
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
@ -636,7 +754,7 @@ static void bch2_read_decompress_err(struct work_struct *work)
else
bch_err_ratelimited(c, "%s", buf.buf);
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
@ -656,16 +774,53 @@ static void bch2_read_decrypt_err(struct work_struct *work)
else
bch_err_ratelimited(c, "%s", buf.buf);
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
#ifdef CONFIG_BCACHEFS_DEBUG
static unsigned bch2_read_corrupt_ratio;
module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(read_corrupt_ratio, "");
static void corrupt_bio(struct bio *bio)
{
struct bvec_iter iter;
struct bio_vec bv;
unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
bio_for_each_segment(bv, bio, iter) {
unsigned u64s = bv.bv_len / sizeof(u64);
if (offset < u64s) {
u64 *segment = bvec_kmap_local(&bv);
segment[offset] = get_random_u64();
kunmap_local(segment);
return;
}
offset -= u64s;
}
}
static inline void maybe_corrupt_bio(struct bio *bio)
{
if (bch2_read_corrupt_ratio &&
!get_random_u32_below(bch2_read_corrupt_ratio))
corrupt_bio(bio);
}
#else
static inline void maybe_corrupt_bio(struct bio *bio)
{
}
#endif
/* Inner part that may run in process context */
static void __bch2_read_endio(struct work_struct *work)
{
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
struct bio *src = &rbio->bio;
struct bio *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
@ -686,8 +841,26 @@ static void __bch2_read_endio(struct work_struct *work)
src->bi_iter = rbio->bvec_iter;
}
maybe_corrupt_bio(src);
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
/*
* Checksum error: if the bio wasn't bounced, we may have been
* reading into buffers owned by userspace (that userspace can
* scribble over) - retry the read, bouncing it this time:
*/
if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
rbio->flags |= BCH_READ_must_bounce;
bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err_maybe_userspace,
BLK_STS_IOERR);
goto out;
}
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
if (!csum_good)
goto csum_err;
/*
@ -760,17 +933,6 @@ out:
memalloc_nofs_restore(nofs_flags);
return;
csum_err:
/*
* Checksum error: if the bio wasn't bounced, we may have been
* reading into buffers owned by userspace (that userspace can
* scribble over) - retry the read, bouncing it this time:
*/
if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
rbio->flags |= BCH_READ_must_bounce;
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
goto out;
}
bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
decompression_err:
@ -790,10 +952,8 @@ static void bch2_read_endio(struct bio *bio)
struct workqueue_struct *wq = NULL;
enum rbio_context context = RBIO_CONTEXT_NULL;
if (rbio->have_ioref) {
bch2_latency_acct(ca, rbio->submit_time, READ);
percpu_ref_put(&ca->io_ref);
}
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
rbio->submit_time, !bio->bi_status);
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
@ -808,9 +968,9 @@ static void bch2_read_endio(struct bio *bio)
trace_and_count(c, io_read_reuse_race, &rbio->bio);
if (rbio->flags & BCH_READ_retry_if_stale)
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
else
bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
return;
}
@ -883,7 +1043,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bch_read_bio *rbio = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
int pick_ret;
int ret = 0;
if (bkey_extent_is_inline_data(k.k)) {
unsigned bytes = min_t(unsigned, iter.bi_size,
@ -899,16 +1059,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
goto out_read_done;
}
retry_pick:
pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
/* hole or reservation - just zero fill: */
if (!pick_ret)
if (!ret)
goto hole;
if (unlikely(pick_ret < 0)) {
if (unlikely(ret < 0)) {
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret));
prt_printf(&buf, "%s\n ", bch2_err_str(ret));
bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "%s", buf.buf);
@ -924,6 +1084,7 @@ retry_pick:
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
ret = -BCH_ERR_data_read_no_encryption_key;
goto err;
}
@ -940,7 +1101,7 @@ retry_pick:
ca &&
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick);
bch2_mark_io_failure(failed, &pick, false);
percpu_ref_put(&ca->io_ref);
goto retry_pick;
}
@ -984,10 +1145,10 @@ retry_pick:
*/
struct data_update *u = container_of(orig, struct data_update, rbio);
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
BUG();
if (ca)
percpu_ref_put(&ca->io_ref);
goto hole;
rbio->ret = -BCH_ERR_data_read_buffer_too_small;
goto out_read_done;
}
iter.bi_size = pick.crc.compressed_size << 9;
@ -1067,8 +1228,7 @@ retry_pick:
rbio->flags = flags;
rbio->have_ioref = ca != NULL;
rbio->narrow_crcs = narrow_crcs;
rbio->hole = 0;
rbio->retry = 0;
rbio->ret = 0;
rbio->context = 0;
rbio->pick = pick;
rbio->subvol = orig->subvol;
@ -1104,7 +1264,7 @@ retry_pick:
trace_and_count(c, io_read_split, &orig->bio);
}
if (!rbio->pick.idx) {
if (likely(!rbio->pick.do_ec_reconstruct)) {
if (unlikely(!rbio->have_ioref)) {
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
@ -1114,7 +1274,9 @@ retry_pick:
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
bch2_rbio_error(rbio,
-BCH_ERR_data_read_device_offline,
BLK_STS_IOERR);
goto out;
}
@ -1140,7 +1302,8 @@ retry_pick:
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(trans, rbio, k)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
bch2_rbio_error(rbio, -BCH_ERR_data_read_ec_reconstruct_err,
BLK_STS_IOERR);
goto out;
}
@ -1156,25 +1319,22 @@ out:
rbio->context = RBIO_CONTEXT_UNBOUND;
bch2_read_endio(&rbio->bio);
ret = rbio->retry;
ret = rbio->ret;
rbio = bch2_rbio_free(rbio);
if (ret == READ_RETRY_AVOID) {
bch2_mark_io_failure(failed, &pick);
ret = READ_RETRY;
}
if (!ret)
goto out_read_done;
if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
bch2_mark_io_failure(failed, &pick,
ret == -BCH_ERR_data_read_csum_err);
return ret;
}
err:
if (flags & BCH_READ_in_retry)
return READ_ERR;
return ret;
orig->bio.bi_status = BLK_STS_IOERR;
orig->bio.bi_status = BLK_STS_IOERR;
orig->ret = ret;
goto out_read_done;
hole:
@ -1186,20 +1346,21 @@ hole:
* to read no longer exists we have to signal that:
*/
if (flags & BCH_READ_data_update)
orig->hole = true;
orig->ret = -BCH_ERR_data_read_key_overwritten;
zero_fill_bio_iter(&orig->bio, iter);
out_read_done:
if (flags & BCH_READ_last_fragment)
if ((flags & BCH_READ_last_fragment) &&
!(flags & BCH_READ_in_retry))
bch2_rbio_done(orig);
return 0;
}
void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
struct bvec_iter bvec_iter, subvol_inum inum,
struct bch_io_failures *failed, unsigned flags)
int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
struct bvec_iter bvec_iter, subvol_inum inum,
struct bch_io_failures *failed, unsigned flags)
{
struct btree_trans *trans = bch2_trans_get(c);
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
@ -1232,6 +1393,23 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
if (ret)
goto err;
if (unlikely(flags & BCH_READ_in_retry)) {
struct data_update *u = flags & BCH_READ_data_update
? container_of(rbio, struct data_update, rbio)
: NULL;
if (u &&
!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
/* extent we wanted to read no longer exists: */
ret = -BCH_ERR_data_read_key_overwritten;
goto err;
}
if (!bkey_deleted(&sk.k->k) &&
!bkey_and_val_eq(k, bkey_i_to_s_c(sk.k)))
failed->nr = 0;
}
s64 offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
unsigned sectors = k.k->size - offset_into_extent;
@ -1271,28 +1449,32 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
err:
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
ret != READ_RETRY &&
ret != READ_RETRY_AVOID)
!bch2_err_matches(ret, BCH_ERR_data_read_retry))
break;
}
bch2_trans_iter_exit(trans, &iter);
if (unlikely(ret)) {
if (ret == -BCH_ERR_no_device_to_read_from && failed)
maybe_poison_extent(trans, &iter, k, failed);
if (ret) {
struct printbuf buf = PRINTBUF;
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf, inum,
bvec_iter.bi_sector << 9));
prt_printf(&buf, "read error %i from btree lookup", ret);
prt_printf(&buf, "read error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
rbio->bio.bi_status = BLK_STS_IOERR;
bch2_rbio_done(rbio);
rbio->bio.bi_status = BLK_STS_IOERR;
rbio->ret = ret;
if (!(flags & BCH_READ_in_retry))
bch2_rbio_done(rbio);
}
bch2_trans_put(trans);
bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
return ret;
}
void bch2_fs_io_read_exit(struct bch_fs *c)

View File

@ -3,6 +3,7 @@
#define _BCACHEFS_IO_READ_H
#include "bkey_buf.h"
#include "btree_iter.h"
#include "reflink.h"
struct bch_read_bio {
@ -40,13 +41,12 @@ struct bch_read_bio {
split:1,
have_ioref:1,
narrow_crcs:1,
hole:1,
saw_error:1,
retry:2,
context:2;
};
u16 _state;
};
s16 ret;
struct extent_ptr_decoded pick;
@ -141,22 +141,21 @@ static inline void bch2_read_extent(struct btree_trans *trans,
data_btree, k, offset_into_extent, NULL, flags, -1);
}
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
subvol_inum, struct bch_io_failures *, unsigned flags);
int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
subvol_inum, struct bch_io_failures *, unsigned flags);
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
subvol_inum inum)
{
struct bch_io_failures failed = { .nr = 0 };
BUG_ON(rbio->_state);
rbio->subvol = inum.subvol;
__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
BCH_READ_retry_if_stale|
BCH_READ_may_promote|
BCH_READ_user_mapped);
bch2_trans_run(c,
__bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL,
BCH_READ_retry_if_stale|
BCH_READ_may_promote|
BCH_READ_user_mapped));
}
static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
@ -166,6 +165,7 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
rbio->c = orig->c;
rbio->_state = 0;
rbio->ret = 0;
rbio->split = true;
rbio->parent = orig;
rbio->opts = orig->opts;
@ -182,6 +182,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
rbio->start_time = local_clock();
rbio->c = c;
rbio->_state = 0;
rbio->ret = 0;
rbio->opts = opts;
rbio->bio.bi_end_io = end_io;
return rbio;

View File

@ -716,11 +716,15 @@ static void bch2_write_endio(struct bio *bio)
? bch2_dev_have_ref(c, wbio->dev)
: NULL;
if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
wbio->submit_time, !bio->bi_status);
if (bio->bi_status) {
bch_err_inum_offset_ratelimited(ca,
op->pos.inode,
wbio->inode_offset << 9,
"data write error: %s",
bch2_blk_status_to_str(bio->bi_status))) {
bch2_blk_status_to_str(bio->bi_status));
set_bit(wbio->dev, op->failed.d);
op->flags |= BCH_WRITE_io_error;
}
@ -732,10 +736,8 @@ static void bch2_write_endio(struct bio *bio)
set_bit(wbio->dev, op->devs_need_flush->d);
}
if (wbio->have_ioref) {
bch2_latency_acct(ca, wbio->submit_time, WRITE);
if (wbio->have_ioref)
percpu_ref_put(&ca->io_ref);
}
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);

View File

@ -11,12 +11,6 @@
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void bch2_latency_acct(struct bch_dev *, u64, int);
#else
static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
#endif
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *, bool);

View File

@ -1096,8 +1096,8 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
/* allocate journal on a device: */
static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
bool new_fs, struct closure *cl)
static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
bool new_fs, struct closure *cl)
{
struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
@ -1225,26 +1225,20 @@ err_free:
return ret;
}
/*
* Allocate more journal space at runtime - not currently making use if it, but
* the code works:
*/
int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
unsigned nr)
static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca,
unsigned nr, bool new_fs)
{
struct journal_device *ja = &ca->journal;
struct closure cl;
int ret = 0;
struct closure cl;
closure_init_stack(&cl);
down_write(&c->state_lock);
/* don't handle reducing nr of buckets yet: */
if (nr < ja->nr)
goto unlock;
return 0;
while (ja->nr < nr) {
while (!ret && ja->nr < nr) {
struct disk_reservation disk_res = { 0, 0, 0 };
/*
@ -1257,25 +1251,38 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
* filesystem-wide allocation will succeed, this is a device
* specific allocation - we can hang here:
*/
if (!new_fs) {
ret = bch2_disk_reservation_get(c, &disk_res,
bucket_to_sector(ca, nr - ja->nr), 1, 0);
if (ret)
break;
}
ret = bch2_disk_reservation_get(c, &disk_res,
bucket_to_sector(ca, nr - ja->nr), 1, 0);
if (ret)
break;
ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl);
ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
if (ret == -BCH_ERR_bucket_alloc_blocked ||
ret == -BCH_ERR_open_buckets_empty)
ret = 0; /* wait and retry */
bch2_disk_reservation_put(c, &disk_res);
closure_sync(&cl);
if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
break;
}
bch_err_fn(c, ret);
unlock:
return ret;
}
/*
* Allocate more journal space at runtime - not currently making use if it, but
* the code works:
*/
int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
unsigned nr)
{
down_write(&c->state_lock);
int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false);
up_write(&c->state_lock);
bch_err_fn(c, ret);
return ret;
}
@ -1301,7 +1308,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
min(1 << 13,
(1 << 24) / ca->mi.bucket_size));
ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL);
ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs);
err:
bch_err_fn(ca, ret);
return ret;

View File

@ -1041,13 +1041,19 @@ reread:
bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, buf->data, sectors_read << 9);
u64 submit_time = local_clock();
ret = submit_bio_wait(bio);
kfree(bio);
if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
"journal read error: sector %llu",
offset) ||
bch2_meta_read_fault("journal")) {
if (!ret && bch2_meta_read_fault("journal"))
ret = -BCH_ERR_EIO_fault_injected;
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
submit_time, !ret);
if (ret) {
bch_err_dev_ratelimited(ca,
"journal read error: sector %llu", offset);
/*
* We don't error out of the recovery process
* here, since the relevant journal entry may be
@ -1110,13 +1116,16 @@ reread:
struct bch_csum csum;
csum_good = jset_csum_good(c, j, &csum);
if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
"%s",
(printbuf_reset(&err),
prt_str(&err, "journal "),
bch2_csum_err_msg(&err, csum_type, j->csum, csum),
err.buf)))
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
if (!csum_good) {
bch_err_dev_ratelimited(ca, "%s",
(printbuf_reset(&err),
prt_str(&err, "journal "),
bch2_csum_err_msg(&err, csum_type, j->csum, csum),
err.buf));
saw_bad = true;
}
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
@ -1655,6 +1664,7 @@ static CLOSURE_CALLBACK(journal_write_done)
}
bool completed = false;
bool do_discards = false;
for (seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
@ -1667,7 +1677,6 @@ static CLOSURE_CALLBACK(journal_write_done)
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
bch2_reset_alloc_cursors(c);
}
@ -1718,6 +1727,9 @@ static CLOSURE_CALLBACK(journal_write_done)
*/
bch2_journal_do_writes(j);
spin_unlock(&j->lock);
if (do_discards)
bch2_do_discards(c);
}
static void journal_write_endio(struct bio *bio)
@ -1727,13 +1739,16 @@ static void journal_write_endio(struct bio *bio)
struct journal *j = &ca->fs->journal;
struct journal_buf *w = j->buf + jbio->buf_idx;
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
jbio->submit_time, !bio->bi_status);
if (bio->bi_status) {
bch_err_dev_ratelimited(ca,
"error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
unsigned long flags;
bch2_blk_status_to_str(bio->bi_status));
unsigned long flags;
spin_lock_irqsave(&j->err_lock, flags);
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
spin_unlock_irqrestore(&j->err_lock, flags);
@ -1762,7 +1777,11 @@ static CLOSURE_CALLBACK(journal_write_submit)
sectors);
struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;
struct journal_bio *jbio = ja->bio[w->idx];
struct bio *bio = &jbio->bio;
jbio->submit_time = local_clock();
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
@ -1794,6 +1813,10 @@ static CLOSURE_CALLBACK(journal_write_preflush)
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
/*
* Wait for previous journal writes to comelete; they won't necessarily
* be flushed if they're still in flight
*/
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
spin_lock(&j->lock);
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {

View File

@ -175,6 +175,7 @@ typedef DARRAY(u64) darray_u64;
struct journal_bio {
struct bch_dev *ca;
unsigned buf_idx;
u64 submit_time;
struct bio bio;
};

View File

@ -125,8 +125,8 @@ static void move_write(struct moving_io *io)
&ctxt->stats->sectors_error_corrected);
}
if (unlikely(io->write.rbio.bio.bi_status ||
io->write.rbio.hole ||
if (unlikely(io->write.rbio.ret ||
io->write.rbio.bio.bi_status ||
io->write.data_opts.scrub)) {
move_free(io);
return;
@ -816,7 +816,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (!bp.v->level)
ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
else if (!data_opts.scrub)
ret = bch2_btree_node_rewrite_key(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
else
ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);

View File

@ -74,20 +74,14 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
struct move_bucket *b, u64 time)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a;
int ret;
if (bch2_bucket_is_open(trans->c,
b->k.bucket.inode,
b->k.bucket.offset))
if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
return 0;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
b->k.bucket, BTREE_ITER_cached);
ret = bkey_err(k);
struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
b->k.bucket, BTREE_ITER_cached);
int ret = bkey_err(k);
if (ret)
return ret;
@ -95,13 +89,18 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
if (!ca)
goto out;
a = bch2_alloc_to_v4(k, &_a);
if (ca->mi.state != BCH_MEMBER_STATE_rw ||
!bch2_dev_is_online(ca))
goto out_put;
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
b->k.gen = a->gen;
b->sectors = bch2_bucket_sectors_dirty(*a);
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
ret = lru_idx && lru_idx <= time;
out_put:
bch2_dev_put(ca);
out:
bch2_trans_iter_exit(trans, &iter);

View File

@ -145,6 +145,11 @@ enum fsck_err_opts {
OPT_STR(bch2_error_actions), \
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
NULL, "Action to take on filesystem error") \
x(write_error_timeout, u16, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(1, 300), \
BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
NULL, "Number of consecutive write errors allowed before kicking out a device")\
x(metadata_replicas, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \

View File

@ -24,7 +24,7 @@
x(check_topology, 4, 0) \
x(accounting_read, 39, PASS_ALWAYS) \
x(alloc_read, 0, PASS_ALWAYS) \
x(stripes_read, 1, PASS_ALWAYS) \
x(stripes_read, 1, 0) \
x(initialize_subvolumes, 2, 0) \
x(snapshots_read, 3, PASS_ALWAYS) \
x(check_allocations, 5, PASS_FSCK) \

View File

@ -606,7 +606,7 @@ s64 bch2_remap_range(struct bch_fs *c,
u64 dst_done = 0;
u32 dst_snapshot, src_snapshot;
bool reflink_p_may_update_opts_field =
bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
!bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
int ret = 0, ret2 = 0;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))

View File

@ -91,9 +91,6 @@
BCH_FSCK_ERR_accounting_mismatch, \
BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
BCH_FSCK_ERR_accounting_key_junk_at_end) \
x(directory_size, \
BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
BCH_FSCK_ERR_directory_size_mismatch) \
x(cached_backpointers, \
BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
BCH_FSCK_ERR_ptr_to_missing_backpointer) \

View File

@ -179,6 +179,7 @@ enum bch_fsck_flags {
x(ptr_crc_redundant, 160, 0) \
x(ptr_crc_nonce_mismatch, 162, 0) \
x(ptr_stripe_redundant, 163, 0) \
x(extent_flags_not_at_start, 306, 0) \
x(reservation_key_nr_replicas_invalid, 164, 0) \
x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
x(reflink_v_pos_bad, 292, 0) \
@ -316,7 +317,7 @@ enum bch_fsck_flags {
x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
x(dirent_cf_name_too_big, 304, 0) \
x(dirent_stray_data_after_cf_name, 305, 0) \
x(MAX, 306, 0)
x(MAX, 307, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,

View File

@ -35,7 +35,7 @@ static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
return ret;
}
static inline bool bch2_dev_is_readable(struct bch_dev *ca)
static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
{
return bch2_dev_is_online(ca) &&
ca->mi.state != BCH_MEMBER_STATE_failed;
@ -283,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev
static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
{
might_sleep();
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
if (ca && !percpu_ref_tryget(&ca->io_ref))

View File

@ -850,7 +850,8 @@ void six_lock_exit(struct six_lock *lock)
EXPORT_SYMBOL_GPL(six_lock_exit);
void __six_lock_init(struct six_lock *lock, const char *name,
struct lock_class_key *key, enum six_lock_init_flags flags)
struct lock_class_key *key, enum six_lock_init_flags flags,
gfp_t gfp)
{
atomic_set(&lock->state, 0);
raw_spin_lock_init(&lock->wait_lock);
@ -873,7 +874,7 @@ void __six_lock_init(struct six_lock *lock, const char *name,
* failure if they wish by checking lock->readers, but generally
* will not want to treat it as an error.
*/
lock->readers = alloc_percpu(unsigned);
lock->readers = alloc_percpu_gfp(unsigned, gfp);
}
#endif
}

View File

@ -164,18 +164,19 @@ enum six_lock_init_flags {
};
void __six_lock_init(struct six_lock *lock, const char *name,
struct lock_class_key *key, enum six_lock_init_flags flags);
struct lock_class_key *key, enum six_lock_init_flags flags,
gfp_t gfp);
/**
* six_lock_init - initialize a six lock
* @lock: lock to initialize
* @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU
*/
#define six_lock_init(lock, flags) \
#define six_lock_init(lock, flags, gfp) \
do { \
static struct lock_class_key __key; \
\
__six_lock_init((lock), #lock, &__key, flags); \
__six_lock_init((lock), #lock, &__key, flags, gfp); \
} while (0)
/**

View File

@ -25,9 +25,6 @@
#include <linux/sort.h>
#include <linux/string_choices.h>
static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
};
struct bch2_metadata_version {
u16 version;
const char *name;
@ -69,14 +66,22 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta
return v;
}
void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
{
mutex_lock(&c->sb_lock);
SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
version <= c->sb.version_incompat_allowed)
? 0
: -BCH_ERR_may_not_use_incompat_feature;
if (!ret) {
mutex_lock(&c->sb_lock);
SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
return ret;
}
const char * const bch2_sb_fields[] = {
@ -366,7 +371,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
u16 block_size;
int ret;
ret = bch2_sb_compatible(sb, out);
@ -385,8 +389,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
return -BCH_ERR_invalid_sb_features;
}
block_size = le16_to_cpu(sb->block_size);
if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
prt_printf(out, "Bad user UUID (got zeroes)");
return -BCH_ERR_invalid_sb_uuid;
@ -452,6 +454,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
}
#ifdef __KERNEL__
@ -743,7 +748,7 @@ retry:
memset(sb, 0, sizeof(*sb));
sb->mode = BLK_OPEN_READ;
sb->have_bio = true;
sb->holder = kmalloc(1, GFP_KERNEL);
sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL);
if (!sb->holder)
return -ENOMEM;
@ -906,16 +911,16 @@ static void write_super_endio(struct bio *bio)
{
struct bch_dev *ca = bio->bi_private;
bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
/* XXX: return errors directly */
if (bch2_dev_io_err_on(bio->bi_status, ca,
bio_data_dir(bio)
? BCH_MEMBER_ERROR_write
: BCH_MEMBER_ERROR_read,
"superblock %s error: %s",
if (bio->bi_status) {
bch_err_dev_ratelimited(ca, "superblock %s error: %s",
str_write_read(bio_data_dir(bio)),
bch2_blk_status_to_str(bio->bi_status)))
bch2_blk_status_to_str(bio->bi_status));
ca->sb_write_error = 1;
}
closure_put(&ca->fs->sb_write);
percpu_ref_put(&ca->io_ref);
@ -1154,7 +1159,7 @@ int bch2_write_super(struct bch_fs *c)
!can_mount_with_written), c,
": Unable to write superblock to sufficient devices (from %ps)",
(void *) _RET_IP_))
ret = -1;
ret = -BCH_ERR_erofs_sb_err;
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
@ -1211,11 +1216,12 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
c->disk_sb.sb->version = cpu_to_le16(new_version);
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
if (incompat)
if (incompat) {
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
}
}
static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,

View File

@ -21,17 +21,14 @@ static inline bool bch2_version_compatible(u16 version)
void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
static inline bool bch2_request_incompat_feature(struct bch_fs *c,
enum bcachefs_metadata_version version)
static inline int bch2_request_incompat_feature(struct bch_fs *c,
enum bcachefs_metadata_version version)
{
if (unlikely(version > c->sb.version_incompat)) {
if (version > c->sb.version_incompat_allowed)
return false;
bch2_set_version_incompat(c, version);
}
return true;
return likely(version <= c->sb.version_incompat)
? 0
: bch2_set_version_incompat(c, version);
}
static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)

View File

@ -1075,6 +1075,7 @@ int bch2_fs_start(struct bch_fs *c)
}
set_bit(BCH_FS_started, &c->flags);
wake_up(&c->ro_ref_wait);
if (c->opts.read_only) {
bch2_fs_read_only(c);
@ -1431,6 +1432,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
ca->disk_sb = *sb;
memset(sb, 0, sizeof(*sb));
/*
* Stash pointer to the filesystem for blk_holder_ops - note that once
* attached to a filesystem, we will always close the block device
* before tearing down the filesystem object.
*/
ca->disk_sb.holder->c = ca->fs;
ca->dev = ca->disk_sb.bdev->bd_dev;
percpu_ref_reinit(&ca->io_ref);
@ -2016,6 +2024,102 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
/* blk_holder_ops: */
static struct bch_fs *bdev_get_fs(struct block_device *bdev)
__releases(&bdev->bd_holder_lock)
{
struct bch_sb_handle_holder *holder = bdev->bd_holder;
struct bch_fs *c = holder->c;
if (c && !bch2_ro_ref_tryget(c))
c = NULL;
mutex_unlock(&bdev->bd_holder_lock);
if (c)
wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
return c;
}
/* returns with ref on ca->ref */
static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
{
for_each_member_device(c, ca)
if (ca->disk_sb.bdev == bdev)
return ca;
return NULL;
}
static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
{
struct bch_fs *c = bdev_get_fs(bdev);
if (!c)
return;
struct super_block *sb = c->vfs_sb;
if (sb) {
/*
* Not necessary, c->ro_ref guards against the filesystem being
* unmounted - we only take this to avoid a warning in
* sync_filesystem:
*/
down_read(&sb->s_umount);
}
down_write(&c->state_lock);
struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
if (!ca)
goto unlock;
if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) {
__bch2_dev_offline(c, ca);
} else {
if (sb) {
if (!surprise)
sync_filesystem(sb);
shrink_dcache_sb(sb);
evict_inodes(sb);
}
bch2_journal_flush(&c->journal);
bch2_fs_emergency_read_only(c);
}
bch2_dev_put(ca);
unlock:
if (sb)
up_read(&sb->s_umount);
up_write(&c->state_lock);
bch2_ro_ref_put(c);
}
static void bch2_fs_bdev_sync(struct block_device *bdev)
{
struct bch_fs *c = bdev_get_fs(bdev);
if (!c)
return;
struct super_block *sb = c->vfs_sb;
if (sb) {
/*
* Not necessary, c->ro_ref guards against the filesystem being
* unmounted - we only take this to avoid a warning in
* sync_filesystem:
*/
down_read(&sb->s_umount);
sync_filesystem(sb);
up_read(&sb->s_umount);
}
bch2_ro_ref_put(c);
}
const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
.mark_dead = bch2_fs_bdev_mark_dead,
.sync = bch2_fs_bdev_sync,
};
/* Filesystem open: */
static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)

View File

@ -42,4 +42,6 @@ void bch2_fs_stop(struct bch_fs *);
int bch2_fs_start(struct bch_fs *);
struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
#endif /* _BCACHEFS_SUPER_H */

View File

@ -2,13 +2,19 @@
#ifndef _BCACHEFS_SUPER_TYPES_H
#define _BCACHEFS_SUPER_TYPES_H
struct bch_fs;
struct bch_sb_handle_holder {
struct bch_fs *c;
};
struct bch_sb_handle {
struct bch_sb *sb;
struct file *s_bdev_file;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
void *holder;
struct bch_sb_handle_holder *holder;
size_t buffer_size;
blk_mode_t mode;
unsigned have_layout:1;

View File

@ -174,7 +174,6 @@ read_attribute(journal_debug);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(btree_reserve_cache);
read_attribute(stripes_heap);
read_attribute(open_buckets);
read_attribute(open_buckets_partial);
read_attribute(nocow_lock_table);
@ -355,9 +354,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_btree_reserve_cache)
bch2_btree_reserve_cache_to_text(out, c);
if (attr == &sysfs_stripes_heap)
bch2_stripes_heap_to_text(out, c);
if (attr == &sysfs_open_buckets)
bch2_open_buckets_to_text(out, c, NULL);
@ -566,7 +562,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_btree_key_cache,
&sysfs_btree_reserve_cache,
&sysfs_new_stripes,
&sysfs_stripes_heap,
&sysfs_open_buckets,
&sysfs_open_buckets_partial,
#ifdef BCH_WRITE_REF_DEBUG

View File

@ -339,6 +339,30 @@ DEFINE_EVENT(bio, io_read_reuse_race,
TP_ARGS(bio)
);
/* ec.c */
TRACE_EVENT(stripe_create,
TP_PROTO(struct bch_fs *c, u64 idx, int ret),
TP_ARGS(c, idx, ret),
TP_STRUCT__entry(
__field(dev_t, dev )
__field(u64, idx )
__field(int, ret )
),
TP_fast_assign(
__entry->dev = c->dev;
__entry->idx = idx;
__entry->ret = ret;
),
TP_printk("%d,%d idx %llu ret %i",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->idx,
__entry->ret)
);
/* Journal */
DEFINE_EVENT(bch_fs, journal_full,

View File

@ -208,6 +208,8 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
bdev->queue.backing_dev_info = bdev->bd_disk->bdi;
bdev->bd_inode = &bdev->__bd_inode;
mutex_init(&bdev->bd_holder_lock);
struct file *file = calloc(sizeof(*file), 1);
file->f_inode = bdev->bd_inode;