mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-03-27 00:00:04 +03:00
Update bcachefs sources to 63bbe0ca4167 bcachefs: Scrub
This commit is contained in:
parent
895fc9a101
commit
8361ae1a42
.bcachefs_revision
c_src
include/linux
libbcachefs
alloc_background.calloc_foreground.calloc_foreground.halloc_types.hbackpointers.cbcachefs.hbcachefs_ioctl.hbtree_cache.cbtree_io.cbtree_io.hbtree_iter.cbtree_key_cache.cbtree_trans_commit.cbtree_update_interior.cbtree_update_interior.hbuckets_waiting_for_journal.cbuckets_waiting_for_journal.hchardev.ccompress.ccompress.hdata_update.cdata_update.hdebug.cerrcode.hextents.cextents.hfs-io-buffered.cfs-io-direct.cfsck.cinode.hio_read.cio_read.hio_write.cio_write.hio_write_types.hjournal.cjournal.hjournal_io.cjournal_reclaim.cjournal_reclaim.hjournal_types.hmove.cmove_types.hmovinggc.copts.hrebalance.crebalance.hrecovery.csb-counters.csb-counters.hsb-counters_format.hsb-errors_format.hsb-members.hstr_hash.csubvolume.csuper.csuper.hsysfs.ctrace.hutil.h
@ -1 +1 @@
|
||||
78c6c8127e21fe2c8bf5c1d6a5e6832e28136f8f
|
||||
63bbe0ca416791095c994aba7bea388e947dd60a
|
||||
|
@ -31,9 +31,6 @@
|
||||
#include "libbcachefs/replicas.h"
|
||||
#include "libbcachefs/super.h"
|
||||
|
||||
/* XXX cut and pasted from fsck.c */
|
||||
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
|
||||
|
||||
static char *dev_t_to_path(dev_t dev)
|
||||
{
|
||||
char link[PATH_MAX], *p;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include <dirent.h>
|
||||
#include <sys/xattr.h>
|
||||
#include <linux/dcache.h>
|
||||
#include <linux/xattr.h>
|
||||
|
||||
#include "posix_to_bcachefs.h"
|
||||
@ -158,7 +159,7 @@ static void write_data(struct bch_fs *c,
|
||||
op.nr_replicas = 1;
|
||||
op.subvol = 1;
|
||||
op.pos = SPOS(dst_inode->bi_inum, dst_offset >> 9, U32_MAX);
|
||||
op.flags |= BCH_WRITE_SYNC;
|
||||
op.flags |= BCH_WRITE_sync;
|
||||
|
||||
int ret = bch2_disk_reservation_get(c, &op.res, len >> 9,
|
||||
c->opts.data_replicas, 0);
|
||||
@ -167,7 +168,7 @@ static void write_data(struct bch_fs *c,
|
||||
|
||||
closure_call(&op.cl, bch2_write, NULL, NULL);
|
||||
|
||||
BUG_ON(!(op.flags & BCH_WRITE_SUBMITTED));
|
||||
BUG_ON(!(op.flags & BCH_WRITE_submitted));
|
||||
dst_inode->bi_sectors += len >> 9;
|
||||
|
||||
if (op.error)
|
||||
|
@ -9,4 +9,7 @@ struct dentry {
|
||||
struct inode *d_inode;
|
||||
};
|
||||
|
||||
#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
|
||||
#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n))
|
||||
|
||||
#endif /* __LINUX_DCACHE_H */
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <linux/bug.h>
|
||||
#include <linux/byteorder.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/dcache.h>
|
||||
#include <linux/math.h>
|
||||
#include <linux/minmax.h>
|
||||
|
||||
|
@ -97,6 +97,11 @@ struct task_struct {
|
||||
struct signal_struct {
|
||||
struct rw_semaphore exec_update_lock;
|
||||
} *signal, _signal;
|
||||
|
||||
struct {
|
||||
u64 sum_exec_runtime;
|
||||
u64 exec_start;
|
||||
} se;
|
||||
};
|
||||
|
||||
extern __thread struct task_struct *current;
|
||||
|
@ -1803,7 +1803,6 @@ struct discard_buckets_state {
|
||||
u64 open;
|
||||
u64 need_journal_commit;
|
||||
u64 discarded;
|
||||
u64 need_journal_commit_this_dev;
|
||||
};
|
||||
|
||||
static int bch2_discard_one_bucket(struct btree_trans *trans,
|
||||
@ -1827,11 +1826,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
|
||||
c->journal.flushed_seq_ondisk,
|
||||
pos.inode, pos.offset)) {
|
||||
s->need_journal_commit++;
|
||||
s->need_journal_commit_this_dev++;
|
||||
u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
|
||||
pos.inode, pos.offset);
|
||||
if (seq_ready > c->journal.flushed_seq_ondisk) {
|
||||
if (seq_ready > c->journal.flushing_seq)
|
||||
s->need_journal_commit++;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1865,23 +1864,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
|
||||
discard_locked = true;
|
||||
}
|
||||
|
||||
if (!bkey_eq(*discard_pos_done, iter.pos) &&
|
||||
ca->mi.discard && !c->opts.nochanges) {
|
||||
/*
|
||||
* This works without any other locks because this is the only
|
||||
* thread that removes items from the need_discard tree
|
||||
*/
|
||||
bch2_trans_unlock_long(trans);
|
||||
blkdev_issue_discard(ca->disk_sb.bdev,
|
||||
k.k->p.offset * ca->mi.bucket_size,
|
||||
ca->mi.bucket_size,
|
||||
GFP_KERNEL);
|
||||
*discard_pos_done = iter.pos;
|
||||
if (!bkey_eq(*discard_pos_done, iter.pos)) {
|
||||
s->discarded++;
|
||||
*discard_pos_done = iter.pos;
|
||||
|
||||
ret = bch2_trans_relock_notrace(trans);
|
||||
if (ret)
|
||||
goto out;
|
||||
if (ca->mi.discard && !c->opts.nochanges) {
|
||||
/*
|
||||
* This works without any other locks because this is the only
|
||||
* thread that removes items from the need_discard tree
|
||||
*/
|
||||
bch2_trans_unlock_long(trans);
|
||||
blkdev_issue_discard(ca->disk_sb.bdev,
|
||||
k.k->p.offset * ca->mi.bucket_size,
|
||||
ca->mi.bucket_size,
|
||||
GFP_KERNEL);
|
||||
ret = bch2_trans_relock_notrace(trans);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
|
||||
@ -1897,7 +1897,10 @@ commit:
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
count_event(c, bucket_discard);
|
||||
if (!fastpath)
|
||||
count_event(c, bucket_discard);
|
||||
else
|
||||
count_event(c, bucket_discard_fast);
|
||||
out:
|
||||
fsck_err:
|
||||
if (discard_locked)
|
||||
@ -1929,6 +1932,9 @@ static void bch2_do_discards_work(struct work_struct *work)
|
||||
POS(ca->dev_idx, U64_MAX), 0, k,
|
||||
bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
|
||||
|
||||
if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal))
|
||||
bch2_journal_flush_async(&c->journal, NULL);
|
||||
|
||||
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
|
||||
bch2_err_str(ret));
|
||||
|
||||
@ -2024,7 +2030,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
|
||||
break;
|
||||
}
|
||||
|
||||
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
|
||||
trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
|
||||
|
||||
bch2_trans_put(trans);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
|
@ -179,23 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
|
||||
closure_wake_up(&c->freelist_wait);
|
||||
}
|
||||
|
||||
static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
|
||||
{
|
||||
switch (watermark) {
|
||||
case BCH_WATERMARK_interior_updates:
|
||||
return 0;
|
||||
case BCH_WATERMARK_reclaim:
|
||||
return OPEN_BUCKETS_COUNT / 6;
|
||||
case BCH_WATERMARK_btree:
|
||||
case BCH_WATERMARK_btree_copygc:
|
||||
return OPEN_BUCKETS_COUNT / 4;
|
||||
case BCH_WATERMARK_copygc:
|
||||
return OPEN_BUCKETS_COUNT / 3;
|
||||
default:
|
||||
return OPEN_BUCKETS_COUNT / 2;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool may_alloc_bucket(struct bch_fs *c,
|
||||
struct bpos bucket,
|
||||
struct bucket_alloc_state *s)
|
||||
@ -205,8 +188,12 @@ static inline bool may_alloc_bucket(struct bch_fs *c,
|
||||
return false;
|
||||
}
|
||||
|
||||
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
|
||||
c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) {
|
||||
u64 journal_seq_ready =
|
||||
bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
|
||||
bucket.inode, bucket.offset);
|
||||
if (journal_seq_ready > c->journal.flushed_seq_ondisk) {
|
||||
if (journal_seq_ready > c->journal.flushing_seq)
|
||||
s->need_journal_commit++;
|
||||
s->skipped_need_journal_commit++;
|
||||
return false;
|
||||
}
|
||||
@ -235,7 +222,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
|
||||
|
||||
spin_lock(&c->freelist_lock);
|
||||
|
||||
if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
|
||||
if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) {
|
||||
if (cl)
|
||||
closure_wait(&c->open_buckets_wait, cl);
|
||||
|
||||
@ -570,7 +557,7 @@ alloc:
|
||||
? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
|
||||
: bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
|
||||
|
||||
if (s.skipped_need_journal_commit * 2 > avail)
|
||||
if (s.need_journal_commit * 2 > avail)
|
||||
bch2_journal_flush_async(&c->journal, NULL);
|
||||
|
||||
if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) {
|
||||
@ -724,7 +711,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
|
||||
|
||||
struct bch_dev_usage usage;
|
||||
struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
|
||||
cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
|
||||
cl, flags & BCH_WRITE_alloc_nowait, &usage);
|
||||
if (!IS_ERR(ob))
|
||||
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
|
||||
bch2_dev_put(ca);
|
||||
@ -1332,7 +1319,7 @@ retry:
|
||||
if (wp->data_type != BCH_DATA_user)
|
||||
have_cache = true;
|
||||
|
||||
if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
|
||||
if (target && !(flags & BCH_WRITE_only_specified_devs)) {
|
||||
ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
|
||||
target, erasure_code,
|
||||
nr_replicas, &nr_effective,
|
||||
@ -1422,7 +1409,7 @@ err:
|
||||
if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
|
||||
ret = -BCH_ERR_bucket_alloc_blocked;
|
||||
|
||||
if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
|
||||
if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
|
||||
bch2_err_matches(ret, BCH_ERR_freelist_empty))
|
||||
ret = -BCH_ERR_bucket_alloc_blocked;
|
||||
|
||||
|
@ -33,6 +33,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
|
||||
return bch2_dev_have_ref(c, ob->dev);
|
||||
}
|
||||
|
||||
static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark)
|
||||
{
|
||||
switch (watermark) {
|
||||
case BCH_WATERMARK_interior_updates:
|
||||
return 0;
|
||||
case BCH_WATERMARK_reclaim:
|
||||
return OPEN_BUCKETS_COUNT / 6;
|
||||
case BCH_WATERMARK_btree:
|
||||
case BCH_WATERMARK_btree_copygc:
|
||||
return OPEN_BUCKETS_COUNT / 4;
|
||||
case BCH_WATERMARK_copygc:
|
||||
return OPEN_BUCKETS_COUNT / 3;
|
||||
default:
|
||||
return OPEN_BUCKETS_COUNT / 2;
|
||||
}
|
||||
}
|
||||
|
||||
struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
|
||||
enum bch_watermark, enum bch_data_type,
|
||||
struct closure *);
|
||||
|
@ -18,6 +18,7 @@ struct bucket_alloc_state {
|
||||
u64 buckets_seen;
|
||||
u64 skipped_open;
|
||||
u64 skipped_need_journal_commit;
|
||||
u64 need_journal_commit;
|
||||
u64 skipped_nocow;
|
||||
u64 skipped_nouse;
|
||||
u64 skipped_mi_btree_bitmap;
|
||||
@ -89,6 +90,7 @@ struct dev_stripe_state {
|
||||
x(stopped) \
|
||||
x(waiting_io) \
|
||||
x(waiting_work) \
|
||||
x(runnable) \
|
||||
x(running)
|
||||
|
||||
enum write_point_state {
|
||||
@ -124,6 +126,7 @@ struct write_point {
|
||||
enum write_point_state state;
|
||||
u64 last_state_change;
|
||||
u64 time[WRITE_POINT_STATE_NR];
|
||||
u64 last_runtime;
|
||||
} __aligned(SMP_CACHE_BYTES);
|
||||
};
|
||||
|
||||
|
@ -244,27 +244,31 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
|
||||
if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
|
||||
return bkey_s_c_null;
|
||||
|
||||
if (likely(!bp.v->level)) {
|
||||
bch2_trans_node_iter_init(trans, iter,
|
||||
bp.v->btree_id,
|
||||
bp.v->pos,
|
||||
0, 0,
|
||||
iter_flags);
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
|
||||
if (bkey_err(k)) {
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
return k;
|
||||
}
|
||||
|
||||
if (k.k &&
|
||||
extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
|
||||
return k;
|
||||
|
||||
bch2_trans_node_iter_init(trans, iter,
|
||||
bp.v->btree_id,
|
||||
bp.v->pos,
|
||||
0,
|
||||
bp.v->level,
|
||||
iter_flags);
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
|
||||
if (bkey_err(k)) {
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
return k;
|
||||
}
|
||||
|
||||
if (k.k &&
|
||||
extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
|
||||
return k;
|
||||
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
|
||||
if (!bp.v->level) {
|
||||
int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
|
||||
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
|
||||
} else {
|
||||
struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
|
||||
if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
|
||||
return bkey_s_c_null;
|
||||
if (IS_ERR_OR_NULL(b))
|
||||
return ((struct bkey_s_c) { .k = ERR_CAST(b) });
|
||||
|
||||
|
@ -687,7 +687,8 @@ struct btree_trans_buf {
|
||||
x(gc_gens) \
|
||||
x(snapshot_delete_pagecache) \
|
||||
x(sysfs) \
|
||||
x(btree_write_buffer)
|
||||
x(btree_write_buffer) \
|
||||
x(btree_node_scrub)
|
||||
|
||||
enum bch_write_ref {
|
||||
#define x(n) BCH_WRITE_REF_##n,
|
||||
|
@ -87,6 +87,7 @@ struct bch_ioctl_incremental {
|
||||
#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
|
||||
#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
|
||||
#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
|
||||
#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters)
|
||||
|
||||
/* ioctl below act on a particular file, not the filesystem as a whole: */
|
||||
|
||||
@ -213,6 +214,10 @@ struct bch_ioctl_data {
|
||||
struct bpos end_pos;
|
||||
|
||||
union {
|
||||
struct {
|
||||
__u32 dev;
|
||||
__u32 data_types;
|
||||
} scrub;
|
||||
struct {
|
||||
__u32 dev;
|
||||
__u32 pad;
|
||||
@ -237,11 +242,19 @@ struct bch_ioctl_data_progress {
|
||||
|
||||
__u64 sectors_done;
|
||||
__u64 sectors_total;
|
||||
__u64 sectors_error_corrected;
|
||||
__u64 sectors_error_uncorrected;
|
||||
} __packed __aligned(8);
|
||||
|
||||
enum bch_ioctl_data_event_ret {
|
||||
BCH_IOCTL_DATA_EVENT_RET_done = 1,
|
||||
BCH_IOCTL_DATA_EVENT_RET_device_offline = 2,
|
||||
};
|
||||
|
||||
struct bch_ioctl_data_event {
|
||||
__u8 type;
|
||||
__u8 pad[7];
|
||||
__u8 ret;
|
||||
__u8 pad[6];
|
||||
union {
|
||||
struct bch_ioctl_data_progress p;
|
||||
__u64 pad2[15];
|
||||
@ -443,4 +456,13 @@ struct bch_ioctl_query_accounting {
|
||||
struct bkey_i_accounting accounting[];
|
||||
};
|
||||
|
||||
#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0)
|
||||
|
||||
struct bch_ioctl_query_counters {
|
||||
__u16 nr;
|
||||
__u16 flags;
|
||||
__u32 pad;
|
||||
__u64 d[];
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_IOCTL_H */
|
||||
|
@ -24,7 +24,10 @@ do { \
|
||||
} while (0)
|
||||
|
||||
const char * const bch2_btree_node_flags[] = {
|
||||
#define x(f) #f,
|
||||
"typebit",
|
||||
"typebit",
|
||||
"typebit",
|
||||
#define x(f) [BTREE_NODE_##f] = #f,
|
||||
BTREE_FLAGS()
|
||||
#undef x
|
||||
NULL
|
||||
|
@ -1,6 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "bkey_buf.h"
|
||||
#include "bkey_methods.h"
|
||||
#include "bkey_sort.h"
|
||||
#include "btree_cache.h"
|
||||
@ -1352,7 +1353,7 @@ start:
|
||||
|
||||
can_retry = bch2_bkey_pick_read_device(c,
|
||||
bkey_i_to_s_c(&b->key),
|
||||
&failed, &rb->pick) > 0;
|
||||
&failed, &rb->pick, -1) > 0;
|
||||
|
||||
if (!bio->bi_status &&
|
||||
!bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
|
||||
@ -1697,7 +1698,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
|
||||
return;
|
||||
|
||||
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
|
||||
NULL, &pick);
|
||||
NULL, &pick, -1);
|
||||
|
||||
if (ret <= 0) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
@ -1811,6 +1812,190 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
|
||||
return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
|
||||
}
|
||||
|
||||
struct btree_node_scrub {
|
||||
struct bch_fs *c;
|
||||
struct bch_dev *ca;
|
||||
void *buf;
|
||||
bool used_mempool;
|
||||
unsigned written;
|
||||
|
||||
enum btree_id btree;
|
||||
unsigned level;
|
||||
struct bkey_buf key;
|
||||
__le64 seq;
|
||||
|
||||
struct work_struct work;
|
||||
struct bio bio;
|
||||
};
|
||||
|
||||
static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
|
||||
struct printbuf *err)
|
||||
{
|
||||
unsigned written = 0;
|
||||
|
||||
if (le64_to_cpu(data->magic) != bset_magic(c)) {
|
||||
prt_printf(err, "bad magic: want %llx, got %llx",
|
||||
bset_magic(c), le64_to_cpu(data->magic));
|
||||
return false;
|
||||
}
|
||||
|
||||
while (written < (ptr_written ?: btree_sectors(c))) {
|
||||
struct btree_node_entry *bne;
|
||||
struct bset *i;
|
||||
bool first = !written;
|
||||
|
||||
if (first) {
|
||||
bne = NULL;
|
||||
i = &data->keys;
|
||||
} else {
|
||||
bne = (void *) data + (written << 9);
|
||||
i = &bne->keys;
|
||||
|
||||
if (!ptr_written && i->seq != data->keys.seq)
|
||||
break;
|
||||
}
|
||||
|
||||
struct nonce nonce = btree_nonce(i, written << 9);
|
||||
bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
|
||||
|
||||
if (first) {
|
||||
if (good_csum_type) {
|
||||
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
|
||||
if (bch2_crc_cmp(data->csum, csum)) {
|
||||
bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
written += vstruct_sectors(data, c->block_bits);
|
||||
} else {
|
||||
if (good_csum_type) {
|
||||
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
|
||||
if (bch2_crc_cmp(bne->csum, csum)) {
|
||||
bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
written += vstruct_sectors(bne, c->block_bits);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void btree_node_scrub_work(struct work_struct *work)
|
||||
{
|
||||
struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
|
||||
struct bch_fs *c = scrub->c;
|
||||
struct printbuf err = PRINTBUF;
|
||||
|
||||
__bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
|
||||
bkey_i_to_s_c(scrub->key.k));
|
||||
prt_newline(&err);
|
||||
|
||||
if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
|
||||
struct btree_iter iter;
|
||||
bch2_trans_node_iter_init(trans, &iter, scrub->btree,
|
||||
scrub->key.k->k.p, 0, scrub->level - 1, 0);
|
||||
|
||||
struct btree *b;
|
||||
int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter)));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) {
|
||||
bch_err(c, "error validating btree node during scrub on %s at btree %s",
|
||||
scrub->ca->name, err.buf);
|
||||
|
||||
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
|
||||
}
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
bch2_trans_begin(trans);
|
||||
bch2_trans_put(trans);
|
||||
}
|
||||
|
||||
printbuf_exit(&err);
|
||||
bch2_bkey_buf_exit(&scrub->key, c);;
|
||||
btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
|
||||
percpu_ref_put(&scrub->ca->io_ref);
|
||||
kfree(scrub);
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
|
||||
}
|
||||
|
||||
static void btree_node_scrub_endio(struct bio *bio)
|
||||
{
|
||||
struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
|
||||
|
||||
queue_work(scrub->c->btree_read_complete_wq, &scrub->work);
|
||||
}
|
||||
|
||||
int bch2_btree_node_scrub(struct btree_trans *trans,
|
||||
enum btree_id btree, unsigned level,
|
||||
struct bkey_s_c k, unsigned dev)
|
||||
{
|
||||
if (k.k->type != KEY_TYPE_btree_ptr_v2)
|
||||
return 0;
|
||||
|
||||
struct bch_fs *c = trans->c;
|
||||
|
||||
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub))
|
||||
return -BCH_ERR_erofs_no_writes;
|
||||
|
||||
struct extent_ptr_decoded pick;
|
||||
int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
|
||||
if (ret <= 0)
|
||||
goto err;
|
||||
|
||||
struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
|
||||
if (!ca) {
|
||||
ret = -BCH_ERR_device_offline;
|
||||
goto err;
|
||||
}
|
||||
|
||||
bool used_mempool = false;
|
||||
void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
|
||||
|
||||
unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
|
||||
|
||||
struct btree_node_scrub *scrub =
|
||||
kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
|
||||
if (!scrub) {
|
||||
ret = -ENOMEM;
|
||||
goto err_free;
|
||||
}
|
||||
|
||||
scrub->c = c;
|
||||
scrub->ca = ca;
|
||||
scrub->buf = buf;
|
||||
scrub->used_mempool = used_mempool;
|
||||
scrub->written = btree_ptr_sectors_written(k);
|
||||
|
||||
scrub->btree = btree;
|
||||
scrub->level = level;
|
||||
bch2_bkey_buf_init(&scrub->key);
|
||||
bch2_bkey_buf_reassemble(&scrub->key, c, k);
|
||||
scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq;
|
||||
|
||||
INIT_WORK(&scrub->work, btree_node_scrub_work);
|
||||
|
||||
bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ);
|
||||
bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
|
||||
scrub->bio.bi_iter.bi_sector = pick.ptr.offset;
|
||||
scrub->bio.bi_end_io = btree_node_scrub_endio;
|
||||
submit_bio(&scrub->bio);
|
||||
return 0;
|
||||
err_free:
|
||||
btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
err:
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
|
||||
struct btree_write *w)
|
||||
{
|
||||
|
@ -132,6 +132,9 @@ void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
|
||||
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
|
||||
const struct bkey_i *, unsigned);
|
||||
|
||||
int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned,
|
||||
struct bkey_s_c, unsigned);
|
||||
|
||||
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
|
||||
|
||||
enum btree_write_flags {
|
||||
|
@ -2239,8 +2239,6 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
|
||||
if (unlikely(ret))
|
||||
return bkey_s_c_err(ret);
|
||||
|
||||
btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
|
||||
|
||||
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
|
||||
if (!k.k)
|
||||
return k;
|
||||
@ -2251,6 +2249,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
|
||||
|
||||
iter->k = u;
|
||||
k.k = &iter->k;
|
||||
btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
|
||||
return k;
|
||||
}
|
||||
|
||||
|
@ -291,8 +291,10 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
|
||||
struct btree_path *ck_path,
|
||||
unsigned flags)
|
||||
{
|
||||
if (flags & BTREE_ITER_cached_nofill)
|
||||
if (flags & BTREE_ITER_cached_nofill) {
|
||||
ck_path->l[0].b = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
@ -746,7 +748,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
|
||||
rcu_read_unlock();
|
||||
mutex_lock(&bc->table.mutex);
|
||||
mutex_unlock(&bc->table.mutex);
|
||||
rcu_read_lock();
|
||||
continue;
|
||||
}
|
||||
for (i = 0; i < tbl->size; i++)
|
||||
|
@ -348,7 +348,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
|
||||
unsigned flags)
|
||||
{
|
||||
return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
|
||||
trans->journal_u64s, flags);
|
||||
trans->journal_u64s, flags, trans);
|
||||
}
|
||||
|
||||
#define JSET_ENTRY_LOG_U64s 4
|
||||
|
@ -2189,6 +2189,26 @@ err:
|
||||
goto out;
|
||||
}
|
||||
|
||||
int bch2_btree_node_rewrite_key(struct btree_trans *trans,
|
||||
enum btree_id btree, unsigned level,
|
||||
struct bpos pos, unsigned flags)
|
||||
{
|
||||
BUG_ON(!level);
|
||||
|
||||
/* Traverse one depth lower to get a pointer to the node itself: */
|
||||
struct btree_iter iter;
|
||||
bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0);
|
||||
struct btree *b = bch2_btree_iter_peek_node(&iter);
|
||||
int ret = PTR_ERR_OR_ZERO(b);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct async_btree_rewrite {
|
||||
struct bch_fs *c;
|
||||
struct work_struct work;
|
||||
|
@ -169,7 +169,11 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
|
||||
|
||||
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
|
||||
struct btree *, unsigned);
|
||||
int bch2_btree_node_rewrite_key(struct btree_trans *,
|
||||
enum btree_id, unsigned,
|
||||
struct bpos, unsigned);
|
||||
void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
|
||||
|
||||
int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
|
||||
struct btree *, struct bkey_i *,
|
||||
unsigned, bool);
|
||||
|
@ -22,23 +22,21 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_
|
||||
memset(t->d, 0, sizeof(t->d[0]) << t->bits);
|
||||
}
|
||||
|
||||
bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
|
||||
u64 flushed_seq,
|
||||
unsigned dev, u64 bucket)
|
||||
u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b,
|
||||
unsigned dev, u64 bucket)
|
||||
{
|
||||
struct buckets_waiting_for_journal_table *t;
|
||||
u64 dev_bucket = (u64) dev << 56 | bucket;
|
||||
bool ret = false;
|
||||
unsigned i;
|
||||
u64 ret = 0;
|
||||
|
||||
mutex_lock(&b->lock);
|
||||
t = b->t;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
|
||||
struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
|
||||
|
||||
if (h->dev_bucket == dev_bucket) {
|
||||
ret = h->journal_seq > flushed_seq;
|
||||
ret = h->journal_seq;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -4,8 +4,8 @@
|
||||
|
||||
#include "buckets_waiting_for_journal_types.h"
|
||||
|
||||
bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
|
||||
u64, unsigned, u64);
|
||||
u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *,
|
||||
unsigned, u64);
|
||||
int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
|
||||
u64, unsigned, u64, u64);
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "move.h"
|
||||
#include "recovery_passes.h"
|
||||
#include "replicas.h"
|
||||
#include "sb-counters.h"
|
||||
#include "super-io.h"
|
||||
#include "thread_with_file.h"
|
||||
|
||||
@ -312,7 +313,10 @@ static int bch2_data_thread(void *arg)
|
||||
struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
|
||||
|
||||
ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
|
||||
ctx->stats.data_type = U8_MAX;
|
||||
if (ctx->thr.ret == -BCH_ERR_device_offline)
|
||||
ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
|
||||
else
|
||||
ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -331,14 +335,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
|
||||
struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
|
||||
struct bch_fs *c = ctx->c;
|
||||
struct bch_ioctl_data_event e = {
|
||||
.type = BCH_DATA_EVENT_PROGRESS,
|
||||
.p.data_type = ctx->stats.data_type,
|
||||
.p.btree_id = ctx->stats.pos.btree,
|
||||
.p.pos = ctx->stats.pos.pos,
|
||||
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
|
||||
.p.sectors_total = bch2_fs_usage_read_short(c).used,
|
||||
.type = BCH_DATA_EVENT_PROGRESS,
|
||||
.ret = ctx->stats.ret,
|
||||
.p.data_type = ctx->stats.data_type,
|
||||
.p.btree_id = ctx->stats.pos.btree,
|
||||
.p.pos = ctx->stats.pos.pos,
|
||||
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
|
||||
.p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected),
|
||||
.p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected),
|
||||
};
|
||||
|
||||
if (ctx->arg.op == BCH_DATA_OP_scrub) {
|
||||
struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
|
||||
if (ca) {
|
||||
struct bch_dev_usage u;
|
||||
bch2_dev_usage_read_fast(ca, &u);
|
||||
for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
|
||||
if (ctx->arg.scrub.data_types & BIT(i))
|
||||
e.p.sectors_total += u.d[i].sectors;
|
||||
bch2_dev_put(ca);
|
||||
}
|
||||
} else {
|
||||
e.p.sectors_total = bch2_fs_usage_read_short(c).used;
|
||||
}
|
||||
|
||||
if (len < sizeof(e))
|
||||
return -EINVAL;
|
||||
|
||||
@ -710,6 +730,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
|
||||
BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
|
||||
case BCH_IOCTL_QUERY_ACCOUNTING:
|
||||
return bch2_ioctl_query_accounting(c, arg);
|
||||
case BCH_IOCTL_QUERY_COUNTERS:
|
||||
return bch2_ioctl_query_counters(c, arg);
|
||||
default:
|
||||
return -ENOTTY;
|
||||
}
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include "compress.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "io_write.h"
|
||||
#include "opts.h"
|
||||
#include "super-io.h"
|
||||
|
||||
@ -254,11 +255,14 @@ err:
|
||||
goto out;
|
||||
}
|
||||
|
||||
int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
|
||||
struct bch_extent_crc_unpacked *crc)
|
||||
int bch2_bio_uncompress_inplace(struct bch_write_op *op,
|
||||
struct bio *bio)
|
||||
{
|
||||
struct bch_fs *c = op->c;
|
||||
struct bch_extent_crc_unpacked *crc = &op->crc;
|
||||
struct bbuf data = { NULL };
|
||||
size_t dst_len = crc->uncompressed_size << 9;
|
||||
int ret = 0;
|
||||
|
||||
/* bio must own its pages: */
|
||||
BUG_ON(!bio->bi_vcnt);
|
||||
@ -266,17 +270,26 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
|
||||
|
||||
if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
|
||||
crc->compressed_size << 9 > c->opts.encoded_extent_max) {
|
||||
bch_err(c, "error rewriting existing data: extent too big");
|
||||
struct printbuf buf = PRINTBUF;
|
||||
bch2_write_op_error(&buf, op);
|
||||
prt_printf(&buf, "error rewriting existing data: extent too big");
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
data = __bounce_alloc(c, dst_len, WRITE);
|
||||
|
||||
if (__bio_uncompress(c, bio, data.b, *crc)) {
|
||||
if (!c->opts.no_data_io)
|
||||
bch_err(c, "error rewriting existing data: decompression error");
|
||||
bio_unmap_or_unbounce(c, data);
|
||||
return -EIO;
|
||||
if (!c->opts.no_data_io) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
bch2_write_op_error(&buf, op);
|
||||
prt_printf(&buf, "error rewriting existing data: decompression error");
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
ret = -EIO;
|
||||
goto err;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -293,9 +306,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
|
||||
crc->uncompressed_size = crc->live_size;
|
||||
crc->offset = 0;
|
||||
crc->csum = (struct bch_csum) { 0, 0 };
|
||||
|
||||
err:
|
||||
bio_unmap_or_unbounce(c, data);
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
|
||||
|
@ -47,8 +47,8 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
|
||||
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
|
||||
}
|
||||
|
||||
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
|
||||
struct bch_extent_crc_unpacked *);
|
||||
struct bch_write_op;
|
||||
int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *);
|
||||
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
|
||||
struct bvec_iter, struct bch_extent_crc_unpacked);
|
||||
unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
|
||||
|
@ -20,6 +20,8 @@
|
||||
#include "subvolume.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/ioprio.h>
|
||||
|
||||
static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
|
||||
{
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
@ -33,7 +35,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
if (!bch2_dev_tryget(c, ptr->dev)) {
|
||||
if (unlikely(!bch2_dev_tryget(c, ptr->dev))) {
|
||||
bkey_for_each_ptr(ptrs, ptr2) {
|
||||
if (ptr2 == ptr)
|
||||
break;
|
||||
@ -91,15 +93,28 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
|
||||
return true;
|
||||
}
|
||||
|
||||
static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
|
||||
static noinline void trace_move_extent_finish2(struct data_update *u,
|
||||
struct bkey_i *new,
|
||||
struct bkey_i *insert)
|
||||
{
|
||||
if (trace_move_extent_finish_enabled()) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
struct bch_fs *c = u->op.c;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
trace_move_extent_finish(c, buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
prt_newline(&buf);
|
||||
|
||||
bch2_data_update_to_text(&buf, u);
|
||||
prt_newline(&buf);
|
||||
|
||||
prt_str_indented(&buf, "new replicas:\t");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
|
||||
prt_newline(&buf);
|
||||
|
||||
prt_str_indented(&buf, "insert:\t");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
|
||||
prt_newline(&buf);
|
||||
|
||||
trace_move_extent_finish(c, buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
static void trace_move_extent_fail2(struct data_update *m,
|
||||
@ -372,7 +387,8 @@ restart_drop_extra_replicas:
|
||||
bch2_btree_iter_set_pos(&iter, next_pos);
|
||||
|
||||
this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
|
||||
trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i));
|
||||
if (trace_move_extent_finish_enabled())
|
||||
trace_move_extent_finish2(m, &new->k_i, insert);
|
||||
}
|
||||
err:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
@ -412,14 +428,17 @@ int bch2_data_update_index_update(struct bch_write_op *op)
|
||||
return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
|
||||
}
|
||||
|
||||
void bch2_data_update_read_done(struct data_update *m,
|
||||
struct bch_extent_crc_unpacked crc)
|
||||
void bch2_data_update_read_done(struct data_update *m)
|
||||
{
|
||||
m->read_done = true;
|
||||
|
||||
/* write bio must own pages: */
|
||||
BUG_ON(!m->op.wbio.bio.bi_vcnt);
|
||||
|
||||
m->op.crc = crc;
|
||||
m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
|
||||
m->op.crc = m->rbio.pick.crc;
|
||||
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
|
||||
|
||||
this_cpu_add(m->op.c->counters[BCH_COUNTER_move_extent_write], m->k.k->k.size);
|
||||
|
||||
closure_call(&m->op.cl, bch2_write, NULL, NULL);
|
||||
}
|
||||
@ -429,31 +448,34 @@ void bch2_data_update_exit(struct data_update *update)
|
||||
struct bch_fs *c = update->op.c;
|
||||
struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
|
||||
|
||||
bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
|
||||
kfree(update->bvecs);
|
||||
update->bvecs = NULL;
|
||||
|
||||
if (c->opts.nocow_enabled)
|
||||
bkey_nocow_unlock(c, k);
|
||||
bkey_put_dev_refs(c, k);
|
||||
bch2_bkey_buf_exit(&update->k, c);
|
||||
bch2_disk_reservation_put(c, &update->op.res);
|
||||
bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
|
||||
bch2_bkey_buf_exit(&update->k, c);
|
||||
}
|
||||
|
||||
static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
||||
struct data_update *update)
|
||||
static int bch2_update_unwritten_extent(struct btree_trans *trans,
|
||||
struct data_update *update)
|
||||
{
|
||||
struct bch_fs *c = update->op.c;
|
||||
struct bio *bio = &update->op.wbio.bio;
|
||||
struct bkey_i_extent *e;
|
||||
struct write_point *wp;
|
||||
struct closure cl;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
int ret = 0;
|
||||
|
||||
closure_init_stack(&cl);
|
||||
bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
|
||||
|
||||
while (bio_sectors(bio)) {
|
||||
unsigned sectors = bio_sectors(bio);
|
||||
while (bpos_lt(update->op.pos, update->k.k->k.p)) {
|
||||
unsigned sectors = update->k.k->k.p.offset -
|
||||
update->op.pos.offset;
|
||||
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
@ -489,7 +511,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
||||
bch_err_fn_ratelimited(c, ret);
|
||||
|
||||
if (ret)
|
||||
return;
|
||||
break;
|
||||
|
||||
sectors = min(sectors, wp->sectors_free);
|
||||
|
||||
@ -499,7 +521,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
||||
bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
|
||||
bch2_alloc_sectors_done(c, wp);
|
||||
|
||||
bio_advance(bio, sectors << 9);
|
||||
update->op.pos.offset += sectors;
|
||||
|
||||
extent_for_each_ptr(extent_i_to_s(e), ptr)
|
||||
@ -518,6 +539,8 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
|
||||
bch2_trans_unlock(trans);
|
||||
closure_sync(&cl);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
@ -527,37 +550,47 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
if (!out->nr_tabstops)
|
||||
printbuf_tabstop_push(out, 20);
|
||||
|
||||
prt_printf(out, "rewrite ptrs:\t");
|
||||
prt_str_indented(out, "rewrite ptrs:\t");
|
||||
bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "kill ptrs:\t");
|
||||
prt_str_indented(out, "kill ptrs:\t");
|
||||
bch2_prt_u64_base2(out, data_opts->kill_ptrs);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "target:\t");
|
||||
prt_str_indented(out, "target:\t");
|
||||
bch2_target_to_text(out, c, data_opts->target);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "compression:\t");
|
||||
prt_str_indented(out, "compression:\t");
|
||||
bch2_compression_opt_to_text(out, io_opts->background_compression);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "opts.replicas:\t");
|
||||
prt_str_indented(out, "opts.replicas:\t");
|
||||
prt_u64(out, io_opts->data_replicas);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "extra replicas:\t");
|
||||
prt_str_indented(out, "extra replicas:\t");
|
||||
prt_u64(out, data_opts->extra_replicas);
|
||||
prt_newline(out);
|
||||
}
|
||||
|
||||
void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
|
||||
{
|
||||
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
|
||||
prt_newline(out);
|
||||
|
||||
prt_str_indented(out, "old key:\t");
|
||||
bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
|
||||
}
|
||||
|
||||
void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m)
|
||||
{
|
||||
bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
|
||||
prt_newline(out);
|
||||
printbuf_indent_add(out, 2);
|
||||
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
|
||||
prt_printf(out, "read_done:\t\%u\n", m->read_done);
|
||||
bch2_write_op_to_text(out, &m->op);
|
||||
printbuf_indent_sub(out, 2);
|
||||
}
|
||||
@ -605,6 +638,40 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
|
||||
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
||||
}
|
||||
|
||||
static bool can_allocate_without_blocking(struct bch_fs *c,
|
||||
struct data_update *m)
|
||||
{
|
||||
if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
|
||||
return false;
|
||||
|
||||
unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
|
||||
? m->op.target
|
||||
: 0;
|
||||
struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
|
||||
|
||||
darray_for_each(m->op.devs_have, i)
|
||||
__clear_bit(*i, devs.d);
|
||||
|
||||
rcu_read_lock();
|
||||
unsigned nr_replicas = 0, i;
|
||||
for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, i);
|
||||
|
||||
struct bch_dev_usage usage;
|
||||
bch2_dev_usage_read_fast(ca, &usage);
|
||||
|
||||
if (!dev_buckets_free(ca, usage, m->op.watermark))
|
||||
continue;
|
||||
|
||||
nr_replicas += ca->mi.durability;
|
||||
if (nr_replicas >= m->op.nr_replicas)
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return nr_replicas >= m->op.nr_replicas;
|
||||
}
|
||||
|
||||
int bch2_data_update_init(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct moving_context *ctxt,
|
||||
@ -628,16 +695,7 @@ int bch2_data_update_init(struct btree_trans *trans,
|
||||
* snapshots table - just skip it, we can move it later.
|
||||
*/
|
||||
if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
|
||||
return -BCH_ERR_data_update_done;
|
||||
|
||||
if (!bkey_get_dev_refs(c, k))
|
||||
return -BCH_ERR_data_update_done;
|
||||
|
||||
if (c->opts.nocow_enabled &&
|
||||
!bkey_nocow_lock(c, ctxt, k)) {
|
||||
bkey_put_dev_refs(c, k);
|
||||
return -BCH_ERR_nocow_lock_blocked;
|
||||
}
|
||||
return -BCH_ERR_data_update_done_no_snapshot;
|
||||
|
||||
bch2_bkey_buf_init(&m->k);
|
||||
bch2_bkey_buf_reassemble(&m->k, c, k);
|
||||
@ -652,10 +710,10 @@ int bch2_data_update_init(struct btree_trans *trans,
|
||||
m->op.target = data_opts.target;
|
||||
m->op.write_point = wp;
|
||||
m->op.nr_replicas = 0;
|
||||
m->op.flags |= BCH_WRITE_PAGES_STABLE|
|
||||
BCH_WRITE_PAGES_OWNED|
|
||||
BCH_WRITE_DATA_ENCODED|
|
||||
BCH_WRITE_MOVE|
|
||||
m->op.flags |= BCH_WRITE_pages_stable|
|
||||
BCH_WRITE_pages_owned|
|
||||
BCH_WRITE_data_encoded|
|
||||
BCH_WRITE_move|
|
||||
m->data_opts.write_flags;
|
||||
m->op.compression_opt = io_opts.background_compression;
|
||||
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
|
||||
@ -729,7 +787,15 @@ int bch2_data_update_init(struct btree_trans *trans,
|
||||
/* if iter == NULL, it's just a promote */
|
||||
if (iter)
|
||||
ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
|
||||
goto out;
|
||||
if (!ret)
|
||||
ret = -BCH_ERR_data_update_done_no_writes_needed;
|
||||
goto out_bkey_buf_exit;
|
||||
}
|
||||
|
||||
if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
|
||||
!can_allocate_without_blocking(c, m)) {
|
||||
ret = -BCH_ERR_data_update_done_would_block;
|
||||
goto out_bkey_buf_exit;
|
||||
}
|
||||
|
||||
if (reserve_sectors) {
|
||||
@ -738,18 +804,63 @@ int bch2_data_update_init(struct btree_trans *trans,
|
||||
? 0
|
||||
: BCH_DISK_RESERVATION_NOFAIL);
|
||||
if (ret)
|
||||
goto out;
|
||||
goto out_bkey_buf_exit;
|
||||
}
|
||||
|
||||
if (!bkey_get_dev_refs(c, k)) {
|
||||
ret = -BCH_ERR_data_update_done_no_dev_refs;
|
||||
goto out_put_disk_res;
|
||||
}
|
||||
|
||||
if (c->opts.nocow_enabled &&
|
||||
!bkey_nocow_lock(c, ctxt, k)) {
|
||||
ret = -BCH_ERR_nocow_lock_blocked;
|
||||
goto out_put_dev_refs;
|
||||
}
|
||||
|
||||
if (bkey_extent_is_unwritten(k)) {
|
||||
bch2_update_unwritten_extent(trans, m);
|
||||
goto out;
|
||||
ret = bch2_update_unwritten_extent(trans, m) ?:
|
||||
-BCH_ERR_data_update_done_unwritten;
|
||||
goto out_nocow_unlock;
|
||||
}
|
||||
|
||||
/* write path might have to decompress data: */
|
||||
unsigned buf_bytes = 0;
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
||||
buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
|
||||
|
||||
unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
|
||||
|
||||
m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
|
||||
if (!m->bvecs)
|
||||
goto enomem;
|
||||
|
||||
bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
|
||||
bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
|
||||
|
||||
if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL))
|
||||
goto enomem;
|
||||
|
||||
rbio_init(&m->rbio.bio, c, io_opts, NULL);
|
||||
m->rbio.bio.bi_iter.bi_size = buf_bytes;
|
||||
m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
|
||||
m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
||||
|
||||
return 0;
|
||||
out:
|
||||
bch2_data_update_exit(m);
|
||||
return ret ?: -BCH_ERR_data_update_done;
|
||||
enomem:
|
||||
ret = -ENOMEM;
|
||||
kfree(m->bvecs);
|
||||
m->bvecs = NULL;
|
||||
out_nocow_unlock:
|
||||
if (c->opts.nocow_enabled)
|
||||
bkey_nocow_unlock(c, k);
|
||||
out_put_dev_refs:
|
||||
bkey_put_dev_refs(c, k);
|
||||
out_put_disk_res:
|
||||
bch2_disk_reservation_put(c, &m->op.res);
|
||||
out_bkey_buf_exit:
|
||||
bch2_bkey_buf_exit(&m->k, c);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
|
||||
|
@ -4,6 +4,7 @@
|
||||
#define _BCACHEFS_DATA_UPDATE_H
|
||||
|
||||
#include "bkey_buf.h"
|
||||
#include "io_read.h"
|
||||
#include "io_write_types.h"
|
||||
|
||||
struct moving_context;
|
||||
@ -15,6 +16,9 @@ struct data_update_opts {
|
||||
u8 extra_replicas;
|
||||
unsigned btree_insert_flags;
|
||||
unsigned write_flags;
|
||||
|
||||
int read_dev;
|
||||
bool scrub;
|
||||
};
|
||||
|
||||
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
|
||||
@ -22,20 +26,24 @@ void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
|
||||
|
||||
struct data_update {
|
||||
/* extent being updated: */
|
||||
bool read_done;
|
||||
enum btree_id btree_id;
|
||||
struct bkey_buf k;
|
||||
struct data_update_opts data_opts;
|
||||
struct moving_context *ctxt;
|
||||
struct bch_move_stats *stats;
|
||||
|
||||
struct bch_read_bio rbio;
|
||||
struct bch_write_op op;
|
||||
struct bio_vec *bvecs;
|
||||
};
|
||||
|
||||
void bch2_data_update_to_text(struct printbuf *, struct data_update *);
|
||||
void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
|
||||
|
||||
int bch2_data_update_index_update(struct bch_write_op *);
|
||||
|
||||
void bch2_data_update_read_done(struct data_update *,
|
||||
struct bch_extent_crc_unpacked);
|
||||
void bch2_data_update_read_done(struct data_update *);
|
||||
|
||||
int bch2_extent_drop_ptrs(struct btree_trans *,
|
||||
struct btree_iter *,
|
||||
|
@ -7,6 +7,7 @@
|
||||
*/
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "alloc_foreground.h"
|
||||
#include "bkey_methods.h"
|
||||
#include "btree_cache.h"
|
||||
#include "btree_io.h"
|
||||
@ -20,6 +21,7 @@
|
||||
#include "extents.h"
|
||||
#include "fsck.h"
|
||||
#include "inode.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "super.h"
|
||||
|
||||
#include <linux/console.h>
|
||||
@ -189,7 +191,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
unsigned offset = 0;
|
||||
int ret;
|
||||
|
||||
if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
|
||||
if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) {
|
||||
prt_printf(out, "error getting device to read from: invalid device\n");
|
||||
return;
|
||||
}
|
||||
@ -843,8 +845,11 @@ restart:
|
||||
seqmutex_unlock(&c->btree_trans_lock);
|
||||
}
|
||||
|
||||
static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
||||
size_t size, loff_t *ppos)
|
||||
typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *);
|
||||
|
||||
static ssize_t bch2_simple_print(struct file *file, char __user *buf,
|
||||
size_t size, loff_t *ppos,
|
||||
fs_to_text_fn fn)
|
||||
{
|
||||
struct dump_iter *i = file->private_data;
|
||||
struct bch_fs *c = i->c;
|
||||
@ -855,7 +860,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
||||
i->ret = 0;
|
||||
|
||||
if (!i->iter) {
|
||||
btree_deadlock_to_text(&i->buf, c);
|
||||
fn(&i->buf, c);
|
||||
i->iter++;
|
||||
}
|
||||
|
||||
@ -868,6 +873,12 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
||||
return ret ?: i->ret;
|
||||
}
|
||||
|
||||
static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
|
||||
size_t size, loff_t *ppos)
|
||||
{
|
||||
return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text);
|
||||
}
|
||||
|
||||
static const struct file_operations btree_deadlock_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = bch2_dump_open,
|
||||
@ -875,6 +886,19 @@ static const struct file_operations btree_deadlock_ops = {
|
||||
.read = bch2_btree_deadlock_read,
|
||||
};
|
||||
|
||||
static ssize_t bch2_write_points_read(struct file *file, char __user *buf,
|
||||
size_t size, loff_t *ppos)
|
||||
{
|
||||
return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text);
|
||||
}
|
||||
|
||||
static const struct file_operations write_points_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = bch2_dump_open,
|
||||
.release = bch2_dump_release,
|
||||
.read = bch2_write_points_read,
|
||||
};
|
||||
|
||||
void bch2_fs_debug_exit(struct bch_fs *c)
|
||||
{
|
||||
if (!IS_ERR_OR_NULL(c->fs_debug_dir))
|
||||
@ -926,6 +950,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
|
||||
debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
|
||||
c->btree_debug, &btree_deadlock_ops);
|
||||
|
||||
debugfs_create_file("write_points", 0400, c->fs_debug_dir,
|
||||
c->btree_debug, &write_points_ops);
|
||||
|
||||
c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
|
||||
if (IS_ERR_OR_NULL(c->btree_debug_dir))
|
||||
return;
|
||||
|
@ -180,6 +180,11 @@
|
||||
x(EINVAL, not_in_recovery) \
|
||||
x(EINVAL, cannot_rewind_recovery) \
|
||||
x(0, data_update_done) \
|
||||
x(BCH_ERR_data_update_done, data_update_done_would_block) \
|
||||
x(BCH_ERR_data_update_done, data_update_done_unwritten) \
|
||||
x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
|
||||
x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \
|
||||
x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \
|
||||
x(EINVAL, device_state_not_allowed) \
|
||||
x(EINVAL, member_info_missing) \
|
||||
x(EINVAL, mismatched_block_size) \
|
||||
@ -269,6 +274,7 @@
|
||||
x(EIO, invalidate_stripe_to_dev) \
|
||||
x(EIO, no_encryption_key) \
|
||||
x(EIO, insufficient_journal_devices) \
|
||||
x(EIO, device_offline) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
|
||||
|
@ -114,8 +114,9 @@ static inline bool ptr_better(struct bch_fs *c,
|
||||
* other devices, it will still pick a pointer from avoid.
|
||||
*/
|
||||
int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
||||
struct bch_io_failures *failed,
|
||||
struct extent_ptr_decoded *pick)
|
||||
struct bch_io_failures *failed,
|
||||
struct extent_ptr_decoded *pick,
|
||||
int dev)
|
||||
{
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
@ -137,6 +138,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
||||
break;
|
||||
}
|
||||
|
||||
/* Are we being asked to read from a specific device? */
|
||||
if (dev >= 0 && p.ptr.dev != dev)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If there are any dirty pointers it's an error if we can't
|
||||
* read:
|
||||
|
@ -404,7 +404,7 @@ void bch2_mark_io_failure(struct bch_io_failures *,
|
||||
struct extent_ptr_decoded *);
|
||||
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
|
||||
struct bch_io_failures *,
|
||||
struct extent_ptr_decoded *);
|
||||
struct extent_ptr_decoded *, int);
|
||||
|
||||
/* KEY_TYPE_btree_ptr: */
|
||||
|
||||
|
@ -149,12 +149,10 @@ static void bchfs_read(struct btree_trans *trans,
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
struct bkey_buf sk;
|
||||
int flags = BCH_READ_RETRY_IF_STALE|
|
||||
BCH_READ_MAY_PROMOTE;
|
||||
int flags = BCH_READ_retry_if_stale|
|
||||
BCH_READ_may_promote;
|
||||
int ret = 0;
|
||||
|
||||
rbio->c = c;
|
||||
rbio->start_time = local_clock();
|
||||
rbio->subvol = inum.subvol;
|
||||
|
||||
bch2_bkey_buf_init(&sk);
|
||||
@ -211,14 +209,14 @@ static void bchfs_read(struct btree_trans *trans,
|
||||
swap(rbio->bio.bi_iter.bi_size, bytes);
|
||||
|
||||
if (rbio->bio.bi_iter.bi_size == bytes)
|
||||
flags |= BCH_READ_LAST_FRAGMENT;
|
||||
flags |= BCH_READ_last_fragment;
|
||||
|
||||
bch2_bio_page_state_set(&rbio->bio, k);
|
||||
|
||||
bch2_read_extent(trans, rbio, iter.pos,
|
||||
data_btree, k, offset_into_extent, flags);
|
||||
|
||||
if (flags & BCH_READ_LAST_FRAGMENT)
|
||||
if (flags & BCH_READ_last_fragment)
|
||||
break;
|
||||
|
||||
swap(rbio->bio.bi_iter.bi_size, bytes);
|
||||
@ -280,12 +278,13 @@ void bch2_readahead(struct readahead_control *ractl)
|
||||
struct bch_read_bio *rbio =
|
||||
rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
|
||||
GFP_KERNEL, &c->bio_read),
|
||||
opts);
|
||||
c,
|
||||
opts,
|
||||
bch2_readpages_end_io);
|
||||
|
||||
readpage_iter_advance(&readpages_iter);
|
||||
|
||||
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
|
||||
rbio->bio.bi_end_io = bch2_readpages_end_io;
|
||||
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
|
||||
|
||||
bchfs_read(trans, rbio, inode_inum(inode),
|
||||
@ -323,10 +322,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
|
||||
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
|
||||
|
||||
rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
|
||||
opts);
|
||||
c,
|
||||
opts,
|
||||
bch2_read_single_folio_end_io);
|
||||
rbio->bio.bi_private = &done;
|
||||
rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
|
||||
|
||||
rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
|
||||
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
|
||||
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
|
||||
@ -420,7 +419,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
|
||||
}
|
||||
}
|
||||
|
||||
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
|
||||
if (io->op.flags & BCH_WRITE_wrote_data_inline) {
|
||||
bio_for_each_folio_all(fi, bio) {
|
||||
struct bch_folio *s;
|
||||
|
||||
|
@ -73,6 +73,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
||||
struct blk_plug plug;
|
||||
loff_t offset = req->ki_pos;
|
||||
bool sync = is_sync_kiocb(req);
|
||||
bool split = false;
|
||||
size_t shorten;
|
||||
ssize_t ret;
|
||||
|
||||
@ -99,8 +100,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
||||
GFP_KERNEL,
|
||||
&c->dio_read_bioset);
|
||||
|
||||
bio->bi_end_io = bch2_direct_IO_read_endio;
|
||||
|
||||
dio = container_of(bio, struct dio_read, rbio.bio);
|
||||
closure_init(&dio->cl, NULL);
|
||||
|
||||
@ -133,12 +132,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
||||
|
||||
goto start;
|
||||
while (iter->count) {
|
||||
split = true;
|
||||
|
||||
bio = bio_alloc_bioset(NULL,
|
||||
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
|
||||
REQ_OP_READ,
|
||||
GFP_KERNEL,
|
||||
&c->bio_read);
|
||||
bio->bi_end_io = bch2_direct_IO_read_split_endio;
|
||||
start:
|
||||
bio->bi_opf = REQ_OP_READ|REQ_SYNC;
|
||||
bio->bi_iter.bi_sector = offset >> 9;
|
||||
@ -160,7 +160,15 @@ start:
|
||||
if (iter->count)
|
||||
closure_get(&dio->cl);
|
||||
|
||||
bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
|
||||
struct bch_read_bio *rbio =
|
||||
rbio_init(bio,
|
||||
c,
|
||||
opts,
|
||||
split
|
||||
? bch2_direct_IO_read_split_endio
|
||||
: bch2_direct_IO_read_endio);
|
||||
|
||||
bch2_read(c, rbio, inode_inum(inode));
|
||||
}
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
@ -511,8 +519,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
|
||||
dio->op.devs_need_flush = &inode->ei_devs_need_flush;
|
||||
|
||||
if (sync)
|
||||
dio->op.flags |= BCH_WRITE_SYNC;
|
||||
dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
|
||||
dio->op.flags |= BCH_WRITE_sync;
|
||||
dio->op.flags |= BCH_WRITE_check_enospc;
|
||||
|
||||
ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
|
||||
bio_sectors(bio), true);
|
||||
|
@ -450,7 +450,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
|
||||
return ret;
|
||||
|
||||
struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
|
||||
struct qstr name = (struct qstr) QSTR(name_buf);
|
||||
struct qstr name = QSTR(name_buf);
|
||||
|
||||
inode->bi_dir = lostfound.bi_inum;
|
||||
|
||||
|
@ -285,12 +285,14 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
|
||||
struct bch_inode_unpacked *);
|
||||
int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
|
||||
|
||||
#include "rebalance.h"
|
||||
|
||||
static inline struct bch_extent_rebalance
|
||||
bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
|
||||
{
|
||||
struct bch_io_opts io_opts;
|
||||
bch2_inode_opts_get(&io_opts, c, inode);
|
||||
return io_opts_to_rebalance_opts(&io_opts);
|
||||
return io_opts_to_rebalance_opts(c, &io_opts);
|
||||
}
|
||||
|
||||
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
|
||||
|
@ -80,6 +80,7 @@ struct promote_op {
|
||||
struct rhash_head hash;
|
||||
struct bpos pos;
|
||||
|
||||
struct work_struct work;
|
||||
struct data_update write;
|
||||
struct bio_vec bi_inline_vecs[]; /* must be last */
|
||||
};
|
||||
@ -96,6 +97,26 @@ static inline bool have_io_error(struct bch_io_failures *failed)
|
||||
return failed && failed->nr;
|
||||
}
|
||||
|
||||
static bool ptr_being_rewritten(struct bch_read_bio *orig,
|
||||
unsigned dev,
|
||||
unsigned flags)
|
||||
{
|
||||
if (!(flags & BCH_READ_data_update))
|
||||
return false;
|
||||
|
||||
struct data_update *u = container_of(orig, struct data_update, rbio);
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
|
||||
unsigned i = 0;
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
if (ptr->dev == dev &&
|
||||
u->data_opts.rewrite_ptrs & BIT(i))
|
||||
return true;
|
||||
i++;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
|
||||
struct bpos pos,
|
||||
struct bch_io_opts opts,
|
||||
@ -105,7 +126,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
|
||||
if (!have_io_error(failed)) {
|
||||
BUG_ON(!opts.promote_target);
|
||||
|
||||
if (!(flags & BCH_READ_MAY_PROMOTE))
|
||||
if (!(flags & BCH_READ_may_promote))
|
||||
return -BCH_ERR_nopromote_may_not;
|
||||
|
||||
if (bch2_bkey_has_target(c, k, opts.promote_target))
|
||||
@ -125,163 +146,138 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void promote_free(struct bch_fs *c, struct promote_op *op)
|
||||
static noinline void promote_free(struct bch_read_bio *rbio)
|
||||
{
|
||||
int ret;
|
||||
struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
|
||||
struct bch_fs *c = rbio->c;
|
||||
|
||||
int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
|
||||
bch_promote_params);
|
||||
BUG_ON(ret);
|
||||
|
||||
bch2_data_update_exit(&op->write);
|
||||
|
||||
ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
|
||||
bch_promote_params);
|
||||
BUG_ON(ret);
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
|
||||
kfree_rcu(op, rcu);
|
||||
}
|
||||
|
||||
static void promote_done(struct bch_write_op *wop)
|
||||
{
|
||||
struct promote_op *op =
|
||||
container_of(wop, struct promote_op, write.op);
|
||||
struct bch_fs *c = op->write.op.c;
|
||||
struct promote_op *op = container_of(wop, struct promote_op, write.op);
|
||||
struct bch_fs *c = op->write.rbio.c;
|
||||
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
|
||||
op->start_time);
|
||||
promote_free(c, op);
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
|
||||
promote_free(&op->write.rbio);
|
||||
}
|
||||
|
||||
static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
|
||||
static void promote_start_work(struct work_struct *work)
|
||||
{
|
||||
struct bio *bio = &op->write.op.wbio.bio;
|
||||
struct promote_op *op = container_of(work, struct promote_op, work);
|
||||
|
||||
bch2_data_update_read_done(&op->write);
|
||||
}
|
||||
|
||||
static noinline void promote_start(struct bch_read_bio *rbio)
|
||||
{
|
||||
struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
|
||||
|
||||
trace_and_count(op->write.op.c, read_promote, &rbio->bio);
|
||||
|
||||
/* we now own pages: */
|
||||
BUG_ON(!rbio->bounce);
|
||||
BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
|
||||
|
||||
memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
|
||||
sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
|
||||
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
|
||||
|
||||
bch2_data_update_read_done(&op->write, rbio->pick.crc);
|
||||
INIT_WORK(&op->work, promote_start_work);
|
||||
queue_work(rbio->c->write_ref_wq, &op->work);
|
||||
}
|
||||
|
||||
static struct promote_op *__promote_alloc(struct btree_trans *trans,
|
||||
enum btree_id btree_id,
|
||||
struct bkey_s_c k,
|
||||
struct bpos pos,
|
||||
struct extent_ptr_decoded *pick,
|
||||
struct bch_io_opts opts,
|
||||
unsigned sectors,
|
||||
struct bch_read_bio **rbio,
|
||||
struct bch_io_failures *failed)
|
||||
static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
|
||||
enum btree_id btree_id,
|
||||
struct bkey_s_c k,
|
||||
struct bpos pos,
|
||||
struct extent_ptr_decoded *pick,
|
||||
unsigned sectors,
|
||||
unsigned flags,
|
||||
struct bch_read_bio *orig,
|
||||
struct bch_io_failures *failed)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct promote_op *op = NULL;
|
||||
struct bio *bio;
|
||||
unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
|
||||
int ret;
|
||||
|
||||
struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
|
||||
|
||||
if (!have_io_error(failed)) {
|
||||
update_opts.target = orig->opts.promote_target;
|
||||
update_opts.extra_replicas = 1;
|
||||
update_opts.write_flags |= BCH_WRITE_cached;
|
||||
update_opts.write_flags |= BCH_WRITE_only_specified_devs;
|
||||
} else {
|
||||
update_opts.target = orig->opts.foreground_target;
|
||||
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
unsigned ptr_bit = 1;
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
if (bch2_dev_io_failures(failed, ptr->dev) &&
|
||||
!ptr_being_rewritten(orig, ptr->dev, flags))
|
||||
update_opts.rewrite_ptrs |= ptr_bit;
|
||||
ptr_bit <<= 1;
|
||||
}
|
||||
|
||||
if (!update_opts.rewrite_ptrs)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
|
||||
return ERR_PTR(-BCH_ERR_nopromote_no_writes);
|
||||
|
||||
op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
|
||||
struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
|
||||
if (!op) {
|
||||
ret = -BCH_ERR_nopromote_enomem;
|
||||
goto err;
|
||||
goto err_put;
|
||||
}
|
||||
|
||||
op->start_time = local_clock();
|
||||
op->pos = pos;
|
||||
|
||||
/*
|
||||
* We don't use the mempool here because extents that aren't
|
||||
* checksummed or compressed can be too big for the mempool:
|
||||
*/
|
||||
*rbio = kzalloc(sizeof(struct bch_read_bio) +
|
||||
sizeof(struct bio_vec) * pages,
|
||||
GFP_KERNEL);
|
||||
if (!*rbio) {
|
||||
ret = -BCH_ERR_nopromote_enomem;
|
||||
goto err;
|
||||
}
|
||||
|
||||
rbio_init(&(*rbio)->bio, opts);
|
||||
bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
|
||||
|
||||
if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) {
|
||||
ret = -BCH_ERR_nopromote_enomem;
|
||||
goto err;
|
||||
}
|
||||
|
||||
(*rbio)->bounce = true;
|
||||
(*rbio)->split = true;
|
||||
(*rbio)->kmalloc = true;
|
||||
|
||||
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
|
||||
bch_promote_params)) {
|
||||
ret = -BCH_ERR_nopromote_in_flight;
|
||||
goto err;
|
||||
}
|
||||
|
||||
bio = &op->write.op.wbio.bio;
|
||||
bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
|
||||
|
||||
struct data_update_opts update_opts = {};
|
||||
|
||||
if (!have_io_error(failed)) {
|
||||
update_opts.target = opts.promote_target;
|
||||
update_opts.extra_replicas = 1;
|
||||
update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED;
|
||||
} else {
|
||||
update_opts.target = opts.foreground_target;
|
||||
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
unsigned ptr_bit = 1;
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
if (bch2_dev_io_failures(failed, ptr->dev))
|
||||
update_opts.rewrite_ptrs |= ptr_bit;
|
||||
ptr_bit <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
|
||||
writepoint_hashed((unsigned long) current),
|
||||
opts,
|
||||
orig->opts,
|
||||
update_opts,
|
||||
btree_id, k);
|
||||
/*
|
||||
* possible errors: -BCH_ERR_nocow_lock_blocked,
|
||||
* -BCH_ERR_ENOSPC_disk_reservation:
|
||||
*/
|
||||
if (ret) {
|
||||
BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
|
||||
bch_promote_params));
|
||||
goto err;
|
||||
}
|
||||
if (ret)
|
||||
goto err_remove_hash;
|
||||
|
||||
rbio_init_fragment(&op->write.rbio.bio, orig);
|
||||
op->write.rbio.bounce = true;
|
||||
op->write.rbio.promote = true;
|
||||
op->write.op.end_io = promote_done;
|
||||
|
||||
return op;
|
||||
return &op->write.rbio;
|
||||
err_remove_hash:
|
||||
BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
|
||||
bch_promote_params));
|
||||
err:
|
||||
if (*rbio)
|
||||
bio_free_pages(&(*rbio)->bio);
|
||||
kfree(*rbio);
|
||||
*rbio = NULL;
|
||||
bio_free_pages(&op->write.op.wbio.bio);
|
||||
/* We may have added to the rhashtable and thus need rcu freeing: */
|
||||
kfree_rcu(op, rcu);
|
||||
err_put:
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
noinline
|
||||
static struct promote_op *promote_alloc(struct btree_trans *trans,
|
||||
static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
|
||||
struct bvec_iter iter,
|
||||
struct bkey_s_c k,
|
||||
struct extent_ptr_decoded *pick,
|
||||
struct bch_io_opts opts,
|
||||
unsigned flags,
|
||||
struct bch_read_bio **rbio,
|
||||
struct bch_read_bio *orig,
|
||||
bool *bounce,
|
||||
bool *read_full,
|
||||
struct bch_io_failures *failed)
|
||||
@ -301,18 +297,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
|
||||
struct bpos pos = promote_full
|
||||
? bkey_start_pos(k.k)
|
||||
: POS(k.k->p.inode, iter.bi_sector);
|
||||
struct promote_op *promote;
|
||||
int ret;
|
||||
|
||||
ret = should_promote(c, k, pos, opts, flags, failed);
|
||||
ret = should_promote(c, k, pos, orig->opts, flags, failed);
|
||||
if (ret)
|
||||
goto nopromote;
|
||||
|
||||
promote = __promote_alloc(trans,
|
||||
k.k->type == KEY_TYPE_reflink_v
|
||||
? BTREE_ID_reflink
|
||||
: BTREE_ID_extents,
|
||||
k, pos, pick, opts, sectors, rbio, failed);
|
||||
struct bch_read_bio *promote =
|
||||
__promote_alloc(trans,
|
||||
k.k->type == KEY_TYPE_reflink_v
|
||||
? BTREE_ID_reflink
|
||||
: BTREE_ID_extents,
|
||||
k, pos, pick, sectors, flags, orig, failed);
|
||||
if (!promote)
|
||||
return NULL;
|
||||
|
||||
ret = PTR_ERR_OR_ZERO(promote);
|
||||
if (ret)
|
||||
goto nopromote;
|
||||
@ -375,20 +374,20 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
|
||||
{
|
||||
BUG_ON(rbio->bounce && !rbio->split);
|
||||
|
||||
if (rbio->promote)
|
||||
promote_free(rbio->c, rbio->promote);
|
||||
rbio->promote = NULL;
|
||||
|
||||
if (rbio->bounce)
|
||||
bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
|
||||
|
||||
if (rbio->split) {
|
||||
struct bch_read_bio *parent = rbio->parent;
|
||||
|
||||
if (rbio->kmalloc)
|
||||
kfree(rbio);
|
||||
else
|
||||
if (unlikely(rbio->promote)) {
|
||||
if (!rbio->bio.bi_status)
|
||||
promote_start(rbio);
|
||||
else
|
||||
promote_free(rbio);
|
||||
} else {
|
||||
if (rbio->bounce)
|
||||
bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
|
||||
|
||||
bio_put(&rbio->bio);
|
||||
}
|
||||
|
||||
rbio = parent;
|
||||
}
|
||||
@ -408,61 +407,47 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
|
||||
bio_endio(&rbio->bio);
|
||||
}
|
||||
|
||||
static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
struct bvec_iter bvec_iter,
|
||||
struct bch_io_failures *failed,
|
||||
unsigned flags)
|
||||
{
|
||||
struct data_update *u = container_of(rbio, struct data_update, rbio);
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
struct btree_iter iter;
|
||||
struct bkey_buf sk;
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
flags &= ~BCH_READ_LAST_FRAGMENT;
|
||||
flags |= BCH_READ_MUST_CLONE;
|
||||
|
||||
bch2_bkey_buf_init(&sk);
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, rbio->data_btree,
|
||||
rbio->read_pos, BTREE_ITER_slots);
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
rbio->bio.bi_status = 0;
|
||||
|
||||
ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret = lockrestart_do(trans,
|
||||
bkey_err(k = bch2_bkey_get_iter(trans, &iter,
|
||||
u->btree_id, bkey_start_pos(&u->k.k->k),
|
||||
0)));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_bkey_buf_reassemble(&sk, c, k);
|
||||
k = bkey_i_to_s_c(sk.k);
|
||||
|
||||
if (!bch2_bkey_matches_ptr(c, k,
|
||||
rbio->pick.ptr,
|
||||
rbio->data_pos.offset -
|
||||
rbio->pick.crc.offset)) {
|
||||
if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
|
||||
/* extent we wanted to read no longer exists: */
|
||||
rbio->hole = true;
|
||||
goto out;
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = __bch2_read_extent(trans, rbio, bvec_iter,
|
||||
rbio->read_pos,
|
||||
rbio->data_btree,
|
||||
k, 0, failed, flags);
|
||||
bkey_start_pos(&u->k.k->k),
|
||||
u->btree_id,
|
||||
bkey_i_to_s_c(u->k.k),
|
||||
0, failed, flags, -1);
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (ret == READ_RETRY)
|
||||
goto retry;
|
||||
if (ret)
|
||||
goto err;
|
||||
out:
|
||||
rbio->bio.bi_status = BLK_STS_IOERR;
|
||||
|
||||
BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
|
||||
bch2_rbio_done(rbio);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
bch2_trans_put(trans);
|
||||
bch2_bkey_buf_exit(&sk, c);
|
||||
return;
|
||||
err:
|
||||
rbio->bio.bi_status = BLK_STS_IOERR;
|
||||
goto out;
|
||||
}
|
||||
|
||||
static void bch2_rbio_retry(struct work_struct *work)
|
||||
@ -483,29 +468,29 @@ static void bch2_rbio_retry(struct work_struct *work)
|
||||
if (rbio->retry == READ_RETRY_AVOID)
|
||||
bch2_mark_io_failure(&failed, &rbio->pick);
|
||||
|
||||
rbio->bio.bi_status = 0;
|
||||
if (!rbio->split)
|
||||
rbio->bio.bi_status = 0;
|
||||
|
||||
rbio = bch2_rbio_free(rbio);
|
||||
|
||||
flags |= BCH_READ_IN_RETRY;
|
||||
flags &= ~BCH_READ_MAY_PROMOTE;
|
||||
flags |= BCH_READ_in_retry;
|
||||
flags &= ~BCH_READ_may_promote;
|
||||
flags &= ~BCH_READ_last_fragment;
|
||||
flags |= BCH_READ_must_clone;
|
||||
|
||||
if (flags & BCH_READ_NODECODE) {
|
||||
if (flags & BCH_READ_data_update)
|
||||
bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
|
||||
} else {
|
||||
flags &= ~BCH_READ_LAST_FRAGMENT;
|
||||
flags |= BCH_READ_MUST_CLONE;
|
||||
|
||||
else
|
||||
__bch2_read(c, rbio, iter, inum, &failed, flags);
|
||||
}
|
||||
}
|
||||
|
||||
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
|
||||
blk_status_t error)
|
||||
{
|
||||
rbio->retry = retry;
|
||||
rbio->saw_error = true;
|
||||
|
||||
if (rbio->flags & BCH_READ_IN_RETRY)
|
||||
if (rbio->flags & BCH_READ_in_retry)
|
||||
return;
|
||||
|
||||
if (retry == READ_ERR) {
|
||||
@ -712,32 +697,40 @@ static void __bch2_read_endio(struct work_struct *work)
|
||||
if (unlikely(rbio->narrow_crcs))
|
||||
bch2_rbio_narrow_crcs(rbio);
|
||||
|
||||
if (rbio->flags & BCH_READ_NODECODE)
|
||||
goto nodecode;
|
||||
if (likely(!(rbio->flags & BCH_READ_data_update))) {
|
||||
/* Adjust crc to point to subset of data we want: */
|
||||
crc.offset += rbio->offset_into_extent;
|
||||
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
|
||||
|
||||
/* Adjust crc to point to subset of data we want: */
|
||||
crc.offset += rbio->offset_into_extent;
|
||||
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
|
||||
if (crc_is_compressed(crc)) {
|
||||
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
||||
if (ret)
|
||||
goto decrypt_err;
|
||||
|
||||
if (crc_is_compressed(crc)) {
|
||||
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
||||
if (ret)
|
||||
goto decrypt_err;
|
||||
if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
|
||||
!c->opts.no_data_io)
|
||||
goto decompression_err;
|
||||
} else {
|
||||
/* don't need to decrypt the entire bio: */
|
||||
nonce = nonce_add(nonce, crc.offset << 9);
|
||||
bio_advance(src, crc.offset << 9);
|
||||
|
||||
if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
|
||||
!c->opts.no_data_io)
|
||||
goto decompression_err;
|
||||
BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
|
||||
src->bi_iter.bi_size = dst_iter.bi_size;
|
||||
|
||||
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
||||
if (ret)
|
||||
goto decrypt_err;
|
||||
|
||||
if (rbio->bounce) {
|
||||
struct bvec_iter src_iter = src->bi_iter;
|
||||
|
||||
bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* don't need to decrypt the entire bio: */
|
||||
nonce = nonce_add(nonce, crc.offset << 9);
|
||||
bio_advance(src, crc.offset << 9);
|
||||
|
||||
BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
|
||||
src->bi_iter.bi_size = dst_iter.bi_size;
|
||||
|
||||
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
||||
if (ret)
|
||||
goto decrypt_err;
|
||||
if (rbio->split)
|
||||
rbio->parent->pick = rbio->pick;
|
||||
|
||||
if (rbio->bounce) {
|
||||
struct bvec_iter src_iter = src->bi_iter;
|
||||
@ -754,12 +747,9 @@ static void __bch2_read_endio(struct work_struct *work)
|
||||
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
||||
if (ret)
|
||||
goto decrypt_err;
|
||||
|
||||
promote_start(rbio->promote, rbio);
|
||||
rbio->promote = NULL;
|
||||
}
|
||||
nodecode:
|
||||
if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
|
||||
|
||||
if (likely(!(rbio->flags & BCH_READ_in_retry))) {
|
||||
rbio = bch2_rbio_free(rbio);
|
||||
bch2_rbio_done(rbio);
|
||||
}
|
||||
@ -772,8 +762,8 @@ csum_err:
|
||||
* reading into buffers owned by userspace (that userspace can
|
||||
* scribble over) - retry the read, bouncing it this time:
|
||||
*/
|
||||
if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
|
||||
rbio->flags |= BCH_READ_MUST_BOUNCE;
|
||||
if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
|
||||
rbio->flags |= BCH_READ_must_bounce;
|
||||
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
|
||||
goto out;
|
||||
}
|
||||
@ -810,11 +800,11 @@ static void bch2_read_endio(struct bio *bio)
|
||||
return;
|
||||
}
|
||||
|
||||
if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
|
||||
if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
|
||||
(ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
|
||||
trace_and_count(c, read_reuse_race, &rbio->bio);
|
||||
|
||||
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
|
||||
if (rbio->flags & BCH_READ_retry_if_stale)
|
||||
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
|
||||
else
|
||||
bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
|
||||
@ -883,12 +873,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
||||
struct bvec_iter iter, struct bpos read_pos,
|
||||
enum btree_id data_btree, struct bkey_s_c k,
|
||||
unsigned offset_into_extent,
|
||||
struct bch_io_failures *failed, unsigned flags)
|
||||
struct bch_io_failures *failed, unsigned flags, int dev)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct extent_ptr_decoded pick;
|
||||
struct bch_read_bio *rbio = NULL;
|
||||
struct promote_op *promote = NULL;
|
||||
bool bounce = false, read_full = false, narrow_crcs = false;
|
||||
struct bpos data_pos = bkey_start_pos(k.k);
|
||||
int pick_ret;
|
||||
@ -905,7 +894,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
||||
goto out_read_done;
|
||||
}
|
||||
retry_pick:
|
||||
pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
|
||||
pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
|
||||
|
||||
/* hole or reservation - just zero fill: */
|
||||
if (!pick_ret)
|
||||
@ -941,7 +930,7 @@ retry_pick:
|
||||
* retry path, don't check here, it'll be caught in bch2_read_endio()
|
||||
* and we'll end up in the retry path:
|
||||
*/
|
||||
if ((flags & BCH_READ_IN_RETRY) &&
|
||||
if ((flags & BCH_READ_in_retry) &&
|
||||
!pick.ptr.cached &&
|
||||
ca &&
|
||||
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
|
||||
@ -955,48 +944,52 @@ retry_pick:
|
||||
* Unlock the iterator while the btree node's lock is still in
|
||||
* cache, before doing the IO:
|
||||
*/
|
||||
bch2_trans_unlock(trans);
|
||||
if (!(flags & BCH_READ_in_retry))
|
||||
bch2_trans_unlock(trans);
|
||||
else
|
||||
bch2_trans_unlock_long(trans);
|
||||
|
||||
if (flags & BCH_READ_NODECODE) {
|
||||
if (!(flags & BCH_READ_data_update)) {
|
||||
if (!(flags & BCH_READ_last_fragment) ||
|
||||
bio_flagged(&orig->bio, BIO_CHAIN))
|
||||
flags |= BCH_READ_must_clone;
|
||||
|
||||
narrow_crcs = !(flags & BCH_READ_in_retry) &&
|
||||
bch2_can_narrow_extent_crcs(k, pick.crc);
|
||||
|
||||
if (narrow_crcs && (flags & BCH_READ_user_mapped))
|
||||
flags |= BCH_READ_must_bounce;
|
||||
|
||||
EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
|
||||
|
||||
if (crc_is_compressed(pick.crc) ||
|
||||
(pick.crc.csum_type != BCH_CSUM_none &&
|
||||
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
|
||||
(bch2_csum_type_is_encryption(pick.crc.csum_type) &&
|
||||
(flags & BCH_READ_user_mapped)) ||
|
||||
(flags & BCH_READ_must_bounce)))) {
|
||||
read_full = true;
|
||||
bounce = true;
|
||||
}
|
||||
} else {
|
||||
read_full = true;
|
||||
/*
|
||||
* can happen if we retry, and the extent we were going to read
|
||||
* has been merged in the meantime:
|
||||
*/
|
||||
if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) {
|
||||
struct data_update *u = container_of(orig, struct data_update, rbio);
|
||||
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
|
||||
if (ca)
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
goto hole;
|
||||
}
|
||||
|
||||
iter.bi_size = pick.crc.compressed_size << 9;
|
||||
goto get_bio;
|
||||
}
|
||||
|
||||
if (!(flags & BCH_READ_LAST_FRAGMENT) ||
|
||||
bio_flagged(&orig->bio, BIO_CHAIN))
|
||||
flags |= BCH_READ_MUST_CLONE;
|
||||
|
||||
narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
|
||||
bch2_can_narrow_extent_crcs(k, pick.crc);
|
||||
|
||||
if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
|
||||
flags |= BCH_READ_MUST_BOUNCE;
|
||||
|
||||
EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
|
||||
|
||||
if (crc_is_compressed(pick.crc) ||
|
||||
(pick.crc.csum_type != BCH_CSUM_none &&
|
||||
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
|
||||
(bch2_csum_type_is_encryption(pick.crc.csum_type) &&
|
||||
(flags & BCH_READ_USER_MAPPED)) ||
|
||||
(flags & BCH_READ_MUST_BOUNCE)))) {
|
||||
read_full = true;
|
||||
bounce = true;
|
||||
}
|
||||
|
||||
if (orig->opts.promote_target || have_io_error(failed))
|
||||
promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
|
||||
&rbio, &bounce, &read_full, failed);
|
||||
rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
|
||||
&bounce, &read_full, failed);
|
||||
|
||||
if (!read_full) {
|
||||
EBUG_ON(crc_is_compressed(pick.crc));
|
||||
@ -1015,7 +1008,7 @@ retry_pick:
|
||||
pick.crc.offset = 0;
|
||||
pick.crc.live_size = bvec_iter_sectors(iter);
|
||||
}
|
||||
get_bio:
|
||||
|
||||
if (rbio) {
|
||||
/*
|
||||
* promote already allocated bounce rbio:
|
||||
@ -1030,17 +1023,16 @@ get_bio:
|
||||
} else if (bounce) {
|
||||
unsigned sectors = pick.crc.compressed_size;
|
||||
|
||||
rbio = rbio_init(bio_alloc_bioset(NULL,
|
||||
rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
|
||||
DIV_ROUND_UP(sectors, PAGE_SECTORS),
|
||||
0,
|
||||
GFP_NOFS,
|
||||
&c->bio_read_split),
|
||||
orig->opts);
|
||||
orig);
|
||||
|
||||
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
|
||||
rbio->bounce = true;
|
||||
rbio->split = true;
|
||||
} else if (flags & BCH_READ_MUST_CLONE) {
|
||||
} else if (flags & BCH_READ_must_clone) {
|
||||
/*
|
||||
* Have to clone if there were any splits, due to error
|
||||
* reporting issues (if a split errored, and retrying didn't
|
||||
@ -1049,11 +1041,10 @@ get_bio:
|
||||
* from the whole bio, in which case we don't want to retry and
|
||||
* lose the error)
|
||||
*/
|
||||
rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
|
||||
rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
|
||||
&c->bio_read_split),
|
||||
orig->opts);
|
||||
orig);
|
||||
rbio->bio.bi_iter = iter;
|
||||
rbio->split = true;
|
||||
} else {
|
||||
rbio = orig;
|
||||
rbio->bio.bi_iter = iter;
|
||||
@ -1062,11 +1053,8 @@ get_bio:
|
||||
|
||||
EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
|
||||
|
||||
rbio->c = c;
|
||||
rbio->submit_time = local_clock();
|
||||
if (rbio->split)
|
||||
rbio->parent = orig;
|
||||
else
|
||||
if (!rbio->split)
|
||||
rbio->end_io = orig->bio.bi_end_io;
|
||||
rbio->bvec_iter = iter;
|
||||
rbio->offset_into_extent= offset_into_extent;
|
||||
@ -1076,20 +1064,14 @@ get_bio:
|
||||
rbio->hole = 0;
|
||||
rbio->retry = 0;
|
||||
rbio->context = 0;
|
||||
/* XXX: only initialize this if needed */
|
||||
rbio->devs_have = bch2_bkey_devs(k);
|
||||
rbio->pick = pick;
|
||||
rbio->subvol = orig->subvol;
|
||||
rbio->read_pos = read_pos;
|
||||
rbio->data_btree = data_btree;
|
||||
rbio->data_pos = data_pos;
|
||||
rbio->version = k.k->bversion;
|
||||
rbio->promote = promote;
|
||||
INIT_WORK(&rbio->work, NULL);
|
||||
|
||||
if (flags & BCH_READ_NODECODE)
|
||||
orig->pick = pick;
|
||||
|
||||
rbio->bio.bi_opf = orig->bio.bi_opf;
|
||||
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
|
||||
rbio->bio.bi_end_io = bch2_read_endio;
|
||||
@ -1097,18 +1079,19 @@ get_bio:
|
||||
if (rbio->bounce)
|
||||
trace_and_count(c, read_bounce, &rbio->bio);
|
||||
|
||||
this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
|
||||
if (!(flags & BCH_READ_data_update))
|
||||
this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
|
||||
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
|
||||
|
||||
/*
|
||||
* If it's being moved internally, we don't want to flag it as a cache
|
||||
* hit:
|
||||
*/
|
||||
if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE))
|
||||
if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update))
|
||||
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
|
||||
PTR_BUCKET_NR(ca, &pick.ptr), READ);
|
||||
|
||||
if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
|
||||
if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
|
||||
bio_inc_remaining(&orig->bio);
|
||||
trace_and_count(c, read_split, &orig->bio);
|
||||
}
|
||||
@ -1132,10 +1115,10 @@ get_bio:
|
||||
bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
|
||||
|
||||
if (unlikely(c->opts.no_data_io)) {
|
||||
if (likely(!(flags & BCH_READ_IN_RETRY)))
|
||||
if (likely(!(flags & BCH_READ_in_retry)))
|
||||
bio_endio(&rbio->bio);
|
||||
} else {
|
||||
if (likely(!(flags & BCH_READ_IN_RETRY)))
|
||||
if (likely(!(flags & BCH_READ_in_retry)))
|
||||
submit_bio(&rbio->bio);
|
||||
else
|
||||
submit_bio_wait(&rbio->bio);
|
||||
@ -1153,11 +1136,11 @@ get_bio:
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (likely(!(flags & BCH_READ_IN_RETRY)))
|
||||
if (likely(!(flags & BCH_READ_in_retry)))
|
||||
bio_endio(&rbio->bio);
|
||||
}
|
||||
out:
|
||||
if (likely(!(flags & BCH_READ_IN_RETRY))) {
|
||||
if (likely(!(flags & BCH_READ_in_retry))) {
|
||||
return 0;
|
||||
} else {
|
||||
int ret;
|
||||
@ -1180,7 +1163,7 @@ out:
|
||||
}
|
||||
|
||||
err:
|
||||
if (flags & BCH_READ_IN_RETRY)
|
||||
if (flags & BCH_READ_in_retry)
|
||||
return READ_ERR;
|
||||
|
||||
orig->bio.bi_status = BLK_STS_IOERR;
|
||||
@ -1188,16 +1171,16 @@ err:
|
||||
|
||||
hole:
|
||||
/*
|
||||
* won't normally happen in the BCH_READ_NODECODE
|
||||
* won't normally happen in the BCH_READ_data_update
|
||||
* (bch2_move_extent()) path, but if we retry and the extent we wanted
|
||||
* to read no longer exists we have to signal that:
|
||||
*/
|
||||
if (flags & BCH_READ_NODECODE)
|
||||
if (flags & BCH_READ_data_update)
|
||||
orig->hole = true;
|
||||
|
||||
zero_fill_bio_iter(&orig->bio, iter);
|
||||
out_read_done:
|
||||
if (flags & BCH_READ_LAST_FRAGMENT)
|
||||
if (flags & BCH_READ_last_fragment)
|
||||
bch2_rbio_done(orig);
|
||||
return 0;
|
||||
}
|
||||
@ -1212,7 +1195,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
BUG_ON(flags & BCH_READ_NODECODE);
|
||||
BUG_ON(flags & BCH_READ_data_update);
|
||||
|
||||
bch2_bkey_buf_init(&sk);
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
|
||||
@ -1262,15 +1245,15 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
swap(bvec_iter.bi_size, bytes);
|
||||
|
||||
if (bvec_iter.bi_size == bytes)
|
||||
flags |= BCH_READ_LAST_FRAGMENT;
|
||||
flags |= BCH_READ_last_fragment;
|
||||
|
||||
ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
|
||||
data_btree, k,
|
||||
offset_into_extent, failed, flags);
|
||||
offset_into_extent, failed, flags, -1);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (flags & BCH_READ_LAST_FRAGMENT)
|
||||
if (flags & BCH_READ_last_fragment)
|
||||
break;
|
||||
|
||||
swap(bvec_iter.bi_size, bytes);
|
||||
|
@ -35,20 +35,19 @@ struct bch_read_bio {
|
||||
u16 flags;
|
||||
union {
|
||||
struct {
|
||||
u16 bounce:1,
|
||||
u16 promote:1,
|
||||
bounce:1,
|
||||
split:1,
|
||||
kmalloc:1,
|
||||
have_ioref:1,
|
||||
narrow_crcs:1,
|
||||
hole:1,
|
||||
saw_error:1,
|
||||
retry:2,
|
||||
context:2;
|
||||
};
|
||||
u16 _state;
|
||||
};
|
||||
|
||||
struct bch_devs_list devs_have;
|
||||
|
||||
struct extent_ptr_decoded pick;
|
||||
|
||||
/*
|
||||
@ -65,8 +64,6 @@ struct bch_read_bio {
|
||||
struct bpos data_pos;
|
||||
struct bversion version;
|
||||
|
||||
struct promote_op *promote;
|
||||
|
||||
struct bch_io_opts opts;
|
||||
|
||||
struct work_struct work;
|
||||
@ -108,23 +105,32 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
|
||||
return 0;
|
||||
}
|
||||
|
||||
enum bch_read_flags {
|
||||
BCH_READ_RETRY_IF_STALE = 1 << 0,
|
||||
BCH_READ_MAY_PROMOTE = 1 << 1,
|
||||
BCH_READ_USER_MAPPED = 1 << 2,
|
||||
BCH_READ_NODECODE = 1 << 3,
|
||||
BCH_READ_LAST_FRAGMENT = 1 << 4,
|
||||
#define BCH_READ_FLAGS() \
|
||||
x(retry_if_stale) \
|
||||
x(may_promote) \
|
||||
x(user_mapped) \
|
||||
x(data_update) \
|
||||
x(last_fragment) \
|
||||
x(must_bounce) \
|
||||
x(must_clone) \
|
||||
x(in_retry)
|
||||
|
||||
/* internal: */
|
||||
BCH_READ_MUST_BOUNCE = 1 << 5,
|
||||
BCH_READ_MUST_CLONE = 1 << 6,
|
||||
BCH_READ_IN_RETRY = 1 << 7,
|
||||
enum __bch_read_flags {
|
||||
#define x(n) __BCH_READ_##n,
|
||||
BCH_READ_FLAGS()
|
||||
#undef x
|
||||
};
|
||||
|
||||
enum bch_read_flags {
|
||||
#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n),
|
||||
BCH_READ_FLAGS()
|
||||
#undef x
|
||||
};
|
||||
|
||||
int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
|
||||
struct bvec_iter, struct bpos, enum btree_id,
|
||||
struct bkey_s_c, unsigned,
|
||||
struct bch_io_failures *, unsigned);
|
||||
struct bch_io_failures *, unsigned, int);
|
||||
|
||||
static inline void bch2_read_extent(struct btree_trans *trans,
|
||||
struct bch_read_bio *rbio, struct bpos read_pos,
|
||||
@ -132,7 +138,7 @@ static inline void bch2_read_extent(struct btree_trans *trans,
|
||||
unsigned offset_into_extent, unsigned flags)
|
||||
{
|
||||
__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
|
||||
data_btree, k, offset_into_extent, NULL, flags);
|
||||
data_btree, k, offset_into_extent, NULL, flags, -1);
|
||||
}
|
||||
|
||||
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
|
||||
@ -145,24 +151,39 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
|
||||
BUG_ON(rbio->_state);
|
||||
|
||||
rbio->c = c;
|
||||
rbio->start_time = local_clock();
|
||||
rbio->subvol = inum.subvol;
|
||||
|
||||
__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
|
||||
BCH_READ_RETRY_IF_STALE|
|
||||
BCH_READ_MAY_PROMOTE|
|
||||
BCH_READ_USER_MAPPED);
|
||||
BCH_READ_retry_if_stale|
|
||||
BCH_READ_may_promote|
|
||||
BCH_READ_user_mapped);
|
||||
}
|
||||
|
||||
static inline struct bch_read_bio *rbio_init(struct bio *bio,
|
||||
struct bch_io_opts opts)
|
||||
static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
|
||||
struct bch_read_bio *orig)
|
||||
{
|
||||
struct bch_read_bio *rbio = to_rbio(bio);
|
||||
|
||||
rbio->c = orig->c;
|
||||
rbio->_state = 0;
|
||||
rbio->promote = NULL;
|
||||
rbio->opts = opts;
|
||||
rbio->split = true;
|
||||
rbio->parent = orig;
|
||||
rbio->opts = orig->opts;
|
||||
return rbio;
|
||||
}
|
||||
|
||||
static inline struct bch_read_bio *rbio_init(struct bio *bio,
|
||||
struct bch_fs *c,
|
||||
struct bch_io_opts opts,
|
||||
bio_end_io_t end_io)
|
||||
{
|
||||
struct bch_read_bio *rbio = to_rbio(bio);
|
||||
|
||||
rbio->start_time = local_clock();
|
||||
rbio->c = c;
|
||||
rbio->_state = 0;
|
||||
rbio->opts = opts;
|
||||
rbio->bio.bi_end_io = end_io;
|
||||
return rbio;
|
||||
}
|
||||
|
||||
|
@ -374,7 +374,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
|
||||
bch2_extent_update(trans, inum, &iter, sk.k,
|
||||
&op->res,
|
||||
op->new_i_size, &op->i_sectors_delta,
|
||||
op->flags & BCH_WRITE_CHECK_ENOSPC);
|
||||
op->flags & BCH_WRITE_check_enospc);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
@ -403,10 +403,10 @@ static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
|
||||
(subvol_inum) { op->subvol, op->pos.inode, },
|
||||
offset << 9);
|
||||
prt_printf(out, "write error%s: ",
|
||||
op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
|
||||
op->flags & BCH_WRITE_move ? "(internal move)" : "");
|
||||
}
|
||||
|
||||
static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
|
||||
void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
|
||||
{
|
||||
__bch2_write_op_error(out, op, op->pos.offset);
|
||||
}
|
||||
@ -483,7 +483,7 @@ static void bch2_write_done(struct closure *cl)
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
|
||||
bch2_disk_reservation_put(c, &op->res);
|
||||
|
||||
if (!(op->flags & BCH_WRITE_MOVE))
|
||||
if (!(op->flags & BCH_WRITE_move))
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_write);
|
||||
bch2_keylist_free(&op->insert_keys, op->inline_keys);
|
||||
|
||||
@ -529,7 +529,7 @@ static void __bch2_write_index(struct bch_write_op *op)
|
||||
unsigned dev;
|
||||
int ret = 0;
|
||||
|
||||
if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
|
||||
if (unlikely(op->flags & BCH_WRITE_io_error)) {
|
||||
ret = bch2_write_drop_io_error_ptrs(op);
|
||||
if (ret)
|
||||
goto err;
|
||||
@ -538,7 +538,7 @@ static void __bch2_write_index(struct bch_write_op *op)
|
||||
if (!bch2_keylist_empty(keys)) {
|
||||
u64 sectors_start = keylist_sectors(keys);
|
||||
|
||||
ret = !(op->flags & BCH_WRITE_MOVE)
|
||||
ret = !(op->flags & BCH_WRITE_move)
|
||||
? bch2_write_index_default(op)
|
||||
: bch2_data_update_index_update(op);
|
||||
|
||||
@ -570,14 +570,22 @@ out:
|
||||
err:
|
||||
keys->top = keys->keys;
|
||||
op->error = ret;
|
||||
op->flags |= BCH_WRITE_SUBMITTED;
|
||||
op->flags |= BCH_WRITE_submitted;
|
||||
goto out;
|
||||
}
|
||||
|
||||
static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
|
||||
{
|
||||
if (state != wp->state) {
|
||||
struct task_struct *p = current;
|
||||
u64 now = ktime_get_ns();
|
||||
u64 runtime = p->se.sum_exec_runtime +
|
||||
(now - p->se.exec_start);
|
||||
|
||||
if (state == WRITE_POINT_runnable)
|
||||
wp->last_runtime = runtime;
|
||||
else if (wp->state == WRITE_POINT_runnable)
|
||||
wp->time[WRITE_POINT_running] += runtime - wp->last_runtime;
|
||||
|
||||
if (wp->last_state_change &&
|
||||
time_after64(now, wp->last_state_change))
|
||||
@ -591,7 +599,7 @@ static inline void wp_update_state(struct write_point *wp, bool running)
|
||||
{
|
||||
enum write_point_state state;
|
||||
|
||||
state = running ? WRITE_POINT_running :
|
||||
state = running ? WRITE_POINT_runnable:
|
||||
!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
|
||||
: WRITE_POINT_stopped;
|
||||
|
||||
@ -605,8 +613,8 @@ static CLOSURE_CALLBACK(bch2_write_index)
|
||||
struct workqueue_struct *wq = index_update_wq(op);
|
||||
unsigned long flags;
|
||||
|
||||
if ((op->flags & BCH_WRITE_SUBMITTED) &&
|
||||
(op->flags & BCH_WRITE_MOVE))
|
||||
if ((op->flags & BCH_WRITE_submitted) &&
|
||||
(op->flags & BCH_WRITE_move))
|
||||
bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
|
||||
|
||||
spin_lock_irqsave(&wp->writes_lock, flags);
|
||||
@ -644,11 +652,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
|
||||
if (!op)
|
||||
break;
|
||||
|
||||
op->flags |= BCH_WRITE_IN_WORKER;
|
||||
op->flags |= BCH_WRITE_in_worker;
|
||||
|
||||
__bch2_write_index(op);
|
||||
|
||||
if (!(op->flags & BCH_WRITE_SUBMITTED))
|
||||
if (!(op->flags & BCH_WRITE_submitted))
|
||||
__bch2_write(op);
|
||||
else
|
||||
bch2_write_done(&op->cl);
|
||||
@ -672,7 +680,7 @@ static void bch2_write_endio(struct bio *bio)
|
||||
"data write error: %s",
|
||||
bch2_blk_status_to_str(bio->bi_status))) {
|
||||
set_bit(wbio->dev, op->failed.d);
|
||||
op->flags |= BCH_WRITE_IO_ERROR;
|
||||
op->flags |= BCH_WRITE_io_error;
|
||||
}
|
||||
|
||||
if (wbio->nocow) {
|
||||
@ -719,7 +727,7 @@ static void init_append_extent(struct bch_write_op *op,
|
||||
bch2_extent_crc_append(&e->k_i, crc);
|
||||
|
||||
bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
|
||||
op->flags & BCH_WRITE_CACHED);
|
||||
op->flags & BCH_WRITE_cached);
|
||||
|
||||
bch2_keylist_push(&op->insert_keys);
|
||||
}
|
||||
@ -836,7 +844,7 @@ static enum prep_encoded_ret {
|
||||
struct bch_fs *c = op->c;
|
||||
struct bio *bio = &op->wbio.bio;
|
||||
|
||||
if (!(op->flags & BCH_WRITE_DATA_ENCODED))
|
||||
if (!(op->flags & BCH_WRITE_data_encoded))
|
||||
return PREP_ENCODED_OK;
|
||||
|
||||
BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
|
||||
@ -873,7 +881,7 @@ static enum prep_encoded_ret {
|
||||
if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
|
||||
return PREP_ENCODED_CHECKSUM_ERR;
|
||||
|
||||
if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
|
||||
if (bch2_bio_uncompress_inplace(op, bio))
|
||||
return PREP_ENCODED_ERR;
|
||||
}
|
||||
|
||||
@ -944,9 +952,9 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
||||
if (ec_buf ||
|
||||
op->compression_opt ||
|
||||
(op->csum_type &&
|
||||
!(op->flags & BCH_WRITE_PAGES_STABLE)) ||
|
||||
!(op->flags & BCH_WRITE_pages_stable)) ||
|
||||
(bch2_csum_type_is_encryption(op->csum_type) &&
|
||||
!(op->flags & BCH_WRITE_PAGES_OWNED))) {
|
||||
!(op->flags & BCH_WRITE_pages_owned))) {
|
||||
dst = bch2_write_bio_alloc(c, wp, src,
|
||||
&page_alloc_failed,
|
||||
ec_buf);
|
||||
@ -966,7 +974,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
||||
break;
|
||||
|
||||
BUG_ON(op->compression_opt &&
|
||||
(op->flags & BCH_WRITE_DATA_ENCODED) &&
|
||||
(op->flags & BCH_WRITE_data_encoded) &&
|
||||
bch2_csum_type_is_encryption(op->crc.csum_type));
|
||||
BUG_ON(op->compression_opt && !bounce);
|
||||
|
||||
@ -1004,7 +1012,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
||||
}
|
||||
}
|
||||
|
||||
if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
|
||||
if ((op->flags & BCH_WRITE_data_encoded) &&
|
||||
!crc_is_compressed(crc) &&
|
||||
bch2_csum_type_is_encryption(op->crc.csum_type) ==
|
||||
bch2_csum_type_is_encryption(op->csum_type)) {
|
||||
@ -1036,7 +1044,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
||||
crc.compression_type = compression_type;
|
||||
crc.nonce = nonce;
|
||||
} else {
|
||||
if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
|
||||
if ((op->flags & BCH_WRITE_data_encoded) &&
|
||||
bch2_rechecksum_bio(c, src, version, op->crc,
|
||||
NULL, &op->crc,
|
||||
src_len >> 9,
|
||||
@ -1210,9 +1218,9 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
|
||||
|
||||
static void __bch2_nocow_write_done(struct bch_write_op *op)
|
||||
{
|
||||
if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
|
||||
if (unlikely(op->flags & BCH_WRITE_io_error)) {
|
||||
op->error = -EIO;
|
||||
} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
|
||||
} else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
|
||||
bch2_nocow_write_convert_unwritten(op);
|
||||
}
|
||||
|
||||
@ -1241,7 +1249,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
|
||||
struct bucket_to_lock *stale_at;
|
||||
int stale, ret;
|
||||
|
||||
if (op->flags & BCH_WRITE_MOVE)
|
||||
if (op->flags & BCH_WRITE_move)
|
||||
return;
|
||||
|
||||
darray_init(&buckets);
|
||||
@ -1299,7 +1307,7 @@ retry:
|
||||
}), GFP_KERNEL|__GFP_NOFAIL);
|
||||
|
||||
if (ptr->unwritten)
|
||||
op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
|
||||
op->flags |= BCH_WRITE_convert_unwritten;
|
||||
}
|
||||
|
||||
/* Unlock before taking nocow locks, doing IO: */
|
||||
@ -1307,7 +1315,7 @@ retry:
|
||||
bch2_trans_unlock(trans);
|
||||
|
||||
bch2_cut_front(op->pos, op->insert_keys.top);
|
||||
if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
|
||||
if (op->flags & BCH_WRITE_convert_unwritten)
|
||||
bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
|
||||
|
||||
darray_for_each(buckets, i) {
|
||||
@ -1332,7 +1340,7 @@ retry:
|
||||
wbio_init(bio)->put_bio = true;
|
||||
bio->bi_opf = op->wbio.bio.bi_opf;
|
||||
} else {
|
||||
op->flags |= BCH_WRITE_SUBMITTED;
|
||||
op->flags |= BCH_WRITE_submitted;
|
||||
}
|
||||
|
||||
op->pos.offset += bio_sectors(bio);
|
||||
@ -1346,7 +1354,7 @@ retry:
|
||||
op->insert_keys.top, true);
|
||||
|
||||
bch2_keylist_push(&op->insert_keys);
|
||||
if (op->flags & BCH_WRITE_SUBMITTED)
|
||||
if (op->flags & BCH_WRITE_submitted)
|
||||
break;
|
||||
bch2_btree_iter_advance(&iter);
|
||||
}
|
||||
@ -1366,15 +1374,15 @@ err:
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
op->error = ret;
|
||||
op->flags |= BCH_WRITE_SUBMITTED;
|
||||
op->flags |= BCH_WRITE_submitted;
|
||||
}
|
||||
|
||||
/* fallback to cow write path? */
|
||||
if (!(op->flags & BCH_WRITE_SUBMITTED)) {
|
||||
if (!(op->flags & BCH_WRITE_submitted)) {
|
||||
closure_sync(&op->cl);
|
||||
__bch2_nocow_write_done(op);
|
||||
op->insert_keys.top = op->insert_keys.keys;
|
||||
} else if (op->flags & BCH_WRITE_SYNC) {
|
||||
} else if (op->flags & BCH_WRITE_sync) {
|
||||
closure_sync(&op->cl);
|
||||
bch2_nocow_write_done(&op->cl.work);
|
||||
} else {
|
||||
@ -1426,7 +1434,7 @@ static void __bch2_write(struct bch_write_op *op)
|
||||
|
||||
if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
|
||||
bch2_nocow_write(op);
|
||||
if (op->flags & BCH_WRITE_SUBMITTED)
|
||||
if (op->flags & BCH_WRITE_submitted)
|
||||
goto out_nofs_restore;
|
||||
}
|
||||
again:
|
||||
@ -1456,7 +1464,7 @@ again:
|
||||
ret = bch2_trans_run(c, lockrestart_do(trans,
|
||||
bch2_alloc_sectors_start_trans(trans,
|
||||
op->target,
|
||||
op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
|
||||
op->opts.erasure_code && !(op->flags & BCH_WRITE_cached),
|
||||
op->write_point,
|
||||
&op->devs_have,
|
||||
op->nr_replicas,
|
||||
@ -1479,10 +1487,10 @@ again:
|
||||
bch2_alloc_sectors_done_inlined(c, wp);
|
||||
err:
|
||||
if (ret <= 0) {
|
||||
op->flags |= BCH_WRITE_SUBMITTED;
|
||||
op->flags |= BCH_WRITE_submitted;
|
||||
|
||||
if (unlikely(ret < 0)) {
|
||||
if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) {
|
||||
if (!(op->flags & BCH_WRITE_alloc_nowait)) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
bch2_write_op_error(&buf, op);
|
||||
prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
|
||||
@ -1514,14 +1522,14 @@ err:
|
||||
* synchronously here if we weren't able to submit all of the IO at
|
||||
* once, as that signals backpressure to the caller.
|
||||
*/
|
||||
if ((op->flags & BCH_WRITE_SYNC) ||
|
||||
(!(op->flags & BCH_WRITE_SUBMITTED) &&
|
||||
!(op->flags & BCH_WRITE_IN_WORKER))) {
|
||||
if ((op->flags & BCH_WRITE_sync) ||
|
||||
(!(op->flags & BCH_WRITE_submitted) &&
|
||||
!(op->flags & BCH_WRITE_in_worker))) {
|
||||
bch2_wait_on_allocator(c, &op->cl);
|
||||
|
||||
__bch2_write_index(op);
|
||||
|
||||
if (!(op->flags & BCH_WRITE_SUBMITTED))
|
||||
if (!(op->flags & BCH_WRITE_submitted))
|
||||
goto again;
|
||||
bch2_write_done(&op->cl);
|
||||
} else {
|
||||
@ -1542,8 +1550,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
|
||||
|
||||
memset(&op->failed, 0, sizeof(op->failed));
|
||||
|
||||
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
|
||||
op->flags |= BCH_WRITE_SUBMITTED;
|
||||
op->flags |= BCH_WRITE_wrote_data_inline;
|
||||
op->flags |= BCH_WRITE_submitted;
|
||||
|
||||
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
|
||||
|
||||
@ -1606,8 +1614,8 @@ CLOSURE_CALLBACK(bch2_write)
|
||||
BUG_ON(!op->write_point.v);
|
||||
BUG_ON(bkey_eq(op->pos, POS_MAX));
|
||||
|
||||
if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
|
||||
op->flags |= BCH_WRITE_ALLOC_NOWAIT;
|
||||
if (op->flags & BCH_WRITE_only_specified_devs)
|
||||
op->flags |= BCH_WRITE_alloc_nowait;
|
||||
|
||||
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
|
||||
op->start_time = local_clock();
|
||||
@ -1628,13 +1636,14 @@ CLOSURE_CALLBACK(bch2_write)
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (!(op->flags & BCH_WRITE_MOVE) &&
|
||||
if (!(op->flags & BCH_WRITE_move) &&
|
||||
!bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
|
||||
op->error = -BCH_ERR_erofs_no_writes;
|
||||
goto err;
|
||||
}
|
||||
|
||||
this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
|
||||
if (!(op->flags & BCH_WRITE_move))
|
||||
this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
|
||||
bch2_increment_clock(c, bio_sectors(bio), WRITE);
|
||||
|
||||
data_len = min_t(u64, bio->bi_iter.bi_size,
|
||||
|
@ -20,22 +20,23 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw
|
||||
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
|
||||
enum bch_data_type, const struct bkey_i *, bool);
|
||||
|
||||
void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op);
|
||||
|
||||
#define BCH_WRITE_FLAGS() \
|
||||
x(ALLOC_NOWAIT) \
|
||||
x(CACHED) \
|
||||
x(DATA_ENCODED) \
|
||||
x(PAGES_STABLE) \
|
||||
x(PAGES_OWNED) \
|
||||
x(ONLY_SPECIFIED_DEVS) \
|
||||
x(WROTE_DATA_INLINE) \
|
||||
x(FROM_INTERNAL) \
|
||||
x(CHECK_ENOSPC) \
|
||||
x(SYNC) \
|
||||
x(MOVE) \
|
||||
x(IN_WORKER) \
|
||||
x(SUBMITTED) \
|
||||
x(IO_ERROR) \
|
||||
x(CONVERT_UNWRITTEN)
|
||||
x(alloc_nowait) \
|
||||
x(cached) \
|
||||
x(data_encoded) \
|
||||
x(pages_stable) \
|
||||
x(pages_owned) \
|
||||
x(only_specified_devs) \
|
||||
x(wrote_data_inline) \
|
||||
x(check_enospc) \
|
||||
x(sync) \
|
||||
x(move) \
|
||||
x(in_worker) \
|
||||
x(submitted) \
|
||||
x(io_error) \
|
||||
x(convert_unwritten)
|
||||
|
||||
enum __bch_write_flags {
|
||||
#define x(f) __BCH_WRITE_##f,
|
||||
|
@ -64,7 +64,7 @@ struct bch_write_op {
|
||||
struct bpos pos;
|
||||
struct bversion version;
|
||||
|
||||
/* For BCH_WRITE_DATA_ENCODED: */
|
||||
/* For BCH_WRITE_data_encoded: */
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
|
||||
struct write_point_specifier write_point;
|
||||
|
@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq)
|
||||
|
||||
static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(p->list); i++)
|
||||
INIT_LIST_HEAD(&p->list[i]);
|
||||
INIT_LIST_HEAD(&p->flushed);
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
|
||||
INIT_LIST_HEAD(&p->unflushed[i]);
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
|
||||
INIT_LIST_HEAD(&p->flushed[i]);
|
||||
atomic_set(&p->count, count);
|
||||
p->devs.nr = 0;
|
||||
}
|
||||
@ -307,7 +306,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
|
||||
|
||||
bch2_journal_space_available(j);
|
||||
|
||||
__bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
|
||||
__bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq));
|
||||
}
|
||||
|
||||
void bch2_journal_halt(struct journal *j)
|
||||
@ -320,6 +319,16 @@ void bch2_journal_halt(struct journal *j)
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
void bch2_journal_halt_locked(struct journal *j)
|
||||
{
|
||||
lockdep_assert_held(&j->lock);
|
||||
|
||||
__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
|
||||
if (!j->err_seq)
|
||||
j->err_seq = journal_cur_seq(j);
|
||||
journal_wake(j);
|
||||
}
|
||||
|
||||
static bool journal_entry_want_write(struct journal *j)
|
||||
{
|
||||
bool ret = !journal_entry_is_open(j) ||
|
||||
@ -382,9 +391,12 @@ static int journal_entry_open(struct journal *j)
|
||||
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
|
||||
return JOURNAL_ERR_max_in_flight;
|
||||
|
||||
if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX,
|
||||
c, "cannot start: journal seq overflow"))
|
||||
if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) {
|
||||
bch_err(c, "cannot start: journal seq overflow");
|
||||
if (bch2_fs_emergency_read_only_locked(c))
|
||||
bch_err(c, "fatal error - emergency read only");
|
||||
return JOURNAL_ERR_insufficient_devices; /* -EROFS */
|
||||
}
|
||||
|
||||
BUG_ON(!j->cur_entry_sectors);
|
||||
|
||||
@ -601,6 +613,16 @@ out:
|
||||
: -BCH_ERR_journal_res_get_blocked;
|
||||
}
|
||||
|
||||
static unsigned max_dev_latency(struct bch_fs *c)
|
||||
{
|
||||
u64 nsecs = 0;
|
||||
|
||||
for_each_rw_member(c, ca)
|
||||
nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
|
||||
|
||||
return nsecs_to_jiffies(nsecs);
|
||||
}
|
||||
|
||||
/*
|
||||
* Essentially the entry function to the journaling code. When bcachefs is doing
|
||||
* a btree insert, it calls this function to get the current journal write.
|
||||
@ -612,17 +634,31 @@ out:
|
||||
* btree node write locks.
|
||||
*/
|
||||
int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
|
||||
unsigned flags)
|
||||
unsigned flags,
|
||||
struct btree_trans *trans)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (closure_wait_event_timeout(&j->async_wait,
|
||||
(ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
|
||||
(flags & JOURNAL_RES_GET_NONBLOCK),
|
||||
HZ * 10))
|
||||
HZ))
|
||||
return ret;
|
||||
|
||||
if (trans)
|
||||
bch2_trans_unlock_long(trans);
|
||||
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10);
|
||||
|
||||
remaining_wait = max(0, remaining_wait - HZ);
|
||||
|
||||
if (closure_wait_event_timeout(&j->async_wait,
|
||||
(ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
|
||||
(flags & JOURNAL_RES_GET_NONBLOCK),
|
||||
remaining_wait))
|
||||
return ret;
|
||||
|
||||
struct printbuf buf = PRINTBUF;
|
||||
bch2_journal_debug_to_text(&buf, j);
|
||||
bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
|
||||
@ -727,7 +763,7 @@ recheck_need_open:
|
||||
* livelock:
|
||||
*/
|
||||
sched_annotate_sleep();
|
||||
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
|
||||
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -760,6 +796,7 @@ recheck_need_open:
|
||||
}
|
||||
|
||||
buf->must_flush = true;
|
||||
j->flushing_seq = max(j->flushing_seq, seq);
|
||||
|
||||
if (parent && !closure_wait(&buf->wait, parent))
|
||||
BUG();
|
||||
@ -848,7 +885,7 @@ out:
|
||||
static int __bch2_journal_meta(struct journal *j)
|
||||
{
|
||||
struct journal_res res = {};
|
||||
int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
|
||||
int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -1345,8 +1382,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
|
||||
set_bit(JOURNAL_running, &j->flags);
|
||||
j->last_flush_write = jiffies;
|
||||
|
||||
j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
|
||||
j->reservations.unwritten_idx++;
|
||||
j->reservations.idx = journal_cur_seq(j);
|
||||
|
||||
c->last_bucket_seq_cleanup = journal_cur_seq(j);
|
||||
|
||||
@ -1602,54 +1638,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
|
||||
__bch2_journal_debug_to_text(out, j);
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
|
||||
{
|
||||
struct journal_entry_pin_list *pin_list;
|
||||
struct journal_entry_pin *pin;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
if (!test_bit(JOURNAL_running, &j->flags)) {
|
||||
spin_unlock(&j->lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
*seq = max(*seq, j->pin.front);
|
||||
|
||||
if (*seq >= j->pin.back) {
|
||||
spin_unlock(&j->lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
out->atomic++;
|
||||
|
||||
pin_list = journal_seq_pin(j, *seq);
|
||||
|
||||
prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
|
||||
printbuf_indent_add(out, 2);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
|
||||
list_for_each_entry(pin, &pin_list->list[i], list)
|
||||
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
|
||||
|
||||
if (!list_empty(&pin_list->flushed))
|
||||
prt_printf(out, "flushed:\n");
|
||||
|
||||
list_for_each_entry(pin, &pin_list->flushed, list)
|
||||
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
|
||||
|
||||
printbuf_indent_sub(out, 2);
|
||||
|
||||
--out->atomic;
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
|
||||
{
|
||||
u64 seq = 0;
|
||||
|
||||
while (!bch2_journal_seq_pins_to_text(out, j, &seq))
|
||||
seq++;
|
||||
}
|
||||
|
@ -193,7 +193,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
|
||||
static inline struct jset_entry *
|
||||
journal_res_entry(struct journal *j, struct journal_res *res)
|
||||
{
|
||||
return vstruct_idx(j->buf[res->idx].data, res->offset);
|
||||
return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset);
|
||||
}
|
||||
|
||||
static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
|
||||
@ -267,8 +267,9 @@ bool bch2_journal_entry_close(struct journal *);
|
||||
void bch2_journal_do_writes(struct journal *);
|
||||
void bch2_journal_buf_put_final(struct journal *, u64);
|
||||
|
||||
static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
|
||||
static inline void __bch2_journal_buf_put(struct journal *j, u64 seq)
|
||||
{
|
||||
unsigned idx = seq & JOURNAL_BUF_MASK;
|
||||
union journal_res_state s;
|
||||
|
||||
s = journal_state_buf_put(j, idx);
|
||||
@ -276,8 +277,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
|
||||
bch2_journal_buf_put_final(j, seq);
|
||||
}
|
||||
|
||||
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
|
||||
static inline void bch2_journal_buf_put(struct journal *j, u64 seq)
|
||||
{
|
||||
unsigned idx = seq & JOURNAL_BUF_MASK;
|
||||
union journal_res_state s;
|
||||
|
||||
s = journal_state_buf_put(j, idx);
|
||||
@ -306,13 +308,13 @@ static inline void bch2_journal_res_put(struct journal *j,
|
||||
BCH_JSET_ENTRY_btree_keys,
|
||||
0, 0, 0);
|
||||
|
||||
bch2_journal_buf_put(j, res->idx, res->seq);
|
||||
bch2_journal_buf_put(j, res->seq);
|
||||
|
||||
res->ref = 0;
|
||||
}
|
||||
|
||||
int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
|
||||
unsigned);
|
||||
unsigned, struct btree_trans *);
|
||||
|
||||
/* First bits for BCH_WATERMARK: */
|
||||
enum journal_res_flags {
|
||||
@ -361,14 +363,17 @@ static inline int journal_res_get_fast(struct journal *j,
|
||||
&old.v, new.v));
|
||||
|
||||
res->ref = true;
|
||||
res->idx = old.idx;
|
||||
res->offset = old.cur_entry_offset;
|
||||
res->seq = le64_to_cpu(j->buf[old.idx].data->seq);
|
||||
res->seq = journal_cur_seq(j);
|
||||
res->seq -= (res->seq - old.idx) & JOURNAL_BUF_MASK;
|
||||
|
||||
EBUG_ON(res->seq != le64_to_cpu(j->buf[old.idx].data->seq));
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
|
||||
unsigned u64s, unsigned flags)
|
||||
unsigned u64s, unsigned flags,
|
||||
struct btree_trans *trans)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@ -380,7 +385,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
|
||||
if (journal_res_get_fast(j, res, flags))
|
||||
goto out;
|
||||
|
||||
ret = bch2_journal_res_get_slowpath(j, res, flags);
|
||||
ret = bch2_journal_res_get_slowpath(j, res, flags, trans);
|
||||
if (ret)
|
||||
return ret;
|
||||
out:
|
||||
@ -408,6 +413,7 @@ bool bch2_journal_noflush_seq(struct journal *, u64, u64);
|
||||
int bch2_journal_meta(struct journal *);
|
||||
|
||||
void bch2_journal_halt(struct journal *);
|
||||
void bch2_journal_halt_locked(struct journal *);
|
||||
|
||||
static inline int bch2_journal_error(struct journal *j)
|
||||
{
|
||||
@ -429,8 +435,6 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u
|
||||
|
||||
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
|
||||
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
|
||||
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
|
||||
bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
|
||||
|
||||
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
|
||||
unsigned nr);
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include "sb-clean.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/ioprio.h>
|
||||
#include <linux/string_choices.h>
|
||||
|
||||
void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
|
||||
@ -1610,7 +1611,6 @@ static CLOSURE_CALLBACK(journal_write_done)
|
||||
struct journal *j = container_of(w, struct journal, buf[w->idx]);
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct bch_replicas_padded replicas;
|
||||
union journal_res_state old, new;
|
||||
u64 seq = le64_to_cpu(w->data->seq);
|
||||
int err = 0;
|
||||
|
||||
@ -1670,16 +1670,6 @@ static CLOSURE_CALLBACK(journal_write_done)
|
||||
if (j->watermark != BCH_WATERMARK_stripe)
|
||||
journal_reclaim_kick(&c->journal);
|
||||
|
||||
old.v = atomic64_read(&j->reservations.counter);
|
||||
do {
|
||||
new.v = old.v;
|
||||
BUG_ON(journal_state_count(new, new.unwritten_idx));
|
||||
BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
|
||||
|
||||
new.unwritten_idx++;
|
||||
} while (!atomic64_try_cmpxchg(&j->reservations.counter,
|
||||
&old.v, new.v));
|
||||
|
||||
closure_wake_up(&w->wait);
|
||||
completed = true;
|
||||
}
|
||||
@ -1694,7 +1684,7 @@ static CLOSURE_CALLBACK(journal_write_done)
|
||||
}
|
||||
|
||||
if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
|
||||
new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
|
||||
j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
|
||||
struct journal_buf *buf = journal_cur_buf(j);
|
||||
long delta = buf->expires - jiffies;
|
||||
|
||||
@ -1763,6 +1753,7 @@ static CLOSURE_CALLBACK(journal_write_submit)
|
||||
bio->bi_iter.bi_sector = ptr->offset;
|
||||
bio->bi_end_io = journal_write_endio;
|
||||
bio->bi_private = ca;
|
||||
bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0);
|
||||
|
||||
BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
|
||||
ca->prev_journal_sector = bio->bi_iter.bi_sector;
|
||||
|
@ -327,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j)
|
||||
popped = true;
|
||||
}
|
||||
|
||||
if (popped)
|
||||
if (popped) {
|
||||
bch2_journal_space_available(j);
|
||||
__closure_wake_up(&j->reclaim_flush_wait);
|
||||
}
|
||||
}
|
||||
|
||||
bool __bch2_journal_pin_put(struct journal *j, u64 seq)
|
||||
@ -362,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j,
|
||||
pin->seq = 0;
|
||||
list_del_init(&pin->list);
|
||||
|
||||
if (j->reclaim_flush_wait.list.first)
|
||||
__closure_wake_up(&j->reclaim_flush_wait);
|
||||
|
||||
/*
|
||||
* Unpinning a journal entry may make journal_next_bucket() succeed, if
|
||||
* writing a new last_seq will now make another bucket available:
|
||||
@ -383,11 +388,11 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
|
||||
{
|
||||
if (fn == bch2_btree_node_flush0 ||
|
||||
fn == bch2_btree_node_flush1)
|
||||
return JOURNAL_PIN_btree;
|
||||
return JOURNAL_PIN_TYPE_btree;
|
||||
else if (fn == bch2_btree_key_cache_journal_flush)
|
||||
return JOURNAL_PIN_key_cache;
|
||||
return JOURNAL_PIN_TYPE_key_cache;
|
||||
else
|
||||
return JOURNAL_PIN_other;
|
||||
return JOURNAL_PIN_TYPE_other;
|
||||
}
|
||||
|
||||
static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
|
||||
@ -406,7 +411,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
|
||||
atomic_inc(&pin_list->count);
|
||||
pin->seq = seq;
|
||||
pin->flush = flush_fn;
|
||||
list_add(&pin->list, &pin_list->list[type]);
|
||||
|
||||
if (list_empty(&pin_list->unflushed[type]) &&
|
||||
j->reclaim_flush_wait.list.first)
|
||||
__closure_wake_up(&j->reclaim_flush_wait);
|
||||
|
||||
list_add(&pin->list, &pin_list->unflushed[type]);
|
||||
}
|
||||
|
||||
void bch2_journal_pin_copy(struct journal *j,
|
||||
@ -499,16 +509,15 @@ journal_get_next_pin(struct journal *j,
|
||||
{
|
||||
struct journal_entry_pin_list *pin_list;
|
||||
struct journal_entry_pin *ret = NULL;
|
||||
unsigned i;
|
||||
|
||||
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
|
||||
if (*seq > seq_to_flush && !allowed_above_seq)
|
||||
break;
|
||||
|
||||
for (i = 0; i < JOURNAL_PIN_NR; i++)
|
||||
if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
|
||||
((1U << i) & allowed_above_seq)) {
|
||||
ret = list_first_entry_or_null(&pin_list->list[i],
|
||||
for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
|
||||
if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
|
||||
(BIT(i) & allowed_above_seq)) {
|
||||
ret = list_first_entry_or_null(&pin_list->unflushed[i],
|
||||
struct journal_entry_pin, list);
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -544,8 +553,8 @@ static size_t journal_flush_pins(struct journal *j,
|
||||
}
|
||||
|
||||
if (min_key_cache) {
|
||||
allowed_above |= 1U << JOURNAL_PIN_key_cache;
|
||||
allowed_below |= 1U << JOURNAL_PIN_key_cache;
|
||||
allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
|
||||
allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
|
||||
}
|
||||
|
||||
cond_resched();
|
||||
@ -553,7 +562,9 @@ static size_t journal_flush_pins(struct journal *j,
|
||||
j->last_flushed = jiffies;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
|
||||
pin = journal_get_next_pin(j, seq_to_flush,
|
||||
allowed_below,
|
||||
allowed_above, &seq);
|
||||
if (pin) {
|
||||
BUG_ON(j->flush_in_progress);
|
||||
j->flush_in_progress = pin;
|
||||
@ -576,7 +587,7 @@ static size_t journal_flush_pins(struct journal *j,
|
||||
spin_lock(&j->lock);
|
||||
/* Pin might have been dropped or rearmed: */
|
||||
if (likely(!err && !j->flush_in_progress_dropped))
|
||||
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
|
||||
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
|
||||
j->flush_in_progress = NULL;
|
||||
j->flush_in_progress_dropped = false;
|
||||
spin_unlock(&j->lock);
|
||||
@ -816,10 +827,41 @@ int bch2_journal_reclaim_start(struct journal *j)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
|
||||
unsigned types)
|
||||
{
|
||||
struct journal_entry_pin_list *pin_list;
|
||||
u64 seq;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
|
||||
if (seq > seq_to_flush)
|
||||
break;
|
||||
|
||||
for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
|
||||
if ((BIT(i) & types) &&
|
||||
(!list_empty(&pin_list->unflushed[i]) ||
|
||||
!list_empty(&pin_list->flushed[i]))) {
|
||||
spin_unlock(&j->lock);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
|
||||
unsigned types)
|
||||
{
|
||||
return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
|
||||
journal_pins_still_flushing(j, seq_to_flush, types);
|
||||
}
|
||||
|
||||
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
|
||||
bool *did_work)
|
||||
{
|
||||
int ret;
|
||||
int ret = 0;
|
||||
|
||||
ret = bch2_journal_error(j);
|
||||
if (ret)
|
||||
@ -827,12 +869,18 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
|
||||
|
||||
mutex_lock(&j->reclaim_lock);
|
||||
|
||||
if (journal_flush_pins(j, seq_to_flush,
|
||||
(1U << JOURNAL_PIN_key_cache)|
|
||||
(1U << JOURNAL_PIN_other), 0, 0, 0) ||
|
||||
journal_flush_pins(j, seq_to_flush,
|
||||
(1U << JOURNAL_PIN_btree), 0, 0, 0))
|
||||
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
|
||||
BIT(JOURNAL_PIN_TYPE_key_cache)|
|
||||
BIT(JOURNAL_PIN_TYPE_other))) {
|
||||
*did_work = true;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
|
||||
BIT(JOURNAL_PIN_TYPE_btree))) {
|
||||
*did_work = true;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (seq_to_flush > journal_cur_seq(j))
|
||||
bch2_journal_entry_close(j);
|
||||
@ -847,6 +895,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
|
||||
!fifo_used(&j->pin);
|
||||
|
||||
spin_unlock(&j->lock);
|
||||
unlock:
|
||||
mutex_unlock(&j->reclaim_lock);
|
||||
|
||||
return ret;
|
||||
@ -860,7 +909,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
|
||||
if (!test_bit(JOURNAL_running, &j->flags))
|
||||
return false;
|
||||
|
||||
closure_wait_event(&j->async_wait,
|
||||
closure_wait_event(&j->reclaim_flush_wait,
|
||||
journal_flush_done(j, seq_to_flush, &did_work));
|
||||
|
||||
return did_work;
|
||||
@ -926,3 +975,54 @@ err:
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
|
||||
{
|
||||
struct journal_entry_pin_list *pin_list;
|
||||
struct journal_entry_pin *pin;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
if (!test_bit(JOURNAL_running, &j->flags)) {
|
||||
spin_unlock(&j->lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
*seq = max(*seq, j->pin.front);
|
||||
|
||||
if (*seq >= j->pin.back) {
|
||||
spin_unlock(&j->lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
out->atomic++;
|
||||
|
||||
pin_list = journal_seq_pin(j, *seq);
|
||||
|
||||
prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
|
||||
printbuf_indent_add(out, 2);
|
||||
|
||||
prt_printf(out, "unflushed:\n");
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
|
||||
list_for_each_entry(pin, &pin_list->unflushed[i], list)
|
||||
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
|
||||
|
||||
prt_printf(out, "flushed:\n");
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
|
||||
list_for_each_entry(pin, &pin_list->flushed[i], list)
|
||||
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
|
||||
|
||||
printbuf_indent_sub(out, 2);
|
||||
|
||||
--out->atomic;
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
|
||||
{
|
||||
u64 seq = 0;
|
||||
|
||||
while (!bch2_journal_seq_pins_to_text(out, j, &seq))
|
||||
seq++;
|
||||
}
|
||||
|
@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j)
|
||||
|
||||
int bch2_journal_flush_device_pins(struct journal *, int);
|
||||
|
||||
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
|
||||
bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
|
||||
|
||||
#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
|
||||
|
@ -53,15 +53,15 @@ struct journal_buf {
|
||||
*/
|
||||
|
||||
enum journal_pin_type {
|
||||
JOURNAL_PIN_btree,
|
||||
JOURNAL_PIN_key_cache,
|
||||
JOURNAL_PIN_other,
|
||||
JOURNAL_PIN_NR,
|
||||
JOURNAL_PIN_TYPE_btree,
|
||||
JOURNAL_PIN_TYPE_key_cache,
|
||||
JOURNAL_PIN_TYPE_other,
|
||||
JOURNAL_PIN_TYPE_NR,
|
||||
};
|
||||
|
||||
struct journal_entry_pin_list {
|
||||
struct list_head list[JOURNAL_PIN_NR];
|
||||
struct list_head flushed;
|
||||
struct list_head unflushed[JOURNAL_PIN_TYPE_NR];
|
||||
struct list_head flushed[JOURNAL_PIN_TYPE_NR];
|
||||
atomic_t count;
|
||||
struct bch_devs_list devs;
|
||||
};
|
||||
@ -79,7 +79,6 @@ struct journal_entry_pin {
|
||||
|
||||
struct journal_res {
|
||||
bool ref;
|
||||
u8 idx;
|
||||
u16 u64s;
|
||||
u32 offset;
|
||||
u64 seq;
|
||||
@ -95,9 +94,8 @@ union journal_res_state {
|
||||
};
|
||||
|
||||
struct {
|
||||
u64 cur_entry_offset:20,
|
||||
u64 cur_entry_offset:22,
|
||||
idx:2,
|
||||
unwritten_idx:2,
|
||||
buf0_count:10,
|
||||
buf1_count:10,
|
||||
buf2_count:10,
|
||||
@ -107,13 +105,13 @@ union journal_res_state {
|
||||
|
||||
/* bytes: */
|
||||
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
|
||||
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
|
||||
#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */
|
||||
|
||||
/*
|
||||
* We stash some journal state as sentinal values in cur_entry_offset:
|
||||
* note - cur_entry_offset is in units of u64s
|
||||
*/
|
||||
#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
|
||||
#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1)
|
||||
|
||||
#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2)
|
||||
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
|
||||
@ -226,6 +224,7 @@ struct journal {
|
||||
/* Used when waiting because the journal was full */
|
||||
wait_queue_head_t wait;
|
||||
struct closure_waitlist async_wait;
|
||||
struct closure_waitlist reclaim_flush_wait;
|
||||
|
||||
struct delayed_work write_work;
|
||||
struct workqueue_struct *wq;
|
||||
@ -236,6 +235,7 @@ struct journal {
|
||||
/* seq, last_seq from the most recent journal entry successfully written */
|
||||
u64 seq_ondisk;
|
||||
u64 flushed_seq_ondisk;
|
||||
u64 flushing_seq;
|
||||
u64 last_seq_ondisk;
|
||||
u64 err_seq;
|
||||
u64 last_empty_seq;
|
||||
|
@ -74,11 +74,7 @@ struct moving_io {
|
||||
unsigned read_sectors;
|
||||
unsigned write_sectors;
|
||||
|
||||
struct bch_read_bio rbio;
|
||||
|
||||
struct data_update write;
|
||||
/* Must be last since it is variable size */
|
||||
struct bio_vec bi_inline_vecs[];
|
||||
};
|
||||
|
||||
static void move_free(struct moving_io *io)
|
||||
@ -113,7 +109,20 @@ static void move_write_done(struct bch_write_op *op)
|
||||
|
||||
static void move_write(struct moving_io *io)
|
||||
{
|
||||
if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
|
||||
struct moving_context *ctxt = io->write.ctxt;
|
||||
|
||||
if (ctxt->stats) {
|
||||
if (io->write.rbio.bio.bi_status)
|
||||
atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
|
||||
&ctxt->stats->sectors_error_uncorrected);
|
||||
else if (io->write.rbio.saw_error)
|
||||
atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
|
||||
&ctxt->stats->sectors_error_corrected);
|
||||
}
|
||||
|
||||
if (unlikely(io->write.rbio.bio.bi_status ||
|
||||
io->write.rbio.hole ||
|
||||
io->write.data_opts.scrub)) {
|
||||
move_free(io);
|
||||
return;
|
||||
}
|
||||
@ -131,7 +140,7 @@ static void move_write(struct moving_io *io)
|
||||
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
|
||||
atomic_inc(&io->write.ctxt->write_ios);
|
||||
|
||||
bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
|
||||
bch2_data_update_read_done(&io->write);
|
||||
}
|
||||
|
||||
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
|
||||
@ -144,7 +153,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx
|
||||
|
||||
static void move_read_endio(struct bio *bio)
|
||||
{
|
||||
struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
|
||||
struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio);
|
||||
struct moving_context *ctxt = io->write.ctxt;
|
||||
|
||||
atomic_sub(io->read_sectors, &ctxt->read_sectors);
|
||||
@ -257,11 +266,6 @@ int bch2_move_extent(struct moving_context *ctxt,
|
||||
{
|
||||
struct btree_trans *trans = ctxt->trans;
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
struct moving_io *io;
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
unsigned sectors = k.k->size, pages;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
trace_move_extent2(c, k, &io_opts, &data_opts);
|
||||
@ -272,7 +276,8 @@ int bch2_move_extent(struct moving_context *ctxt,
|
||||
bch2_data_update_opts_normalize(k, &data_opts);
|
||||
|
||||
if (!data_opts.rewrite_ptrs &&
|
||||
!data_opts.extra_replicas) {
|
||||
!data_opts.extra_replicas &&
|
||||
!data_opts.scrub) {
|
||||
if (data_opts.kill_ptrs)
|
||||
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
|
||||
return 0;
|
||||
@ -284,13 +289,7 @@ int bch2_move_extent(struct moving_context *ctxt,
|
||||
*/
|
||||
bch2_trans_unlock(trans);
|
||||
|
||||
/* write path might have to decompress data: */
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
||||
sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
|
||||
|
||||
pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
|
||||
io = kzalloc(sizeof(struct moving_io) +
|
||||
sizeof(struct bio_vec) * pages, GFP_KERNEL);
|
||||
struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL);
|
||||
if (!io)
|
||||
goto err;
|
||||
|
||||
@ -299,31 +298,21 @@ int bch2_move_extent(struct moving_context *ctxt,
|
||||
io->read_sectors = k.k->size;
|
||||
io->write_sectors = k.k->size;
|
||||
|
||||
bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
|
||||
bio_set_prio(&io->write.op.wbio.bio,
|
||||
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
||||
if (!data_opts.scrub) {
|
||||
ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
|
||||
io_opts, data_opts, iter->btree_id, k);
|
||||
if (ret)
|
||||
goto err_free;
|
||||
|
||||
if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
|
||||
GFP_KERNEL))
|
||||
goto err_free;
|
||||
io->write.op.end_io = move_write_done;
|
||||
} else {
|
||||
bch2_bkey_buf_init(&io->write.k);
|
||||
io->write.op.c = c;
|
||||
io->write.data_opts = data_opts;
|
||||
}
|
||||
|
||||
io->rbio.c = c;
|
||||
io->rbio.opts = io_opts;
|
||||
bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
|
||||
io->rbio.bio.bi_vcnt = pages;
|
||||
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
||||
io->rbio.bio.bi_iter.bi_size = sectors << 9;
|
||||
|
||||
io->rbio.bio.bi_opf = REQ_OP_READ;
|
||||
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
|
||||
io->rbio.bio.bi_end_io = move_read_endio;
|
||||
|
||||
ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
|
||||
io_opts, data_opts, iter->btree_id, k);
|
||||
if (ret)
|
||||
goto err_free_pages;
|
||||
|
||||
io->write.op.end_io = move_write_done;
|
||||
io->write.rbio.bio.bi_end_io = move_read_endio;
|
||||
io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
||||
|
||||
if (ctxt->rate)
|
||||
bch2_ratelimit_increment(ctxt->rate, k.k->size);
|
||||
@ -355,18 +344,19 @@ int bch2_move_extent(struct moving_context *ctxt,
|
||||
* ctxt when doing wakeup
|
||||
*/
|
||||
closure_get(&ctxt->cl);
|
||||
bch2_read_extent(trans, &io->rbio,
|
||||
bkey_start_pos(k.k),
|
||||
iter->btree_id, k, 0,
|
||||
BCH_READ_NODECODE|
|
||||
BCH_READ_LAST_FRAGMENT);
|
||||
__bch2_read_extent(trans, &io->write.rbio,
|
||||
io->write.rbio.bio.bi_iter,
|
||||
bkey_start_pos(k.k),
|
||||
iter->btree_id, k, 0,
|
||||
NULL,
|
||||
BCH_READ_data_update|
|
||||
BCH_READ_last_fragment,
|
||||
data_opts.scrub ? data_opts.read_dev : -1);
|
||||
return 0;
|
||||
err_free_pages:
|
||||
bio_free_pages(&io->write.op.wbio.bio);
|
||||
err_free:
|
||||
kfree(io);
|
||||
err:
|
||||
if (ret == -BCH_ERR_data_update_done)
|
||||
if (bch2_err_matches(ret, BCH_ERR_data_update_done))
|
||||
return 0;
|
||||
|
||||
if (bch2_err_matches(ret, EROFS) ||
|
||||
@ -626,7 +616,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
||||
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
|
||||
continue;
|
||||
|
||||
if (ret2 == -ENOMEM) {
|
||||
if (bch2_err_matches(ret2, ENOMEM)) {
|
||||
/* memory allocation failure, wait for some IO to finish */
|
||||
bch2_move_ctxt_wait_for_io(ctxt);
|
||||
continue;
|
||||
@ -688,21 +678,22 @@ int bch2_move_data(struct bch_fs *c,
|
||||
bool wait_on_copygc,
|
||||
move_pred_fn pred, void *arg)
|
||||
{
|
||||
|
||||
struct moving_context ctxt;
|
||||
int ret;
|
||||
|
||||
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
|
||||
ret = __bch2_move_data(&ctxt, start, end, pred, arg);
|
||||
int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
|
||||
bch2_moving_ctxt_exit(&ctxt);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
struct move_bucket_in_flight *bucket_in_flight,
|
||||
struct bpos bucket, int gen,
|
||||
struct data_update_opts _data_opts)
|
||||
static int __bch2_move_data_phys(struct moving_context *ctxt,
|
||||
struct move_bucket_in_flight *bucket_in_flight,
|
||||
unsigned dev,
|
||||
u64 bucket_start,
|
||||
u64 bucket_end,
|
||||
unsigned data_types,
|
||||
move_pred_fn pred, void *arg)
|
||||
{
|
||||
struct btree_trans *trans = ctxt->trans;
|
||||
struct bch_fs *c = trans->c;
|
||||
@ -711,16 +702,20 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
struct btree_iter iter = {}, bp_iter = {};
|
||||
struct bkey_buf sk;
|
||||
struct bkey_s_c k;
|
||||
struct data_update_opts data_opts;
|
||||
unsigned sectors_moved = 0;
|
||||
struct bkey_buf last_flushed;
|
||||
int ret = 0;
|
||||
|
||||
struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
|
||||
struct bch_dev *ca = bch2_dev_tryget(c, dev);
|
||||
if (!ca)
|
||||
return 0;
|
||||
|
||||
trace_bucket_evacuate(c, &bucket);
|
||||
bucket_end = min(bucket_end, ca->mi.nbuckets);
|
||||
|
||||
struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start));
|
||||
struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end));
|
||||
bch2_dev_put(ca);
|
||||
ca = NULL;
|
||||
|
||||
bch2_bkey_buf_init(&last_flushed);
|
||||
bkey_init(&last_flushed.k->k);
|
||||
@ -731,8 +726,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
*/
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
|
||||
bucket_pos_to_bp_start(ca, bucket), 0);
|
||||
bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0);
|
||||
|
||||
bch_err_msg(c, ret, "looking up alloc key");
|
||||
if (ret)
|
||||
@ -756,7 +750,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket)))
|
||||
if (!k.k || bkey_gt(k.k->p, bp_end))
|
||||
break;
|
||||
|
||||
if (k.k->type != KEY_TYPE_backpointer)
|
||||
@ -764,107 +758,145 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
|
||||
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
|
||||
|
||||
if (ctxt->stats)
|
||||
ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
|
||||
|
||||
if (!(data_types & BIT(bp.v->data_type)))
|
||||
goto next;
|
||||
|
||||
k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
|
||||
ret = bkey_err(k);
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
continue;
|
||||
if (ret)
|
||||
goto err;
|
||||
if (!k.k)
|
||||
goto next;
|
||||
|
||||
if (!bp.v->level) {
|
||||
k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
|
||||
ret = bkey_err(k);
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
continue;
|
||||
if (ret)
|
||||
goto err;
|
||||
if (!k.k)
|
||||
goto next;
|
||||
|
||||
bch2_bkey_buf_reassemble(&sk, c, k);
|
||||
k = bkey_i_to_s_c(sk.k);
|
||||
|
||||
ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
|
||||
if (ret) {
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
continue;
|
||||
}
|
||||
|
||||
data_opts = _data_opts;
|
||||
data_opts.target = io_opts.background_target;
|
||||
data_opts.rewrite_ptrs = 0;
|
||||
|
||||
unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */
|
||||
unsigned i = 0;
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
|
||||
if (p.ptr.dev == bucket.inode) {
|
||||
if (p.ptr.cached) {
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
goto next;
|
||||
}
|
||||
data_opts.rewrite_ptrs |= 1U << i;
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
ret = bch2_move_extent(ctxt, bucket_in_flight,
|
||||
&iter, k, io_opts, data_opts);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
continue;
|
||||
if (ret == -ENOMEM) {
|
||||
/* memory allocation failure, wait for some IO to finish */
|
||||
bch2_move_ctxt_wait_for_io(ctxt);
|
||||
continue;
|
||||
}
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (ctxt->stats)
|
||||
atomic64_add(sectors, &ctxt->stats->sectors_seen);
|
||||
sectors_moved += sectors;
|
||||
} else {
|
||||
struct btree *b;
|
||||
|
||||
b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed);
|
||||
ret = PTR_ERR_OR_ZERO(b);
|
||||
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
|
||||
goto next;
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
continue;
|
||||
if (ret)
|
||||
goto err;
|
||||
if (!b)
|
||||
goto next;
|
||||
|
||||
unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
|
||||
|
||||
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
continue;
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (ctxt->rate)
|
||||
bch2_ratelimit_increment(ctxt->rate, sectors);
|
||||
if (ctxt->stats) {
|
||||
atomic64_add(sectors, &ctxt->stats->sectors_seen);
|
||||
atomic64_add(sectors, &ctxt->stats->sectors_moved);
|
||||
}
|
||||
sectors_moved += btree_sectors(c);
|
||||
}
|
||||
|
||||
struct data_update_opts data_opts = {};
|
||||
if (!pred(c, arg, k, &io_opts, &data_opts)) {
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
goto next;
|
||||
}
|
||||
|
||||
if (data_opts.scrub &&
|
||||
!bch2_dev_idx_is_online(c, data_opts.read_dev)) {
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
ret = -BCH_ERR_device_offline;
|
||||
break;
|
||||
}
|
||||
|
||||
bch2_bkey_buf_reassemble(&sk, c, k);
|
||||
k = bkey_i_to_s_c(sk.k);
|
||||
|
||||
/* move_extent will drop locks */
|
||||
unsigned sectors = bp.v->bucket_len;
|
||||
|
||||
if (!bp.v->level)
|
||||
ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
|
||||
else if (!data_opts.scrub)
|
||||
ret = bch2_btree_node_rewrite_key(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
|
||||
else
|
||||
ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
|
||||
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
continue;
|
||||
if (ret == -ENOMEM) {
|
||||
/* memory allocation failure, wait for some IO to finish */
|
||||
bch2_move_ctxt_wait_for_io(ctxt);
|
||||
continue;
|
||||
}
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (ctxt->stats)
|
||||
atomic64_add(sectors, &ctxt->stats->sectors_seen);
|
||||
sectors_moved += sectors;
|
||||
next:
|
||||
bch2_btree_iter_advance(&bp_iter);
|
||||
}
|
||||
|
||||
trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret);
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &bp_iter);
|
||||
bch2_dev_put(ca);
|
||||
bch2_bkey_buf_exit(&sk, c);
|
||||
bch2_bkey_buf_exit(&last_flushed, c);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_move_data_phys(struct bch_fs *c,
|
||||
unsigned dev,
|
||||
u64 start,
|
||||
u64 end,
|
||||
unsigned data_types,
|
||||
struct bch_ratelimit *rate,
|
||||
struct bch_move_stats *stats,
|
||||
struct write_point_specifier wp,
|
||||
bool wait_on_copygc,
|
||||
move_pred_fn pred, void *arg)
|
||||
{
|
||||
struct moving_context ctxt;
|
||||
|
||||
bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
|
||||
|
||||
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
|
||||
ctxt.stats->phys = true;
|
||||
|
||||
int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg);
|
||||
bch2_moving_ctxt_exit(&ctxt);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct evacuate_bucket_arg {
|
||||
struct bpos bucket;
|
||||
int gen;
|
||||
struct data_update_opts data_opts;
|
||||
};
|
||||
|
||||
static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k,
|
||||
struct bch_io_opts *io_opts,
|
||||
struct data_update_opts *data_opts)
|
||||
{
|
||||
struct evacuate_bucket_arg *arg = _arg;
|
||||
|
||||
*data_opts = arg->data_opts;
|
||||
|
||||
unsigned i = 0;
|
||||
bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
|
||||
if (ptr->dev == arg->bucket.inode &&
|
||||
(arg->gen < 0 || arg->gen == ptr->gen) &&
|
||||
!ptr->cached)
|
||||
data_opts->rewrite_ptrs |= BIT(i);
|
||||
i++;
|
||||
}
|
||||
|
||||
return data_opts->rewrite_ptrs != 0;
|
||||
}
|
||||
|
||||
int bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
struct move_bucket_in_flight *bucket_in_flight,
|
||||
struct bpos bucket, int gen,
|
||||
struct data_update_opts data_opts)
|
||||
{
|
||||
struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
|
||||
|
||||
return __bch2_move_data_phys(ctxt, bucket_in_flight,
|
||||
bucket.inode,
|
||||
bucket.offset,
|
||||
bucket.offset + 1,
|
||||
~0,
|
||||
evacuate_bucket_pred, &arg);
|
||||
}
|
||||
|
||||
typedef bool (*move_btree_pred)(struct bch_fs *, void *,
|
||||
struct btree *, struct bch_io_opts *,
|
||||
struct data_update_opts *);
|
||||
@ -1103,6 +1135,30 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
|
||||
return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
||||
}
|
||||
|
||||
static bool scrub_pred(struct bch_fs *c, void *_arg,
|
||||
struct bkey_s_c k,
|
||||
struct bch_io_opts *io_opts,
|
||||
struct data_update_opts *data_opts)
|
||||
{
|
||||
struct bch_ioctl_data *arg = _arg;
|
||||
|
||||
if (k.k->type != KEY_TYPE_btree_ptr_v2) {
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
||||
if (p.ptr.dev == arg->migrate.dev) {
|
||||
if (!p.crc.csum_type)
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
data_opts->scrub = true;
|
||||
data_opts->read_dev = arg->migrate.dev;
|
||||
return true;
|
||||
}
|
||||
|
||||
int bch2_data_job(struct bch_fs *c,
|
||||
struct bch_move_stats *stats,
|
||||
struct bch_ioctl_data op)
|
||||
@ -1117,6 +1173,16 @@ int bch2_data_job(struct bch_fs *c,
|
||||
bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
|
||||
|
||||
switch (op.op) {
|
||||
case BCH_DATA_OP_scrub:
|
||||
ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
|
||||
op.scrub.data_types,
|
||||
NULL,
|
||||
stats,
|
||||
writepoint_hashed((unsigned long) current),
|
||||
false,
|
||||
scrub_pred, &op) ?: ret;
|
||||
break;
|
||||
|
||||
case BCH_DATA_OP_rereplicate:
|
||||
stats->data_type = BCH_DATA_journal;
|
||||
ret = bch2_journal_flush_device_pins(&c->journal, -1);
|
||||
@ -1215,7 +1281,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
|
||||
|
||||
mutex_lock(&ctxt->lock);
|
||||
list_for_each_entry(io, &ctxt->ios, io_list)
|
||||
bch2_data_update_to_text(out, &io->write);
|
||||
bch2_data_update_inflight_to_text(out, &io->write);
|
||||
mutex_unlock(&ctxt->lock);
|
||||
|
||||
printbuf_indent_sub(out, 4);
|
||||
|
@ -3,17 +3,31 @@
|
||||
#define _BCACHEFS_MOVE_TYPES_H
|
||||
|
||||
#include "bbpos_types.h"
|
||||
#include "bcachefs_ioctl.h"
|
||||
|
||||
struct bch_move_stats {
|
||||
enum bch_data_type data_type;
|
||||
struct bbpos pos;
|
||||
char name[32];
|
||||
bool phys;
|
||||
enum bch_ioctl_data_event_ret ret;
|
||||
|
||||
union {
|
||||
struct {
|
||||
enum bch_data_type data_type;
|
||||
struct bbpos pos;
|
||||
};
|
||||
struct {
|
||||
unsigned dev;
|
||||
u64 offset;
|
||||
};
|
||||
};
|
||||
|
||||
atomic64_t keys_moved;
|
||||
atomic64_t keys_raced;
|
||||
atomic64_t sectors_seen;
|
||||
atomic64_t sectors_moved;
|
||||
atomic64_t sectors_raced;
|
||||
atomic64_t sectors_error_corrected;
|
||||
atomic64_t sectors_error_uncorrected;
|
||||
};
|
||||
|
||||
struct move_bucket_key {
|
||||
|
@ -215,7 +215,8 @@ static int bch2_copygc(struct moving_context *ctxt,
|
||||
};
|
||||
move_buckets buckets = { 0 };
|
||||
struct move_bucket_in_flight *f;
|
||||
u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
|
||||
u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen);
|
||||
u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);
|
||||
int ret = 0;
|
||||
|
||||
ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
|
||||
@ -245,7 +246,6 @@ static int bch2_copygc(struct moving_context *ctxt,
|
||||
*did_work = true;
|
||||
}
|
||||
err:
|
||||
darray_exit(&buckets);
|
||||
|
||||
/* no entries in LRU btree found, or got to end: */
|
||||
if (bch2_err_matches(ret, ENOENT))
|
||||
@ -254,8 +254,11 @@ err:
|
||||
if (ret < 0 && !bch2_err_matches(ret, EROFS))
|
||||
bch_err_msg(c, ret, "from bch2_move_data()");
|
||||
|
||||
moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
|
||||
trace_and_count(c, copygc, c, moved, 0, 0, 0);
|
||||
sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
|
||||
sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
|
||||
trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved);
|
||||
|
||||
darray_exit(&buckets);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -476,13 +476,13 @@ enum fsck_err_opts {
|
||||
NULL, "Enable nocow mode: enables runtime locking in\n"\
|
||||
"data move path needed if nocow will ever be in use\n")\
|
||||
x(copygc_enabled, u8, \
|
||||
OPT_FS|OPT_MOUNT, \
|
||||
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
|
||||
OPT_BOOL(), \
|
||||
BCH2_NO_SB_OPT, true, \
|
||||
NULL, "Enable copygc: disable for debugging, or to\n"\
|
||||
"quiet the system when doing performance testing\n")\
|
||||
x(rebalance_enabled, u8, \
|
||||
OPT_FS|OPT_MOUNT, \
|
||||
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
|
||||
OPT_BOOL(), \
|
||||
BCH2_NO_SB_OPT, true, \
|
||||
NULL, "Enable rebalance: disable for debugging, or to\n"\
|
||||
@ -659,18 +659,4 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
|
||||
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
|
||||
bool bch2_opt_is_inode_opt(enum bch_opt_id);
|
||||
|
||||
/* rebalance opts: */
|
||||
|
||||
static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts)
|
||||
{
|
||||
return (struct bch_extent_rebalance) {
|
||||
.type = BIT(BCH_EXTENT_ENTRY_rebalance),
|
||||
#define x(_name) \
|
||||
._name = opts->_name, \
|
||||
._name##_from_inode = opts->_name##_from_inode,
|
||||
BCH_REBALANCE_OPTS()
|
||||
#undef x
|
||||
};
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_OPTS_H */
|
||||
|
@ -121,12 +121,10 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
|
||||
}
|
||||
}
|
||||
incompressible:
|
||||
if (opts->background_target &&
|
||||
bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) {
|
||||
if (opts->background_target)
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
||||
if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
|
||||
sectors += p.crc.compressed_size;
|
||||
}
|
||||
|
||||
return sectors;
|
||||
}
|
||||
@ -140,7 +138,7 @@ static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opt
|
||||
const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
|
||||
|
||||
if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
|
||||
struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts);
|
||||
struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
|
||||
return old == NULL || memcmp(old, &new, sizeof(new));
|
||||
} else {
|
||||
return old != NULL;
|
||||
@ -163,7 +161,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
|
||||
k.k->u64s += sizeof(*old) / sizeof(u64);
|
||||
}
|
||||
|
||||
*old = io_opts_to_rebalance_opts(opts);
|
||||
*old = io_opts_to_rebalance_opts(c, opts);
|
||||
} else {
|
||||
if (old)
|
||||
extent_entry_drop(k, (union bch_extent_entry *) old);
|
||||
@ -343,7 +341,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
|
||||
memset(data_opts, 0, sizeof(*data_opts));
|
||||
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
|
||||
data_opts->target = io_opts->background_target;
|
||||
data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
|
||||
data_opts->write_flags |= BCH_WRITE_only_specified_devs;
|
||||
|
||||
if (!data_opts->rewrite_ptrs) {
|
||||
/*
|
||||
@ -451,7 +449,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
|
||||
{
|
||||
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
|
||||
data_opts->target = io_opts->background_target;
|
||||
data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
|
||||
data_opts->write_flags |= BCH_WRITE_only_specified_devs;
|
||||
return data_opts->rewrite_ptrs != 0;
|
||||
}
|
||||
|
||||
|
@ -4,8 +4,28 @@
|
||||
|
||||
#include "compress.h"
|
||||
#include "disk_groups.h"
|
||||
#include "opts.h"
|
||||
#include "rebalance_types.h"
|
||||
|
||||
static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
|
||||
struct bch_io_opts *opts)
|
||||
{
|
||||
struct bch_extent_rebalance r = {
|
||||
.type = BIT(BCH_EXTENT_ENTRY_rebalance),
|
||||
#define x(_name) \
|
||||
._name = opts->_name, \
|
||||
._name##_from_inode = opts->_name##_from_inode,
|
||||
BCH_REBALANCE_OPTS()
|
||||
#undef x
|
||||
};
|
||||
|
||||
if (r.background_target &&
|
||||
!bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
|
||||
r.background_target = 0;
|
||||
|
||||
return r;
|
||||
};
|
||||
|
||||
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
|
||||
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
|
||||
int bch2_get_update_rebalance_opts(struct btree_trans *,
|
||||
|
@ -32,7 +32,6 @@
|
||||
#include <linux/sort.h>
|
||||
#include <linux/stat.h>
|
||||
|
||||
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
|
||||
|
||||
int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
|
||||
{
|
||||
|
@ -5,7 +5,13 @@
|
||||
|
||||
/* BCH_SB_FIELD_counters */
|
||||
|
||||
static const char * const bch2_counter_names[] = {
|
||||
static const u8 counters_to_stable_map[] = {
|
||||
#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n,
|
||||
BCH_PERSISTENT_COUNTERS()
|
||||
#undef x
|
||||
};
|
||||
|
||||
const char * const bch2_counter_names[] = {
|
||||
#define x(t, n, ...) (#t),
|
||||
BCH_PERSISTENT_COUNTERS()
|
||||
#undef x
|
||||
@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
|
||||
return 0;
|
||||
|
||||
return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
|
||||
};
|
||||
}
|
||||
|
||||
static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
|
||||
enum bch_validate_flags flags, struct printbuf *err)
|
||||
{
|
||||
return 0;
|
||||
};
|
||||
}
|
||||
|
||||
static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
|
||||
struct bch_sb_field *f)
|
||||
@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
|
||||
struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
|
||||
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
|
||||
|
||||
for (unsigned i = 0; i < nr; i++)
|
||||
prt_printf(out, "%s \t%llu\n",
|
||||
i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)",
|
||||
le64_to_cpu(ctrs->d[i]));
|
||||
};
|
||||
for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
||||
unsigned stable = counters_to_stable_map[i];
|
||||
if (stable < nr)
|
||||
prt_printf(out, "%s \t%llu\n",
|
||||
bch2_counter_names[i],
|
||||
le64_to_cpu(ctrs->d[stable]));
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_sb_counters_to_cpu(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
|
||||
unsigned int i;
|
||||
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
|
||||
u64 val = 0;
|
||||
|
||||
for (i = 0; i < BCH_COUNTER_NR; i++)
|
||||
for (unsigned i = 0; i < BCH_COUNTER_NR; i++)
|
||||
c->counters_on_mount[i] = 0;
|
||||
|
||||
for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
|
||||
val = le64_to_cpu(ctrs->d[i]);
|
||||
percpu_u64_set(&c->counters[i], val);
|
||||
c->counters_on_mount[i] = val;
|
||||
for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
||||
unsigned stable = counters_to_stable_map[i];
|
||||
if (stable < nr) {
|
||||
u64 v = le64_to_cpu(ctrs->d[stable]);
|
||||
percpu_u64_set(&c->counters[i], v);
|
||||
c->counters_on_mount[i] = v;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
};
|
||||
}
|
||||
|
||||
int bch2_sb_counters_from_cpu(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
|
||||
struct bch_sb_field_counters *ret;
|
||||
unsigned int i;
|
||||
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
|
||||
|
||||
if (nr < BCH_COUNTER_NR) {
|
||||
ret = bch2_sb_field_resize(&c->disk_sb, counters,
|
||||
sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
|
||||
|
||||
sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
|
||||
if (ret) {
|
||||
ctrs = ret;
|
||||
nr = bch2_sb_counter_nr_entries(ctrs);
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
||||
unsigned stable = counters_to_stable_map[i];
|
||||
if (stable < nr)
|
||||
ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
|
||||
}
|
||||
|
||||
for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
|
||||
ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = {
|
||||
.validate = bch2_sb_counters_validate,
|
||||
.to_text = bch2_sb_counters_to_text,
|
||||
};
|
||||
|
||||
#ifndef NO_BCACHEFS_CHARDEV
|
||||
long bch2_ioctl_query_counters(struct bch_fs *c,
|
||||
struct bch_ioctl_query_counters __user *user_arg)
|
||||
{
|
||||
struct bch_ioctl_query_counters arg;
|
||||
int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) ||
|
||||
arg.pad)
|
||||
return -EINVAL;
|
||||
|
||||
arg.nr = min(arg.nr, BCH_COUNTER_NR);
|
||||
ret = put_user(arg.nr, &user_arg->nr);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
|
||||
unsigned stable = counters_to_stable_map[i];
|
||||
|
||||
if (stable < arg.nr) {
|
||||
u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT)
|
||||
? percpu_u64_get(&c->counters[i])
|
||||
: c->counters_on_mount[i];
|
||||
|
||||
ret = put_user(v, &user_arg->d[stable]);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *);
|
||||
void bch2_fs_counters_exit(struct bch_fs *);
|
||||
int bch2_fs_counters_init(struct bch_fs *);
|
||||
|
||||
extern const char * const bch2_counter_names[];
|
||||
extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
|
||||
|
||||
long bch2_ioctl_query_counters(struct bch_fs *,
|
||||
struct bch_ioctl_query_counters __user *);
|
||||
|
||||
#endif // _BCACHEFS_SB_COUNTERS_H
|
||||
|
@ -13,6 +13,7 @@ enum counters_flags {
|
||||
x(io_move, 2, TYPE_SECTORS) \
|
||||
x(bucket_invalidate, 3, TYPE_COUNTER) \
|
||||
x(bucket_discard, 4, TYPE_COUNTER) \
|
||||
x(bucket_discard_fast, 79, TYPE_COUNTER) \
|
||||
x(bucket_alloc, 5, TYPE_COUNTER) \
|
||||
x(bucket_alloc_fail, 6, TYPE_COUNTER) \
|
||||
x(btree_cache_scan, 7, TYPE_COUNTER) \
|
||||
@ -95,6 +96,13 @@ enum bch_persistent_counters {
|
||||
BCH_COUNTER_NR
|
||||
};
|
||||
|
||||
enum bch_persistent_counters_stable {
|
||||
#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n,
|
||||
BCH_PERSISTENT_COUNTERS()
|
||||
#undef x
|
||||
BCH_COUNTER_STABLE_NR
|
||||
};
|
||||
|
||||
struct bch_sb_field_counters {
|
||||
struct bch_sb_field field;
|
||||
__le64 d[];
|
||||
|
@ -57,7 +57,7 @@ enum bch_fsck_flags {
|
||||
x(bset_wrong_sector_offset, 44, 0) \
|
||||
x(bset_empty, 45, 0) \
|
||||
x(bset_bad_seq, 46, 0) \
|
||||
x(bset_blacklisted_journal_seq, 47, 0) \
|
||||
x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \
|
||||
x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \
|
||||
x(btree_node_bad_btree, 49, 0) \
|
||||
x(btree_node_bad_level, 50, 0) \
|
||||
|
@ -23,6 +23,18 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca)
|
||||
return !percpu_ref_is_zero(&ca->io_ref);
|
||||
}
|
||||
|
||||
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
|
||||
|
||||
static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
|
||||
{
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, dev);
|
||||
bool ret = ca && bch2_dev_is_online(ca);
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_is_readable(struct bch_dev *ca)
|
||||
{
|
||||
return bch2_dev_is_online(ca) &&
|
||||
|
@ -31,11 +31,11 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir
|
||||
}
|
||||
}
|
||||
|
||||
static int fsck_rename_dirent(struct btree_trans *trans,
|
||||
struct snapshots_seen *s,
|
||||
const struct bch_hash_desc desc,
|
||||
struct bch_hash_info *hash_info,
|
||||
struct bkey_s_c_dirent old)
|
||||
static noinline int fsck_rename_dirent(struct btree_trans *trans,
|
||||
struct snapshots_seen *s,
|
||||
const struct bch_hash_desc desc,
|
||||
struct bch_hash_info *hash_info,
|
||||
struct bkey_s_c_dirent old)
|
||||
{
|
||||
struct qstr old_name = bch2_dirent_get_name(old);
|
||||
struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
|
||||
@ -71,11 +71,11 @@ static int fsck_rename_dirent(struct btree_trans *trans,
|
||||
return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
|
||||
}
|
||||
|
||||
static int hash_pick_winner(struct btree_trans *trans,
|
||||
const struct bch_hash_desc desc,
|
||||
struct bch_hash_info *hash_info,
|
||||
struct bkey_s_c k1,
|
||||
struct bkey_s_c k2)
|
||||
static noinline int hash_pick_winner(struct btree_trans *trans,
|
||||
const struct bch_hash_desc desc,
|
||||
struct bch_hash_info *hash_info,
|
||||
struct bkey_s_c k1,
|
||||
struct bkey_s_c k2)
|
||||
{
|
||||
if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
|
||||
!memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
|
||||
@ -142,8 +142,8 @@ fsck_err:
|
||||
* All versions of the same inode in different snapshots must have the same hash
|
||||
* seed/type: verify that the hash info we're using matches the root
|
||||
*/
|
||||
static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
|
||||
struct bch_hash_info *hash_info)
|
||||
static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
|
||||
struct bch_hash_info *hash_info)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
|
@ -428,7 +428,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
|
||||
bch2_bkey_get_iter_typed(trans, &snapshot_iter,
|
||||
BTREE_ID_snapshots, POS(0, snapid),
|
||||
0, snapshot);
|
||||
ret = bkey_err(subvol);
|
||||
ret = bkey_err(snapshot);
|
||||
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
|
||||
"missing snapshot %u", snapid);
|
||||
if (ret)
|
||||
@ -440,6 +440,11 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
|
||||
bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter,
|
||||
BTREE_ID_snapshot_trees, POS(0, treeid),
|
||||
0, snapshot_tree);
|
||||
ret = bkey_err(snapshot_tree);
|
||||
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
|
||||
"missing snapshot tree %u", treeid);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) {
|
||||
struct bkey_i_snapshot_tree *snapshot_tree_mut =
|
||||
|
@ -411,6 +411,17 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
|
||||
{
|
||||
bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
|
||||
|
||||
bch2_journal_halt_locked(&c->journal);
|
||||
bch2_fs_read_only_async(c);
|
||||
|
||||
wake_up(&bch2_read_only_wait);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_fs_read_write_late(struct bch_fs *c)
|
||||
{
|
||||
int ret;
|
||||
|
@ -29,6 +29,7 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
|
||||
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
|
||||
|
||||
bool bch2_fs_emergency_read_only(struct bch_fs *);
|
||||
bool bch2_fs_emergency_read_only_locked(struct bch_fs *);
|
||||
void bch2_fs_read_only(struct bch_fs *);
|
||||
|
||||
int bch2_fs_read_write(struct bch_fs *);
|
||||
|
@ -176,7 +176,6 @@ read_attribute(btree_reserve_cache);
|
||||
read_attribute(stripes_heap);
|
||||
read_attribute(open_buckets);
|
||||
read_attribute(open_buckets_partial);
|
||||
read_attribute(write_points);
|
||||
read_attribute(nocow_lock_table);
|
||||
|
||||
#ifdef BCH_WRITE_REF_DEBUG
|
||||
@ -364,9 +363,6 @@ SHOW(bch2_fs)
|
||||
if (attr == &sysfs_open_buckets_partial)
|
||||
bch2_open_buckets_partial_to_text(out, c);
|
||||
|
||||
if (attr == &sysfs_write_points)
|
||||
bch2_write_points_to_text(out, c);
|
||||
|
||||
if (attr == &sysfs_compression_stats)
|
||||
bch2_compression_stats_to_text(out, c);
|
||||
|
||||
@ -569,7 +565,6 @@ struct attribute *bch2_fs_internal_files[] = {
|
||||
&sysfs_stripes_heap,
|
||||
&sysfs_open_buckets,
|
||||
&sysfs_open_buckets_partial,
|
||||
&sysfs_write_points,
|
||||
#ifdef BCH_WRITE_REF_DEBUG
|
||||
&sysfs_write_refs,
|
||||
#endif
|
||||
|
@ -727,7 +727,7 @@ DEFINE_EVENT(fs_str, bucket_alloc_fail,
|
||||
TP_ARGS(c, str)
|
||||
);
|
||||
|
||||
TRACE_EVENT(discard_buckets,
|
||||
DECLARE_EVENT_CLASS(discard_buckets_class,
|
||||
TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
|
||||
u64 need_journal_commit, u64 discarded, const char *err),
|
||||
TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
|
||||
@ -759,6 +759,18 @@ TRACE_EVENT(discard_buckets,
|
||||
__entry->err)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(discard_buckets_class, discard_buckets,
|
||||
TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
|
||||
u64 need_journal_commit, u64 discarded, const char *err),
|
||||
TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(discard_buckets_class, discard_buckets_fast,
|
||||
TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
|
||||
u64 need_journal_commit, u64 discarded, const char *err),
|
||||
TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
|
||||
);
|
||||
|
||||
TRACE_EVENT(bucket_invalidate,
|
||||
TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
|
||||
TP_ARGS(c, dev, bucket, sectors),
|
||||
@ -785,27 +797,6 @@ TRACE_EVENT(bucket_invalidate,
|
||||
|
||||
/* Moving IO */
|
||||
|
||||
TRACE_EVENT(bucket_evacuate,
|
||||
TP_PROTO(struct bch_fs *c, struct bpos *bucket),
|
||||
TP_ARGS(c, bucket),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev )
|
||||
__field(u32, dev_idx )
|
||||
__field(u64, bucket )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = c->dev;
|
||||
__entry->dev_idx = bucket->inode;
|
||||
__entry->bucket = bucket->offset;
|
||||
),
|
||||
|
||||
TP_printk("%d:%d %u:%llu",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->dev_idx, __entry->bucket)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(fs_str, move_extent,
|
||||
TP_PROTO(struct bch_fs *c, const char *str),
|
||||
TP_ARGS(c, str)
|
||||
@ -869,65 +860,32 @@ TRACE_EVENT(move_data,
|
||||
__entry->sectors_raced)
|
||||
);
|
||||
|
||||
TRACE_EVENT(evacuate_bucket,
|
||||
TP_PROTO(struct bch_fs *c, struct bpos *bucket,
|
||||
unsigned sectors, unsigned bucket_size,
|
||||
int ret),
|
||||
TP_ARGS(c, bucket, sectors, bucket_size, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev )
|
||||
__field(u64, member )
|
||||
__field(u64, bucket )
|
||||
__field(u32, sectors )
|
||||
__field(u32, bucket_size )
|
||||
__field(int, ret )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = c->dev;
|
||||
__entry->member = bucket->inode;
|
||||
__entry->bucket = bucket->offset;
|
||||
__entry->sectors = sectors;
|
||||
__entry->bucket_size = bucket_size;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->member, __entry->bucket,
|
||||
__entry->sectors, __entry->bucket_size,
|
||||
__entry->ret)
|
||||
);
|
||||
|
||||
TRACE_EVENT(copygc,
|
||||
TP_PROTO(struct bch_fs *c,
|
||||
u64 sectors_moved, u64 sectors_not_moved,
|
||||
u64 buckets_moved, u64 buckets_not_moved),
|
||||
TP_ARGS(c,
|
||||
sectors_moved, sectors_not_moved,
|
||||
buckets_moved, buckets_not_moved),
|
||||
u64 buckets,
|
||||
u64 sectors_seen,
|
||||
u64 sectors_moved),
|
||||
TP_ARGS(c, buckets, sectors_seen, sectors_moved),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev )
|
||||
__field(u64, buckets )
|
||||
__field(u64, sectors_seen )
|
||||
__field(u64, sectors_moved )
|
||||
__field(u64, sectors_not_moved )
|
||||
__field(u64, buckets_moved )
|
||||
__field(u64, buckets_not_moved )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = c->dev;
|
||||
__entry->buckets = buckets;
|
||||
__entry->sectors_seen = sectors_seen;
|
||||
__entry->sectors_moved = sectors_moved;
|
||||
__entry->sectors_not_moved = sectors_not_moved;
|
||||
__entry->buckets_moved = buckets_moved;
|
||||
__entry->buckets_not_moved = buckets_moved;
|
||||
),
|
||||
|
||||
TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
|
||||
TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->sectors_moved, __entry->sectors_not_moved,
|
||||
__entry->buckets_moved, __entry->buckets_not_moved)
|
||||
__entry->buckets,
|
||||
__entry->sectors_seen,
|
||||
__entry->sectors_moved)
|
||||
);
|
||||
|
||||
TRACE_EVENT(copygc_wait,
|
||||
|
@ -670,8 +670,6 @@ static inline int cmp_le32(__le32 l, __le32 r)
|
||||
|
||||
#include <linux/uuid.h>
|
||||
|
||||
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
|
||||
|
||||
static inline bool qstr_eq(const struct qstr l, const struct qstr r)
|
||||
{
|
||||
return l.len == r.len && !memcmp(l.name, r.name, l.len);
|
||||
|
Loading…
Reference in New Issue
Block a user