Update bcachefs sources to 36f049d8029e bcachefs: Don't log duplicate errors in read path
Some checks failed
build / bcachefs-tools-msrv (push) Has been cancelled
.deb build orchestrator / obs (push) Has been cancelled
.deb build orchestrator / source-only (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / reprotest (push) Has been cancelled
.deb build orchestrator / publish (push) Has been cancelled
Nix Flake actions / nix-matrix (push) Has been cancelled
Nix Flake actions / ${{ matrix.name }} (${{ matrix.system }}) (push) Has been cancelled

This commit is contained in:
Kent Overstreet 2025-10-18 08:18:51 -04:00
parent 65f334e47a
commit 38a8ddad69
14 changed files with 443 additions and 593 deletions

View File

@ -1 +1 @@
93477de251da483c3fd9b16dca27363c1b44e73f
36f049d8029efb23fc759dbb6f651237a5854980

View File

@ -295,7 +295,7 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
static inline int bch2_trans_commit(struct btree_trans *trans,
struct disk_reservation *disk_res,
u64 *journal_seq,
unsigned flags)
enum bch_trans_commit_flags flags)
{
trans->disk_res = disk_res;
trans->journal_seq = journal_seq;

View File

@ -307,7 +307,8 @@ static int bch2_copygc(struct moving_context *ctxt,
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct data_update_opts data_opts = {
.btree_insert_flags = BCH_WATERMARK_copygc,
.type = BCH_DATA_UPDATE_copygc,
.commit_flags = (unsigned) BCH_WATERMARK_copygc,
};
u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen);
u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);

View File

@ -27,6 +27,13 @@
#include "util/util.h"
#ifdef CONFIG_BCACHEFS_DEBUG
static int bch2_force_read_device = -1;
module_param_named(force_read_device, bch2_force_read_device, int, 0644);
MODULE_PARM_DESC(force_read_device, "");
#endif
static const char * const bch2_extent_flags_strs[] = {
#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
BCH_EXTENT_FLAGS()
@ -172,6 +179,15 @@ static inline bool ptr_better(struct bch_fs *c,
if (unlikely(crc_retry_delta))
return crc_retry_delta < 0;
#ifdef CONFIG_BCACHEFS_DEBUG
if (bch2_force_read_device >= 0) {
int cmp = (p1.ptr.dev == bch2_force_read_device) -
(p2.ptr.dev == bch2_force_read_device);
if (cmp)
return cmp > 0;
}
#endif
/* Pick at random, biased in favor of the faster device: */
return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;

View File

@ -55,27 +55,6 @@ static int evacuate_bucket_pred(struct btree_trans *, void *,
struct bch_inode_opts *,
struct data_update_opts *);
static noinline void
trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
CLASS(printbuf, buf)();
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
trace_io_move(c, buf.buf);
}
static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
{
CLASS(printbuf, buf)();
bch2_bkey_val_to_text(&buf, c, k);
trace_io_move_read(c, buf.buf);
}
static noinline void
trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
struct bch_inode_opts *io_opts,
@ -114,30 +93,18 @@ trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen)
static void move_write_done(struct bch_write_op *op)
{
struct data_update *u = container_of(op, struct data_update, op);
struct bch_fs *c = op->c;
struct moving_context *ctxt = u->ctxt;
if (op->error) {
if (trace_io_move_write_fail_enabled()) {
CLASS(printbuf, buf)();
bch2_write_op_to_text(&buf, op);
trace_io_move_write_fail(c, buf.buf);
}
count_event(c, io_move_write_fail);
ctxt->write_error = true;
}
atomic_sub(u->k.k->k.size, &ctxt->write_sectors);
atomic_dec(&ctxt->write_ios);
bch2_data_update_exit(u);
bch2_data_update_exit(u, op->error);
kfree(u);
closure_put(&ctxt->cl);
}
static void move_write(struct data_update *u)
{
struct bch_fs *c = u->op.c;
struct moving_context *ctxt = u->ctxt;
struct bch_read_bio *rbio = &u->rbio;
@ -150,33 +117,6 @@ static void move_write(struct data_update *u)
&ctxt->stats->sectors_error_corrected);
}
/*
* If the extent has been bitrotted, we're going to have to give it a
* new checksum in order to move it - but the poison bit will ensure
* that userspace still gets the appropriate error.
*/
if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
(bch2_bkey_extent_flags(bkey_i_to_s_c(u->k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc);
rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type,
nonce, &rbio->bio);
rbio->ret = 0;
}
if (unlikely(rbio->ret || u->data_opts.scrub)) {
bch2_data_update_exit(u);
kfree(u);
return;
}
if (trace_io_move_write_enabled()) {
CLASS(printbuf, buf)();
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(u->k.k));
trace_io_move_write(c, buf.buf);
}
closure_get(&ctxt->cl);
atomic_add(u->k.k->k.size, &ctxt->write_sectors);
atomic_inc(&ctxt->write_ios);
@ -307,27 +247,9 @@ static int __bch2_move_extent(struct moving_context *ctxt,
struct bch_fs *c = trans->c;
int ret = 0;
if (trace_io_move_enabled())
trace_io_move2(c, k, &io_opts, &data_opts);
this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
bch2_data_update_opts_normalize(k, &data_opts);
if (!data_opts.rewrite_ptrs &&
!data_opts.extra_replicas &&
!data_opts.scrub) {
if (data_opts.kill_ptrs|data_opts.kill_ec_ptrs) {
this_cpu_add(c->counters[BCH_COUNTER_io_move_drop_only], k.k->size);
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
} else {
this_cpu_add(c->counters[BCH_COUNTER_io_move_noop], k.k->size);
return 0;
}
}
struct data_update *u = allocate_dropping_locks(trans, ret,
kzalloc(sizeof(struct data_update), _gfp));
if (!u && !ret)
@ -340,6 +262,8 @@ static int __bch2_move_extent(struct moving_context *ctxt,
if (ret)
goto err;
k = bkey_i_to_s_c(u->k.k);
u->op.end_io = move_write_done;
u->rbio.bio.bi_end_io = move_read_endio;
u->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
@ -357,9 +281,6 @@ static int __bch2_move_extent(struct moving_context *ctxt,
atomic_inc(&u->b->count);
}
if (trace_io_move_read_enabled())
trace_io_move_read2(c, k);
scoped_guard(mutex, &ctxt->lock) {
atomic_add(u->k.k->k.size, &ctxt->read_sectors);
atomic_inc(&ctxt->read_ios);
@ -379,30 +300,14 @@ static int __bch2_move_extent(struct moving_context *ctxt,
iter->btree_id, k, 0,
NULL,
BCH_READ_last_fragment,
data_opts.scrub ? data_opts.read_dev : -1);
data_opts.type == BCH_DATA_UPDATE_scrub ? data_opts.read_dev : -1);
return 0;
err:
if (bch2_err_matches(ret, BCH_ERR_data_update_done))
ret = 0;
if (ret &&
!bch2_err_matches(ret, EROFS) &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
count_event(c, io_move_start_fail);
if (trace_io_move_start_fail_enabled()) {
CLASS(printbuf, buf)();
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(u->k.k));
prt_str(&buf, ": ");
prt_str(&buf, bch2_err_str(ret));
trace_io_move_start_fail(c, buf.buf);
}
}
bch2_bkey_buf_exit(&u->k);
kfree(u);
return ret;
return bch2_err_matches(ret, BCH_ERR_data_update_done)
? 0
: ret;
}
int bch2_move_extent(struct moving_context *ctxt,
@ -431,18 +336,13 @@ int bch2_move_extent(struct moving_context *ctxt,
if (ret <= 0)
return ret;
if (data_opts.scrub &&
if (data_opts.type == BCH_DATA_UPDATE_scrub &&
!bch2_dev_idx_is_online(c, data_opts.read_dev))
return bch_err_throw(c, device_offline);
struct bkey_buf sk __cleanup(bch2_bkey_buf_exit);
bch2_bkey_buf_init(&sk);
bch2_bkey_buf_reassemble(&sk, k);
k = bkey_i_to_s_c(sk.k);
if (!bkey_is_btree_ptr(k.k))
ret = __bch2_move_extent(ctxt, bucket_in_flight, iter, k, opts, data_opts);
else if (!data_opts.scrub)
else if (data_opts.type != BCH_DATA_UPDATE_scrub)
ret = bch2_btree_node_rewrite_pos(trans, iter->btree_id, level, k.k->p, data_opts.target, 0);
else
ret = bch2_btree_node_scrub(trans, iter->btree_id, level, k, data_opts.read_dev);
@ -743,7 +643,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
continue;
if (bch2_err_matches(ret, BCH_ERR_data_update_fail))
ret = 0; /* failure for this extent, keep going */
if (bch2_err_matches(ret, EROFS))
if (bch2_err_matches(ret, EROFS) ||
bch2_err_matches(ret, BCH_ERR_device_offline))
return ret;
WARN_ONCE(ret, "unhandled error from move_extent: %s", bch2_err_str(ret));
next:
@ -795,11 +696,11 @@ static int evacuate_bucket_pred(struct btree_trans *trans, void *_arg,
if (ptr->dev == arg->bucket.inode &&
(arg->gen < 0 || arg->gen == ptr->gen) &&
!ptr->cached)
data_opts->rewrite_ptrs |= BIT(i);
data_opts->ptrs_rewrite |= BIT(i);
i++;
}
return data_opts->rewrite_ptrs != 0;
return data_opts->ptrs_rewrite != 0;
}
int bch2_evacuate_bucket(struct moving_context *ctxt,
@ -917,17 +818,15 @@ static int rereplicate_pred(struct btree_trans *trans, void *arg,
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
if (!ptr->cached &&
(!ca || !ca->mi.durability))
data_opts->kill_ptrs |= BIT(i);
data_opts->ptrs_kill |= BIT(i);
i++;
}
if (!data_opts->kill_ptrs &&
if (!data_opts->ptrs_kill &&
(!nr_good || nr_good >= replicas))
return false;
data_opts->target = 0;
data_opts->extra_replicas = replicas - nr_good;
data_opts->btree_insert_flags = 0;
data_opts->extra_replicas = replicas - nr_good;
return true;
}
@ -938,20 +837,15 @@ static int migrate_pred(struct btree_trans *trans, void *arg,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_ioctl_data *op = arg;
unsigned i = 0;
data_opts->rewrite_ptrs = 0;
data_opts->target = 0;
data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
if (ptr->dev == op->migrate.dev)
data_opts->rewrite_ptrs |= 1U << i;
i++;
data_opts->ptrs_rewrite |= ptr_bit;
ptr_bit <<= 1;
}
return data_opts->rewrite_ptrs != 0;
return data_opts->ptrs_rewrite != 0;
}
/*
@ -975,12 +869,8 @@ static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
{
if (b->version_ondisk != c->sb.version ||
btree_node_need_rewrite(b) ||
bformat_needs_redo(&b->format)) {
data_opts->target = 0;
data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
bformat_needs_redo(&b->format))
return true;
}
return false;
}
@ -1024,7 +914,7 @@ static int drop_extra_replicas_pred(struct btree_trans *trans, void *arg,
unsigned d = bch2_extent_ptr_durability(c, &p);
if (d && durability - d >= replicas) {
data_opts->kill_ptrs |= BIT(i);
data_opts->ptrs_kill |= BIT(i);
durability -= d;
}
@ -1034,14 +924,14 @@ static int drop_extra_replicas_pred(struct btree_trans *trans, void *arg,
i = 0;
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
if (p.has_ec && durability - p.ec.redundancy >= replicas) {
data_opts->kill_ec_ptrs |= BIT(i);
data_opts->ptrs_kill_ec |= BIT(i);
durability -= p.ec.redundancy;
}
i++;
}
return (data_opts->kill_ptrs|data_opts->kill_ec_ptrs) != 0;
return (data_opts->ptrs_kill|data_opts->ptrs_kill_ec) != 0;
}
static int scrub_pred(struct btree_trans *trans, void *_arg,
@ -1063,7 +953,7 @@ static int scrub_pred(struct btree_trans *trans, void *_arg,
}
}
data_opts->scrub = true;
data_opts->type = BCH_DATA_UPDATE_scrub;
data_opts->read_dev = arg->migrate.dev;
return true;
}

View File

@ -20,7 +20,6 @@ struct moving_context {
struct bch_move_stats *stats;
struct write_point_specifier wp;
bool wait_on_copygc;
bool write_error;
/* For waiting on outstanding reads and writes: */
struct closure cl;

View File

@ -162,24 +162,23 @@ static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
return false;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
unsigned i = 0;
unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
if (ptr->dev == dev &&
u->data_opts.rewrite_ptrs & BIT(i))
if (ptr->dev == dev && (u->opts.ptrs_rewrite & ptr_bit))
return true;
i++;
ptr_bit <<= 1;
}
return false;
}
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
struct bpos pos,
struct bch_inode_opts opts,
unsigned flags,
struct bch_io_failures *failed)
struct bpos pos,
struct bch_inode_opts opts,
unsigned flags,
bool self_healing)
{
if (!have_io_error(failed)) {
if (!self_healing) {
BUG_ON(!opts.promote_target);
if (!(flags & BCH_READ_may_promote)) {
@ -212,19 +211,19 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
static noinline void promote_free(struct bch_read_bio *rbio)
static noinline void promote_free(struct bch_read_bio *rbio, int ret)
{
struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
struct bch_fs *c = rbio->c;
int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
bch_promote_params);
BUG_ON(ret);
int ret2 = rhashtable_remove_fast(&c->promote_table, &op->hash,
bch_promote_params);
BUG_ON(ret2);
async_object_list_del(c, promote, op->list_idx);
async_object_list_del(c, rbio, rbio->list_idx);
bch2_data_update_exit(&op->write);
bch2_data_update_exit(&op->write, ret);
enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
kfree_rcu(op, rcu);
@ -236,7 +235,7 @@ static void promote_done(struct bch_write_op *wop)
struct bch_fs *c = op->write.rbio.c;
bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
promote_free(&op->write.rbio);
promote_free(&op->write.rbio, 0);
}
static void promote_start_work(struct work_struct *work)
@ -271,23 +270,27 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
if (!have_io_error(failed)) {
update_opts.type = BCH_DATA_UPDATE_promote;
update_opts.target = orig->opts.promote_target;
update_opts.extra_replicas = 1;
update_opts.write_flags |= BCH_WRITE_cached;
update_opts.write_flags |= BCH_WRITE_only_specified_devs;
} else {
update_opts.type = BCH_DATA_UPDATE_self_heal;
update_opts.target = orig->opts.foreground_target;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
if (bch2_dev_io_failures(failed, ptr->dev) &&
!ptr_being_rewritten(orig, ptr->dev))
update_opts.rewrite_ptrs |= ptr_bit;
!ptr_being_rewritten(orig, ptr->dev)) {
update_opts.ptrs_io_error|= ptr_bit;
update_opts.ptrs_rewrite|= ptr_bit;
}
ptr_bit <<= 1;
}
if (!update_opts.rewrite_ptrs)
if (!update_opts.ptrs_rewrite)
return ERR_PTR(bch_err_throw(c, nopromote_no_rewrites));
}
@ -318,7 +321,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
&orig->opts,
update_opts,
btree_id, k);
op->write.type = BCH_DATA_UPDATE_promote;
/*
* possible errors: -BCH_ERR_nocow_lock_blocked,
* -BCH_ERR_ENOSPC_disk_reservation:
@ -333,7 +335,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
return &op->write.rbio;
err_remove_list:
bch2_bkey_buf_exit(&op->write.k);
async_object_list_del(c, promote, op->list_idx);
err_remove_hash:
BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
@ -358,19 +359,30 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
bool *read_full,
struct bch_io_failures *failed)
{
struct bch_fs *c = trans->c;
bool self_healing = failed != NULL;
/*
* We're in the retry path, but we don't know what to repair yet, and we
* don't want to do a promote here:
*/
if (failed && !failed->nr)
if (self_healing && !failed->nr)
return NULL;
/*
* We're already doing a data update, we don't need to kick off another
* write here - we'll just propagate IO errors back to the parent
* data_update:
*/
if (self_healing && orig->data_update)
return NULL;
struct bch_fs *c = trans->c;
/*
* if failed != NULL we're not actually doing a promote, we're
* recovering from an io/checksum error
*/
bool promote_full = (have_io_error(failed) ||
bool promote_full = (self_healing ||
*read_full ||
READ_ONCE(c->opts.promote_whole_extents));
/* data might have to be decompressed in the write path: */
@ -380,9 +392,8 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
struct bpos pos = promote_full
? bkey_start_pos(k.k)
: POS(k.k->p.inode, iter.bi_sector);
int ret;
ret = should_promote(c, k, pos, orig->opts, flags, failed);
int ret = should_promote(c, k, pos, orig->opts, flags, self_healing);
if (ret)
goto nopromote;
@ -392,9 +403,6 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
? BTREE_ID_reflink
: BTREE_ID_extents,
k, pos, pick, sectors, orig, failed);
if (!promote)
return NULL;
ret = PTR_ERR_OR_ZERO(promote);
if (ret)
goto nopromote;
@ -402,8 +410,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
*bounce = true;
*read_full = promote_full;
if (have_io_error(failed))
orig->self_healing = true;
orig->self_healing |= self_healing;
return promote;
nopromote:
@ -493,7 +500,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
if (!rbio->bio.bi_status)
promote_start(rbio);
else
promote_free(rbio);
promote_free(rbio, -EIO);
} else {
async_object_list_del(rbio->c, rbio, rbio->list_idx);
@ -527,7 +534,7 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
static int get_rbio_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bkey_buf *sk)
{
struct btree_iter iter;
CLASS(btree_iter_uninit, iter)(trans);
struct bkey_s_c k;
try(lockrestart_do(trans,
@ -541,7 +548,6 @@ static int get_rbio_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
break;
}
bch2_trans_iter_exit(&iter);
return 0;
}
@ -592,35 +598,32 @@ static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
unsigned flags)
{
struct data_update *u = container_of(rbio, struct data_update, rbio);
retry:
bch2_trans_begin(trans);
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
try(lockrestart_do(trans,
bkey_err(k = bch2_bkey_get_iter(trans, &iter,
u->btree_id, bkey_start_pos(&u->k.k->k),
0))));
do {
bch2_trans_begin(trans);
if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
/* extent we wanted to read no longer exists: */
ret = bch_err_throw(trans->c, data_read_key_overwritten);
goto err;
}
CLASS(btree_iter_uninit, iter)(trans);
struct bkey_s_c k;
ret = __bch2_read_extent(trans, rbio, bvec_iter,
bkey_start_pos(&u->k.k->k),
u->btree_id,
bkey_i_to_s_c(u->k.k),
0, failed, flags, -1);
err:
bch2_trans_iter_exit(&iter);
try(lockrestart_do(trans,
bkey_err(k = bch2_bkey_get_iter(trans, &iter,
u->btree_id, bkey_start_pos(&u->k.k->k),
0))));
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, BCH_ERR_data_read_retry))
goto retry;
if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
/* extent we wanted to read no longer exists: */
ret = bch_err_throw(trans->c, data_read_key_overwritten);
break;
}
ret = __bch2_read_extent(trans, rbio, bvec_iter,
bkey_start_pos(&u->k.k->k),
u->btree_id,
bkey_i_to_s_c(u->k.k),
0, failed, flags, -1);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, BCH_ERR_data_read_retry));
if (ret) {
rbio->bio.bi_status = BLK_STS_IOERR;
@ -631,6 +634,22 @@ err:
return ret;
}
static void propagate_io_error_to_data_update(struct bch_read_bio *rbio,
struct extent_ptr_decoded *pick)
{
struct data_update *u = rbio_data_update(bch2_rbio_parent(rbio));
if (u && !pick->do_ec_reconstruct) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
if (pick->ptr.dev == ptr->dev)
u->opts.ptrs_io_error |= ptr_bit;
ptr_bit <<= 1;
}
}
}
static void bch2_rbio_retry(struct work_struct *work)
{
struct bch_read_bio *rbio =
@ -657,9 +676,12 @@ static void bch2_rbio_retry(struct work_struct *work)
get_rbio_extent(trans, rbio, &sk);
if (!bkey_deleted(&sk.k->k) &&
bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) {
bch2_mark_io_failure(&failed, &rbio->pick,
rbio->ret == -BCH_ERR_data_read_retry_csum_err);
propagate_io_error_to_data_update(rbio, &rbio->pick);
}
if (!rbio->split) {
rbio->bio.bi_status = 0;
@ -1104,22 +1126,26 @@ retry_pick:
trace_and_count(c, io_read_fail_and_poison, &orig->bio);
}
CLASS(printbuf, buf)();
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
prt_printf(&buf, "%s\n ", bch2_err_str(ret));
bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "%s", buf.buf);
if (!(flags & BCH_READ_in_retry)) {
CLASS(printbuf, buf)();
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
prt_printf(&buf, "%s\n ", bch2_err_str(ret));
bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "%s", buf.buf);
}
goto err;
}
if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
!c->chacha20_key_set) {
CLASS(printbuf, buf)();
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
bch2_bkey_val_to_text(&buf, c, k);
if (!(flags & BCH_READ_in_retry)) {
CLASS(printbuf, buf)();
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "%s", buf.buf);
bch_err_ratelimited(c, "%s", buf.buf);
}
ret = bch_err_throw(c, data_read_no_encryption_key);
goto err;
}
@ -1139,6 +1165,7 @@ retry_pick:
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick, false);
propagate_io_error_to_data_update(rbio, &pick);
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
goto retry_pick;
}
@ -1354,9 +1381,11 @@ out:
ret = rbio->ret;
rbio = bch2_rbio_free(rbio);
if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) {
bch2_mark_io_failure(failed, &pick,
ret == -BCH_ERR_data_read_retry_csum_err);
propagate_io_error_to_data_update(rbio, &pick);
}
return ret;
}
@ -1482,7 +1511,8 @@ err:
}
if (unlikely(ret)) {
if (ret != -BCH_ERR_extent_poisoned) {
if (!(flags & BCH_READ_in_retry) &&
ret != -BCH_ERR_extent_poisoned) {
CLASS(printbuf, buf)();
bch2_read_err_msg_trans(trans, &buf, rbio, POS(inum.inum, bvec_iter.bi_sector));
prt_printf(&buf, "data read error: %s", bch2_err_str(ret));

View File

@ -491,11 +491,12 @@ static int rebalance_set_data_opts(struct btree_trans *trans,
struct bch_fs *c = trans->c;
memset(data_opts, 0, sizeof(*data_opts));
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k);
data_opts->type = BCH_DATA_UPDATE_rebalance;
data_opts->ptrs_rewrite = bch2_bkey_ptrs_need_rebalance(c, opts, k);
data_opts->target = opts->background_target;
data_opts->write_flags |= BCH_WRITE_only_specified_devs;
if (!data_opts->rewrite_ptrs) {
if (!data_opts->ptrs_rewrite) {
/*
* device we would want to write to offline? devices in target
* changed?
@ -507,36 +508,6 @@ static int rebalance_set_data_opts(struct btree_trans *trans,
return 0;
}
if (trace_rebalance_extent_enabled()) {
CLASS(printbuf, buf)();
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
unsigned move_ptrs = 0;
unsigned compress_ptrs = 0;
u64 sectors = 0;
bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
if (move_ptrs) {
prt_str(&buf, "move=");
bch2_target_to_text(&buf, c, opts->background_target);
prt_str(&buf, " ");
bch2_prt_u64_base2(&buf, move_ptrs);
prt_newline(&buf);
}
if (compress_ptrs) {
prt_str(&buf, "compression=");
bch2_compression_opt_to_text(&buf, opts->background_compression);
prt_str(&buf, " ");
bch2_prt_u64_base2(&buf, compress_ptrs);
prt_newline(&buf);
}
trace_rebalance_extent(c, buf.buf);
}
count_event(c, rebalance_extent);
return 1;
}

View File

@ -30,7 +30,7 @@
#include <linux/ioprio.h>
static const char * const bch2_data_update_type_strs[] = {
#define x(t, n, ...) [n] = #t,
#define x(n) #n,
BCH_DATA_UPDATE_TYPES()
#undef x
NULL
@ -63,37 +63,22 @@ static unsigned bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
}
noinline_for_stack
static void trace_io_move_finish2(struct data_update *u,
struct bkey_i *new,
struct bkey_i *insert)
static void trace_data_update_key_fail2(struct data_update *m,
struct btree_iter *iter,
struct bkey_s_c new,
struct bkey_s_c wrote,
struct bkey_i *insert,
const char *msg)
{
struct bch_fs *c = u->op.c;
CLASS(printbuf, buf)();
if (m->stats) {
atomic64_inc(&m->stats->keys_raced);
atomic64_add(new.k->p.offset - iter->pos.offset,
&m->stats->sectors_raced);
}
prt_newline(&buf);
count_event(m->op.c, data_update_key_fail);
bch2_data_update_to_text(&buf, u);
prt_newline(&buf);
prt_str_indented(&buf, "new replicas:\t");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
prt_newline(&buf);
prt_str_indented(&buf, "insert:\t");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
prt_newline(&buf);
trace_io_move_finish(c, buf.buf);
}
noinline_for_stack
static void trace_io_move_fail2(struct data_update *m,
struct bkey_s_c new,
struct bkey_s_c wrote,
struct bkey_i *insert,
const char *msg)
{
if (!trace_io_move_fail_enabled())
if (!trace_data_update_key_fail_enabled())
return;
struct bch_fs *c = m->op.c;
@ -113,7 +98,7 @@ static void trace_io_move_fail2(struct data_update *m,
unsigned ptr_bit = 1;
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
if ((ptr_bit & m->opts.ptrs_rewrite) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached)
rewrites_found |= ptr_bit;
@ -125,7 +110,7 @@ static void trace_io_move_fail2(struct data_update *m,
bch2_prt_u64_base2(&buf, rewrites_found);
prt_newline(&buf);
bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->opts);
prt_str_indented(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
@ -141,11 +126,11 @@ static void trace_io_move_fail2(struct data_update *m,
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
}
trace_io_move_fail(c, buf.buf);
trace_data_update_key_fail(c, buf.buf);
}
noinline_for_stack
static void trace_data_update2(struct data_update *m,
static void trace_data_update_key2(struct data_update *m,
struct bkey_s_c old, struct bkey_s_c k,
struct bkey_i *insert)
{
@ -159,55 +144,7 @@ static void trace_data_update2(struct data_update *m,
prt_str(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
trace_data_update(c, buf.buf);
}
noinline_for_stack
static void trace_io_move_created_rebalance2(struct data_update *m,
struct bkey_s_c old, struct bkey_s_c k,
struct bkey_i *insert)
{
struct bch_fs *c = m->op.c;
CLASS(printbuf, buf)();
bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
prt_str(&buf, "\nk: ");
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
trace_io_move_created_rebalance(c, buf.buf);
count_event(c, io_move_created_rebalance);
}
noinline_for_stack
static int data_update_invalid_bkey(struct data_update *m,
struct bkey_s_c old, struct bkey_s_c k,
struct bkey_i *insert)
{
struct bch_fs *c = m->op.c;
CLASS(printbuf, buf)();
bch2_log_msg_start(c, &buf);
prt_str(&buf, "about to insert invalid key in data update path");
prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
prt_str(&buf, "\nk: ");
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
prt_newline(&buf);
bch2_fs_emergency_read_only2(c, &buf);
bch2_print_str(c, KERN_ERR, buf.buf);
return bch_err_throw(c, invalid_bkey);
trace_data_update_key(c, buf.buf);
}
static int __bch2_data_update_index_update(struct btree_trans *trans,
@ -243,11 +180,17 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (ret)
goto err;
struct bkey_i *tmp_k = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(tmp_k);
if (ret)
goto err;
k = bkey_i_to_s_c(tmp_k);
new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys));
if (!bch2_extents_match(k, old)) {
trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i),
NULL, "no match:");
trace_data_update_key_fail2(m, &iter, k, bkey_i_to_s_c(&new->k_i), NULL, "no match:");
goto nowork;
}
@ -282,24 +225,28 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
* other updates
* @new: extent with new pointers that we'll be adding to @insert
*
* Fist, drop rewrite_ptrs from @new:
* Fist, drop ptrs_rewrite from @new:
*/
ptr_bit = 1;
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) {
bch2_extent_ptr_set_cached(c, &m->op.opts,
bkey_i_to_s(insert), ptr);
if ((ptr_bit & m->opts.ptrs_rewrite) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert)))) {
if (ptr_bit & m->opts.ptrs_io_error)
bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
else if (!ptr->cached)
bch2_extent_ptr_set_cached(c, &m->op.opts,
bkey_i_to_s(insert), ptr);
rewrites_found |= ptr_bit;
}
ptr_bit <<= 1;
}
if (m->data_opts.rewrite_ptrs &&
if (m->opts.ptrs_rewrite &&
!rewrites_found &&
bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
trace_data_update_key_fail2(m, &iter, k, bkey_i_to_s_c(&new->k_i), insert,
"no rewrites found:");
goto nowork;
}
@ -316,7 +263,7 @@ restart_drop_conflicting_replicas:
}
if (!bkey_val_u64s(&new->k)) {
trace_io_move_fail2(m, k,
trace_data_update_key_fail2(m, &iter, k,
bkey_i_to_s_c(bch2_keylist_front(&op->insert_keys)),
insert, "new replicas conflicted:");
goto nowork;
@ -372,25 +319,9 @@ restart_drop_extra_replicas:
next_pos = insert->k.p;
/*
* Check for nonce offset inconsistency:
* This is debug code - we've been seeing this bug rarely, and
* it's been hard to reproduce, so this should give us some more
* information when it does occur:
*/
int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert),
(struct bkey_validate_context) {
.btree = m->btree_id,
.flags = BCH_VALIDATE_commit,
});
if (unlikely(invalid)) {
ret = data_update_invalid_bkey(m, old, k, insert);
goto out;
}
struct bch_inode_opts opts;
ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?:
ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->opts.type]) ?:
bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
@ -401,30 +332,20 @@ restart_drop_extra_replicas:
SET_NEEDS_REBALANCE_foreground,
m->op.opts.change_cookie) ?:
bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto err;
if (trace_data_update_enabled())
trace_data_update2(m, old, k, insert);
if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
trace_io_move_created_rebalance2(m, old, k, insert);
ret = bch2_trans_commit(trans, &op->res,
BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, &op->res,
NULL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
m->opts.commit_flags);
if (ret)
goto err;
bch2_btree_iter_set_pos(&iter, next_pos);
this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
if (trace_io_move_finish_enabled())
trace_io_move_finish2(m, &new->k_i, insert);
if (trace_data_update_key_enabled())
trace_data_update_key2(m, old, k, insert);
this_cpu_add(c->counters[BCH_COUNTER_data_update_key], new->k.size);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
ret = 0;
@ -438,15 +359,6 @@ next:
}
continue;
nowork:
if (m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
&m->stats->sectors_raced);
}
count_event(c, io_move_fail);
bch2_btree_iter_advance(&iter);
goto next;
}
@ -461,23 +373,101 @@ int bch2_data_update_index_update(struct bch_write_op *op)
return __bch2_data_update_index_update(trans, op);
}
void bch2_data_update_read_done(struct data_update *m)
void bch2_data_update_read_done(struct data_update *u)
{
m->read_done = true;
struct bch_fs *c = u->op.c;
struct bch_read_bio *rbio = &u->rbio;
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
u->read_done = true;
/*
* If the extent has been bitrotted, we're going to have to give it a
* new checksum in order to move it - but the poison bit will ensure
* that userspace still gets the appropriate error.
*/
if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
(bch2_bkey_extent_flags(bkey_i_to_s_c(u->k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
struct nonce nonce = extent_nonce(rbio->version, crc);
crc.csum = bch2_checksum_bio(c, crc.csum_type, nonce, &rbio->bio);
rbio->ret = 0;
}
if (unlikely(rbio->ret)) {
u->op.end_io(&u->op);
return;
}
if (u->opts.type == BCH_DATA_UPDATE_scrub && !u->opts.ptrs_io_error) {
u->op.end_io(&u->op);
return;
}
if (u->opts.ptrs_io_error) {
struct bkey_s_c k = bkey_i_to_s_c(u->k.k);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
unsigned ptr_bit = 1;
guard(rcu)();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if ((u->opts.ptrs_io_error & ptr_bit) &&
!(u->opts.ptrs_rewrite & ptr_bit)) {
u->op.nr_replicas += bch2_extent_ptr_durability(c, &p);
u->opts.ptrs_rewrite |= ptr_bit;
bch2_dev_list_drop_dev(&u->op.devs_have, p.ptr.dev);
}
ptr_bit <<= 1;
}
}
/* write bio must own pages: */
BUG_ON(!m->op.wbio.bio.bi_vcnt);
BUG_ON(!u->op.wbio.bio.bi_vcnt);
m->op.crc = m->rbio.pick.crc;
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
u->op.crc = crc;
u->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size);
closure_call(&m->op.cl, bch2_write, NULL, NULL);
closure_call(&u->op.cl, bch2_write, NULL, NULL);
}
void bch2_data_update_exit(struct data_update *update)
static void data_update_trace(struct data_update *u, int ret)
{
struct bch_fs *c = u->op.c;
if (!ret) {
if (trace_data_update_enabled()) {
CLASS(printbuf, buf)();
bch2_data_update_to_text(&buf, u);
trace_data_update(c, buf.buf);
}
count_event(c, data_update);
} else if (bch2_err_matches(ret, BCH_ERR_data_update_done)) {
if (trace_data_update_no_io_enabled()) {
CLASS(printbuf, buf)();
bch2_data_update_to_text(&buf, u);
prt_printf(&buf, "\nret:\t%s\n", bch2_err_str(ret));
trace_data_update_no_io(c, buf.buf);
}
count_event(c, data_update_no_io);
} else if (ret != -BCH_ERR_data_update_fail_no_rw_devs) {
if (trace_data_update_fail_enabled()) {
CLASS(printbuf, buf)();
bch2_data_update_to_text(&buf, u);
prt_printf(&buf, "\nret:\t%s\n", bch2_err_str(ret));
trace_data_update_fail(c, buf.buf);
}
count_event(c, data_update_fail);
}
}
void bch2_data_update_exit(struct data_update *update, int ret)
{
data_update_trace(update, ret);
struct bch_fs *c = update->op.c;
struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
@ -586,6 +576,15 @@ int bch2_update_unwritten_extent(struct btree_trans *trans,
return ret;
}
static void ptr_bits_to_text(struct printbuf *out, unsigned ptrs, const char *name)
{
if (ptrs) {
prt_printf(out, "%s ptrs:\t", name);
bch2_prt_u64_base2(out, ptrs);
prt_newline(out);
}
}
void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
@ -593,13 +592,13 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 20);
prt_str_indented(out, "rewrite ptrs:\t");
bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
prt_str(out, bch2_data_update_type_strs[data_opts->type]);
prt_newline(out);
prt_str_indented(out, "kill ptrs:\t");
bch2_prt_u64_base2(out, data_opts->kill_ptrs);
prt_newline(out);
ptr_bits_to_text(out, data_opts->ptrs_rewrite, "rewrite");
ptr_bits_to_text(out, data_opts->ptrs_io_error, "io error");
ptr_bits_to_text(out, data_opts->ptrs_kill, "kill");
ptr_bits_to_text(out, data_opts->ptrs_kill_ec, "kill ec");
prt_str_indented(out, "target:\t");
bch2_target_to_text(out, c, data_opts->target);
@ -616,17 +615,11 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
prt_str_indented(out, "extra replicas:\t");
prt_u64(out, data_opts->extra_replicas);
prt_newline(out);
prt_str_indented(out, "scrub:\t");
prt_u64(out, data_opts->scrub);
}
void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
{
prt_str(out, bch2_data_update_type_strs[m->type]);
prt_newline(out);
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->opts);
prt_newline(out);
prt_str_indented(out, "old key:\t");
@ -640,7 +633,7 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update
bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
prt_newline(out);
guard(printbuf_indent)(out);
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->opts);
if (!m->read_done) {
prt_printf(out, "read:\n");
@ -653,11 +646,11 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update
}
}
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
static int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
struct bch_fs *c = trans->c;
@ -667,16 +660,16 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct extent_ptr_decoded p = {};
unsigned i = 0;
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
if (data_opts->kill_ec_ptrs & BIT(i))
if (data_opts->ptrs_kill_ec & BIT(i))
bch2_bkey_drop_ec(n, p.ptr.dev);
i++;
}
while (data_opts->kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
while (data_opts->ptrs_kill) {
unsigned i = 0, drop = __fls(data_opts->ptrs_kill);
bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), p, entry, i++ == drop);
data_opts->kill_ptrs ^= 1U << drop;
data_opts->ptrs_kill ^= 1U << drop;
}
/*
@ -700,9 +693,9 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
struct bch_inode_opts *io_opts,
unsigned buf_bytes)
static int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
struct bch_inode_opts *io_opts,
unsigned buf_bytes)
{
unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
@ -727,21 +720,6 @@ static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
return 0;
}
int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
struct bch_inode_opts *io_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
/* write path might have to decompress data: */
unsigned buf_bytes = 0;
bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
}
static int can_write_extent(struct bch_fs *c, struct data_update *m)
{
if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
@ -757,6 +735,7 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
if (*i != BCH_SB_MEMBER_INVALID)
__clear_bit(*i, devs.d);
bool trace = trace_data_update_fail_enabled();
CLASS(printbuf, buf)();
guard(printbuf_atomic)(&buf);
@ -773,7 +752,8 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark);
prt_printf(&buf, "%s=%llu ", ca->name, nr_free);
if (trace)
prt_printf(&buf, "%s=%llu ", ca->name, nr_free);
if (!nr_free)
continue;
@ -784,8 +764,12 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
}
if (!nr_replicas) {
prt_printf(&buf, "\nnr_replicas %u < %u", nr_replicas, m->op.nr_replicas);
trace_data_update_done_no_rw_devs(c, buf.buf);
if (trace) {
prt_printf(&buf, "\nnr_replicas %u < %u", nr_replicas, m->op.nr_replicas);
trace_data_update_fail(c, buf.buf);
}
count_event(c, data_update_fail);
return bch_err_throw(c, data_update_fail_no_rw_devs);
}
@ -805,27 +789,12 @@ int bch2_data_update_init(struct btree_trans *trans,
struct bch_fs *c = trans->c;
int ret = 0;
if (k.k->p.snapshot) {
ret = bch2_check_key_has_snapshot(trans, iter, k);
if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
/* Can't repair yet, waiting on other recovery passes */
return bch_err_throw(c, data_update_fail_no_snapshot);
}
if (ret < 0)
return ret;
if (ret) /* key was deleted */
return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
bch_err_throw(c, data_update_fail_no_snapshot);
ret = 0;
}
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, k);
m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc
? BCH_DATA_UPDATE_copygc
: BCH_DATA_UPDATE_rebalance;
k = bkey_i_to_s_c(m->k.k);
m->btree_id = btree_id;
m->data_opts = data_opts;
m->opts = data_opts;
m->ctxt = ctxt;
m->stats = ctxt ? ctxt->stats : NULL;
@ -842,9 +811,21 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_pages_owned|
BCH_WRITE_data_encoded|
BCH_WRITE_move|
m->data_opts.write_flags;
m->opts.write_flags;
m->op.compression_opt = io_opts->background_compression;
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
m->op.watermark = m->opts.commit_flags & BCH_WATERMARK_MASK;
if (k.k->p.snapshot &&
unlikely(ret = bch2_check_key_has_snapshot(trans, iter, k))) {
if (ret > 0) /* key was deleted */
ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
bch_err_throw(c, data_update_fail_no_snapshot);
if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
/* Can't repair yet, waiting on other recovery passes */
ret = bch_err_throw(c, data_update_fail_no_snapshot);
}
goto out;
}
unsigned durability_have = 0, durability_removing = 0;
@ -855,42 +836,48 @@ int bch2_data_update_init(struct btree_trans *trans,
unsigned buf_bytes = 0;
bool unwritten = false;
unsigned ptr_bit = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (!p.ptr.cached) {
guard(rcu)();
if (ptr_bit & m->data_opts.rewrite_ptrs) {
if (crc_is_compressed(p.crc))
reserve_sectors += k.k->size;
scoped_guard(rcu) {
unsigned ptr_bit = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (!p.ptr.cached) {
if (ptr_bit & m->opts.ptrs_rewrite) {
if (crc_is_compressed(p.crc))
reserve_sectors += k.k->size;
m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
durability_removing += bch2_extent_ptr_desired_durability(c, &p);
} else if (!(ptr_bit & m->data_opts.kill_ptrs)) {
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
durability_removing += bch2_extent_ptr_desired_durability(c, &p);
} else if (!(ptr_bit & m->opts.ptrs_kill)) {
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
}
} else {
if (m->opts.ptrs_rewrite & ptr_bit) {
m->opts.ptrs_kill |= ptr_bit;
m->opts.ptrs_rewrite ^= ptr_bit;
}
}
/*
* op->csum_type is normally initialized from the fs/file's
* current options - but if an extent is encrypted, we require
* that it stays encrypted:
*/
if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
m->op.nonce = p.crc.nonce + p.crc.offset;
m->op.csum_type = p.crc.csum_type;
}
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
m->op.incompressible = true;
buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
unwritten |= p.ptr.unwritten;
ptr_bit <<= 1;
}
/*
* op->csum_type is normally initialized from the fs/file's
* current options - but if an extent is encrypted, we require
* that it stays encrypted:
*/
if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
m->op.nonce = p.crc.nonce + p.crc.offset;
m->op.csum_type = p.crc.csum_type;
}
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
m->op.incompressible = true;
buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
unwritten |= p.ptr.unwritten;
ptr_bit <<= 1;
}
if (!data_opts.scrub) {
if (m->opts.type != BCH_DATA_UPDATE_scrub) {
unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/*
@ -902,7 +889,7 @@ int bch2_data_update_init(struct btree_trans *trans,
* rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/
m->op.nr_replicas = min(durability_removing, durability_required) +
m->data_opts.extra_replicas;
m->opts.extra_replicas;
/*
* If device(s) were set to durability=0 after data was written to them
@ -920,11 +907,11 @@ int bch2_data_update_init(struct btree_trans *trans,
* was written:
*/
if (!m->op.nr_replicas) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
m->opts.ptrs_kill |= m->opts.ptrs_rewrite;
m->opts.ptrs_rewrite = 0;
/* if iter == NULL, it's just a promote */
if (iter)
ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->opts);
if (!ret)
ret = bch_err_throw(c, data_update_done_no_writes_needed);
goto out;
@ -949,7 +936,7 @@ int bch2_data_update_init(struct btree_trans *trans,
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
m->data_opts.extra_replicas
m->opts.extra_replicas
? 0
: BCH_DISK_RESERVATION_NOFAIL);
if (ret)
@ -997,32 +984,22 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_trans_unlock(trans);
ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
ret = bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
if (ret)
goto out_nocow_unlock;
return 0;
out_nocow_unlock:
if (c->opts.nocow_enabled)
bch2_bkey_nocow_unlock(c, k, 0);
out:
BUG_ON(!ret);
data_update_trace(m, ret);
bkey_put_dev_refs(c, k, m->ptrs_held);
m->ptrs_held = 0;
bch2_disk_reservation_put(c, &m->op.res);
bch2_bkey_buf_exit(&m->k);
return ret;
}
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) {
opts->kill_ptrs |= ptr_bit;
opts->rewrite_ptrs ^= ptr_bit;
}
ptr_bit <<= 1;
}
}

View File

@ -4,46 +4,48 @@
#define _BCACHEFS_DATA_UPDATE_H
#include "btree/bkey_buf.h"
#include "btree/update.h"
#include "data/read.h"
#include "data/write_types.h"
struct moving_context;
struct data_update_opts {
unsigned rewrite_ptrs;
unsigned kill_ptrs;
unsigned kill_ec_ptrs;
u16 target;
u8 extra_replicas;
unsigned btree_insert_flags;
unsigned write_flags;
int read_dev;
bool scrub;
};
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
struct bch_inode_opts *, struct data_update_opts *);
#define BCH_DATA_UPDATE_TYPES() \
x(copygc, 0) \
x(rebalance, 1) \
x(promote, 2)
#define BCH_DATA_UPDATE_TYPES() \
x(other) \
x(copygc) \
x(rebalance) \
x(promote) \
x(self_heal) \
x(scrub)
enum bch_data_update_types {
#define x(n, id) BCH_DATA_UPDATE_##n = id,
#define x(n) BCH_DATA_UPDATE_##n,
BCH_DATA_UPDATE_TYPES()
#undef x
};
struct data_update_opts {
enum bch_data_update_types type;
u8 ptrs_rewrite;
u8 ptrs_io_error;
u8 ptrs_kill;
u8 ptrs_kill_ec;
u8 extra_replicas;
u16 target;
int read_dev;
enum bch_write_flags write_flags;
enum bch_trans_commit_flags commit_flags;
};
struct data_update {
enum bch_data_update_types type;
bool read_done;
u8 ptrs_held;
/* extent being updated: */
enum btree_id btree_id;
struct bkey_buf k;
struct data_update_opts data_opts;
struct data_update_opts opts;
bool read_done;
u8 ptrs_held;
/* associated with @ctxt */
struct list_head read_list;
@ -72,6 +74,8 @@ struct promote_op {
struct bio_vec bi_inline_vecs[]; /* must be last */
};
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
struct bch_inode_opts *, struct data_update_opts *);
void bch2_data_update_to_text(struct printbuf *, struct data_update *);
void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
@ -79,22 +83,12 @@ int bch2_data_update_index_update(struct bch_write_op *);
void bch2_data_update_read_done(struct data_update *);
int bch2_extent_drop_ptrs(struct btree_trans *,
struct btree_iter *,
struct bkey_s_c,
struct bch_inode_opts *,
struct data_update_opts *);
int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
struct bch_inode_opts *);
void bch2_data_update_exit(struct data_update *);
void bch2_data_update_exit(struct data_update *, int);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
struct moving_context *,
struct data_update *,
struct write_point_specifier,
struct bch_inode_opts *, struct data_update_opts,
enum btree_id, struct bkey_s_c);
void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
#endif /* _BCACHEFS_DATA_UPDATE_H */

View File

@ -426,9 +426,11 @@ static int bch2_write_index_default(struct bch_write_op *op)
void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
{
CLASS(printbuf, buf)();
CLASS(btree_trans, trans)(op->c);
CLASS(printbuf, buf)();
bch2_log_msg_start(op->c, &buf);
struct bpos pos = op->pos;
pos.offset = offset;
@ -440,15 +442,17 @@ void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, .
va_start(args, fmt);
prt_vprintf(&buf, fmt, args);
va_end(args);
prt_newline(&buf);
if (op->flags & BCH_WRITE_move) {
struct data_update *u = container_of(op, struct data_update, op);
prt_printf(&buf, "\n from internal move ");
prt_printf(&buf, "from internal move ");
bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
prt_newline(&buf);
}
bch_err_ratelimited(op->c, "%s", buf.buf);
bch2_print_str_ratelimited(op->c, KERN_ERR, buf.buf);
}
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,

View File

@ -797,37 +797,37 @@ TRACE_EVENT(bucket_invalidate,
/* Moving IO */
DEFINE_EVENT(fs_str, io_move,
DEFINE_EVENT(fs_str, data_update,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_read,
DEFINE_EVENT(fs_str, data_update_no_io,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_write,
DEFINE_EVENT(fs_str, data_update_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_finish,
DEFINE_EVENT(fs_str, data_update_key,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_fail,
DEFINE_EVENT(fs_str, data_update_key_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_write_fail,
DEFINE_EVENT(fs_str, io_move_pred,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_start_fail,
DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
@ -1317,40 +1317,7 @@ TRACE_EVENT(write_buffer_maybe_flush,
TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key))
);
DEFINE_EVENT(fs_str, rebalance_extent,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, data_update,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, data_update_done_no_rw_devs,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_pred,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_created_rebalance,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, extent_trim_atomic,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
/* BTREE ITER TRACEPOINTS */
DEFINE_EVENT(fs_str, btree_iter_peek_slot,
TP_PROTO(struct bch_fs *c, const char *str),
@ -1372,6 +1339,11 @@ DEFINE_EVENT(fs_str, btree_iter_peek_prev_min,
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, extent_trim_atomic,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
TRACE_EVENT(update_by_path,

View File

@ -26,14 +26,14 @@ enum counters_flags {
x(io_read_narrow_crcs, 97, TYPE_COUNTER) \
x(io_read_narrow_crcs_fail, 98, TYPE_COUNTER) \
x(io_write, 1, TYPE_SECTORS) \
x(io_move, 2, TYPE_SECTORS) \
x(data_update, 2, TYPE_SECTORS) \
x(data_update_no_io, 91, TYPE_COUNTER) \
x(data_update_fail, 82, TYPE_COUNTER) \
x(data_update_key, 37, TYPE_SECTORS) \
x(data_update_key_fail, 38, TYPE_COUNTER) \
x(io_move_read, 35, TYPE_SECTORS) \
x(io_move_write, 36, TYPE_SECTORS) \
x(io_move_finish, 37, TYPE_SECTORS) \
x(io_move_fail, 38, TYPE_COUNTER) \
x(io_move_write_fail, 82, TYPE_COUNTER) \
x(io_move_start_fail, 39, TYPE_COUNTER) \
x(io_move_drop_only, 91, TYPE_COUNTER) \
x(io_move_noop, 92, TYPE_COUNTER) \
x(io_move_created_rebalance, 83, TYPE_COUNTER) \
x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \

View File

@ -187,12 +187,8 @@ static int check_subvol_child(struct btree_trans *trans,
le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
trans, subvol_children_bad,
"incorrect entry in subvolume_children btree %llu:%llu",
child_k.k->p.inode, child_k.k->p.offset)) {
ret = bch2_btree_delete_at(trans, child_iter, 0);
if (ret)
goto err;
}
err:
child_k.k->p.inode, child_k.k->p.offset))
try(bch2_btree_delete_at(trans, child_iter, 0));
fsck_err:
return ret;
}