Update bcachefs sources to 36f049d8029e bcachefs: Don't log duplicate errors in read path
Some checks failed
build / bcachefs-tools-msrv (push) Has been cancelled
.deb build orchestrator / obs (push) Has been cancelled
.deb build orchestrator / source-only (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / reprotest (push) Has been cancelled
.deb build orchestrator / publish (push) Has been cancelled
Nix Flake actions / nix-matrix (push) Has been cancelled
Nix Flake actions / ${{ matrix.name }} (${{ matrix.system }}) (push) Has been cancelled

This commit is contained in:
Kent Overstreet 2025-10-18 08:18:51 -04:00
parent 65f334e47a
commit 38a8ddad69
14 changed files with 443 additions and 593 deletions

View File

@ -1 +1 @@
93477de251da483c3fd9b16dca27363c1b44e73f 36f049d8029efb23fc759dbb6f651237a5854980

View File

@ -295,7 +295,7 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
static inline int bch2_trans_commit(struct btree_trans *trans, static inline int bch2_trans_commit(struct btree_trans *trans,
struct disk_reservation *disk_res, struct disk_reservation *disk_res,
u64 *journal_seq, u64 *journal_seq,
unsigned flags) enum bch_trans_commit_flags flags)
{ {
trans->disk_res = disk_res; trans->disk_res = disk_res;
trans->journal_seq = journal_seq; trans->journal_seq = journal_seq;

View File

@ -307,7 +307,8 @@ static int bch2_copygc(struct moving_context *ctxt,
struct btree_trans *trans = ctxt->trans; struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct data_update_opts data_opts = { struct data_update_opts data_opts = {
.btree_insert_flags = BCH_WATERMARK_copygc, .type = BCH_DATA_UPDATE_copygc,
.commit_flags = (unsigned) BCH_WATERMARK_copygc,
}; };
u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen);
u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);

View File

@ -27,6 +27,13 @@
#include "util/util.h" #include "util/util.h"
#ifdef CONFIG_BCACHEFS_DEBUG
static int bch2_force_read_device = -1;
module_param_named(force_read_device, bch2_force_read_device, int, 0644);
MODULE_PARM_DESC(force_read_device, "");
#endif
static const char * const bch2_extent_flags_strs[] = { static const char * const bch2_extent_flags_strs[] = {
#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, #define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
BCH_EXTENT_FLAGS() BCH_EXTENT_FLAGS()
@ -172,6 +179,15 @@ static inline bool ptr_better(struct bch_fs *c,
if (unlikely(crc_retry_delta)) if (unlikely(crc_retry_delta))
return crc_retry_delta < 0; return crc_retry_delta < 0;
#ifdef CONFIG_BCACHEFS_DEBUG
if (bch2_force_read_device >= 0) {
int cmp = (p1.ptr.dev == bch2_force_read_device) -
(p2.ptr.dev == bch2_force_read_device);
if (cmp)
return cmp > 0;
}
#endif
/* Pick at random, biased in favor of the faster device: */ /* Pick at random, biased in favor of the faster device: */
return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency; return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;

View File

@ -55,27 +55,6 @@ static int evacuate_bucket_pred(struct btree_trans *, void *,
struct bch_inode_opts *, struct bch_inode_opts *,
struct data_update_opts *); struct data_update_opts *);
static noinline void
trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
CLASS(printbuf, buf)();
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
trace_io_move(c, buf.buf);
}
static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
{
CLASS(printbuf, buf)();
bch2_bkey_val_to_text(&buf, c, k);
trace_io_move_read(c, buf.buf);
}
static noinline void static noinline void
trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
struct bch_inode_opts *io_opts, struct bch_inode_opts *io_opts,
@ -114,30 +93,18 @@ trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen)
static void move_write_done(struct bch_write_op *op) static void move_write_done(struct bch_write_op *op)
{ {
struct data_update *u = container_of(op, struct data_update, op); struct data_update *u = container_of(op, struct data_update, op);
struct bch_fs *c = op->c;
struct moving_context *ctxt = u->ctxt; struct moving_context *ctxt = u->ctxt;
if (op->error) {
if (trace_io_move_write_fail_enabled()) {
CLASS(printbuf, buf)();
bch2_write_op_to_text(&buf, op);
trace_io_move_write_fail(c, buf.buf);
}
count_event(c, io_move_write_fail);
ctxt->write_error = true;
}
atomic_sub(u->k.k->k.size, &ctxt->write_sectors); atomic_sub(u->k.k->k.size, &ctxt->write_sectors);
atomic_dec(&ctxt->write_ios); atomic_dec(&ctxt->write_ios);
bch2_data_update_exit(u);
bch2_data_update_exit(u, op->error);
kfree(u); kfree(u);
closure_put(&ctxt->cl); closure_put(&ctxt->cl);
} }
static void move_write(struct data_update *u) static void move_write(struct data_update *u)
{ {
struct bch_fs *c = u->op.c;
struct moving_context *ctxt = u->ctxt; struct moving_context *ctxt = u->ctxt;
struct bch_read_bio *rbio = &u->rbio; struct bch_read_bio *rbio = &u->rbio;
@ -150,33 +117,6 @@ static void move_write(struct data_update *u)
&ctxt->stats->sectors_error_corrected); &ctxt->stats->sectors_error_corrected);
} }
/*
* If the extent has been bitrotted, we're going to have to give it a
* new checksum in order to move it - but the poison bit will ensure
* that userspace still gets the appropriate error.
*/
if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
(bch2_bkey_extent_flags(bkey_i_to_s_c(u->k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc);
rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type,
nonce, &rbio->bio);
rbio->ret = 0;
}
if (unlikely(rbio->ret || u->data_opts.scrub)) {
bch2_data_update_exit(u);
kfree(u);
return;
}
if (trace_io_move_write_enabled()) {
CLASS(printbuf, buf)();
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(u->k.k));
trace_io_move_write(c, buf.buf);
}
closure_get(&ctxt->cl); closure_get(&ctxt->cl);
atomic_add(u->k.k->k.size, &ctxt->write_sectors); atomic_add(u->k.k->k.size, &ctxt->write_sectors);
atomic_inc(&ctxt->write_ios); atomic_inc(&ctxt->write_ios);
@ -307,27 +247,9 @@ static int __bch2_move_extent(struct moving_context *ctxt,
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
int ret = 0; int ret = 0;
if (trace_io_move_enabled())
trace_io_move2(c, k, &io_opts, &data_opts);
this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
if (ctxt->stats) if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
bch2_data_update_opts_normalize(k, &data_opts);
if (!data_opts.rewrite_ptrs &&
!data_opts.extra_replicas &&
!data_opts.scrub) {
if (data_opts.kill_ptrs|data_opts.kill_ec_ptrs) {
this_cpu_add(c->counters[BCH_COUNTER_io_move_drop_only], k.k->size);
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
} else {
this_cpu_add(c->counters[BCH_COUNTER_io_move_noop], k.k->size);
return 0;
}
}
struct data_update *u = allocate_dropping_locks(trans, ret, struct data_update *u = allocate_dropping_locks(trans, ret,
kzalloc(sizeof(struct data_update), _gfp)); kzalloc(sizeof(struct data_update), _gfp));
if (!u && !ret) if (!u && !ret)
@ -340,6 +262,8 @@ static int __bch2_move_extent(struct moving_context *ctxt,
if (ret) if (ret)
goto err; goto err;
k = bkey_i_to_s_c(u->k.k);
u->op.end_io = move_write_done; u->op.end_io = move_write_done;
u->rbio.bio.bi_end_io = move_read_endio; u->rbio.bio.bi_end_io = move_read_endio;
u->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); u->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
@ -357,9 +281,6 @@ static int __bch2_move_extent(struct moving_context *ctxt,
atomic_inc(&u->b->count); atomic_inc(&u->b->count);
} }
if (trace_io_move_read_enabled())
trace_io_move_read2(c, k);
scoped_guard(mutex, &ctxt->lock) { scoped_guard(mutex, &ctxt->lock) {
atomic_add(u->k.k->k.size, &ctxt->read_sectors); atomic_add(u->k.k->k.size, &ctxt->read_sectors);
atomic_inc(&ctxt->read_ios); atomic_inc(&ctxt->read_ios);
@ -379,30 +300,14 @@ static int __bch2_move_extent(struct moving_context *ctxt,
iter->btree_id, k, 0, iter->btree_id, k, 0,
NULL, NULL,
BCH_READ_last_fragment, BCH_READ_last_fragment,
data_opts.scrub ? data_opts.read_dev : -1); data_opts.type == BCH_DATA_UPDATE_scrub ? data_opts.read_dev : -1);
return 0; return 0;
err: err:
if (bch2_err_matches(ret, BCH_ERR_data_update_done))
ret = 0;
if (ret &&
!bch2_err_matches(ret, EROFS) &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
count_event(c, io_move_start_fail);
if (trace_io_move_start_fail_enabled()) {
CLASS(printbuf, buf)();
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(u->k.k));
prt_str(&buf, ": ");
prt_str(&buf, bch2_err_str(ret));
trace_io_move_start_fail(c, buf.buf);
}
}
bch2_bkey_buf_exit(&u->k);
kfree(u); kfree(u);
return ret; return bch2_err_matches(ret, BCH_ERR_data_update_done)
? 0
: ret;
} }
int bch2_move_extent(struct moving_context *ctxt, int bch2_move_extent(struct moving_context *ctxt,
@ -431,18 +336,13 @@ int bch2_move_extent(struct moving_context *ctxt,
if (ret <= 0) if (ret <= 0)
return ret; return ret;
if (data_opts.scrub && if (data_opts.type == BCH_DATA_UPDATE_scrub &&
!bch2_dev_idx_is_online(c, data_opts.read_dev)) !bch2_dev_idx_is_online(c, data_opts.read_dev))
return bch_err_throw(c, device_offline); return bch_err_throw(c, device_offline);
struct bkey_buf sk __cleanup(bch2_bkey_buf_exit);
bch2_bkey_buf_init(&sk);
bch2_bkey_buf_reassemble(&sk, k);
k = bkey_i_to_s_c(sk.k);
if (!bkey_is_btree_ptr(k.k)) if (!bkey_is_btree_ptr(k.k))
ret = __bch2_move_extent(ctxt, bucket_in_flight, iter, k, opts, data_opts); ret = __bch2_move_extent(ctxt, bucket_in_flight, iter, k, opts, data_opts);
else if (!data_opts.scrub) else if (data_opts.type != BCH_DATA_UPDATE_scrub)
ret = bch2_btree_node_rewrite_pos(trans, iter->btree_id, level, k.k->p, data_opts.target, 0); ret = bch2_btree_node_rewrite_pos(trans, iter->btree_id, level, k.k->p, data_opts.target, 0);
else else
ret = bch2_btree_node_scrub(trans, iter->btree_id, level, k, data_opts.read_dev); ret = bch2_btree_node_scrub(trans, iter->btree_id, level, k, data_opts.read_dev);
@ -743,7 +643,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
continue; continue;
if (bch2_err_matches(ret, BCH_ERR_data_update_fail)) if (bch2_err_matches(ret, BCH_ERR_data_update_fail))
ret = 0; /* failure for this extent, keep going */ ret = 0; /* failure for this extent, keep going */
if (bch2_err_matches(ret, EROFS)) if (bch2_err_matches(ret, EROFS) ||
bch2_err_matches(ret, BCH_ERR_device_offline))
return ret; return ret;
WARN_ONCE(ret, "unhandled error from move_extent: %s", bch2_err_str(ret)); WARN_ONCE(ret, "unhandled error from move_extent: %s", bch2_err_str(ret));
next: next:
@ -795,11 +696,11 @@ static int evacuate_bucket_pred(struct btree_trans *trans, void *_arg,
if (ptr->dev == arg->bucket.inode && if (ptr->dev == arg->bucket.inode &&
(arg->gen < 0 || arg->gen == ptr->gen) && (arg->gen < 0 || arg->gen == ptr->gen) &&
!ptr->cached) !ptr->cached)
data_opts->rewrite_ptrs |= BIT(i); data_opts->ptrs_rewrite |= BIT(i);
i++; i++;
} }
return data_opts->rewrite_ptrs != 0; return data_opts->ptrs_rewrite != 0;
} }
int bch2_evacuate_bucket(struct moving_context *ctxt, int bch2_evacuate_bucket(struct moving_context *ctxt,
@ -917,17 +818,15 @@ static int rereplicate_pred(struct btree_trans *trans, void *arg,
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
if (!ptr->cached && if (!ptr->cached &&
(!ca || !ca->mi.durability)) (!ca || !ca->mi.durability))
data_opts->kill_ptrs |= BIT(i); data_opts->ptrs_kill |= BIT(i);
i++; i++;
} }
if (!data_opts->kill_ptrs && if (!data_opts->ptrs_kill &&
(!nr_good || nr_good >= replicas)) (!nr_good || nr_good >= replicas))
return false; return false;
data_opts->target = 0; data_opts->extra_replicas = replicas - nr_good;
data_opts->extra_replicas = replicas - nr_good;
data_opts->btree_insert_flags = 0;
return true; return true;
} }
@ -938,20 +837,15 @@ static int migrate_pred(struct btree_trans *trans, void *arg,
{ {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_ioctl_data *op = arg; struct bch_ioctl_data *op = arg;
unsigned i = 0; unsigned ptr_bit = 1;
data_opts->rewrite_ptrs = 0;
data_opts->target = 0;
data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
bkey_for_each_ptr(ptrs, ptr) { bkey_for_each_ptr(ptrs, ptr) {
if (ptr->dev == op->migrate.dev) if (ptr->dev == op->migrate.dev)
data_opts->rewrite_ptrs |= 1U << i; data_opts->ptrs_rewrite |= ptr_bit;
i++; ptr_bit <<= 1;
} }
return data_opts->rewrite_ptrs != 0; return data_opts->ptrs_rewrite != 0;
} }
/* /*
@ -975,12 +869,8 @@ static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
{ {
if (b->version_ondisk != c->sb.version || if (b->version_ondisk != c->sb.version ||
btree_node_need_rewrite(b) || btree_node_need_rewrite(b) ||
bformat_needs_redo(&b->format)) { bformat_needs_redo(&b->format))
data_opts->target = 0;
data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
return true; return true;
}
return false; return false;
} }
@ -1024,7 +914,7 @@ static int drop_extra_replicas_pred(struct btree_trans *trans, void *arg,
unsigned d = bch2_extent_ptr_durability(c, &p); unsigned d = bch2_extent_ptr_durability(c, &p);
if (d && durability - d >= replicas) { if (d && durability - d >= replicas) {
data_opts->kill_ptrs |= BIT(i); data_opts->ptrs_kill |= BIT(i);
durability -= d; durability -= d;
} }
@ -1034,14 +924,14 @@ static int drop_extra_replicas_pred(struct btree_trans *trans, void *arg,
i = 0; i = 0;
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
if (p.has_ec && durability - p.ec.redundancy >= replicas) { if (p.has_ec && durability - p.ec.redundancy >= replicas) {
data_opts->kill_ec_ptrs |= BIT(i); data_opts->ptrs_kill_ec |= BIT(i);
durability -= p.ec.redundancy; durability -= p.ec.redundancy;
} }
i++; i++;
} }
return (data_opts->kill_ptrs|data_opts->kill_ec_ptrs) != 0; return (data_opts->ptrs_kill|data_opts->ptrs_kill_ec) != 0;
} }
static int scrub_pred(struct btree_trans *trans, void *_arg, static int scrub_pred(struct btree_trans *trans, void *_arg,
@ -1063,7 +953,7 @@ static int scrub_pred(struct btree_trans *trans, void *_arg,
} }
} }
data_opts->scrub = true; data_opts->type = BCH_DATA_UPDATE_scrub;
data_opts->read_dev = arg->migrate.dev; data_opts->read_dev = arg->migrate.dev;
return true; return true;
} }

View File

@ -20,7 +20,6 @@ struct moving_context {
struct bch_move_stats *stats; struct bch_move_stats *stats;
struct write_point_specifier wp; struct write_point_specifier wp;
bool wait_on_copygc; bool wait_on_copygc;
bool write_error;
/* For waiting on outstanding reads and writes: */ /* For waiting on outstanding reads and writes: */
struct closure cl; struct closure cl;

View File

@ -162,24 +162,23 @@ static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
return false; return false;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
unsigned i = 0; unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) { bkey_for_each_ptr(ptrs, ptr) {
if (ptr->dev == dev && if (ptr->dev == dev && (u->opts.ptrs_rewrite & ptr_bit))
u->data_opts.rewrite_ptrs & BIT(i))
return true; return true;
i++; ptr_bit <<= 1;
} }
return false; return false;
} }
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
struct bpos pos, struct bpos pos,
struct bch_inode_opts opts, struct bch_inode_opts opts,
unsigned flags, unsigned flags,
struct bch_io_failures *failed) bool self_healing)
{ {
if (!have_io_error(failed)) { if (!self_healing) {
BUG_ON(!opts.promote_target); BUG_ON(!opts.promote_target);
if (!(flags & BCH_READ_may_promote)) { if (!(flags & BCH_READ_may_promote)) {
@ -212,19 +211,19 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
return 0; return 0;
} }
static noinline void promote_free(struct bch_read_bio *rbio) static noinline void promote_free(struct bch_read_bio *rbio, int ret)
{ {
struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
struct bch_fs *c = rbio->c; struct bch_fs *c = rbio->c;
int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, int ret2 = rhashtable_remove_fast(&c->promote_table, &op->hash,
bch_promote_params); bch_promote_params);
BUG_ON(ret); BUG_ON(ret2);
async_object_list_del(c, promote, op->list_idx); async_object_list_del(c, promote, op->list_idx);
async_object_list_del(c, rbio, rbio->list_idx); async_object_list_del(c, rbio, rbio->list_idx);
bch2_data_update_exit(&op->write); bch2_data_update_exit(&op->write, ret);
enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
kfree_rcu(op, rcu); kfree_rcu(op, rcu);
@ -236,7 +235,7 @@ static void promote_done(struct bch_write_op *wop)
struct bch_fs *c = op->write.rbio.c; struct bch_fs *c = op->write.rbio.c;
bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
promote_free(&op->write.rbio); promote_free(&op->write.rbio, 0);
} }
static void promote_start_work(struct work_struct *work) static void promote_start_work(struct work_struct *work)
@ -271,23 +270,27 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
if (!have_io_error(failed)) { if (!have_io_error(failed)) {
update_opts.type = BCH_DATA_UPDATE_promote;
update_opts.target = orig->opts.promote_target; update_opts.target = orig->opts.promote_target;
update_opts.extra_replicas = 1; update_opts.extra_replicas = 1;
update_opts.write_flags |= BCH_WRITE_cached; update_opts.write_flags |= BCH_WRITE_cached;
update_opts.write_flags |= BCH_WRITE_only_specified_devs; update_opts.write_flags |= BCH_WRITE_only_specified_devs;
} else { } else {
update_opts.type = BCH_DATA_UPDATE_self_heal;
update_opts.target = orig->opts.foreground_target; update_opts.target = orig->opts.foreground_target;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned ptr_bit = 1; unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) { bkey_for_each_ptr(ptrs, ptr) {
if (bch2_dev_io_failures(failed, ptr->dev) && if (bch2_dev_io_failures(failed, ptr->dev) &&
!ptr_being_rewritten(orig, ptr->dev)) !ptr_being_rewritten(orig, ptr->dev)) {
update_opts.rewrite_ptrs |= ptr_bit; update_opts.ptrs_io_error|= ptr_bit;
update_opts.ptrs_rewrite|= ptr_bit;
}
ptr_bit <<= 1; ptr_bit <<= 1;
} }
if (!update_opts.rewrite_ptrs) if (!update_opts.ptrs_rewrite)
return ERR_PTR(bch_err_throw(c, nopromote_no_rewrites)); return ERR_PTR(bch_err_throw(c, nopromote_no_rewrites));
} }
@ -318,7 +321,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
&orig->opts, &orig->opts,
update_opts, update_opts,
btree_id, k); btree_id, k);
op->write.type = BCH_DATA_UPDATE_promote;
/* /*
* possible errors: -BCH_ERR_nocow_lock_blocked, * possible errors: -BCH_ERR_nocow_lock_blocked,
* -BCH_ERR_ENOSPC_disk_reservation: * -BCH_ERR_ENOSPC_disk_reservation:
@ -333,7 +335,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
return &op->write.rbio; return &op->write.rbio;
err_remove_list: err_remove_list:
bch2_bkey_buf_exit(&op->write.k);
async_object_list_del(c, promote, op->list_idx); async_object_list_del(c, promote, op->list_idx);
err_remove_hash: err_remove_hash:
BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
@ -358,19 +359,30 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
bool *read_full, bool *read_full,
struct bch_io_failures *failed) struct bch_io_failures *failed)
{ {
struct bch_fs *c = trans->c;
bool self_healing = failed != NULL;
/* /*
* We're in the retry path, but we don't know what to repair yet, and we * We're in the retry path, but we don't know what to repair yet, and we
* don't want to do a promote here: * don't want to do a promote here:
*/ */
if (failed && !failed->nr) if (self_healing && !failed->nr)
return NULL;
/*
* We're already doing a data update, we don't need to kick off another
* write here - we'll just propagate IO errors back to the parent
* data_update:
*/
if (self_healing && orig->data_update)
return NULL; return NULL;
struct bch_fs *c = trans->c;
/* /*
* if failed != NULL we're not actually doing a promote, we're * if failed != NULL we're not actually doing a promote, we're
* recovering from an io/checksum error * recovering from an io/checksum error
*/ */
bool promote_full = (have_io_error(failed) || bool promote_full = (self_healing ||
*read_full || *read_full ||
READ_ONCE(c->opts.promote_whole_extents)); READ_ONCE(c->opts.promote_whole_extents));
/* data might have to be decompressed in the write path: */ /* data might have to be decompressed in the write path: */
@ -380,9 +392,8 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
struct bpos pos = promote_full struct bpos pos = promote_full
? bkey_start_pos(k.k) ? bkey_start_pos(k.k)
: POS(k.k->p.inode, iter.bi_sector); : POS(k.k->p.inode, iter.bi_sector);
int ret;
ret = should_promote(c, k, pos, orig->opts, flags, failed); int ret = should_promote(c, k, pos, orig->opts, flags, self_healing);
if (ret) if (ret)
goto nopromote; goto nopromote;
@ -392,9 +403,6 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
? BTREE_ID_reflink ? BTREE_ID_reflink
: BTREE_ID_extents, : BTREE_ID_extents,
k, pos, pick, sectors, orig, failed); k, pos, pick, sectors, orig, failed);
if (!promote)
return NULL;
ret = PTR_ERR_OR_ZERO(promote); ret = PTR_ERR_OR_ZERO(promote);
if (ret) if (ret)
goto nopromote; goto nopromote;
@ -402,8 +410,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
*bounce = true; *bounce = true;
*read_full = promote_full; *read_full = promote_full;
if (have_io_error(failed)) orig->self_healing |= self_healing;
orig->self_healing = true;
return promote; return promote;
nopromote: nopromote:
@ -493,7 +500,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
if (!rbio->bio.bi_status) if (!rbio->bio.bi_status)
promote_start(rbio); promote_start(rbio);
else else
promote_free(rbio); promote_free(rbio, -EIO);
} else { } else {
async_object_list_del(rbio->c, rbio, rbio->list_idx); async_object_list_del(rbio->c, rbio, rbio->list_idx);
@ -527,7 +534,7 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
static int get_rbio_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bkey_buf *sk) static int get_rbio_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bkey_buf *sk)
{ {
struct btree_iter iter; CLASS(btree_iter_uninit, iter)(trans);
struct bkey_s_c k; struct bkey_s_c k;
try(lockrestart_do(trans, try(lockrestart_do(trans,
@ -541,7 +548,6 @@ static int get_rbio_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
break; break;
} }
bch2_trans_iter_exit(&iter);
return 0; return 0;
} }
@ -592,35 +598,32 @@ static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
unsigned flags) unsigned flags)
{ {
struct data_update *u = container_of(rbio, struct data_update, rbio); struct data_update *u = container_of(rbio, struct data_update, rbio);
retry:
bch2_trans_begin(trans);
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0; int ret = 0;
try(lockrestart_do(trans, do {
bkey_err(k = bch2_bkey_get_iter(trans, &iter, bch2_trans_begin(trans);
u->btree_id, bkey_start_pos(&u->k.k->k),
0))));
if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { CLASS(btree_iter_uninit, iter)(trans);
/* extent we wanted to read no longer exists: */ struct bkey_s_c k;
ret = bch_err_throw(trans->c, data_read_key_overwritten);
goto err;
}
ret = __bch2_read_extent(trans, rbio, bvec_iter, try(lockrestart_do(trans,
bkey_start_pos(&u->k.k->k), bkey_err(k = bch2_bkey_get_iter(trans, &iter,
u->btree_id, u->btree_id, bkey_start_pos(&u->k.k->k),
bkey_i_to_s_c(u->k.k), 0))));
0, failed, flags, -1);
err:
bch2_trans_iter_exit(&iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
bch2_err_matches(ret, BCH_ERR_data_read_retry)) /* extent we wanted to read no longer exists: */
goto retry; ret = bch_err_throw(trans->c, data_read_key_overwritten);
break;
}
ret = __bch2_read_extent(trans, rbio, bvec_iter,
bkey_start_pos(&u->k.k->k),
u->btree_id,
bkey_i_to_s_c(u->k.k),
0, failed, flags, -1);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, BCH_ERR_data_read_retry));
if (ret) { if (ret) {
rbio->bio.bi_status = BLK_STS_IOERR; rbio->bio.bi_status = BLK_STS_IOERR;
@ -631,6 +634,22 @@ err:
return ret; return ret;
} }
static void propagate_io_error_to_data_update(struct bch_read_bio *rbio,
struct extent_ptr_decoded *pick)
{
struct data_update *u = rbio_data_update(bch2_rbio_parent(rbio));
if (u && !pick->do_ec_reconstruct) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
if (pick->ptr.dev == ptr->dev)
u->opts.ptrs_io_error |= ptr_bit;
ptr_bit <<= 1;
}
}
}
static void bch2_rbio_retry(struct work_struct *work) static void bch2_rbio_retry(struct work_struct *work)
{ {
struct bch_read_bio *rbio = struct bch_read_bio *rbio =
@ -657,9 +676,12 @@ static void bch2_rbio_retry(struct work_struct *work)
get_rbio_extent(trans, rbio, &sk); get_rbio_extent(trans, rbio, &sk);
if (!bkey_deleted(&sk.k->k) && if (!bkey_deleted(&sk.k->k) &&
bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) {
bch2_mark_io_failure(&failed, &rbio->pick, bch2_mark_io_failure(&failed, &rbio->pick,
rbio->ret == -BCH_ERR_data_read_retry_csum_err); rbio->ret == -BCH_ERR_data_read_retry_csum_err);
propagate_io_error_to_data_update(rbio, &rbio->pick);
}
if (!rbio->split) { if (!rbio->split) {
rbio->bio.bi_status = 0; rbio->bio.bi_status = 0;
@ -1104,22 +1126,26 @@ retry_pick:
trace_and_count(c, io_read_fail_and_poison, &orig->bio); trace_and_count(c, io_read_fail_and_poison, &orig->bio);
} }
CLASS(printbuf, buf)(); if (!(flags & BCH_READ_in_retry)) {
bch2_read_err_msg_trans(trans, &buf, orig, read_pos); CLASS(printbuf, buf)();
prt_printf(&buf, "%s\n ", bch2_err_str(ret)); bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
bch2_bkey_val_to_text(&buf, c, k); prt_printf(&buf, "%s\n ", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf); bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "%s", buf.buf);
}
goto err; goto err;
} }
if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
!c->chacha20_key_set) { !c->chacha20_key_set) {
CLASS(printbuf, buf)(); if (!(flags & BCH_READ_in_retry)) {
bch2_read_err_msg_trans(trans, &buf, orig, read_pos); CLASS(printbuf, buf)();
prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
bch2_bkey_val_to_text(&buf, c, k); prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "%s", buf.buf); bch_err_ratelimited(c, "%s", buf.buf);
}
ret = bch_err_throw(c, data_read_no_encryption_key); ret = bch_err_throw(c, data_read_no_encryption_key);
goto err; goto err;
} }
@ -1139,6 +1165,7 @@ retry_pick:
unlikely(dev_ptr_stale(ca, &pick.ptr))) { unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick, false); bch2_mark_io_failure(failed, &pick, false);
propagate_io_error_to_data_update(rbio, &pick);
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
goto retry_pick; goto retry_pick;
} }
@ -1354,9 +1381,11 @@ out:
ret = rbio->ret; ret = rbio->ret;
rbio = bch2_rbio_free(rbio); rbio = bch2_rbio_free(rbio);
if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) {
bch2_mark_io_failure(failed, &pick, bch2_mark_io_failure(failed, &pick,
ret == -BCH_ERR_data_read_retry_csum_err); ret == -BCH_ERR_data_read_retry_csum_err);
propagate_io_error_to_data_update(rbio, &pick);
}
return ret; return ret;
} }
@ -1482,7 +1511,8 @@ err:
} }
if (unlikely(ret)) { if (unlikely(ret)) {
if (ret != -BCH_ERR_extent_poisoned) { if (!(flags & BCH_READ_in_retry) &&
ret != -BCH_ERR_extent_poisoned) {
CLASS(printbuf, buf)(); CLASS(printbuf, buf)();
bch2_read_err_msg_trans(trans, &buf, rbio, POS(inum.inum, bvec_iter.bi_sector)); bch2_read_err_msg_trans(trans, &buf, rbio, POS(inum.inum, bvec_iter.bi_sector));
prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); prt_printf(&buf, "data read error: %s", bch2_err_str(ret));

View File

@ -491,11 +491,12 @@ static int rebalance_set_data_opts(struct btree_trans *trans,
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
memset(data_opts, 0, sizeof(*data_opts)); memset(data_opts, 0, sizeof(*data_opts));
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k); data_opts->type = BCH_DATA_UPDATE_rebalance;
data_opts->ptrs_rewrite = bch2_bkey_ptrs_need_rebalance(c, opts, k);
data_opts->target = opts->background_target; data_opts->target = opts->background_target;
data_opts->write_flags |= BCH_WRITE_only_specified_devs; data_opts->write_flags |= BCH_WRITE_only_specified_devs;
if (!data_opts->rewrite_ptrs) { if (!data_opts->ptrs_rewrite) {
/* /*
* device we would want to write to offline? devices in target * device we would want to write to offline? devices in target
* changed? * changed?
@ -507,36 +508,6 @@ static int rebalance_set_data_opts(struct btree_trans *trans,
return 0; return 0;
} }
if (trace_rebalance_extent_enabled()) {
CLASS(printbuf, buf)();
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
unsigned move_ptrs = 0;
unsigned compress_ptrs = 0;
u64 sectors = 0;
bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
if (move_ptrs) {
prt_str(&buf, "move=");
bch2_target_to_text(&buf, c, opts->background_target);
prt_str(&buf, " ");
bch2_prt_u64_base2(&buf, move_ptrs);
prt_newline(&buf);
}
if (compress_ptrs) {
prt_str(&buf, "compression=");
bch2_compression_opt_to_text(&buf, opts->background_compression);
prt_str(&buf, " ");
bch2_prt_u64_base2(&buf, compress_ptrs);
prt_newline(&buf);
}
trace_rebalance_extent(c, buf.buf);
}
count_event(c, rebalance_extent); count_event(c, rebalance_extent);
return 1; return 1;
} }

View File

@ -30,7 +30,7 @@
#include <linux/ioprio.h> #include <linux/ioprio.h>
static const char * const bch2_data_update_type_strs[] = { static const char * const bch2_data_update_type_strs[] = {
#define x(t, n, ...) [n] = #t, #define x(n) #n,
BCH_DATA_UPDATE_TYPES() BCH_DATA_UPDATE_TYPES()
#undef x #undef x
NULL NULL
@ -63,37 +63,22 @@ static unsigned bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
} }
noinline_for_stack noinline_for_stack
static void trace_io_move_finish2(struct data_update *u, static void trace_data_update_key_fail2(struct data_update *m,
struct bkey_i *new, struct btree_iter *iter,
struct bkey_i *insert) struct bkey_s_c new,
struct bkey_s_c wrote,
struct bkey_i *insert,
const char *msg)
{ {
struct bch_fs *c = u->op.c; if (m->stats) {
CLASS(printbuf, buf)(); atomic64_inc(&m->stats->keys_raced);
atomic64_add(new.k->p.offset - iter->pos.offset,
&m->stats->sectors_raced);
}
prt_newline(&buf); count_event(m->op.c, data_update_key_fail);
bch2_data_update_to_text(&buf, u); if (!trace_data_update_key_fail_enabled())
prt_newline(&buf);
prt_str_indented(&buf, "new replicas:\t");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
prt_newline(&buf);
prt_str_indented(&buf, "insert:\t");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
prt_newline(&buf);
trace_io_move_finish(c, buf.buf);
}
noinline_for_stack
static void trace_io_move_fail2(struct data_update *m,
struct bkey_s_c new,
struct bkey_s_c wrote,
struct bkey_i *insert,
const char *msg)
{
if (!trace_io_move_fail_enabled())
return; return;
struct bch_fs *c = m->op.c; struct bch_fs *c = m->op.c;
@ -113,7 +98,7 @@ static void trace_io_move_fail2(struct data_update *m,
unsigned ptr_bit = 1; unsigned ptr_bit = 1;
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
if ((ptr_bit & m->data_opts.rewrite_ptrs) && if ((ptr_bit & m->opts.ptrs_rewrite) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) !ptr->cached)
rewrites_found |= ptr_bit; rewrites_found |= ptr_bit;
@ -125,7 +110,7 @@ static void trace_io_move_fail2(struct data_update *m,
bch2_prt_u64_base2(&buf, rewrites_found); bch2_prt_u64_base2(&buf, rewrites_found);
prt_newline(&buf); prt_newline(&buf);
bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->opts);
prt_str_indented(&buf, "\nold: "); prt_str_indented(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old); bch2_bkey_val_to_text(&buf, c, old);
@ -141,11 +126,11 @@ static void trace_io_move_fail2(struct data_update *m,
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
} }
trace_io_move_fail(c, buf.buf); trace_data_update_key_fail(c, buf.buf);
} }
noinline_for_stack noinline_for_stack
static void trace_data_update2(struct data_update *m, static void trace_data_update_key2(struct data_update *m,
struct bkey_s_c old, struct bkey_s_c k, struct bkey_s_c old, struct bkey_s_c k,
struct bkey_i *insert) struct bkey_i *insert)
{ {
@ -159,55 +144,7 @@ static void trace_data_update2(struct data_update *m,
prt_str(&buf, "\nnew: "); prt_str(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
trace_data_update(c, buf.buf); trace_data_update_key(c, buf.buf);
}
noinline_for_stack
static void trace_io_move_created_rebalance2(struct data_update *m,
struct bkey_s_c old, struct bkey_s_c k,
struct bkey_i *insert)
{
struct bch_fs *c = m->op.c;
CLASS(printbuf, buf)();
bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
prt_str(&buf, "\nk: ");
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
trace_io_move_created_rebalance(c, buf.buf);
count_event(c, io_move_created_rebalance);
}
noinline_for_stack
static int data_update_invalid_bkey(struct data_update *m,
struct bkey_s_c old, struct bkey_s_c k,
struct bkey_i *insert)
{
struct bch_fs *c = m->op.c;
CLASS(printbuf, buf)();
bch2_log_msg_start(c, &buf);
prt_str(&buf, "about to insert invalid key in data update path");
prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
prt_str(&buf, "\nk: ");
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
prt_newline(&buf);
bch2_fs_emergency_read_only2(c, &buf);
bch2_print_str(c, KERN_ERR, buf.buf);
return bch_err_throw(c, invalid_bkey);
} }
static int __bch2_data_update_index_update(struct btree_trans *trans, static int __bch2_data_update_index_update(struct btree_trans *trans,
@ -243,11 +180,17 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (ret) if (ret)
goto err; goto err;
struct bkey_i *tmp_k = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(tmp_k);
if (ret)
goto err;
k = bkey_i_to_s_c(tmp_k);
new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys)); new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys));
if (!bch2_extents_match(k, old)) { if (!bch2_extents_match(k, old)) {
trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), trace_data_update_key_fail2(m, &iter, k, bkey_i_to_s_c(&new->k_i), NULL, "no match:");
NULL, "no match:");
goto nowork; goto nowork;
} }
@ -282,24 +225,28 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
* other updates * other updates
* @new: extent with new pointers that we'll be adding to @insert * @new: extent with new pointers that we'll be adding to @insert
* *
* Fist, drop rewrite_ptrs from @new: * Fist, drop ptrs_rewrite from @new:
*/ */
ptr_bit = 1; ptr_bit = 1;
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
if ((ptr_bit & m->data_opts.rewrite_ptrs) && if ((ptr_bit & m->opts.ptrs_rewrite) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert)))) {
!ptr->cached) { if (ptr_bit & m->opts.ptrs_io_error)
bch2_extent_ptr_set_cached(c, &m->op.opts, bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
bkey_i_to_s(insert), ptr); else if (!ptr->cached)
bch2_extent_ptr_set_cached(c, &m->op.opts,
bkey_i_to_s(insert), ptr);
rewrites_found |= ptr_bit; rewrites_found |= ptr_bit;
} }
ptr_bit <<= 1; ptr_bit <<= 1;
} }
if (m->data_opts.rewrite_ptrs && if (m->opts.ptrs_rewrite &&
!rewrites_found && !rewrites_found &&
bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); trace_data_update_key_fail2(m, &iter, k, bkey_i_to_s_c(&new->k_i), insert,
"no rewrites found:");
goto nowork; goto nowork;
} }
@ -316,7 +263,7 @@ restart_drop_conflicting_replicas:
} }
if (!bkey_val_u64s(&new->k)) { if (!bkey_val_u64s(&new->k)) {
trace_io_move_fail2(m, k, trace_data_update_key_fail2(m, &iter, k,
bkey_i_to_s_c(bch2_keylist_front(&op->insert_keys)), bkey_i_to_s_c(bch2_keylist_front(&op->insert_keys)),
insert, "new replicas conflicted:"); insert, "new replicas conflicted:");
goto nowork; goto nowork;
@ -372,25 +319,9 @@ restart_drop_extra_replicas:
next_pos = insert->k.p; next_pos = insert->k.p;
/*
* Check for nonce offset inconsistency:
* This is debug code - we've been seeing this bug rarely, and
* it's been hard to reproduce, so this should give us some more
* information when it does occur:
*/
int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert),
(struct bkey_validate_context) {
.btree = m->btree_id,
.flags = BCH_VALIDATE_commit,
});
if (unlikely(invalid)) {
ret = data_update_invalid_bkey(m, old, k, insert);
goto out;
}
struct bch_inode_opts opts; struct bch_inode_opts opts;
ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->opts.type]) ?:
bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id, bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?: k.k->p, bkey_start_pos(&insert->k)) ?:
@ -401,30 +332,20 @@ restart_drop_extra_replicas:
SET_NEEDS_REBALANCE_foreground, SET_NEEDS_REBALANCE_foreground,
m->op.opts.change_cookie) ?: m->op.opts.change_cookie) ?:
bch2_trans_update(trans, &iter, insert, bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_internal_snapshot_node); BTREE_UPDATE_internal_snapshot_node) ?:
if (ret) bch2_trans_commit(trans, &op->res,
goto err;
if (trace_data_update_enabled())
trace_data_update2(m, old, k, insert);
if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
trace_io_move_created_rebalance2(m, old, k, insert);
ret = bch2_trans_commit(trans, &op->res,
NULL, NULL,
BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags); m->opts.commit_flags);
if (ret) if (ret)
goto err; goto err;
bch2_btree_iter_set_pos(&iter, next_pos); bch2_btree_iter_set_pos(&iter, next_pos);
this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); if (trace_data_update_key_enabled())
if (trace_io_move_finish_enabled()) trace_data_update_key2(m, old, k, insert);
trace_io_move_finish2(m, &new->k_i, insert); this_cpu_add(c->counters[BCH_COUNTER_data_update_key], new->k.size);
err: err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
ret = 0; ret = 0;
@ -438,15 +359,6 @@ next:
} }
continue; continue;
nowork: nowork:
if (m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
&m->stats->sectors_raced);
}
count_event(c, io_move_fail);
bch2_btree_iter_advance(&iter); bch2_btree_iter_advance(&iter);
goto next; goto next;
} }
@ -461,23 +373,101 @@ int bch2_data_update_index_update(struct bch_write_op *op)
return __bch2_data_update_index_update(trans, op); return __bch2_data_update_index_update(trans, op);
} }
void bch2_data_update_read_done(struct data_update *m) void bch2_data_update_read_done(struct data_update *u)
{ {
m->read_done = true; struct bch_fs *c = u->op.c;
struct bch_read_bio *rbio = &u->rbio;
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
u->read_done = true;
/*
* If the extent has been bitrotted, we're going to have to give it a
* new checksum in order to move it - but the poison bit will ensure
* that userspace still gets the appropriate error.
*/
if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
(bch2_bkey_extent_flags(bkey_i_to_s_c(u->k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
struct nonce nonce = extent_nonce(rbio->version, crc);
crc.csum = bch2_checksum_bio(c, crc.csum_type, nonce, &rbio->bio);
rbio->ret = 0;
}
if (unlikely(rbio->ret)) {
u->op.end_io(&u->op);
return;
}
if (u->opts.type == BCH_DATA_UPDATE_scrub && !u->opts.ptrs_io_error) {
u->op.end_io(&u->op);
return;
}
if (u->opts.ptrs_io_error) {
struct bkey_s_c k = bkey_i_to_s_c(u->k.k);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
unsigned ptr_bit = 1;
guard(rcu)();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if ((u->opts.ptrs_io_error & ptr_bit) &&
!(u->opts.ptrs_rewrite & ptr_bit)) {
u->op.nr_replicas += bch2_extent_ptr_durability(c, &p);
u->opts.ptrs_rewrite |= ptr_bit;
bch2_dev_list_drop_dev(&u->op.devs_have, p.ptr.dev);
}
ptr_bit <<= 1;
}
}
/* write bio must own pages: */ /* write bio must own pages: */
BUG_ON(!m->op.wbio.bio.bi_vcnt); BUG_ON(!u->op.wbio.bio.bi_vcnt);
m->op.crc = m->rbio.pick.crc; u->op.crc = crc;
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; u->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); closure_call(&u->op.cl, bch2_write, NULL, NULL);
closure_call(&m->op.cl, bch2_write, NULL, NULL);
} }
void bch2_data_update_exit(struct data_update *update) static void data_update_trace(struct data_update *u, int ret)
{ {
struct bch_fs *c = u->op.c;
if (!ret) {
if (trace_data_update_enabled()) {
CLASS(printbuf, buf)();
bch2_data_update_to_text(&buf, u);
trace_data_update(c, buf.buf);
}
count_event(c, data_update);
} else if (bch2_err_matches(ret, BCH_ERR_data_update_done)) {
if (trace_data_update_no_io_enabled()) {
CLASS(printbuf, buf)();
bch2_data_update_to_text(&buf, u);
prt_printf(&buf, "\nret:\t%s\n", bch2_err_str(ret));
trace_data_update_no_io(c, buf.buf);
}
count_event(c, data_update_no_io);
} else if (ret != -BCH_ERR_data_update_fail_no_rw_devs) {
if (trace_data_update_fail_enabled()) {
CLASS(printbuf, buf)();
bch2_data_update_to_text(&buf, u);
prt_printf(&buf, "\nret:\t%s\n", bch2_err_str(ret));
trace_data_update_fail(c, buf.buf);
}
count_event(c, data_update_fail);
}
}
void bch2_data_update_exit(struct data_update *update, int ret)
{
data_update_trace(update, ret);
struct bch_fs *c = update->op.c; struct bch_fs *c = update->op.c;
struct bkey_s_c k = bkey_i_to_s_c(update->k.k); struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
@ -586,6 +576,15 @@ int bch2_update_unwritten_extent(struct btree_trans *trans,
return ret; return ret;
} }
static void ptr_bits_to_text(struct printbuf *out, unsigned ptrs, const char *name)
{
if (ptrs) {
prt_printf(out, "%s ptrs:\t", name);
bch2_prt_u64_base2(out, ptrs);
prt_newline(out);
}
}
void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_inode_opts *io_opts, struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts) struct data_update_opts *data_opts)
@ -593,13 +592,13 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
if (!out->nr_tabstops) if (!out->nr_tabstops)
printbuf_tabstop_push(out, 20); printbuf_tabstop_push(out, 20);
prt_str_indented(out, "rewrite ptrs:\t"); prt_str(out, bch2_data_update_type_strs[data_opts->type]);
bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
prt_newline(out); prt_newline(out);
prt_str_indented(out, "kill ptrs:\t"); ptr_bits_to_text(out, data_opts->ptrs_rewrite, "rewrite");
bch2_prt_u64_base2(out, data_opts->kill_ptrs); ptr_bits_to_text(out, data_opts->ptrs_io_error, "io error");
prt_newline(out); ptr_bits_to_text(out, data_opts->ptrs_kill, "kill");
ptr_bits_to_text(out, data_opts->ptrs_kill_ec, "kill ec");
prt_str_indented(out, "target:\t"); prt_str_indented(out, "target:\t");
bch2_target_to_text(out, c, data_opts->target); bch2_target_to_text(out, c, data_opts->target);
@ -616,17 +615,11 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
prt_str_indented(out, "extra replicas:\t"); prt_str_indented(out, "extra replicas:\t");
prt_u64(out, data_opts->extra_replicas); prt_u64(out, data_opts->extra_replicas);
prt_newline(out); prt_newline(out);
prt_str_indented(out, "scrub:\t");
prt_u64(out, data_opts->scrub);
} }
void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
{ {
prt_str(out, bch2_data_update_type_strs[m->type]); bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->opts);
prt_newline(out);
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
prt_newline(out); prt_newline(out);
prt_str_indented(out, "old key:\t"); prt_str_indented(out, "old key:\t");
@ -640,7 +633,7 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update
bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
prt_newline(out); prt_newline(out);
guard(printbuf_indent)(out); guard(printbuf_indent)(out);
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->opts);
if (!m->read_done) { if (!m->read_done) {
prt_printf(out, "read:\n"); prt_printf(out, "read:\n");
@ -653,11 +646,11 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update
} }
} }
int bch2_extent_drop_ptrs(struct btree_trans *trans, static int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter, struct btree_iter *iter,
struct bkey_s_c k, struct bkey_s_c k,
struct bch_inode_opts *io_opts, struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts) struct data_update_opts *data_opts)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
@ -667,16 +660,16 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct extent_ptr_decoded p = {}; struct extent_ptr_decoded p = {};
unsigned i = 0; unsigned i = 0;
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
if (data_opts->kill_ec_ptrs & BIT(i)) if (data_opts->ptrs_kill_ec & BIT(i))
bch2_bkey_drop_ec(n, p.ptr.dev); bch2_bkey_drop_ec(n, p.ptr.dev);
i++; i++;
} }
while (data_opts->kill_ptrs) { while (data_opts->ptrs_kill) {
unsigned i = 0, drop = __fls(data_opts->kill_ptrs); unsigned i = 0, drop = __fls(data_opts->ptrs_kill);
bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), p, entry, i++ == drop); bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), p, entry, i++ == drop);
data_opts->kill_ptrs ^= 1U << drop; data_opts->ptrs_kill ^= 1U << drop;
} }
/* /*
@ -700,9 +693,9 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
} }
static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, static int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
struct bch_inode_opts *io_opts, struct bch_inode_opts *io_opts,
unsigned buf_bytes) unsigned buf_bytes)
{ {
unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
@ -727,21 +720,6 @@ static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
return 0; return 0;
} }
int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
struct bch_inode_opts *io_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
/* write path might have to decompress data: */
unsigned buf_bytes = 0;
bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
}
static int can_write_extent(struct bch_fs *c, struct data_update *m) static int can_write_extent(struct bch_fs *c, struct data_update *m)
{ {
if ((m->op.flags & BCH_WRITE_alloc_nowait) && if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
@ -757,6 +735,7 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
if (*i != BCH_SB_MEMBER_INVALID) if (*i != BCH_SB_MEMBER_INVALID)
__clear_bit(*i, devs.d); __clear_bit(*i, devs.d);
bool trace = trace_data_update_fail_enabled();
CLASS(printbuf, buf)(); CLASS(printbuf, buf)();
guard(printbuf_atomic)(&buf); guard(printbuf_atomic)(&buf);
@ -773,7 +752,8 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark); u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark);
prt_printf(&buf, "%s=%llu ", ca->name, nr_free); if (trace)
prt_printf(&buf, "%s=%llu ", ca->name, nr_free);
if (!nr_free) if (!nr_free)
continue; continue;
@ -784,8 +764,12 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
} }
if (!nr_replicas) { if (!nr_replicas) {
prt_printf(&buf, "\nnr_replicas %u < %u", nr_replicas, m->op.nr_replicas); if (trace) {
trace_data_update_done_no_rw_devs(c, buf.buf); prt_printf(&buf, "\nnr_replicas %u < %u", nr_replicas, m->op.nr_replicas);
trace_data_update_fail(c, buf.buf);
}
count_event(c, data_update_fail);
return bch_err_throw(c, data_update_fail_no_rw_devs); return bch_err_throw(c, data_update_fail_no_rw_devs);
} }
@ -805,27 +789,12 @@ int bch2_data_update_init(struct btree_trans *trans,
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
int ret = 0; int ret = 0;
if (k.k->p.snapshot) {
ret = bch2_check_key_has_snapshot(trans, iter, k);
if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
/* Can't repair yet, waiting on other recovery passes */
return bch_err_throw(c, data_update_fail_no_snapshot);
}
if (ret < 0)
return ret;
if (ret) /* key was deleted */
return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
bch_err_throw(c, data_update_fail_no_snapshot);
ret = 0;
}
bch2_bkey_buf_init(&m->k); bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, k); bch2_bkey_buf_reassemble(&m->k, k);
m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc k = bkey_i_to_s_c(m->k.k);
? BCH_DATA_UPDATE_copygc
: BCH_DATA_UPDATE_rebalance;
m->btree_id = btree_id; m->btree_id = btree_id;
m->data_opts = data_opts; m->opts = data_opts;
m->ctxt = ctxt; m->ctxt = ctxt;
m->stats = ctxt ? ctxt->stats : NULL; m->stats = ctxt ? ctxt->stats : NULL;
@ -842,9 +811,21 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_pages_owned| BCH_WRITE_pages_owned|
BCH_WRITE_data_encoded| BCH_WRITE_data_encoded|
BCH_WRITE_move| BCH_WRITE_move|
m->data_opts.write_flags; m->opts.write_flags;
m->op.compression_opt = io_opts->background_compression; m->op.compression_opt = io_opts->background_compression;
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; m->op.watermark = m->opts.commit_flags & BCH_WATERMARK_MASK;
if (k.k->p.snapshot &&
unlikely(ret = bch2_check_key_has_snapshot(trans, iter, k))) {
if (ret > 0) /* key was deleted */
ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
bch_err_throw(c, data_update_fail_no_snapshot);
if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
/* Can't repair yet, waiting on other recovery passes */
ret = bch_err_throw(c, data_update_fail_no_snapshot);
}
goto out;
}
unsigned durability_have = 0, durability_removing = 0; unsigned durability_have = 0, durability_removing = 0;
@ -855,42 +836,48 @@ int bch2_data_update_init(struct btree_trans *trans,
unsigned buf_bytes = 0; unsigned buf_bytes = 0;
bool unwritten = false; bool unwritten = false;
unsigned ptr_bit = 1; scoped_guard(rcu) {
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { unsigned ptr_bit = 1;
if (!p.ptr.cached) { bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
guard(rcu)(); if (!p.ptr.cached) {
if (ptr_bit & m->data_opts.rewrite_ptrs) { if (ptr_bit & m->opts.ptrs_rewrite) {
if (crc_is_compressed(p.crc)) if (crc_is_compressed(p.crc))
reserve_sectors += k.k->size; reserve_sectors += k.k->size;
m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
durability_removing += bch2_extent_ptr_desired_durability(c, &p); durability_removing += bch2_extent_ptr_desired_durability(c, &p);
} else if (!(ptr_bit & m->data_opts.kill_ptrs)) { } else if (!(ptr_bit & m->opts.ptrs_kill)) {
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p); durability_have += bch2_extent_ptr_durability(c, &p);
}
} else {
if (m->opts.ptrs_rewrite & ptr_bit) {
m->opts.ptrs_kill |= ptr_bit;
m->opts.ptrs_rewrite ^= ptr_bit;
}
} }
/*
* op->csum_type is normally initialized from the fs/file's
* current options - but if an extent is encrypted, we require
* that it stays encrypted:
*/
if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
m->op.nonce = p.crc.nonce + p.crc.offset;
m->op.csum_type = p.crc.csum_type;
}
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
m->op.incompressible = true;
buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
unwritten |= p.ptr.unwritten;
ptr_bit <<= 1;
} }
/*
* op->csum_type is normally initialized from the fs/file's
* current options - but if an extent is encrypted, we require
* that it stays encrypted:
*/
if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
m->op.nonce = p.crc.nonce + p.crc.offset;
m->op.csum_type = p.crc.csum_type;
}
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
m->op.incompressible = true;
buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
unwritten |= p.ptr.unwritten;
ptr_bit <<= 1;
} }
if (!data_opts.scrub) { if (m->opts.type != BCH_DATA_UPDATE_scrub) {
unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/* /*
@ -902,7 +889,7 @@ int bch2_data_update_init(struct btree_trans *trans,
* rereplicate, currently, so that users don't get an unexpected -ENOSPC * rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/ */
m->op.nr_replicas = min(durability_removing, durability_required) + m->op.nr_replicas = min(durability_removing, durability_required) +
m->data_opts.extra_replicas; m->opts.extra_replicas;
/* /*
* If device(s) were set to durability=0 after data was written to them * If device(s) were set to durability=0 after data was written to them
@ -920,11 +907,11 @@ int bch2_data_update_init(struct btree_trans *trans,
* was written: * was written:
*/ */
if (!m->op.nr_replicas) { if (!m->op.nr_replicas) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; m->opts.ptrs_kill |= m->opts.ptrs_rewrite;
m->data_opts.rewrite_ptrs = 0; m->opts.ptrs_rewrite = 0;
/* if iter == NULL, it's just a promote */ /* if iter == NULL, it's just a promote */
if (iter) if (iter)
ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->opts);
if (!ret) if (!ret)
ret = bch_err_throw(c, data_update_done_no_writes_needed); ret = bch_err_throw(c, data_update_done_no_writes_needed);
goto out; goto out;
@ -949,7 +936,7 @@ int bch2_data_update_init(struct btree_trans *trans,
if (reserve_sectors) { if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
m->data_opts.extra_replicas m->opts.extra_replicas
? 0 ? 0
: BCH_DISK_RESERVATION_NOFAIL); : BCH_DISK_RESERVATION_NOFAIL);
if (ret) if (ret)
@ -997,32 +984,22 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); ret = bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
if (ret) if (ret)
goto out_nocow_unlock; goto out_nocow_unlock;
return 0; return 0;
out_nocow_unlock: out_nocow_unlock:
if (c->opts.nocow_enabled) if (c->opts.nocow_enabled)
bch2_bkey_nocow_unlock(c, k, 0); bch2_bkey_nocow_unlock(c, k, 0);
out: out:
BUG_ON(!ret);
data_update_trace(m, ret);
bkey_put_dev_refs(c, k, m->ptrs_held); bkey_put_dev_refs(c, k, m->ptrs_held);
m->ptrs_held = 0; m->ptrs_held = 0;
bch2_disk_reservation_put(c, &m->op.res); bch2_disk_reservation_put(c, &m->op.res);
bch2_bkey_buf_exit(&m->k);
return ret; return ret;
} }
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) {
opts->kill_ptrs |= ptr_bit;
opts->rewrite_ptrs ^= ptr_bit;
}
ptr_bit <<= 1;
}
}

View File

@ -4,46 +4,48 @@
#define _BCACHEFS_DATA_UPDATE_H #define _BCACHEFS_DATA_UPDATE_H
#include "btree/bkey_buf.h" #include "btree/bkey_buf.h"
#include "btree/update.h"
#include "data/read.h" #include "data/read.h"
#include "data/write_types.h" #include "data/write_types.h"
struct moving_context; struct moving_context;
struct data_update_opts { #define BCH_DATA_UPDATE_TYPES() \
unsigned rewrite_ptrs; x(other) \
unsigned kill_ptrs; x(copygc) \
unsigned kill_ec_ptrs; x(rebalance) \
u16 target; x(promote) \
u8 extra_replicas; x(self_heal) \
unsigned btree_insert_flags; x(scrub)
unsigned write_flags;
int read_dev;
bool scrub;
};
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
struct bch_inode_opts *, struct data_update_opts *);
#define BCH_DATA_UPDATE_TYPES() \
x(copygc, 0) \
x(rebalance, 1) \
x(promote, 2)
enum bch_data_update_types { enum bch_data_update_types {
#define x(n, id) BCH_DATA_UPDATE_##n = id, #define x(n) BCH_DATA_UPDATE_##n,
BCH_DATA_UPDATE_TYPES() BCH_DATA_UPDATE_TYPES()
#undef x #undef x
}; };
struct data_update_opts {
enum bch_data_update_types type;
u8 ptrs_rewrite;
u8 ptrs_io_error;
u8 ptrs_kill;
u8 ptrs_kill_ec;
u8 extra_replicas;
u16 target;
int read_dev;
enum bch_write_flags write_flags;
enum bch_trans_commit_flags commit_flags;
};
struct data_update { struct data_update {
enum bch_data_update_types type;
bool read_done;
u8 ptrs_held;
/* extent being updated: */ /* extent being updated: */
enum btree_id btree_id; enum btree_id btree_id;
struct bkey_buf k; struct bkey_buf k;
struct data_update_opts data_opts; struct data_update_opts opts;
bool read_done;
u8 ptrs_held;
/* associated with @ctxt */ /* associated with @ctxt */
struct list_head read_list; struct list_head read_list;
@ -72,6 +74,8 @@ struct promote_op {
struct bio_vec bi_inline_vecs[]; /* must be last */ struct bio_vec bi_inline_vecs[]; /* must be last */
}; };
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
struct bch_inode_opts *, struct data_update_opts *);
void bch2_data_update_to_text(struct printbuf *, struct data_update *); void bch2_data_update_to_text(struct printbuf *, struct data_update *);
void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
@ -79,22 +83,12 @@ int bch2_data_update_index_update(struct bch_write_op *);
void bch2_data_update_read_done(struct data_update *); void bch2_data_update_read_done(struct data_update *);
int bch2_extent_drop_ptrs(struct btree_trans *, void bch2_data_update_exit(struct data_update *, int);
struct btree_iter *,
struct bkey_s_c,
struct bch_inode_opts *,
struct data_update_opts *);
int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
struct bch_inode_opts *);
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *, int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
struct moving_context *, struct moving_context *,
struct data_update *, struct data_update *,
struct write_point_specifier, struct write_point_specifier,
struct bch_inode_opts *, struct data_update_opts, struct bch_inode_opts *, struct data_update_opts,
enum btree_id, struct bkey_s_c); enum btree_id, struct bkey_s_c);
void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
#endif /* _BCACHEFS_DATA_UPDATE_H */ #endif /* _BCACHEFS_DATA_UPDATE_H */

View File

@ -426,9 +426,11 @@ static int bch2_write_index_default(struct bch_write_op *op)
void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
{ {
CLASS(printbuf, buf)();
CLASS(btree_trans, trans)(op->c); CLASS(btree_trans, trans)(op->c);
CLASS(printbuf, buf)();
bch2_log_msg_start(op->c, &buf);
struct bpos pos = op->pos; struct bpos pos = op->pos;
pos.offset = offset; pos.offset = offset;
@ -440,15 +442,17 @@ void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, .
va_start(args, fmt); va_start(args, fmt);
prt_vprintf(&buf, fmt, args); prt_vprintf(&buf, fmt, args);
va_end(args); va_end(args);
prt_newline(&buf);
if (op->flags & BCH_WRITE_move) { if (op->flags & BCH_WRITE_move) {
struct data_update *u = container_of(op, struct data_update, op); struct data_update *u = container_of(op, struct data_update, op);
prt_printf(&buf, "\n from internal move "); prt_printf(&buf, "from internal move ");
bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
prt_newline(&buf);
} }
bch_err_ratelimited(op->c, "%s", buf.buf); bch2_print_str_ratelimited(op->c, KERN_ERR, buf.buf);
} }
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,

View File

@ -797,37 +797,37 @@ TRACE_EVENT(bucket_invalidate,
/* Moving IO */ /* Moving IO */
DEFINE_EVENT(fs_str, io_move, DEFINE_EVENT(fs_str, data_update,
TP_PROTO(struct bch_fs *c, const char *str), TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str) TP_ARGS(c, str)
); );
DEFINE_EVENT(fs_str, io_move_read, DEFINE_EVENT(fs_str, data_update_no_io,
TP_PROTO(struct bch_fs *c, const char *str), TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str) TP_ARGS(c, str)
); );
DEFINE_EVENT(fs_str, io_move_write, DEFINE_EVENT(fs_str, data_update_fail,
TP_PROTO(struct bch_fs *c, const char *str), TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str) TP_ARGS(c, str)
); );
DEFINE_EVENT(fs_str, io_move_finish, DEFINE_EVENT(fs_str, data_update_key,
TP_PROTO(struct bch_fs *c, const char *str), TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str) TP_ARGS(c, str)
); );
DEFINE_EVENT(fs_str, io_move_fail, DEFINE_EVENT(fs_str, data_update_key_fail,
TP_PROTO(struct bch_fs *c, const char *str), TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str) TP_ARGS(c, str)
); );
DEFINE_EVENT(fs_str, io_move_write_fail, DEFINE_EVENT(fs_str, io_move_pred,
TP_PROTO(struct bch_fs *c, const char *str), TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str) TP_ARGS(c, str)
); );
DEFINE_EVENT(fs_str, io_move_start_fail, DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
TP_PROTO(struct bch_fs *c, const char *str), TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str) TP_ARGS(c, str)
); );
@ -1317,40 +1317,7 @@ TRACE_EVENT(write_buffer_maybe_flush,
TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key)) TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key))
); );
DEFINE_EVENT(fs_str, rebalance_extent, /* BTREE ITER TRACEPOINTS */
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, data_update,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, data_update_done_no_rw_devs,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_pred,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_created_rebalance,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, extent_trim_atomic,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, btree_iter_peek_slot, DEFINE_EVENT(fs_str, btree_iter_peek_slot,
TP_PROTO(struct bch_fs *c, const char *str), TP_PROTO(struct bch_fs *c, const char *str),
@ -1372,6 +1339,11 @@ DEFINE_EVENT(fs_str, btree_iter_peek_prev_min,
TP_ARGS(c, str) TP_ARGS(c, str)
); );
DEFINE_EVENT(fs_str, extent_trim_atomic,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS #ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
TRACE_EVENT(update_by_path, TRACE_EVENT(update_by_path,

View File

@ -26,14 +26,14 @@ enum counters_flags {
x(io_read_narrow_crcs, 97, TYPE_COUNTER) \ x(io_read_narrow_crcs, 97, TYPE_COUNTER) \
x(io_read_narrow_crcs_fail, 98, TYPE_COUNTER) \ x(io_read_narrow_crcs_fail, 98, TYPE_COUNTER) \
x(io_write, 1, TYPE_SECTORS) \ x(io_write, 1, TYPE_SECTORS) \
x(io_move, 2, TYPE_SECTORS) \ x(data_update, 2, TYPE_SECTORS) \
x(data_update_no_io, 91, TYPE_COUNTER) \
x(data_update_fail, 82, TYPE_COUNTER) \
x(data_update_key, 37, TYPE_SECTORS) \
x(data_update_key_fail, 38, TYPE_COUNTER) \
x(io_move_read, 35, TYPE_SECTORS) \ x(io_move_read, 35, TYPE_SECTORS) \
x(io_move_write, 36, TYPE_SECTORS) \ x(io_move_write, 36, TYPE_SECTORS) \
x(io_move_finish, 37, TYPE_SECTORS) \
x(io_move_fail, 38, TYPE_COUNTER) \
x(io_move_write_fail, 82, TYPE_COUNTER) \
x(io_move_start_fail, 39, TYPE_COUNTER) \ x(io_move_start_fail, 39, TYPE_COUNTER) \
x(io_move_drop_only, 91, TYPE_COUNTER) \
x(io_move_noop, 92, TYPE_COUNTER) \ x(io_move_noop, 92, TYPE_COUNTER) \
x(io_move_created_rebalance, 83, TYPE_COUNTER) \ x(io_move_created_rebalance, 83, TYPE_COUNTER) \
x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \ x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \

View File

@ -187,12 +187,8 @@ static int check_subvol_child(struct btree_trans *trans,
le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode, le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
trans, subvol_children_bad, trans, subvol_children_bad,
"incorrect entry in subvolume_children btree %llu:%llu", "incorrect entry in subvolume_children btree %llu:%llu",
child_k.k->p.inode, child_k.k->p.offset)) { child_k.k->p.inode, child_k.k->p.offset))
ret = bch2_btree_delete_at(trans, child_iter, 0); try(bch2_btree_delete_at(trans, child_iter, 0));
if (ret)
goto err;
}
err:
fsck_err: fsck_err:
return ret; return ret;
} }