Update bcachefs sources to 9a0aad1cf404 bcachefs: adjust BCH_MEMBER_STATE_evacuating for new semantics

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-11-17 17:42:23 -05:00
parent 62c0874fa5
commit 4d1f0d5340
12 changed files with 105 additions and 132 deletions

View File

@ -1 +1 @@
e6f97f86f5fcb9f53c6fef2287af7d3f8acccac7 9a0aad1cf4047ff685a3f0f81af596f3c62ff70e

View File

@ -350,8 +350,8 @@ static int cmd_device_evacuate(int argc, char *argv[])
if (bcachefs_kernel_version() < bcachefs_metadata_version_reconcile) if (bcachefs_kernel_version() < bcachefs_metadata_version_reconcile)
return evacuate_v0(fs, dev_idx, dev_path); return evacuate_v0(fs, dev_idx, dev_path);
printf("Setting %s failed\n", dev_path); printf("Setting %s evacuating \n", dev_path);
bchu_disk_set_state(fs, dev_idx, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED); bchu_disk_set_state(fs, dev_idx, BCH_MEMBER_STATE_evacuating, BCH_FORCE_IF_DEGRADED);
while (true) { while (true) {
struct bch_ioctl_dev_usage_v2 *u = bchu_dev_usage(fs, dev_idx); struct bch_ioctl_dev_usage_v2 *u = bchu_dev_usage(fs, dev_idx);
@ -382,7 +382,7 @@ static void device_set_state_usage(void)
puts("bcachefs device set-state\n" puts("bcachefs device set-state\n"
"Usage: bcachefs device set-state <new-state> <device>|<devid> <path>\n" "Usage: bcachefs device set-state <new-state> <device>|<devid> <path>\n"
"\n" "\n"
"<new-state>: one of rw, ro, failed or spare\n" "<new-state>: one of rw, ro, evacuating or spare\n"
"<path>: path to mounted filesystem, optional unless specifying device by id\n" "<path>: path to mounted filesystem, optional unless specifying device by id\n"
"\n" "\n"
"Options:\n" "Options:\n"
@ -716,7 +716,7 @@ static int device_usage(void)
" online Re-add an existing member to a filesystem\n" " online Re-add an existing member to a filesystem\n"
" offline Take a device offline, without removing it\n" " offline Take a device offline, without removing it\n"
" evacuate Migrate data off a specific device\n" " evacuate Migrate data off a specific device\n"
" set-state Mark a device as failed\n" " set-state Change device state (rw, ro, evacuating, spare)\n"
" resize Resize filesystem on a device\n" " resize Resize filesystem on a device\n"
" resize-journal Resize journal on a device\n" " resize-journal Resize journal on a device\n"
"\n" "\n"

View File

@ -237,7 +237,7 @@ static struct durability_x_degraded replicas_durability(const struct bch_replica
unsigned durability = dev ? dev->durability : 1; unsigned durability = dev ? dev->durability : 1;
if (!dev || !dev->dev || dev->state == BCH_MEMBER_STATE_failed) if (!dev || !dev->dev || dev->state == BCH_MEMBER_STATE_evacuating)
degraded += durability; degraded += durability;
ret.durability += durability; ret.durability += durability;
} }

View File

@ -777,34 +777,27 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *c, struct bch_devs_mask devs,
for_each_cpu_replicas_entry(&c->replicas, i) { for_each_cpu_replicas_entry(&c->replicas, i) {
struct bch_replicas_entry_v1 *e = &i->e; struct bch_replicas_entry_v1 *e = &i->e;
unsigned nr_online = 0, nr_failed = 0, dflags = 0; unsigned nr_online = 0, nr_invalid = 0, dflags = 0;
bool metadata = e->data_type < BCH_DATA_user; bool metadata = e->data_type < BCH_DATA_user;
if (e->data_type == BCH_DATA_cached) if (e->data_type == BCH_DATA_cached)
continue; continue;
scoped_guard(rcu) for (unsigned i = 0; i < e->nr_devs; i++) {
for (unsigned i = 0; i < e->nr_devs; i++) { if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
if (e->devs[i] == BCH_SB_MEMBER_INVALID) { nr_invalid++;
nr_failed++; continue;
continue;
}
nr_online += test_bit(e->devs[i], devs.d);
struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
} }
if (nr_online + nr_failed == e->nr_devs) nr_online += test_bit(e->devs[i], devs.d);
continue; }
if (nr_online < e->nr_required) if (nr_online < e->nr_required)
dflags |= metadata dflags |= metadata
? BCH_FORCE_IF_METADATA_LOST ? BCH_FORCE_IF_METADATA_LOST
: BCH_FORCE_IF_DATA_LOST; : BCH_FORCE_IF_DATA_LOST;
if (nr_online < e->nr_devs) if (nr_online + nr_invalid < e->nr_devs)
dflags |= metadata dflags |= metadata
? BCH_FORCE_IF_METADATA_DEGRADED ? BCH_FORCE_IF_METADATA_DEGRADED
: BCH_FORCE_IF_DATA_DEGRADED; : BCH_FORCE_IF_DATA_DEGRADED;
@ -823,72 +816,67 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *c, struct bch_devs_mask devs,
return true; return true;
} }
bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, bool bch2_can_write_fs_with_devs(struct bch_fs *c, struct bch_devs_mask devs,
unsigned flags, struct printbuf *err, unsigned flags, struct printbuf *err)
bool write)
{ {
if (write) { unsigned nr_have[BCH_DATA_NR];
unsigned nr_have[BCH_DATA_NR]; memset(nr_have, 0, sizeof(nr_have));
memset(nr_have, 0, sizeof(nr_have));
unsigned nr_online[BCH_DATA_NR]; unsigned nr_online[BCH_DATA_NR];
memset(nr_online, 0, sizeof(nr_online)); memset(nr_online, 0, sizeof(nr_online));
scoped_guard(rcu) scoped_guard(rcu)
for_each_member_device_rcu(c, ca, &devs) { for_each_member_device_rcu(c, ca, &devs) {
if (!ca->mi.durability) if (!ca->mi.durability)
continue; continue;
bool online = ca->mi.state == BCH_MEMBER_STATE_rw && bool online = test_bit(ca->dev_idx, devs.d);
test_bit(ca->dev_idx, devs.d); for (unsigned i = 0; i < BCH_DATA_NR; i++) {
nr_have[i] += ca->mi.data_allowed & BIT(i) ? ca->mi.durability : 0;
for (unsigned i = 0; i < BCH_DATA_NR; i++) { if (online)
nr_have[i] += ca->mi.data_allowed & BIT(i) ? ca->mi.durability : 0; nr_online[i] += ca->mi.data_allowed & BIT(i) ? ca->mi.durability : 0;
if (online)
nr_online[i] += ca->mi.data_allowed & BIT(i) ? ca->mi.durability : 0;
}
} }
}
if (!nr_online[BCH_DATA_journal]) { if (!nr_online[BCH_DATA_journal]) {
prt_printf(err, "No rw journal devices online\n"); prt_printf(err, "No rw journal devices online\n");
return false;
}
if (!nr_online[BCH_DATA_btree]) {
prt_printf(err, "No rw btree devices online\n");
return false;
}
if (!nr_online[BCH_DATA_user]) {
prt_printf(err, "No rw user data devices online\n");
return false;
}
if (!(flags & BCH_FORCE_IF_METADATA_DEGRADED)) {
if (nr_online[BCH_DATA_journal] < nr_have[BCH_DATA_journal] &&
nr_online[BCH_DATA_journal] < c->opts.metadata_replicas) {
prt_printf(err, "Insufficient rw journal devices (%u) online\n",
nr_online[BCH_DATA_journal]);
return false; return false;
} }
if (!nr_online[BCH_DATA_btree]) { if (nr_online[BCH_DATA_btree] < nr_have[BCH_DATA_btree] &&
prt_printf(err, "No rw btree devices online\n"); nr_online[BCH_DATA_btree] < c->opts.metadata_replicas) {
prt_printf(err, "Insufficient rw btree devices (%u) online\n",
nr_online[BCH_DATA_btree]);
return false; return false;
} }
}
if (!nr_online[BCH_DATA_user]) { if (!(flags & BCH_FORCE_IF_DATA_DEGRADED)) {
prt_printf(err, "No rw user data devices online\n"); if (nr_online[BCH_DATA_user] < nr_have[BCH_DATA_user] &&
nr_online[BCH_DATA_user] < c->opts.data_replicas) {
prt_printf(err, "Insufficient rw user data devices (%u) online\n",
nr_online[BCH_DATA_user]);
return false; return false;
} }
if (!(flags & BCH_FORCE_IF_METADATA_DEGRADED)) {
if (nr_online[BCH_DATA_journal] < nr_have[BCH_DATA_journal] &&
nr_online[BCH_DATA_journal] < c->opts.metadata_replicas) {
prt_printf(err, "Insufficient rw journal devices (%u) online\n",
nr_online[BCH_DATA_journal]);
return false;
}
if (nr_online[BCH_DATA_btree] < nr_have[BCH_DATA_btree] &&
nr_online[BCH_DATA_btree] < c->opts.metadata_replicas) {
prt_printf(err, "Insufficient rw btree devices (%u) online\n",
nr_online[BCH_DATA_btree]);
return false;
}
}
if (!(flags & BCH_FORCE_IF_DATA_DEGRADED)) {
if (nr_online[BCH_DATA_user] < nr_have[BCH_DATA_user] &&
nr_online[BCH_DATA_user] < c->opts.data_replicas) {
prt_printf(err, "Insufficient rw user data devices (%u) online\n",
nr_online[BCH_DATA_user]);
return false;
}
}
} }
return bch2_can_read_fs_with_devs(c, devs, flags, err); return bch2_can_read_fs_with_devs(c, devs, flags, err);

View File

@ -36,8 +36,8 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
bool bch2_can_read_fs_with_devs(struct bch_fs *, struct bch_devs_mask, bool bch2_can_read_fs_with_devs(struct bch_fs *, struct bch_devs_mask,
unsigned, struct printbuf *); unsigned, struct printbuf *);
bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, bool bch2_can_write_fs_with_devs(struct bch_fs *, struct bch_devs_mask,
unsigned, struct printbuf *, bool); unsigned, struct printbuf *);
bool bch2_sb_has_journal(struct bch_sb *); bool bch2_sb_has_journal(struct bch_sb *);
unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);

View File

@ -2139,7 +2139,7 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
ptr->dev = BCH_SB_MEMBER_INVALID; ptr->dev = BCH_SB_MEMBER_INVALID;
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_evacuating;
} }
if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) { if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) {

View File

@ -150,7 +150,7 @@ static inline u64 dev_latency(struct bch_dev *ca)
static inline int dev_failed(struct bch_dev *ca) static inline int dev_failed(struct bch_dev *ca)
{ {
return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; return !ca || ca->mi.state == BCH_MEMBER_STATE_evacuating;
} }
/* /*
@ -254,7 +254,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
p.crc_retry_nr = f->failed_csum_nr; p.crc_retry_nr = f->failed_csum_nr;
p.has_ec &= ~f->failed_ec; p.has_ec &= ~f->failed_ec;
if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { if (ca && ca->mi.state != BCH_MEMBER_STATE_evacuating) {
have_io_errors |= f->failed_io; have_io_errors |= f->failed_io;
have_io_errors |= f->failed_btree_validate; have_io_errors |= f->failed_btree_validate;
have_io_errors |= f->failed_ec; have_io_errors |= f->failed_ec;
@ -850,7 +850,7 @@ unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded
{ {
struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->ptr.dev); struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->ptr.dev);
if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) if (!ca || ca->mi.state == BCH_MEMBER_STATE_evacuating)
return 0; return 0;
return __extent_ptr_durability(ca, p); return __extent_ptr_durability(ca, p);

View File

@ -612,7 +612,7 @@ static bool bch2_bkey_needs_reconcile(struct bch_fs *c, struct bkey_s_c k,
r.ptrs_moving |= ptr_bit; r.ptrs_moving |= ptr_bit;
} }
if (ca->mi.state == BCH_MEMBER_STATE_failed) { if (ca->mi.state == BCH_MEMBER_STATE_evacuating) {
r.need_rb |= BIT(BCH_REBALANCE_data_replicas); r.need_rb |= BIT(BCH_REBALANCE_data_replicas);
r.hipri = 1; r.hipri = 1;
r.ptrs_moving |= ptr_bit; r.ptrs_moving |= ptr_bit;
@ -622,7 +622,7 @@ static bool bch2_bkey_needs_reconcile(struct bch_fs *c, struct bkey_s_c k,
durability_acct += d; durability_acct += d;
if (ca->mi.state == BCH_MEMBER_STATE_failed) if (ca->mi.state == BCH_MEMBER_STATE_evacuating)
d = 0; d = 0;
durability += d; durability += d;

View File

@ -541,46 +541,17 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags, enum bch_member_state new_state, int flags,
struct printbuf *err) struct printbuf *err)
{ {
struct bch_devs_mask new_online_devs;
int nr_rw = 0, required;
lockdep_assert_held(&c->state_lock); lockdep_assert_held(&c->state_lock);
switch (new_state) { if (ca->mi.state == BCH_MEMBER_STATE_rw &&
case BCH_MEMBER_STATE_rw: new_state != BCH_MEMBER_STATE_rw) {
return true; struct bch_devs_mask new_rw_devs = c->rw_devs[0];
case BCH_MEMBER_STATE_ro: __clear_bit(ca->dev_idx, new_rw_devs.d);
if (ca->mi.state != BCH_MEMBER_STATE_rw)
return true;
/* do we have enough devices to write to? */ return bch2_can_write_fs_with_devs(c, new_rw_devs, flags, err);
for_each_member_device(c, ca2)
if (ca2 != ca)
nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
? c->opts.metadata_replicas
: metadata_replicas_required(c),
!(flags & BCH_FORCE_IF_DATA_DEGRADED)
? c->opts.data_replicas
: data_replicas_required(c));
return nr_rw >= required;
case BCH_MEMBER_STATE_failed:
case BCH_MEMBER_STATE_spare:
if (ca->mi.state != BCH_MEMBER_STATE_rw &&
ca->mi.state != BCH_MEMBER_STATE_ro)
return true;
/* do we have enough devices to read from? */
new_online_devs = c->online_devs;
__clear_bit(ca->dev_idx, new_online_devs.d);
return bch2_have_enough_devs(c, new_online_devs, flags, err,
test_bit(BCH_FS_rw, &c->flags));
default:
BUG();
} }
return true;
} }
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@ -602,7 +573,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
bool do_reconcile_scan = bool do_reconcile_scan =
new_state == BCH_MEMBER_STATE_rw || new_state == BCH_MEMBER_STATE_rw ||
new_state == BCH_MEMBER_STATE_failed; new_state == BCH_MEMBER_STATE_evacuating;
struct reconcile_scan s = new_state == BCH_MEMBER_STATE_rw struct reconcile_scan s = new_state == BCH_MEMBER_STATE_rw
? (struct reconcile_scan) { .type = RECONCILE_SCAN_pending } ? (struct reconcile_scan) { .type = RECONCILE_SCAN_pending }
@ -653,7 +624,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags,
*/ */
bch2_dev_put(ca); bch2_dev_put(ca);
try(__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_failed, flags, err)); try(__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_evacuating, flags, err));
ret = fast_device_removal ret = fast_device_removal
? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags, err) ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags, err)
@ -980,6 +951,24 @@ int bch2_dev_online(struct bch_fs *c, const char *path, struct printbuf *err)
return 0; return 0;
} }
static int bch2_dev_may_offline(struct bch_fs *c, struct bch_dev *ca, int flags, struct printbuf *err)
{
struct bch_devs_mask new_devs = c->online_devs;
__clear_bit(ca->dev_idx, new_devs.d);
struct bch_devs_mask new_rw_devs = c->rw_devs[0];
__clear_bit(ca->dev_idx, new_devs.d);
if (!bch2_can_read_fs_with_devs(c, new_devs, flags, err) ||
(!c->opts.read_only &&
!bch2_can_write_fs_with_devs(c, new_rw_devs, flags, err))) {
prt_printf(err, "Cannot offline required disk\n");
return bch_err_throw(c, device_state_not_allowed);
}
return 0;
}
int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags, struct printbuf *err) int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags, struct printbuf *err)
{ {
guard(rwsem_write)(&c->state_lock); guard(rwsem_write)(&c->state_lock);
@ -989,10 +978,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags, struct pri
return 0; return 0;
} }
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags, NULL)) { try(bch2_dev_may_offline(c, ca, flags, err));
prt_printf(err, "Cannot offline required disk\n");
return bch_err_throw(c, device_state_not_allowed);
}
__bch2_dev_offline(c, ca); __bch2_dev_offline(c, ca);
return 0; return 0;
@ -1150,10 +1136,7 @@ static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
__bch2_log_msg_start(ca->name, &buf); __bch2_log_msg_start(ca->name, &buf);
prt_printf(&buf, "offline from block layer\n"); prt_printf(&buf, "offline from block layer\n");
bool dev = bch2_dev_state_allowed(c, ca, bool dev = !bch2_dev_may_offline(c, ca, BCH_FORCE_IF_DEGRADED, &buf);
BCH_MEMBER_STATE_failed,
BCH_FORCE_IF_DEGRADED,
&buf);
if (!dev && sb) { if (!dev && sb) {
if (!surprise) if (!surprise)
sync_filesystem(sb); sync_filesystem(sb);

View File

@ -1277,7 +1277,7 @@ static int bch2_fs_may_start(struct bch_fs *c, struct printbuf *err)
bool missing = false; bool missing = false;
for_each_member_device(c, ca) for_each_member_device(c, ca)
if (!bch2_dev_is_online(ca) && if (!bch2_dev_is_online(ca) &&
(ca->mi.state != BCH_MEMBER_STATE_failed || (ca->mi.state != BCH_MEMBER_STATE_evacuating ||
bch2_dev_has_data(c, ca))) { bch2_dev_has_data(c, ca))) {
prt_printf(err, "Cannot mount without device %u\n", ca->dev_idx); prt_printf(err, "Cannot mount without device %u\n", ca->dev_idx);
guard(printbuf_indent)(err); guard(printbuf_indent)(err);
@ -1288,7 +1288,9 @@ static int bch2_fs_may_start(struct bch_fs *c, struct printbuf *err)
} }
} }
if (!bch2_have_enough_devs(c, c->online_devs, flags, err, !c->opts.read_only)) { if (!bch2_can_read_fs_with_devs(c, c->online_devs, flags, err) ||
(!c->opts.read_only &&
!bch2_can_write_fs_with_devs(c, c->rw_devs[0], flags, err))) {
prt_printf(err, "Missing devices\n"); prt_printf(err, "Missing devices\n");
for_each_member_device(c, ca) for_each_member_device(c, ca)
if (!bch2_dev_is_online(ca) && bch2_dev_has_data(c, ca)) { if (!bch2_dev_is_online(ca) && bch2_dev_has_data(c, ca)) {
@ -1307,8 +1309,6 @@ static int __bch2_fs_start(struct bch_fs *c, struct printbuf *err)
{ {
BUG_ON(test_bit(BCH_FS_started, &c->flags)); BUG_ON(test_bit(BCH_FS_started, &c->flags));
try(bch2_fs_may_start(c, err));
scoped_guard(rwsem_write, &c->state_lock) { scoped_guard(rwsem_write, &c->state_lock) {
scoped_guard(rcu) scoped_guard(rcu)
for_each_online_member_rcu(c, ca) for_each_online_member_rcu(c, ca)
@ -1318,6 +1318,8 @@ static int __bch2_fs_start(struct bch_fs *c, struct printbuf *err)
bch2_recalc_capacity(c); bch2_recalc_capacity(c);
} }
try(bch2_fs_may_start(c, err));
/* /*
* check mount options as early as possible; some can only be checked * check mount options as early as possible; some can only be checked
* after starting * after starting

View File

@ -63,7 +63,7 @@ static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
static inline bool bch2_dev_is_healthy(struct bch_dev *ca) static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
{ {
return bch2_dev_is_online(ca) && return bch2_dev_is_online(ca) &&
ca->mi.state != BCH_MEMBER_STATE_failed; ca->mi.state != BCH_MEMBER_STATE_evacuating;
} }
static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)

View File

@ -110,7 +110,7 @@ LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
#define BCH_MEMBER_STATES() \ #define BCH_MEMBER_STATES() \
x(rw, 0) \ x(rw, 0) \
x(ro, 1) \ x(ro, 1) \
x(failed, 2) \ x(evacuating, 2) \
x(spare, 3) x(spare, 3)
enum bch_member_state { enum bch_member_state {