Kent Overstreet fa1882de61
Some checks failed
Nix Flake actions / nix-matrix (push) Has been cancelled
build / bcachefs-tools-msrv (push) Has been cancelled
.deb build orchestrator / source-only (push) Has been cancelled
.deb build orchestrator / publish (push) Has been cancelled
Nix Flake actions / ${{ matrix.name }} (${{ matrix.system }}) (push) Has been cancelled
.deb build orchestrator / obs (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / reprotest (push) Has been cancelled
Update bcachefs sources to e3e6e947d0c9 bcachefs: Clear recovery_passes_required when initializing
2025-10-23 22:22:03 -04:00

1110 lines
27 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "alloc/accounting.h"
#include "alloc/background.h"
#include "alloc/backpointers.h"
#include "alloc/check.h"
#include "alloc/replicas.h"
#include "data/ec.h"
#include "data/migrate.h"
#include "data/rebalance.h"
#include "debug/sysfs.h"
#include "journal/init.h"
#include "journal/reclaim.h"
#include "init/dev.h"
#include "init/fs.h"
#include "sb/members.h"
#define x(n) #n,
const char * const bch2_dev_read_refs[] = {
BCH_DEV_READ_REFS()
NULL
};
const char * const bch2_dev_write_refs[] = {
BCH_DEV_WRITE_REFS()
NULL
};
#undef x
void bch2_devs_list_to_text(struct printbuf *out, struct bch_devs_list *d)
{
prt_char(out, '[');
darray_for_each(*d, i) {
if (i != d->data)
prt_char(out, ' ');
prt_printf(out, "%u", *i);
}
prt_char(out, ']');
}
static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
{
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
if (le16_to_cpu(sb->block_size) != block_sectors(c))
return bch_err_throw(c, mismatched_block_size);
if (le16_to_cpu(m.bucket_size) <
BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
return bch_err_throw(c, bucket_size_too_small);
return 0;
}
struct bch_fs *bch2_dev_to_fs(dev_t dev)
{
guard(mutex)(&bch2_fs_list_lock);
guard(rcu)();
struct bch_fs *c;
list_for_each_entry(c, &bch2_fs_list, list)
for_each_member_device_rcu(c, ca, NULL)
if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
closure_get(&c->cl);
return c;
}
return NULL;
}
int bch2_dev_in_fs(struct bch_sb_handle *fs,
struct bch_sb_handle *sb,
struct bch_opts *opts)
{
if (fs == sb)
return 0;
if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
return -BCH_ERR_device_not_a_member_of_filesystem;
if (!bch2_member_exists(fs->sb, sb->sb->dev_idx))
return -BCH_ERR_device_has_been_removed;
if (fs->sb->block_size != sb->sb->block_size)
return -BCH_ERR_mismatched_block_size;
if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||
le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)
return 0;
if (fs->sb->seq == sb->sb->seq &&
fs->sb->write_time != sb->sb->write_time) {
CLASS(printbuf, buf)();
prt_str(&buf, "Split brain detected between ");
prt_bdevname(&buf, sb->bdev);
prt_str(&buf, " and ");
prt_bdevname(&buf, fs->bdev);
prt_char(&buf, ':');
prt_newline(&buf);
prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));
prt_newline(&buf);
prt_bdevname(&buf, fs->bdev);
prt_char(&buf, ' ');
bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));
prt_newline(&buf);
prt_bdevname(&buf, sb->bdev);
prt_char(&buf, ' ');
bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));
prt_newline(&buf);
if (!opts->no_splitbrain_check)
prt_printf(&buf, "Not using older sb");
pr_err("%s", buf.buf);
if (!opts->no_splitbrain_check)
return -BCH_ERR_device_splitbrain;
}
struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
u64 seq_from_fs = le64_to_cpu(m.seq);
u64 seq_from_member = le64_to_cpu(sb->sb->seq);
if (seq_from_fs && seq_from_fs < seq_from_member) {
CLASS(printbuf, buf)();
prt_str(&buf, "Split brain detected between ");
prt_bdevname(&buf, sb->bdev);
prt_str(&buf, " and ");
prt_bdevname(&buf, fs->bdev);
prt_char(&buf, ':');
prt_newline(&buf);
prt_bdevname(&buf, fs->bdev);
prt_str(&buf, " believes seq of ");
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " to be %llu, but ", seq_from_fs);
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " has %llu\n", seq_from_member);
if (!opts->no_splitbrain_check) {
prt_str(&buf, "Not using ");
prt_bdevname(&buf, sb->bdev);
}
pr_err("%s", buf.buf);
if (!opts->no_splitbrain_check)
return -BCH_ERR_device_splitbrain;
}
return 0;
}
/* Device startup/shutdown: */
void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw)
{
if (rw == READ)
clear_bit(ca->dev_idx, ca->fs->online_devs.d);
if (!enumerated_ref_is_zero(&ca->io_ref[rw]))
enumerated_ref_stop(&ca->io_ref[rw],
rw == READ
? bch2_dev_read_refs
: bch2_dev_write_refs);
}
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
bch2_dev_io_ref_stop(ca, WRITE);
/*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
bch2_dev_allocator_remove(c, ca);
bch2_recalc_capacity(c);
bch2_dev_journal_stop(&c->journal, ca);
}
static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
if (enumerated_ref_is_zero(&ca->io_ref[WRITE]))
enumerated_ref_start(&ca->io_ref[WRITE]);
bch2_dev_do_discards(ca);
}
void bch2_dev_unlink(struct bch_dev *ca)
{
struct kobject *b;
/*
* This is racy w.r.t. the underlying block device being hot-removed,
* which removes it from sysfs.
*
* It'd be lovely if we had a way to handle this race, but the sysfs
* code doesn't appear to provide a good method and block/holder.c is
* susceptible as well:
*/
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev &&
(b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
sysfs_remove_link(b, "bcachefs");
sysfs_remove_link(&ca->kobj, "block");
}
}
static void bch2_dev_release(struct kobject *kobj)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
kfree(ca);
}
KTYPE(bch2_dev);
void bch2_dev_free(struct bch_dev *ca)
{
WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
cancel_work_sync(&ca->io_error_work);
bch2_dev_unlink(ca);
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
bch2_free_super(&ca->disk_sb);
bch2_dev_allocator_background_exit(ca);
bch2_dev_journal_exit(ca);
free_percpu(ca->io_done);
bch2_dev_buckets_free(ca);
kfree(ca->sb_read_scratch);
bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
enumerated_ref_exit(&ca->io_ref[WRITE]);
enumerated_ref_exit(&ca->io_ref[READ]);
#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_exit(&ca->ref);
#endif
kobject_put(&ca->kobj);
}
void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
if (enumerated_ref_is_zero(&ca->io_ref[READ]))
return;
__bch2_dev_read_only(c, ca);
bch2_dev_io_ref_stop(ca, READ);
bch2_dev_unlink(ca);
bch2_free_super(&ca->disk_sb);
bch2_dev_journal_exit(ca);
}
#ifndef CONFIG_BCACHEFS_DEBUG
static void bch2_dev_ref_complete(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
complete(&ca->ref_completion);
}
#endif
int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
{
if (!c->kobj.state_in_sysfs)
return 0;
if (!ca->kobj.state_in_sysfs) {
try(kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx));
try(bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE));
}
if (ca->disk_sb.bdev) {
struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
try(sysfs_create_link(block, &ca->kobj, "bcachefs"));
try(sysfs_create_link(&ca->kobj, block, "block"));
}
return 0;
}
static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
struct bch_member *member)
{
struct bch_dev *ca;
unsigned i;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
return NULL;
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->ref_completion);
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
for (i = 0; i < ARRAY_SIZE(member->errors); i++)
atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
ca->uuid = member->uuid;
ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / btree_sectors(c));
#ifndef CONFIG_BCACHEFS_DEBUG
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL))
goto err;
#else
atomic_long_set(&ca->ref, 1);
#endif
mutex_init(&ca->bucket_backpointer_mismatch.lock);
mutex_init(&ca->bucket_backpointer_empty.lock);
bch2_dev_allocator_background_init(ca);
if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) ||
enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) ||
!(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
bch2_dev_buckets_alloc(c, ca) ||
!(ca->io_done = alloc_percpu(*ca->io_done)))
goto err;
return ca;
err:
bch2_dev_free(ca);
return NULL;
}
static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
unsigned dev_idx)
{
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
if (!ca->name[0])
scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
ca->fs = c;
rcu_assign_pointer(c->devs[ca->dev_idx], ca);
if (bch2_dev_sysfs_online(c, ca))
pr_warn("error creating sysfs objects");
}
int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
{
struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
struct bch_dev *ca = NULL;
if (bch2_fs_init_fault("dev_alloc"))
return bch_err_throw(c, ENOMEM_dev_alloc);
ca = __bch2_dev_alloc(c, &member);
if (!ca)
return bch_err_throw(c, ENOMEM_dev_alloc);
ca->fs = c;
bch2_dev_attach(c, ca, dev_idx);
return 0;
}
static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb,
struct printbuf *err)
{
if (bch2_dev_is_online(ca)) {
prt_printf(err, "already have device online in slot %u\n",
sb->sb->dev_idx);
return bch_err_throw(ca->fs, device_already_online);
}
if (get_capacity(sb->bdev->bd_disk) <
ca->mi.bucket_size * ca->mi.nbuckets) {
prt_printf(err, "cannot online: device too small (capacity %llu filesystem size %llu nbuckets %llu)\n",
get_capacity(sb->bdev->bd_disk),
ca->mi.bucket_size * ca->mi.nbuckets,
ca->mi.nbuckets);
return bch_err_throw(ca->fs, device_size_too_small);
}
BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
try(bch2_dev_journal_init(ca, sb->sb));
CLASS(printbuf, name)();
prt_bdevname(&name, sb->bdev);
strscpy(ca->name, name.buf, sizeof(ca->name));
/* Commit: */
ca->disk_sb = *sb;
memset(sb, 0, sizeof(*sb));
/*
* Stash pointer to the filesystem for blk_holder_ops - note that once
* attached to a filesystem, we will always close the block device
* before tearing down the filesystem object.
*/
ca->disk_sb.holder->c = ca->fs;
ca->dev = ca->disk_sb.bdev->bd_dev;
enumerated_ref_start(&ca->io_ref[READ]);
return 0;
}
int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb, struct printbuf *err)
{
lockdep_assert_held(&c->state_lock);
if (le64_to_cpu(sb->sb->seq) >
le64_to_cpu(c->disk_sb.sb->seq))
bch2_sb_to_fs(c, sb->sb);
BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));
struct bch_dev *ca = bch2_dev_locked(c, sb->sb->dev_idx);
try(__bch2_dev_attach_bdev(ca, sb, err));
set_bit(ca->dev_idx, c->online_devs.d);
bch2_dev_sysfs_online(c, ca);
bch2_rebalance_wakeup(c);
return 0;
}
/* Device management: */
/*
* Note: this function is also used by the error paths - when a particular
* device sees an error, we call it to determine whether we can just set the
* device RO, or - if this function returns false - we'll set the whole
* filesystem RO:
*
* XXX: maybe we should be more explicit about whether we're changing state
* because we got an error or what have you?
*/
bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags,
struct printbuf *err)
{
struct bch_devs_mask new_online_devs;
int nr_rw = 0, required;
lockdep_assert_held(&c->state_lock);
switch (new_state) {
case BCH_MEMBER_STATE_rw:
return true;
case BCH_MEMBER_STATE_ro:
if (ca->mi.state != BCH_MEMBER_STATE_rw)
return true;
/* do we have enough devices to write to? */
for_each_member_device(c, ca2)
if (ca2 != ca)
nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
? c->opts.metadata_replicas
: metadata_replicas_required(c),
!(flags & BCH_FORCE_IF_DATA_DEGRADED)
? c->opts.data_replicas
: data_replicas_required(c));
return nr_rw >= required;
case BCH_MEMBER_STATE_failed:
case BCH_MEMBER_STATE_spare:
if (ca->mi.state != BCH_MEMBER_STATE_rw &&
ca->mi.state != BCH_MEMBER_STATE_ro)
return true;
/* do we have enough devices to read from? */
new_online_devs = c->online_devs;
__clear_bit(ca->dev_idx, new_online_devs.d);
return bch2_have_enough_devs(c, new_online_devs, flags, err,
test_bit(BCH_FS_rw, &c->flags));
default:
BUG();
}
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags,
struct printbuf *err)
{
int ret = 0;
if (ca->mi.state == new_state)
return 0;
if (!bch2_dev_state_allowed(c, ca, new_state, flags, err))
return bch_err_throw(c, device_state_not_allowed);
if (new_state != BCH_MEMBER_STATE_rw)
__bch2_dev_read_only(c, ca);
bch_notice(ca, "%s", bch2_member_states[new_state]);
scoped_guard(mutex, &c->sb_lock) {
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
SET_BCH_MEMBER_STATE(m, new_state);
bch2_write_super(c);
}
if (new_state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
bch2_rebalance_wakeup(c);
return ret;
}
int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags,
struct printbuf *err)
{
guard(rwsem_write)(&c->state_lock);
return __bch2_dev_set_state(c, ca, new_state, flags, err);
}
/* Device add/removal: */
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags,
struct printbuf *err)
{
unsigned dev_idx = ca->dev_idx, data;
bool fast_device_removal = (c->sb.compat & BIT_ULL(BCH_COMPAT_no_stale_ptrs)) &&
!bch2_request_incompat_feature(c,
bcachefs_metadata_version_fast_device_removal);
int ret;
guard(rwsem_write)(&c->state_lock);
/*
* We consume a reference to ca->ref, regardless of whether we succeed
* or fail:
*/
bch2_dev_put(ca);
try(__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_failed, flags, err));
ret = fast_device_removal
? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags, err)
: (bch2_dev_data_drop(c, ca->dev_idx, flags, err) ?:
bch2_dev_remove_stripes(c, ca->dev_idx, flags, err));
if (ret)
goto err;
/* Check if device still has data before blowing away alloc info */
struct bch_dev_usage usage = bch2_dev_usage_read(ca);
for (unsigned i = 0; i < BCH_DATA_NR; i++)
if (!data_type_is_empty(i) &&
!data_type_is_hidden(i) &&
usage.buckets[i]) {
prt_printf(err, "Remove failed: still has data (%s, %llu buckets)\n",
__bch2_data_types[i], usage.buckets[i]);
ret = -EBUSY;
goto err;
}
ret = bch2_dev_remove_alloc(c, ca);
if (ret) {
prt_printf(err, "bch2_dev_remove_alloc() error: %s\n", bch2_err_str(ret));
goto err;
}
/*
* We need to flush the entire journal to get rid of keys that reference
* the device being removed before removing the superblock entry
*/
bch2_journal_flush_all_pins(&c->journal);
/*
* this is really just needed for the bch2_replicas_gc_(start|end)
* calls, and could be cleaned up:
*/
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
if (ret) {
prt_printf(err, "bch2_journal_flush_device_pins() error: %s\n", bch2_err_str(ret));
goto err;
}
ret = bch2_journal_flush(&c->journal);
if (ret) {
prt_printf(err, "bch2_journal_flush() error: %s\n", bch2_err_str(ret));
goto err;
}
ret = bch2_replicas_gc2(c);
if (ret) {
prt_printf(err, "bch2_replicas_gc2() error: %s\n", bch2_err_str(ret));
goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
prt_str(err, "Remove failed, still has data (");
prt_bitflags(err, __bch2_data_types, data);
prt_str(err, ")\n");
ret = -EBUSY;
goto err;
}
__bch2_dev_offline(c, ca);
scoped_guard(mutex, &c->sb_lock)
rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_kill(&ca->ref);
#else
ca->dying = true;
bch2_dev_put(ca);
#endif
wait_for_completion(&ca->ref_completion);
bch2_dev_free(ca);
/*
* Free this device's slot in the bch_member array - all pointers to
* this device must be gone:
*/
scoped_guard(mutex, &c->sb_lock) {
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
if (fast_device_removal)
m->uuid = BCH_SB_MEMBER_DELETED_UUID;
else
memset(&m->uuid, 0, sizeof(m->uuid));
bch2_write_super(c);
}
return 0;
err:
if (test_bit(BCH_FS_rw, &c->flags) &&
ca->mi.state == BCH_MEMBER_STATE_rw &&
!enumerated_ref_is_zero(&ca->io_ref[READ]))
__bch2_dev_read_write(c, ca);
return ret;
}
/* Add new device to running filesystem: */
int bch2_dev_add(struct bch_fs *c, const char *path, struct printbuf *err)
{
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb = {};
struct bch_dev *ca = NULL;
CLASS(printbuf, label)();
int ret = 0;
ret = bch2_read_super(path, &opts, &sb);
if (ret) {
prt_printf(err, "error reading superblock: %s\n", bch2_err_str(ret));
goto err;
}
struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
if (label.allocation_failure) {
ret = -ENOMEM;
goto err;
}
}
if (list_empty(&c->list)) {
scoped_guard(mutex, &bch2_fs_list_lock) {
if (__bch2_uuid_to_fs(c->sb.uuid))
ret = bch_err_throw(c, filesystem_uuid_already_open);
else
list_add(&c->list, &bch2_fs_list);
}
if (ret) {
prt_printf(err, "cannot go multidevice: filesystem UUID already open\n");
goto err;
}
}
ret = bch2_dev_may_add(sb.sb, c);
if (ret)
goto err;
ca = __bch2_dev_alloc(c, &dev_mi);
if (!ca) {
ret = -ENOMEM;
goto err;
}
ret = __bch2_dev_attach_bdev(ca, &sb, err);
if (ret)
goto err;
scoped_guard(rwsem_write, &c->state_lock) {
scoped_guard(mutex, &c->sb_lock) {
SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true);
ret = bch2_sb_from_fs(c, ca);
if (ret) {
prt_printf(err, "error setting up new superblock: %s\n", bch2_err_str(ret));
goto err;
}
if (dynamic_fault("bcachefs:add:no_slot"))
goto err;
ret = bch2_sb_member_alloc(c);
if (ret < 0) {
prt_printf(err, "error allocating superblock member slot: %s\n", bch2_err_str(ret));
goto err;
}
unsigned dev_idx = ret;
ret = 0;
/* success: */
dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
*bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
set_bit(ca->dev_idx, c->online_devs.d);
if (BCH_MEMBER_GROUP(&dev_mi)) {
ret = __bch2_dev_group_set(c, ca, label.buf);
prt_printf(err, "error creating new label: %s\n", bch2_err_str(ret));
if (ret)
goto err_late;
}
bch2_write_super(c);
}
ret = bch2_dev_usage_init(ca, false);
if (ret)
goto err_late;
if (test_bit(BCH_FS_started, &c->flags)) {
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
if (ret) {
prt_printf(err, "error marking new superblock: %s\n", bch2_err_str(ret));
goto err_late;
}
ret = bch2_fs_freespace_init(c);
if (ret) {
prt_printf(err, "error initializing free space: %s\n", bch2_err_str(ret));
goto err_late;
}
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
ret = bch2_dev_journal_alloc(ca, false);
if (ret) {
prt_printf(err, "error allocating journal: %s\n", bch2_err_str(ret));
goto err_late;
}
}
/*
* We just changed the superblock UUID, invalidate cache and send a
* uevent to update /dev/disk/by-uuid
*/
invalidate_bdev(ca->disk_sb.bdev);
char uuid_str[37];
snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid);
char *envp[] = {
"CHANGE=uuid",
uuid_str,
NULL,
};
kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp);
}
out:
bch_err_fn(c, ret);
return ret;
err:
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
goto out;
err_late:
ca = NULL;
goto err;
}
/* Hot add existing device to running filesystem: */
int bch2_dev_online(struct bch_fs *c, const char *path, struct printbuf *err)
{
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb = { NULL };
struct bch_dev *ca;
unsigned dev_idx;
int ret;
guard(rwsem_write)(&c->state_lock);
ret = bch2_read_super(path, &opts, &sb);
if (ret) {
prt_printf(err, "error reading superblock: %s\n", bch2_err_str(ret));
return ret;
}
dev_idx = sb.sb->dev_idx;
ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
if (ret) {
prt_printf(err, "device not a member of fs: %s\n", bch2_err_str(ret));
goto err;
}
ret = bch2_dev_attach_bdev(c, &sb, err);
if (ret)
goto err;
ca = bch2_dev_locked(c, dev_idx);
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
if (ret) {
prt_printf(err, "bch2_trans_mark_dev_sb() error: %s\n", bch2_err_str(ret));
goto err;
}
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
if (!ca->mi.freespace_initialized) {
ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
if (ret) {
prt_printf(err, "bch2_dev_freespace_init() error: %s\n", bch2_err_str(ret));
goto err;
}
}
if (!ca->journal.nr) {
ret = bch2_dev_journal_alloc(ca, false);
if (ret) {
prt_printf(err, "bch2_dev_journal_alloc() error: %s\n", bch2_err_str(ret));
goto err;
}
}
scoped_guard(mutex, &c->sb_lock) {
bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
cpu_to_le64(ktime_get_real_seconds());
bch2_write_super(c);
}
return 0;
err:
bch2_free_super(&sb);
return ret;
}
int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags, struct printbuf *err)
{
guard(rwsem_write)(&c->state_lock);
if (!bch2_dev_is_online(ca)) {
prt_printf(err, "Already offline\n");
return 0;
}
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags, NULL)) {
prt_printf(err, "Cannot offline required disk\n");
return bch_err_throw(c, device_state_not_allowed);
}
__bch2_dev_offline(c, ca);
return 0;
}
int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets, struct printbuf *err)
{
u64 old_nbuckets;
int ret = 0;
guard(rwsem_write)(&c->state_lock);
old_nbuckets = ca->mi.nbuckets;
if (nbuckets < ca->mi.nbuckets) {
prt_printf(err, "Cannot shrink yet\n");
return -EINVAL;
}
if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
prt_printf(err, "New device size too big (%llu greater than max %u)\n",
nbuckets, BCH_MEMBER_NBUCKETS_MAX);
return bch_err_throw(c, device_size_too_big);
}
if (bch2_dev_is_online(ca) &&
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
prt_printf(err, "New size %llu larger than device size %llu\n",
ca->mi.bucket_size * nbuckets,
get_capacity(ca->disk_sb.bdev->bd_disk));
return bch_err_throw(c, device_size_too_small);
}
ret = bch2_dev_buckets_resize(c, ca, nbuckets);
if (ret) {
prt_printf(err, "bch2_dev_buckets_resize() error: %s\n", bch2_err_str(ret));
return ret;
}
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
if (ret) {
prt_printf(err, "bch2_trans_mark_dev_sb() error: %s\n", bch2_err_str(ret));
return ret;
}
scoped_guard(mutex, &c->sb_lock) {
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
m->nbuckets = cpu_to_le64(nbuckets);
bch2_write_super(c);
}
if (ca->mi.freespace_initialized) {
ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets);
if (ret) {
prt_printf(err, "__bch2_dev_resize_alloc() error: %s\n", bch2_err_str(ret));
return ret;
}
}
bch2_recalc_capacity(c);
return 0;
}
/* Resize on mount */
int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets)
{
struct bch_fs *c = ca->fs;
u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 };
return bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
.dev = ca->dev_idx,
.data_type = BCH_DATA_free)) ?:
bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets);
}
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
if (!strncmp(name, "/dev/", strlen("/dev/")))
name += strlen("/dev/");
for_each_member_device(c, ca)
if (!strcmp(name, ca->name))
return ca;
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
/* blk_holder_ops: */
static struct bch_fs *bdev_get_fs(struct block_device *bdev)
__releases(&bdev->bd_holder_lock)
{
struct bch_sb_handle_holder *holder = bdev->bd_holder;
struct bch_fs *c = holder->c;
if (c && !bch2_ro_ref_tryget(c))
c = NULL;
mutex_unlock(&bdev->bd_holder_lock);
if (c)
wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
return c;
}
DEFINE_CLASS(bdev_get_fs, struct bch_fs *,
bch2_ro_ref_put(_T), bdev_get_fs(bdev),
struct block_device *bdev);
/* returns with ref on ca->ref */
static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
{
for_each_member_device(c, ca)
if (ca->disk_sb.bdev == bdev)
return ca;
return NULL;
}
static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
{
CLASS(bdev_get_fs, c)(bdev);
if (!c)
return;
struct super_block *sb = c->vfs_sb;
if (sb) {
/*
* Not necessary, c->ro_ref guards against the filesystem being
* unmounted - we only take this to avoid a warning in
* sync_filesystem:
*/
down_read(&sb->s_umount);
}
guard(rwsem_write)(&c->state_lock);
struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
if (ca) {
CLASS(printbuf, buf)();
__bch2_log_msg_start(ca->name, &buf);
prt_printf(&buf, "offline from block layer\n");
bool dev = bch2_dev_state_allowed(c, ca,
BCH_MEMBER_STATE_failed,
BCH_FORCE_IF_DEGRADED,
&buf);
if (!dev && sb) {
if (!surprise)
sync_filesystem(sb);
shrink_dcache_sb(sb);
evict_inodes(sb);
}
if (dev) {
__bch2_dev_offline(c, ca);
} else {
bch2_journal_flush(&c->journal);
bch2_fs_emergency_read_only2(c, &buf);
}
bch2_print_str(c, KERN_ERR, buf.buf);
bch2_dev_put(ca);
}
if (sb)
up_read(&sb->s_umount);
}
static void bch2_fs_bdev_sync(struct block_device *bdev)
{
CLASS(bdev_get_fs, c)(bdev);
if (!c)
return;
struct super_block *sb = c->vfs_sb;
if (sb) {
/*
* Not necessary, c->ro_ref guards against the filesystem being
* unmounted - we only take this to avoid a warning in
* sync_filesystem:
*/
guard(rwsem_read)(&sb->s_umount);
sync_filesystem(sb);
}
}
const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
.mark_dead = bch2_fs_bdev_mark_dead,
.sync = bch2_fs_bdev_sync,
};