Update bcachefs sources to b4927db2cdc7 bcachefs: bcachefs_metadata_version_fast_device_removal
Some checks failed
build / bcachefs-tools-deb (ubuntu-22.04) (push) Has been cancelled
build / bcachefs-tools-deb (ubuntu-24.04) (push) Has been cancelled
build / bcachefs-tools-rpm (push) Has been cancelled
build / bcachefs-tools-msrv (push) Has been cancelled
Nix Flake actions / nix-matrix (push) Has been cancelled
Nix Flake actions / ${{ matrix.name }} (${{ matrix.system }}) (push) Has been cancelled

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-05-04 15:38:06 -04:00
parent 6e4bda5ad5
commit 401a20ed98
21 changed files with 530 additions and 249 deletions

View File

@ -1 +1 @@
5a0455ae19afb354634b3c5c9bf55d2171005a2f
b4927db2cdc7f124f968f9eaa1d785298ae31c1a

View File

@ -1255,6 +1255,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
if (unlikely(ret))
return ret;
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
req->nr_replicas = nr_replicas;
req->target = target;
req->ec = erasure_code;
@ -1262,9 +1265,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
req->flags = flags;
req->devs_have = devs_have;
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
req->ptrs.nr = 0;

View File

@ -696,7 +696,8 @@ struct bch_sb_field_ext {
x(stripe_lru, BCH_VERSION(1, 23)) \
x(casefolding, BCH_VERSION(1, 24)) \
x(extent_flags, BCH_VERSION(1, 25)) \
x(snapshot_deletion_v2, BCH_VERSION(1, 26))
x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \
x(fast_device_removal, BCH_VERSION(1, 27))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,

View File

@ -1079,6 +1079,10 @@ out:
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
closure_wake_up(&c->freelist_wait);
if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags))
bch2_sb_members_clean_deleted(c);
bch_err_fn(c, ret);
return ret;
}

View File

@ -212,17 +212,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
case TARGET_DEV:
return dev == t.dev;
case TARGET_GROUP: {
rcu_read_lock();
struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
const struct bch_devs_mask *m =
g && t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
bool ret = m ? test_bit(dev, m->d) : false;
rcu_read_unlock();
return ret;
return m ? test_bit(dev, m->d) : false;
}
default:
BUG();

View File

@ -1121,8 +1121,9 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke
static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
struct bch_extent_ptr *ptr)
{
if (!opts->promote_target ||
!bch2_dev_in_target(c, ptr->dev, opts->promote_target))
unsigned target = opts->promote_target ?: opts->foreground_target;
if (target && !bch2_dev_in_target(c, ptr->dev, target))
return false;
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
@ -1135,33 +1136,43 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c,
struct bkey_s k,
struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
struct bkey_ptrs ptrs;
union bch_extent_entry *entry;
struct extent_ptr_decoded p;
bool have_cached_ptr;
rcu_read_lock();
if (!want_cached_ptr(c, opts, ptr)) {
bch2_bkey_drop_ptr_noerror(k, ptr);
goto out;
restart_drop_ptrs:
ptrs = bch2_bkey_ptrs(k);
have_cached_ptr = false;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
/*
* Check if it's erasure coded - stripes can't contain cached
* data. Possibly something we can fix in the future?
*/
if (&entry->ptr == ptr && p.has_ec)
goto drop;
if (p.ptr.cached) {
if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) {
bch2_bkey_drop_ptr_noerror(k, &entry->ptr);
goto restart_drop_ptrs;
}
have_cached_ptr = true;
}
}
/*
* Stripes can't contain cached data, for - reasons.
*
* Possibly something we can fix in the future?
*/
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (&entry->ptr == ptr) {
if (p.has_ec)
bch2_bkey_drop_ptr_noerror(k, ptr);
else
ptr->cached = true;
goto out;
}
if (have_cached_ptr || !want_cached_ptr(c, opts, ptr))
goto drop;
BUG();
out:
ptr->cached = true;
rcu_read_unlock();
return;
drop:
rcu_read_unlock();
bch2_bkey_drop_ptr_noerror(k, ptr);
}
/*

View File

@ -790,6 +790,7 @@ static int ref_visible2(struct bch_fs *c,
struct inode_walker_entry {
struct bch_inode_unpacked inode;
bool whiteout;
u64 count;
u64 i_size;
};
@ -818,12 +819,20 @@ static struct inode_walker inode_walker_init(void)
static int add_inode(struct bch_fs *c, struct inode_walker *w,
struct bkey_s_c inode)
{
struct bch_inode_unpacked u;
return bch2_inode_unpack(inode, &u) ?:
darray_push(&w->inodes, ((struct inode_walker_entry) {
.inode = u,
int ret = darray_push(&w->inodes, ((struct inode_walker_entry) {
.whiteout = !bkey_is_inode(inode.k),
}));
if (ret)
return ret;
struct inode_walker_entry *n = &darray_last(w->inodes);
if (!n->whiteout) {
return bch2_inode_unpack(inode, &n->inode);
} else {
n->inode.bi_inum = inode.k->p.inode;
n->inode.bi_snapshot = inode.k->p.snapshot;
return 0;
}
}
static int get_inodes_all_snapshots(struct btree_trans *trans,
@ -843,13 +852,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
w->recalculate_sums = false;
w->inodes.nr = 0;
for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
BTREE_ITER_all_snapshots, k, ret) {
if (k.k->p.offset != inum)
for_each_btree_key_max_norestart(trans, iter,
BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX),
BTREE_ITER_all_snapshots, k, ret) {
ret = add_inode(c, w, k);
if (ret)
break;
if (bkey_is_inode(k.k))
add_inode(c, w, k);
}
bch2_trans_iter_exit(trans, &iter);
@ -861,63 +869,6 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
return 0;
}
static struct inode_walker_entry *
lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
__darray_for_each(w->inodes, i)
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot))
goto found;
return NULL;
found:
BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot);
struct printbuf buf = PRINTBUF;
int ret = 0;
if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
trans, snapshot_key_missing_inode_snapshot,
"have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
"unexpected because we should always update the inode when we update a key in that inode\n"
"%s",
w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
(bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
struct bch_inode_unpacked new = i->inode;
new.bi_snapshot = k.k->p.snapshot;
ret = __bch2_fsck_write_inode(trans, &new) ?:
bch2_trans_commit(trans, NULL, NULL, 0) ?:
-BCH_ERR_transaction_restart_nested;
goto fsck_err;
}
printbuf_exit(&buf);
return i;
fsck_err:
printbuf_exit(&buf);
return ERR_PTR(ret);
}
static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
struct inode_walker *w,
struct bkey_s_c k)
{
if (w->last_pos.inode != k.k->p.inode) {
int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
if (ret)
return ERR_PTR(ret);
}
w->last_pos = k.k->p;
return lookup_inode_for_snapshot(trans, w, k);
}
static int get_visible_inodes(struct btree_trans *trans,
struct inode_walker *w,
struct snapshots_seen *s,
@ -953,6 +904,80 @@ static int get_visible_inodes(struct btree_trans *trans,
return ret;
}
static struct inode_walker_entry *
lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
__darray_for_each(w->inodes, i)
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot))
goto found;
return NULL;
found:
BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot);
struct printbuf buf = PRINTBUF;
int ret = 0;
if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
trans, snapshot_key_missing_inode_snapshot,
"have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
"unexpected because we should always update the inode when we update a key in that inode\n"
"%s",
w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
(bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
struct bch_inode_unpacked new = i->inode;
new.bi_snapshot = k.k->p.snapshot;
ret = __bch2_fsck_write_inode(trans, &new) ?:
bch2_trans_commit(trans, NULL, NULL, 0);
if (ret)
goto fsck_err;
struct inode_walker_entry new_entry = *i;
new_entry.inode.bi_snapshot = k.k->p.snapshot;
new_entry.count = 0;
new_entry.i_size = 0;
while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot)
--i;
size_t pos = i - w->inodes.data;
ret = darray_insert_item(&w->inodes, pos, new_entry);
if (ret)
goto fsck_err;
ret = -BCH_ERR_transaction_restart_nested;
goto fsck_err;
}
printbuf_exit(&buf);
return i;
fsck_err:
printbuf_exit(&buf);
return ERR_PTR(ret);
}
static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
struct inode_walker *w,
struct bkey_s_c k)
{
if (w->last_pos.inode != k.k->p.inode) {
int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
if (ret)
return ERR_PTR(ret);
}
w->last_pos = k.k->p;
return lookup_inode_for_snapshot(trans, w, k);
}
/*
* Prefer to delete the first one, since that will be the one at the wrong
* offset:

View File

@ -240,6 +240,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k,
u64 v[2];
unpacked->bi_inum = inode.k->p.offset;
unpacked->bi_snapshot = inode.k->p.snapshot;
unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
@ -284,13 +285,12 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
{
memset(unpacked, 0, sizeof(*unpacked));
unpacked->bi_snapshot = k.k->p.snapshot;
switch (k.k->type) {
case KEY_TYPE_inode: {
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
unpacked->bi_inum = inode.k->p.offset;
unpacked->bi_snapshot = inode.k->p.snapshot;
unpacked->bi_journal_seq= 0;
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
@ -309,6 +309,7 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
unpacked->bi_inum = inode.k->p.offset;
unpacked->bi_snapshot = inode.k->p.snapshot;
unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
@ -326,8 +327,6 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
int bch2_inode_unpack(struct bkey_s_c k,
struct bch_inode_unpacked *unpacked)
{
unpacked->bi_snapshot = k.k->p.snapshot;
return likely(k.k->type == KEY_TYPE_inode_v3)
? bch2_inode_unpack_v3(k, unpacked)
: bch2_inode_unpack_slowpath(k, unpacked);

View File

@ -1465,6 +1465,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j,
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
rcu_read_lock();
darray_for_each(*devs, i) {
struct bch_dev *ca = rcu_dereference(c->devs[*i]);
if (!ca)
@ -1486,6 +1487,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j,
ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
}
}
rcu_read_unlock();
}
static void __journal_write_alloc(struct journal *j,
@ -1498,7 +1500,8 @@ static void __journal_write_alloc(struct journal *j,
struct bch_fs *c = container_of(j, struct bch_fs, journal);
darray_for_each(*devs, i) {
struct bch_dev *ca = rcu_dereference(c->devs[*i]);
struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE,
BCH_DEV_WRITE_REF_journal_write);
if (!ca)
continue;
@ -1512,8 +1515,10 @@ static void __journal_write_alloc(struct journal *j,
ca->mi.state != BCH_MEMBER_STATE_rw ||
!ja->nr ||
bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
sectors > ja->sectors_free)
sectors > ja->sectors_free) {
enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
continue;
}
bch2_dev_stripe_increment(ca, &j->wp.stripe);
@ -1536,15 +1541,8 @@ static void __journal_write_alloc(struct journal *j,
}
}
/**
* journal_write_alloc - decide where to write next journal entry
*
* @j: journal object
* @w: journal buf (entry to be written)
*
* Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure
*/
static int journal_write_alloc(struct journal *j, struct journal_buf *w)
static int journal_write_alloc(struct journal *j, struct journal_buf *w,
unsigned *replicas)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_devs_mask devs;
@ -1552,29 +1550,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
unsigned target = c->opts.metadata_target ?:
c->opts.foreground_target;
unsigned replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas);
unsigned replicas_need = min_t(unsigned, replicas_want,
READ_ONCE(c->opts.metadata_replicas_required));
bool advance_done = false;
rcu_read_lock();
/* We might run more than once if we have to stop and do discards: */
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key));
bkey_for_each_ptr(ptrs, p) {
struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev);
if (ca)
replicas += ca->mi.durability;
}
retry_target:
devs = target_rw_devs(c, BCH_DATA_journal, target);
devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
retry_alloc:
__journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want);
__journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want);
if (likely(replicas >= replicas_want))
if (likely(*replicas >= replicas_want))
goto done;
if (!advance_done) {
@ -1583,18 +1570,16 @@ retry_alloc:
goto retry_alloc;
}
if (replicas < replicas_want && target) {
if (*replicas < replicas_want && target) {
/* Retry from all devices: */
target = 0;
advance_done = false;
goto retry_target;
}
done:
rcu_read_unlock();
BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
@ -1780,13 +1765,7 @@ static CLOSURE_CALLBACK(journal_write_submit)
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
BCH_DEV_WRITE_REF_journal_write);
if (!ca) {
/* XXX: fix this */
bch_err(c, "missing device %u for journal write", ptr->dev);
continue;
}
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
@ -2066,57 +2045,45 @@ CLOSURE_CALLBACK(bch2_journal_write)
j->write_start_time = local_clock();
mutex_lock(&j->buf_lock);
journal_buf_realloc(j, w);
ret = bch2_journal_write_prep(j, w);
mutex_unlock(&j->buf_lock);
if (unlikely(ret))
goto err;
spin_lock(&j->lock);
if (nr_rw_members > 1)
w->separate_flush = true;
ret = bch2_journal_write_pick_flush(j, w);
spin_unlock(&j->lock);
if (ret)
if (unlikely(ret))
goto err;
mutex_lock(&j->buf_lock);
journal_buf_realloc(j, w);
ret = bch2_journal_write_prep(j, w);
mutex_unlock(&j->buf_lock);
if (ret)
goto err;
j->entry_bytes_written += vstruct_bytes(w->data);
unsigned replicas_allocated = 0;
while (1) {
spin_lock(&j->lock);
ret = journal_write_alloc(j, w);
ret = journal_write_alloc(j, w, &replicas_allocated);
if (!ret || !j->can_discard)
break;
spin_unlock(&j->lock);
bch2_journal_do_discards(j);
}
if (ret && !bch2_journal_error(j)) {
struct printbuf buf = PRINTBUF;
buf.atomic++;
__bch2_journal_debug_to_text(&buf, j);
spin_unlock(&j->lock);
prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
le64_to_cpu(w->data->seq),
vstruct_sectors(w->data, c->block_bits),
bch2_err_str(ret));
bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
if (ret)
goto err;
if (unlikely(ret))
goto err_allocate_write;
spin_lock(&j->lock);
/*
* write is allocated, no longer need to account for it in
* bch2_journal_space_available():
*/
w->sectors = 0;
w->write_allocated = true;
j->entry_bytes_written += vstruct_bytes(w->data);
/*
* journal entry has been compacted and allocated, recalculate space
@ -2128,9 +2095,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
if (c->opts.nochanges)
goto no_io;
/*
* Mark journal replicas before we submit the write to guarantee
* recovery will find the journal entries after a crash.
@ -2141,15 +2105,33 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
if (c->opts.nochanges)
goto no_io;
if (!JSET_NO_FLUSH(w->data))
continue_at(cl, journal_write_preflush, j->wq);
else
continue_at(cl, journal_write_submit, j->wq);
return;
no_io:
continue_at(cl, journal_write_done, j->wq);
return;
err_allocate_write:
if (!bch2_journal_error(j)) {
struct printbuf buf = PRINTBUF;
bch2_journal_debug_to_text(&buf, j);
prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
le64_to_cpu(w->data->seq),
vstruct_sectors(w->data, c->block_bits),
bch2_err_str(ret));
bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
err:
bch2_fatal_error(c);
no_io:
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
}
continue_at(cl, journal_write_done, j->wq);
}

View File

@ -4,9 +4,11 @@
*/
#include "bcachefs.h"
#include "backpointers.h"
#include "bkey_buf.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "buckets.h"
#include "errcode.h"
#include "extents.h"
@ -20,7 +22,7 @@
#include "super-io.h"
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
unsigned dev_idx, int flags, bool metadata)
unsigned dev_idx, unsigned flags, bool metadata)
{
unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
@ -37,11 +39,28 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
return 0;
}
static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter,
struct btree *b, unsigned dev_idx, unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_buf k;
bch2_bkey_buf_init(&k);
bch2_bkey_buf_copy(&k, c, &b->key);
int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?:
bch2_btree_node_update_key(trans, iter, b, k.k, 0, false);
bch_err_fn(c, ret);
bch2_bkey_buf_exit(&k, c);
return ret;
}
static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
unsigned dev_idx,
int flags)
unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_i *n;
@ -77,9 +96,27 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
return 0;
}
static int bch2_dev_btree_drop_key(struct btree_trans *trans,
struct bkey_s_c_backpointer bp,
unsigned dev_idx,
struct bkey_buf *last_flushed,
unsigned flags)
{
struct btree_iter iter;
struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret;
ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_dev_usrdata_drop(struct bch_fs *c,
struct progress_indicator_state *progress,
unsigned dev_idx, int flags)
unsigned dev_idx, unsigned flags)
{
struct btree_trans *trans = bch2_trans_get(c);
enum btree_id id;
@ -106,7 +143,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c,
static int bch2_dev_metadata_drop(struct bch_fs *c,
struct progress_indicator_state *progress,
unsigned dev_idx, int flags)
unsigned dev_idx, unsigned flags)
{
struct btree_trans *trans;
struct btree_iter iter;
@ -137,20 +174,12 @@ retry:
if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
goto next;
bch2_bkey_buf_copy(&k, c, &b->key);
ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
dev_idx, flags, true);
if (ret)
break;
ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
ret = 0;
continue;
}
bch_err_msg(c, ret, "updating btree node key");
if (ret)
break;
next:
@ -176,7 +205,57 @@ err:
return ret;
}
int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx,
struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed,
unsigned flags)
{
struct btree_iter iter;
struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed);
int ret = bkey_err(k);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
return 0;
if (ret)
return ret;
if (!bch2_bkey_has_device_c(k, dev_idx))
goto out;
ret = bkey_is_btree_ptr(k.k)
? bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags)
: bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags)
{
struct btree_trans *trans = bch2_trans_get(c);
struct bkey_buf last_flushed;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
int ret = bch2_btree_write_buffer_flush_sync(trans) ?:
for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
POS(dev_idx, 0),
POS(dev_idx, U64_MAX), 0, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
if (k.k->type != KEY_TYPE_backpointer)
continue;
data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k),
&last_flushed, flags);
}));
bch2_bkey_buf_exit(&last_flushed, trans->c);
bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
}
int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags)
{
struct progress_indicator_state progress;
bch2_progress_init(&progress, c,

View File

@ -2,6 +2,7 @@
#ifndef _BCACHEFS_MIGRATE_H
#define _BCACHEFS_MIGRATE_H
int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned);
int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned);
#endif /* _BCACHEFS_MIGRATE_H */

View File

@ -80,11 +80,13 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
unsigned ptr_bit = 1;
unsigned rewrite_ptrs = 0;
rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr) {
if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
rewrite_ptrs |= ptr_bit;
ptr_bit <<= 1;
}
rcu_read_unlock();
return rewrite_ptrs;
}
@ -132,10 +134,14 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
}
}
incompressible:
if (opts->background_target)
if (opts->background_target) {
rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
if (!p.ptr.cached &&
!bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
sectors += p.crc.compressed_size;
rcu_read_unlock();
}
return sectors;
}

View File

@ -737,11 +737,6 @@ int bch2_fs_recovery(struct bch_fs *c)
c->opts.read_only = true;
}
if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
bch_info(c, "filesystem is an unresized image file, mounting ro");
c->opts.read_only = true;
}
mutex_lock(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
bool write_sb = false;
@ -895,6 +890,17 @@ use_clean:
if (ret)
goto err;
ret = bch2_fs_resize_on_mount(c);
if (ret) {
up_write(&c->state_lock);
goto err;
}
if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
bch_info(c, "filesystem is an unresized image file, mounting ro");
c->opts.read_only = true;
}
if (!c->opts.read_only &&
(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) {
bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");

View File

@ -525,6 +525,7 @@ int bch2_sb_member_alloc(struct bch_fs *c)
unsigned u64s;
int best = -1;
u64 best_last_mount = 0;
unsigned nr_deleted = 0;
if (dev_idx < BCH_SB_MEMBERS_MAX)
goto have_slot;
@ -535,7 +536,10 @@ int bch2_sb_member_alloc(struct bch_fs *c)
continue;
struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
if (bch2_member_alive(&m))
nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID);
if (!bch2_is_zero(&m.uuid, sizeof(m.uuid)))
continue;
u64 last_mount = le64_to_cpu(m.last_mount);
@ -549,6 +553,10 @@ int bch2_sb_member_alloc(struct bch_fs *c)
goto have_slot;
}
if (nr_deleted)
bch_err(c, "unable to allocate new member, but have %u deleted: run fsck",
nr_deleted);
return -BCH_ERR_ENOSPC_sb_members;
have_slot:
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
@ -564,3 +572,22 @@ have_slot:
c->disk_sb.sb->nr_devices = nr_devices;
return dev_idx;
}
void bch2_sb_members_clean_deleted(struct bch_fs *c)
{
mutex_lock(&c->sb_lock);
bool write_sb = false;
for (unsigned i = 0; i < c->sb.nr_devices; i++) {
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i);
if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) {
memset(&m->uuid, 0, sizeof(m->uuid));
write_sb = true;
}
}
if (write_sb)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}

View File

@ -320,7 +320,8 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
static inline bool bch2_member_alive(struct bch_member *m)
{
return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) &&
!uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID);
}
static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
@ -381,5 +382,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
int bch2_sb_member_alloc(struct bch_fs *);
void bch2_sb_members_clean_deleted(struct bch_fs *);
#endif /* _BCACHEFS_SB_MEMBERS_H */

View File

@ -13,6 +13,10 @@
*/
#define BCH_SB_MEMBER_INVALID 255
#define BCH_SB_MEMBER_DELETED_UUID \
UUID_INIT(0xffffffff, 0xffff, 0xffff, \
0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
#define BCH_MIN_NR_NBUCKETS (1 << 6)
#define BCH_IOPS_MEASUREMENTS() \

View File

@ -1427,6 +1427,12 @@ static unsigned live_child(struct bch_fs *c, u32 id)
return ret;
}
static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
{
return snapshot_list_has_id(&d->delete_leaves, id) ||
interior_delete_has_id(&d->delete_interior, id) != 0;
}
static int delete_dead_snapshots_process_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
@ -1468,11 +1474,20 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans,
return 0;
}
static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter)
static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
{
struct bch_fs *c = trans->c;
struct snapshot_delete *d = &c->snapshot_delete;
u64 inum = iter->btree_id != BTREE_ID_inodes
? iter->pos.inode
: iter->pos.offset;
if (*prev_inum == inum)
return false;
*prev_inum = inum;
bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
bch2_snapshot_tree(c, iter->pos.snapshot));
if (unlikely(ret)) {
@ -1486,6 +1501,129 @@ static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree
return ret;
}
static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct snapshot_delete *d = &c->snapshot_delete;
for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
struct disk_reservation res = { 0 };
u64 prev_inum = 0;
d->pos.pos = POS_MIN;
if (!btree_type_has_snapshots(d->pos.btree))
continue;
int ret = for_each_btree_key_commit(trans, iter,
d->pos.btree, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
d->pos.pos = iter.pos;
if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
continue;
delete_dead_snapshots_process_key(trans, &iter, k);
}));
bch2_disk_reservation_put(c, &res);
if (ret)
return ret;
}
return 0;
}
static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
struct bpos start, struct bpos end)
{
struct bch_fs *c = trans->c;
struct snapshot_delete *d = &c->snapshot_delete;
struct disk_reservation res = { 0 };
d->pos.btree = btree;
d->pos.pos = POS_MIN;
int ret = for_each_btree_key_max_commit(trans, iter,
btree, start, end,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
d->pos.pos = iter.pos;
delete_dead_snapshots_process_key(trans, &iter, k);
}));
bch2_disk_reservation_put(c, &res);
return ret;
}
static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct snapshot_delete *d = &c->snapshot_delete;
struct disk_reservation res = { 0 };
u64 prev_inum = 0;
int ret = 0;
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
while (1) {
struct bkey_s_c k;
ret = lockrestart_do(trans,
bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
if (ret)
break;
if (!k.k)
break;
d->pos.btree = iter.btree_id;
d->pos.pos = iter.pos;
if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
continue;
if (snapshot_id_dying(d, k.k->p.snapshot)) {
struct bpos start = POS(k.k->p.offset, 0);
struct bpos end = POS(k.k->p.offset, U64_MAX);
ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
if (ret)
break;
bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
} else {
bch2_btree_iter_advance(trans, &iter);
}
}
bch2_trans_iter_exit(trans, &iter);
if (ret)
goto err;
prev_inum = 0;
ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_inodes, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
d->pos.btree = iter.btree_id;
d->pos.pos = iter.pos;
if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
continue;
delete_dead_snapshots_process_key(trans, &iter, k);
}));
err:
bch2_disk_reservation_put(c, &res);
return ret;
}
/*
* For a given snapshot, if it doesn't have a subvolume that points to it, and
* it doesn't have child snapshot nodes - it's now redundant and we can mark it
@ -1500,6 +1638,7 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s
struct snapshot_delete *d = &c->snapshot_delete;
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
unsigned live_children = 0;
int ret = 0;
if (BCH_SNAPSHOT_SUBVOL(s.v))
return 0;
@ -1507,6 +1646,7 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s
if (BCH_SNAPSHOT_DELETED(s.v))
return 0;
mutex_lock(&d->lock);
for (unsigned i = 0; i < 2; i++) {
u32 child = le32_to_cpu(s.v->children[i]);
@ -1517,7 +1657,7 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s
u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
if (live_children == 0) {
return snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
} else if (live_children == 1) {
struct snapshot_interior_delete n = {
@ -1527,14 +1667,15 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s
if (!n.live_child) {
bch_err(c, "error finding live child of snapshot %u", n.id);
return -EINVAL;
ret = -EINVAL;
} else {
ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
darray_push(&d->delete_interior, n);
}
return snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
darray_push(&d->delete_interior, n);
} else {
return 0;
}
mutex_unlock(&d->lock);
return ret;
}
static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
@ -1641,13 +1782,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
* For every snapshot node: If we have no live children and it's not
* pointed to by a subvolume, delete it:
*/
mutex_lock(&d->lock);
d->running = true;
d->pos = BBPOS_MIN;
ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
check_should_delete_snapshot(trans, k));
mutex_unlock(&d->lock);
if (!bch2_err_matches(ret, EROFS))
bch_err_msg(c, ret, "walking snapshots");
if (ret)
@ -1666,33 +1805,13 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
goto err;
}
for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
struct disk_reservation res = { 0 };
d->pos.pos = POS_MIN;
if (!btree_type_has_snapshots(d->pos.btree))
continue;
ret = for_each_btree_key_commit(trans, iter,
d->pos.btree, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
d->pos.pos = iter.pos;
if (skip_unrelated_snapshot_tree(trans, &iter))
continue;
delete_dead_snapshots_process_key(trans, &iter, k);
}));
bch2_disk_reservation_put(c, &res);
if (!bch2_err_matches(ret, EROFS))
bch_err_msg(c, ret, "deleting keys from dying snapshots");
if (ret)
goto err;
}
ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
? delete_dead_snapshot_keys_v2(trans)
: delete_dead_snapshot_keys_v1(trans);
if (!bch2_err_matches(ret, EROFS))
bch_err_msg(c, ret, "deleting keys from dying snapshots");
if (ret)
goto err;
darray_for_each(d->delete_leaves, i) {
ret = commit_do(trans, NULL, NULL, 0,

View File

@ -87,7 +87,8 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v
struct printbuf buf = PRINTBUF;
prt_str(&buf, "requested incompat feature ");
bch2_version_to_text(&buf, version);
prt_str(&buf, " currently not enabled");
prt_str(&buf, " currently not enabled, allowed up to ");
bch2_version_to_text(&buf, version);
prt_printf(&buf, "\n set version_upgrade=incompat to enable");
bch_notice(c, "%s", buf.buf);

View File

@ -214,7 +214,6 @@ static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
static void bch2_dev_io_ref_stop(struct bch_dev *, int);
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
static int bch2_fs_init_rw(struct bch_fs *);
static int bch2_fs_resize_on_mount(struct bch_fs *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
{
@ -1150,15 +1149,11 @@ int bch2_fs_start(struct bch_fs *c)
cpu_to_le64(now);
rcu_read_unlock();
bch2_write_super(c);
/*
* Dno't write superblock yet: recovery might have to downgrade
*/
mutex_unlock(&c->sb_lock);
ret = bch2_fs_resize_on_mount(c);
if (ret) {
up_write(&c->state_lock);
goto err;
}
rcu_read_lock();
for_each_online_member_rcu(c, ca)
if (ca->mi.state == BCH_MEMBER_STATE_rw)
@ -1724,6 +1719,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_member *m;
unsigned dev_idx = ca->dev_idx, data;
bool fast_device_removal = !bch2_request_incompat_feature(c,
bcachefs_metadata_version_fast_device_removal);
int ret;
down_write(&c->state_lock);
@ -1742,11 +1739,24 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
__bch2_dev_read_only(c, ca);
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
bch_err_msg(ca, ret, "bch2_dev_data_drop()");
ret = fast_device_removal
? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags)
: bch2_dev_data_drop(c, ca->dev_idx, flags);
if (ret)
goto err;
/* Check if device still has data */
struct bch_dev_usage usage = bch2_dev_usage_read(ca);
for (unsigned i = 0; i < BCH_DATA_NR; i++)
if (!data_type_is_empty(i) &&
!data_type_is_hidden(i) &&
usage.buckets[i]) {
bch_err(ca, "Remove failed: still has data (%s, %llu buckets)",
__bch2_data_types[i], usage.buckets[i]);
ret = -EBUSY;
goto err;
}
ret = bch2_dev_remove_alloc(c, ca);
bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
if (ret)
@ -1810,7 +1820,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
*/
mutex_lock(&c->sb_lock);
m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
memset(&m->uuid, 0, sizeof(m->uuid));
if (fast_device_removal)
m->uuid = BCH_SB_MEMBER_DELETED_UUID;
else
memset(&m->uuid, 0, sizeof(m->uuid));
bch2_write_super(c);
@ -2120,7 +2134,7 @@ err:
return ret;
}
static int bch2_fs_resize_on_mount(struct bch_fs *c)
int bch2_fs_resize_on_mount(struct bch_fs *c)
{
for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) {
u64 old_nbuckets = ca->mi.nbuckets;

View File

@ -38,6 +38,8 @@ void bch2_fs_read_only(struct bch_fs *);
int bch2_fs_read_write(struct bch_fs *);
int bch2_fs_read_write_early(struct bch_fs *);
int bch2_fs_resize_on_mount(struct bch_fs *);
void __bch2_fs_stop(struct bch_fs *);
void bch2_fs_free(struct bch_fs *);
void bch2_fs_stop(struct bch_fs *);

View File

@ -455,8 +455,10 @@ ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocki
struct stdio_buf *buf = &stdio->output;
unsigned long flags;
ssize_t ret;
again:
if (stdio->done)
return -EPIPE;
spin_lock_irqsave(&buf->lock, flags);
ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
spin_unlock_irqrestore(&buf->lock, flags);