mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-03-31 00:00:03 +03:00
Update bcachefs sources to f81dc88f0c80 bcachefs: bch2_btree_insert() - add btree iter flags
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
d456f9e97a
commit
7d79fba1af
.bcachefs_revision
bch_bindgen/src
c_src
include/linux
libbcachefs
alloc_background.calloc_background.halloc_background_format.halloc_foreground.cbackpointers.cbcachefs.hbcachefs_format.hbcachefs_ioctl.hbkey_methods.cbtree_cache.cbtree_gc.cbtree_gc.hbtree_io.cbtree_io.hbtree_iter.cbtree_iter.hbtree_journal_iter.cbtree_journal_iter.hbtree_locking.cbtree_trans_commit.cbtree_types.hbtree_update.cbtree_update.hbtree_update_interior.cbtree_update_interior.hbtree_write_buffer.cbtree_write_buffer.hbtree_write_buffer_types.hbuckets.cbuckets.hbuckets_types.hchardev.cdirent.cdisk_accounting.cdisk_accounting.hdisk_accounting_format.hdisk_accounting_types.hdisk_groups.cec.cerrcode.herror.cerror.hfs-io-direct.cfs-io.cfs-ioctl.cfs.cfsck.cinode.cio_write.cjournal.cjournal.hjournal_io.clru.cmove.copts.copts.hprintbuf.cprintbuf.hrecovery.crecovery_passes.crecovery_passes_types.hreflink.creplicas.creplicas.hreplicas_types.hsb-clean.csb-downgrade.csb-downgrade.hsb-errors_format.hsnapshot.cstr_hash.hsubvolume.csuper-io.csuper.csysfs.ctests.cthread_with_file.cthread_with_file.hthread_with_file_types.htrace.htwo_state_shared_lock.hutil.h
@ -1 +1 @@
|
||||
8528bde1b66bab9a0abc2f521523abd00049c81b
|
||||
f81dc88f0c80be08f1703852be83a9b75e626228
|
||||
|
@ -49,6 +49,7 @@ pub enum BkeyValC<'a> {
|
||||
snapshot_tree(&'a c::bch_snapshot_tree),
|
||||
logged_op_truncate(&'a c::bch_logged_op_truncate),
|
||||
logged_op_finsert(&'a c::bch_logged_op_finsert),
|
||||
accounting(&'a c::bch_accounting),
|
||||
}
|
||||
|
||||
impl<'a, 'b> BkeySC<'a> {
|
||||
@ -104,6 +105,7 @@ impl<'a, 'b> BkeySC<'a> {
|
||||
KEY_TYPE_snapshot_tree => snapshot_tree(transmute(self.v)),
|
||||
KEY_TYPE_logged_op_truncate => logged_op_truncate(transmute(self.v)),
|
||||
KEY_TYPE_logged_op_finsert => logged_op_finsert(transmute(self.v)),
|
||||
KEY_TYPE_accounting => accounting(transmute(self.v)),
|
||||
KEY_TYPE_MAX => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
@ -153,7 +153,7 @@ int cmd_dump(int argc, char *argv[])
|
||||
if (IS_ERR(c))
|
||||
die("error opening devices: %s", bch2_err_str(PTR_ERR(c)));
|
||||
|
||||
down_read(&c->gc_lock);
|
||||
down_read(&c->state_lock);
|
||||
|
||||
for_each_online_member(c, ca)
|
||||
nr_devices++;
|
||||
@ -176,7 +176,7 @@ int cmd_dump(int argc, char *argv[])
|
||||
close(fd);
|
||||
}
|
||||
|
||||
up_read(&c->gc_lock);
|
||||
up_read(&c->state_lock);
|
||||
|
||||
bch2_fs_stop(c);
|
||||
return 0;
|
||||
|
@ -236,6 +236,7 @@ int cmd_fsck(int argc, char *argv[])
|
||||
kernel_probed = should_use_kernel_fsck(devs);
|
||||
|
||||
struct bch_opts opts = bch2_opts_empty();
|
||||
struct printbuf parse_later = PRINTBUF;
|
||||
|
||||
if (kernel_probed) {
|
||||
struct bch_ioctl_fsck_offline *fsck = calloc(sizeof(*fsck) +
|
||||
@ -259,7 +260,7 @@ int cmd_fsck(int argc, char *argv[])
|
||||
ret = splice_fd_to_stdinout(fsck_fd);
|
||||
} else {
|
||||
userland_fsck:
|
||||
ret = bch2_parse_mount_opts(NULL, &opts, opts_str.buf);
|
||||
ret = bch2_parse_mount_opts(NULL, &opts, &parse_later, opts_str.buf);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
@ -126,7 +126,7 @@ static void update_inode(struct bch_fs *c,
|
||||
bch2_inode_pack(&packed, inode);
|
||||
packed.inode.k.p.snapshot = U32_MAX;
|
||||
ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
|
||||
NULL, 0);
|
||||
NULL, 0, 0);
|
||||
if (ret)
|
||||
die("error updating inode: %s", bch2_err_str(ret));
|
||||
}
|
||||
@ -340,7 +340,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
|
||||
die("error reserving space in new filesystem: %s",
|
||||
bch2_err_str(ret));
|
||||
|
||||
ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, 0);
|
||||
ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, 0, 0);
|
||||
if (ret)
|
||||
die("btree insert error %s", bch2_err_str(ret));
|
||||
|
||||
|
@ -41,6 +41,17 @@ typedef struct {
|
||||
#define cmpxchg_acquire(p, old, new) uatomic_cmpxchg(p, old, new)
|
||||
#define cmpxchg_release(p, old, new) uatomic_cmpxchg(p, old, new)
|
||||
|
||||
#define try_cmpxchg(p, _old, _new) \
|
||||
({ \
|
||||
typeof(*(_old)) _v = cmpxchg(p, *(_old), _new); \
|
||||
bool _ret = _v == *(_old); \
|
||||
*(_old) = _v; \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
#define try_cmpxchg_acquire(p, _old, _new) \
|
||||
try_cmpxchg(p, _old, _new)
|
||||
|
||||
#define smp_mb__before_atomic() cmm_smp_mb__before_uatomic_add()
|
||||
#define smp_mb__after_atomic() cmm_smp_mb__after_uatomic_add()
|
||||
#define smp_wmb() cmm_smp_wmb()
|
||||
@ -65,6 +76,11 @@ typedef struct {
|
||||
#define xchg(p, v) __atomic_exchange_n(p, v, __ATOMIC_SEQ_CST)
|
||||
#define xchg_acquire(p, v) __atomic_exchange_n(p, v, __ATOMIC_ACQUIRE)
|
||||
|
||||
#define try_cmpxchg(p, old, new) \
|
||||
__atomic_compare_exchange_n((p), __old, new, false, \
|
||||
__ATOMIC_SEQ_CST, \
|
||||
__ATOMIC_SEQ_CST)
|
||||
|
||||
#define cmpxchg(p, old, new) \
|
||||
({ \
|
||||
typeof(*(p)) __old = (old); \
|
||||
@ -281,6 +297,11 @@ static inline i_type a_type##_cmpxchg(a_type##_t *v, i_type old, i_type new)\
|
||||
return cmpxchg(&v->counter, old, new); \
|
||||
} \
|
||||
\
|
||||
static inline bool a_type##_try_cmpxchg(a_type##_t *v, i_type *old, i_type new)\
|
||||
{ \
|
||||
return try_cmpxchg(&v->counter, old, new); \
|
||||
} \
|
||||
\
|
||||
static inline i_type a_type##_cmpxchg_acquire(a_type##_t *v, i_type old, i_type new)\
|
||||
{ \
|
||||
return cmpxchg_acquire(&v->counter, old, new); \
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "buckets_waiting_for_journal.h"
|
||||
#include "clock.h"
|
||||
#include "debug.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "ec.h"
|
||||
#include "error.h"
|
||||
#include "lru.h"
|
||||
@ -259,27 +260,41 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
|
||||
"invalid data type (got %u should be %u)",
|
||||
a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
|
||||
|
||||
unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) >
|
||||
offsetof(struct bch_alloc_v4, stripe_sectors)
|
||||
? a.v->stripe_sectors
|
||||
: 0;
|
||||
|
||||
switch (a.v->data_type) {
|
||||
case BCH_DATA_free:
|
||||
case BCH_DATA_need_gc_gens:
|
||||
case BCH_DATA_need_discard:
|
||||
bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe,
|
||||
bkey_fsck_err_on(stripe_sectors ||
|
||||
a.v->dirty_sectors ||
|
||||
a.v->cached_sectors ||
|
||||
a.v->stripe,
|
||||
c, err, alloc_key_empty_but_have_data,
|
||||
"empty data type free but have data");
|
||||
"empty data type free but have data %u.%u.%u %u",
|
||||
stripe_sectors,
|
||||
a.v->dirty_sectors,
|
||||
a.v->cached_sectors,
|
||||
a.v->stripe);
|
||||
break;
|
||||
case BCH_DATA_sb:
|
||||
case BCH_DATA_journal:
|
||||
case BCH_DATA_btree:
|
||||
case BCH_DATA_user:
|
||||
case BCH_DATA_parity:
|
||||
bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
|
||||
bkey_fsck_err_on(!a.v->dirty_sectors &&
|
||||
!stripe_sectors,
|
||||
c, err, alloc_key_dirty_sectors_0,
|
||||
"data_type %s but dirty_sectors==0",
|
||||
bch2_data_type_str(a.v->data_type));
|
||||
break;
|
||||
case BCH_DATA_cached:
|
||||
bkey_fsck_err_on(!a.v->cached_sectors ||
|
||||
bch2_bucket_sectors_dirty(*a.v) ||
|
||||
a.v->dirty_sectors ||
|
||||
stripe_sectors ||
|
||||
a.v->stripe,
|
||||
c, err, alloc_key_cached_inconsistency,
|
||||
"data type inconsistency");
|
||||
@ -310,6 +325,7 @@ void bch2_alloc_v4_swab(struct bkey_s k)
|
||||
a->stripe = swab32(a->stripe);
|
||||
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
|
||||
a->fragmentation_lru = swab64(a->fragmentation_lru);
|
||||
a->stripe_sectors = swab32(a->stripe_sectors);
|
||||
|
||||
bps = alloc_v4_backpointers(a);
|
||||
for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
|
||||
@ -334,6 +350,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
|
||||
prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
|
||||
prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
|
||||
prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
|
||||
prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
|
||||
prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
|
||||
prt_printf(out, "stripe %u\n", a->stripe);
|
||||
prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
|
||||
@ -570,8 +587,6 @@ int bch2_alloc_read(struct bch_fs *c)
|
||||
struct bch_dev *ca = NULL;
|
||||
int ret;
|
||||
|
||||
down_read(&c->gc_lock);
|
||||
|
||||
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
|
||||
ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
|
||||
BTREE_ITER_prefetch, k, ({
|
||||
@ -620,7 +635,6 @@ int bch2_alloc_read(struct bch_fs *c)
|
||||
|
||||
bch2_dev_put(ca);
|
||||
bch2_trans_put(trans);
|
||||
up_read(&c->gc_lock);
|
||||
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
@ -735,6 +749,61 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
|
||||
enum bch_data_type data_type,
|
||||
s64 delta_buckets,
|
||||
s64 delta_sectors,
|
||||
s64 delta_fragmented, unsigned flags)
|
||||
{
|
||||
struct disk_accounting_pos acc = {
|
||||
.type = BCH_DISK_ACCOUNTING_dev_data_type,
|
||||
.dev_data_type.dev = ca->dev_idx,
|
||||
.dev_data_type.data_type = data_type,
|
||||
};
|
||||
s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
|
||||
|
||||
return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
|
||||
}
|
||||
|
||||
int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
|
||||
const struct bch_alloc_v4 *old,
|
||||
const struct bch_alloc_v4 *new,
|
||||
unsigned flags)
|
||||
{
|
||||
s64 old_sectors = bch2_bucket_sectors(*old);
|
||||
s64 new_sectors = bch2_bucket_sectors(*new);
|
||||
if (old->data_type != new->data_type) {
|
||||
int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
|
||||
1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
|
||||
bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
|
||||
-1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
} else if (old_sectors != new_sectors) {
|
||||
int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
|
||||
0,
|
||||
new_sectors - old_sectors,
|
||||
bch2_bucket_sectors_fragmented(ca, *new) -
|
||||
bch2_bucket_sectors_fragmented(ca, *old), flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
|
||||
s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
|
||||
if (old_unstriped != new_unstriped) {
|
||||
int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
|
||||
!!new_unstriped - !!old_unstriped,
|
||||
new_unstriped - old_unstriped,
|
||||
0,
|
||||
flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_trigger_alloc(struct btree_trans *trans,
|
||||
enum btree_id btree, unsigned level,
|
||||
struct bkey_s_c old, struct bkey_s new,
|
||||
@ -749,10 +818,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
||||
|
||||
struct bch_alloc_v4 old_a_convert;
|
||||
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
|
||||
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
|
||||
|
||||
if (flags & BTREE_TRIGGER_transactional) {
|
||||
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
|
||||
|
||||
alloc_data_type_set(new_a, new_a->data_type);
|
||||
|
||||
if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
|
||||
@ -808,22 +876,21 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
||||
goto err;
|
||||
}
|
||||
|
||||
/*
|
||||
* need to know if we're getting called from the invalidate path or
|
||||
* not:
|
||||
*/
|
||||
|
||||
if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
|
||||
old_a->cached_sectors) {
|
||||
ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
|
||||
-((s64) old_a->cached_sectors));
|
||||
ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
|
||||
-((s64) old_a->cached_sectors),
|
||||
flags & BTREE_TRIGGER_gc);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
|
||||
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
|
||||
u64 journal_seq = trans->journal_res.seq;
|
||||
u64 bucket_journal_seq = new_a->journal_seq;
|
||||
|
||||
@ -852,19 +919,16 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
||||
c->journal.flushed_seq_ondisk,
|
||||
new.k->p.inode, new.k->p.offset,
|
||||
bucket_journal_seq);
|
||||
if (ret) {
|
||||
bch2_fs_fatal_error(c,
|
||||
"setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
|
||||
if (bch2_fs_fatal_err_on(ret, c,
|
||||
"setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
if (new_a->gen != old_a->gen)
|
||||
if (new_a->gen != old_a->gen) {
|
||||
percpu_down_read(&c->mark_lock);
|
||||
*bucket_gen(ca, new.k->p.offset) = new_a->gen;
|
||||
|
||||
bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
}
|
||||
|
||||
#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
|
||||
#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
|
||||
@ -888,26 +952,12 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
||||
bch2_gc_gens_async(c);
|
||||
}
|
||||
|
||||
if ((flags & BTREE_TRIGGER_gc) &&
|
||||
(flags & BTREE_TRIGGER_bucket_invalidate)) {
|
||||
struct bch_alloc_v4 new_a_convert;
|
||||
const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
|
||||
rcu_read_lock();
|
||||
struct bucket *g = gc_bucket(ca, new.k->p.offset);
|
||||
|
||||
bucket_lock(g);
|
||||
|
||||
g->gen_valid = 1;
|
||||
g->gen = new_a->gen;
|
||||
g->data_type = new_a->data_type;
|
||||
g->stripe = new_a->stripe;
|
||||
g->stripe_redundancy = new_a->stripe_redundancy;
|
||||
g->dirty_sectors = new_a->dirty_sectors;
|
||||
g->cached_sectors = new_a->cached_sectors;
|
||||
|
||||
bucket_unlock(g);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
g->gen_valid = 1;
|
||||
g->gen = new_a->gen;
|
||||
rcu_read_unlock();
|
||||
}
|
||||
err:
|
||||
bch2_dev_put(ca);
|
||||
@ -1035,7 +1085,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
|
||||
|
||||
struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
|
||||
if (fsck_err_on(!ca,
|
||||
c, alloc_key_to_missing_dev_bucket,
|
||||
trans, alloc_key_to_missing_dev_bucket,
|
||||
"alloc key for invalid device:bucket %llu:%llu",
|
||||
alloc_k.k->p.inode, alloc_k.k->p.offset))
|
||||
ret = bch2_btree_delete_at(trans, alloc_iter, 0);
|
||||
@ -1055,7 +1105,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(k.k->type != discard_key_type,
|
||||
c, need_discard_key_wrong,
|
||||
trans, need_discard_key_wrong,
|
||||
"incorrect key in need_discard btree (got %s should be %s)\n"
|
||||
" %s",
|
||||
bch2_bkey_types[k.k->type],
|
||||
@ -1085,7 +1135,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(k.k->type != freespace_key_type,
|
||||
c, freespace_key_wrong,
|
||||
trans, freespace_key_wrong,
|
||||
"incorrect key in freespace btree (got %s should be %s)\n"
|
||||
" %s",
|
||||
bch2_bkey_types[k.k->type],
|
||||
@ -1116,7 +1166,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
|
||||
c, bucket_gens_key_wrong,
|
||||
trans, bucket_gens_key_wrong,
|
||||
"incorrect gen in bucket_gens btree (got %u should be %u)\n"
|
||||
" %s",
|
||||
alloc_gen(k, gens_offset), a->gen,
|
||||
@ -1157,7 +1207,6 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
|
||||
struct bpos *end,
|
||||
struct btree_iter *freespace_iter)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_s_c k;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret;
|
||||
@ -1175,7 +1224,7 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
|
||||
*end = bkey_min(k.k->p, *end);
|
||||
|
||||
if (fsck_err_on(k.k->type != KEY_TYPE_set,
|
||||
c, freespace_hole_missing,
|
||||
trans, freespace_hole_missing,
|
||||
"hole in alloc btree missing in freespace btree\n"
|
||||
" device %llu buckets %llu-%llu",
|
||||
freespace_iter->pos.inode,
|
||||
@ -1211,7 +1260,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
|
||||
struct bpos *end,
|
||||
struct btree_iter *bucket_gens_iter)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_s_c k;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
unsigned i, gens_offset, gens_end_offset;
|
||||
@ -1235,7 +1283,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
|
||||
bkey_reassemble(&g.k_i, k);
|
||||
|
||||
for (i = gens_offset; i < gens_end_offset; i++) {
|
||||
if (fsck_err_on(g.v.gens[i], c,
|
||||
if (fsck_err_on(g.v.gens[i], trans,
|
||||
bucket_gens_hole_wrong,
|
||||
"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
|
||||
bucket_gens_pos_to_alloc(k.k->p, i).inode,
|
||||
@ -1293,8 +1341,8 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
|
||||
need_discard_freespace_key_to_invalid_dev_bucket,
|
||||
if (fsck_err_on(!bch2_dev_bucket_exists(c, pos),
|
||||
trans, need_discard_freespace_key_to_invalid_dev_bucket,
|
||||
"entry in %s btree for nonexistant dev:bucket %llu:%llu",
|
||||
bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
|
||||
goto delete;
|
||||
@ -1303,8 +1351,8 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran
|
||||
|
||||
if (fsck_err_on(a->data_type != state ||
|
||||
(state == BCH_DATA_free &&
|
||||
genbits != alloc_freespace_genbits(*a)), c,
|
||||
need_discard_freespace_key_bad,
|
||||
genbits != alloc_freespace_genbits(*a)),
|
||||
trans, need_discard_freespace_key_bad,
|
||||
"%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
|
||||
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
|
||||
bch2_btree_id_str(iter->btree_id),
|
||||
@ -1351,7 +1399,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
|
||||
|
||||
struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
|
||||
if (!ca) {
|
||||
if (fsck_err(c, bucket_gens_to_invalid_dev,
|
||||
if (fsck_err(trans, bucket_gens_to_invalid_dev,
|
||||
"bucket_gens key for invalid device:\n %s",
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
||||
ret = bch2_btree_delete_at(trans, iter, 0);
|
||||
@ -1359,8 +1407,8 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
if (fsck_err_on(end <= ca->mi.first_bucket ||
|
||||
start >= ca->mi.nbuckets, c,
|
||||
bucket_gens_to_invalid_buckets,
|
||||
start >= ca->mi.nbuckets,
|
||||
trans, bucket_gens_to_invalid_buckets,
|
||||
"bucket_gens key for invalid buckets:\n %s",
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
ret = bch2_btree_delete_at(trans, iter, 0);
|
||||
@ -1368,16 +1416,16 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
for (b = start; b < ca->mi.first_bucket; b++)
|
||||
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
|
||||
bucket_gens_nonzero_for_invalid_buckets,
|
||||
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
|
||||
trans, bucket_gens_nonzero_for_invalid_buckets,
|
||||
"bucket_gens key has nonzero gen for invalid bucket")) {
|
||||
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
|
||||
need_update = true;
|
||||
}
|
||||
|
||||
for (b = ca->mi.nbuckets; b < end; b++)
|
||||
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
|
||||
bucket_gens_nonzero_for_invalid_buckets,
|
||||
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
|
||||
trans, bucket_gens_nonzero_for_invalid_buckets,
|
||||
"bucket_gens key has nonzero gen for invalid bucket")) {
|
||||
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
|
||||
need_update = true;
|
||||
@ -1549,8 +1597,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
|
||||
if (a->data_type != BCH_DATA_cached)
|
||||
return 0;
|
||||
|
||||
if (fsck_err_on(!a->io_time[READ], c,
|
||||
alloc_key_cached_but_read_time_zero,
|
||||
if (fsck_err_on(!a->io_time[READ],
|
||||
trans, alloc_key_cached_but_read_time_zero,
|
||||
"cached bucket with read_time 0\n"
|
||||
" %s",
|
||||
(printbuf_reset(&buf),
|
||||
@ -1578,8 +1626,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
|
||||
alloc_key_to_missing_lru_entry,
|
||||
if (fsck_err_on(lru_k.k->type != KEY_TYPE_set,
|
||||
trans, alloc_key_to_missing_lru_entry,
|
||||
"missing lru entry\n"
|
||||
" %s",
|
||||
(printbuf_reset(&buf),
|
||||
@ -1616,7 +1664,7 @@ static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
|
||||
mutex_lock(&c->discard_buckets_in_flight_lock);
|
||||
darray_for_each(c->discard_buckets_in_flight, i)
|
||||
if (bkey_eq(*i, bucket)) {
|
||||
ret = -EEXIST;
|
||||
ret = -BCH_ERR_EEXIST_discard_in_flight_add;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1956,6 +2004,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
|
||||
a->v.gen++;
|
||||
a->v.data_type = 0;
|
||||
a->v.dirty_sectors = 0;
|
||||
a->v.stripe_sectors = 0;
|
||||
a->v.cached_sectors = 0;
|
||||
a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
|
||||
a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
|
||||
|
@ -41,6 +41,7 @@ static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
|
||||
{
|
||||
dst->gen = src.gen;
|
||||
dst->data_type = src.data_type;
|
||||
dst->stripe_sectors = src.stripe_sectors;
|
||||
dst->dirty_sectors = src.dirty_sectors;
|
||||
dst->cached_sectors = src.cached_sectors;
|
||||
dst->stripe = src.stripe;
|
||||
@ -50,6 +51,7 @@ static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket s
|
||||
{
|
||||
dst->gen = src.gen;
|
||||
dst->data_type = src.data_type;
|
||||
dst->stripe_sectors = src.stripe_sectors;
|
||||
dst->dirty_sectors = src.dirty_sectors;
|
||||
dst->cached_sectors = src.cached_sectors;
|
||||
dst->stripe = src.stripe;
|
||||
@ -80,30 +82,49 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
|
||||
bucket_data_type(bucket) != bucket_data_type(ptr);
|
||||
}
|
||||
|
||||
static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a)
|
||||
static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
|
||||
{
|
||||
return a.dirty_sectors + a.cached_sectors;
|
||||
return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
|
||||
}
|
||||
|
||||
static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
|
||||
static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
|
||||
{
|
||||
return a.dirty_sectors;
|
||||
return a.stripe_sectors + a.dirty_sectors;
|
||||
}
|
||||
|
||||
static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
|
||||
static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
|
||||
{
|
||||
return a.data_type == BCH_DATA_cached
|
||||
? a.cached_sectors
|
||||
: bch2_bucket_sectors_dirty(a);
|
||||
}
|
||||
|
||||
static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
|
||||
struct bch_alloc_v4 a)
|
||||
{
|
||||
int d = bch2_bucket_sectors_dirty(a);
|
||||
int d = bch2_bucket_sectors(a);
|
||||
|
||||
return d ? max(0, ca->mi.bucket_size - d) : 0;
|
||||
}
|
||||
|
||||
static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
|
||||
{
|
||||
int d = a.stripe_sectors + a.dirty_sectors;
|
||||
|
||||
return d ? max(0, ca->mi.bucket_size - d) : 0;
|
||||
}
|
||||
|
||||
static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
|
||||
{
|
||||
return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
|
||||
}
|
||||
|
||||
static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
|
||||
enum bch_data_type data_type)
|
||||
{
|
||||
if (a.stripe)
|
||||
return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
|
||||
if (a.dirty_sectors)
|
||||
if (bch2_bucket_sectors_dirty(a))
|
||||
return data_type;
|
||||
if (a.cached_sectors)
|
||||
return BCH_DATA_cached;
|
||||
@ -264,6 +285,9 @@ static inline bool bkey_is_alloc(const struct bkey *k)
|
||||
|
||||
int bch2_alloc_read(struct bch_fs *);
|
||||
|
||||
int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
|
||||
const struct bch_alloc_v4 *,
|
||||
const struct bch_alloc_v4 *, unsigned);
|
||||
int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
|
||||
struct bkey_s_c, struct bkey_s,
|
||||
enum btree_iter_update_trigger_flags);
|
||||
|
@ -70,6 +70,8 @@ struct bch_alloc_v4 {
|
||||
__u32 stripe;
|
||||
__u32 nr_external_backpointers;
|
||||
__u64 fragmentation_lru;
|
||||
__u32 stripe_sectors;
|
||||
__u32 pad;
|
||||
} __packed __aligned(8);
|
||||
|
||||
#define BCH_ALLOC_V4_U64s_V0 6
|
||||
|
@ -1705,15 +1705,13 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
|
||||
printbuf_tabstop_push(out, 24);
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
prt_printf(out, "hidden\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.hidden));
|
||||
prt_printf(out, "btree\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.btree));
|
||||
prt_printf(out, "data\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.data));
|
||||
prt_printf(out, "cached\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.cached));
|
||||
prt_printf(out, "reserved\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.reserved));
|
||||
prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
|
||||
prt_printf(out, "nr_inodes\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes));
|
||||
percpu_up_read(&c->mark_lock);
|
||||
prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden));
|
||||
prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree));
|
||||
prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data));
|
||||
prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached));
|
||||
prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved));
|
||||
prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
|
||||
prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes));
|
||||
|
||||
prt_newline(out);
|
||||
prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty");
|
||||
|
@ -395,7 +395,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
|
||||
|
||||
struct bpos bucket;
|
||||
if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
|
||||
if (fsck_err(c, backpointer_to_missing_device,
|
||||
if (fsck_err(trans, backpointer_to_missing_device,
|
||||
"backpointer for missing device:\n%s",
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
||||
ret = bch2_btree_delete_at(trans, bp_iter, 0);
|
||||
@ -407,8 +407,8 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
|
||||
backpointer_to_missing_alloc,
|
||||
if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4,
|
||||
trans, backpointer_to_missing_alloc,
|
||||
"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
|
||||
alloc_iter.pos.inode, alloc_iter.pos.offset,
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
@ -512,7 +512,7 @@ found:
|
||||
struct nonce nonce = extent_nonce(extent.k->version, p.crc);
|
||||
struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
|
||||
if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
|
||||
c, dup_backpointer_to_bad_csum_extent,
|
||||
trans, dup_backpointer_to_bad_csum_extent,
|
||||
"%s", buf.buf))
|
||||
ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
|
||||
fsck_err:
|
||||
@ -671,7 +671,7 @@ missing:
|
||||
prt_printf(&buf, "\n want: ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
|
||||
|
||||
if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
|
||||
if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
|
||||
ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true);
|
||||
|
||||
goto out;
|
||||
@ -932,8 +932,8 @@ static int check_one_backpointer(struct btree_trans *trans,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (fsck_err_on(!k.k, c,
|
||||
backpointer_to_missing_ptr,
|
||||
if (fsck_err_on(!k.k,
|
||||
trans, backpointer_to_missing_ptr,
|
||||
"backpointer for missing %s\n %s",
|
||||
bp.v->level ? "btree node" : "extent",
|
||||
(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
|
||||
|
@ -205,6 +205,7 @@
|
||||
#include <linux/zstd.h>
|
||||
|
||||
#include "bcachefs_format.h"
|
||||
#include "disk_accounting_types.h"
|
||||
#include "errcode.h"
|
||||
#include "fifo.h"
|
||||
#include "nocow_locking_types.h"
|
||||
@ -266,6 +267,8 @@ do { \
|
||||
|
||||
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
|
||||
|
||||
void bch2_print_str(struct bch_fs *, const char *);
|
||||
|
||||
__printf(2, 3)
|
||||
void bch2_print_opts(struct bch_opts *, const char *, ...);
|
||||
|
||||
@ -530,8 +533,8 @@ struct bch_dev {
|
||||
/*
|
||||
* Buckets:
|
||||
* Per-bucket arrays are protected by c->mark_lock, bucket_lock and
|
||||
* gc_lock, for device resize - holding any is sufficient for access:
|
||||
* Or rcu_read_lock(), but only for dev_ptr_stale():
|
||||
* gc_gens_lock, for device resize - holding any is sufficient for
|
||||
* access: Or rcu_read_lock(), but only for dev_ptr_stale():
|
||||
*/
|
||||
struct bucket_array __rcu *buckets_gc;
|
||||
struct bucket_gens __rcu *bucket_gens;
|
||||
@ -539,9 +542,7 @@ struct bch_dev {
|
||||
unsigned long *buckets_nouse;
|
||||
struct rw_semaphore bucket_lock;
|
||||
|
||||
struct bch_dev_usage *usage_base;
|
||||
struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
|
||||
struct bch_dev_usage __percpu *usage_gc;
|
||||
struct bch_dev_usage __percpu *usage;
|
||||
|
||||
/* Allocator: */
|
||||
u64 new_fs_bucket_idx;
|
||||
@ -581,6 +582,8 @@ struct bch_dev {
|
||||
#define BCH_FS_FLAGS() \
|
||||
x(new_fs) \
|
||||
x(started) \
|
||||
x(btree_running) \
|
||||
x(accounting_replay_done) \
|
||||
x(may_go_rw) \
|
||||
x(rw) \
|
||||
x(was_rw) \
|
||||
@ -659,8 +662,6 @@ struct btree_trans_buf {
|
||||
struct btree_trans *trans;
|
||||
};
|
||||
|
||||
#define REPLICAS_DELTA_LIST_MAX (1U << 16)
|
||||
|
||||
#define BCACHEFS_ROOT_SUBVOL_INUM \
|
||||
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
|
||||
|
||||
@ -730,15 +731,14 @@ struct bch_fs {
|
||||
|
||||
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
|
||||
|
||||
struct bch_accounting_mem accounting[2];
|
||||
|
||||
struct bch_replicas_cpu replicas;
|
||||
struct bch_replicas_cpu replicas_gc;
|
||||
struct mutex replicas_gc_lock;
|
||||
mempool_t replicas_delta_pool;
|
||||
|
||||
struct journal_entry_res btree_root_journal_res;
|
||||
struct journal_entry_res replicas_journal_res;
|
||||
struct journal_entry_res clock_journal_res;
|
||||
struct journal_entry_res dev_usage_journal_res;
|
||||
|
||||
struct bch_disk_groups_cpu __rcu *disk_groups;
|
||||
|
||||
@ -877,15 +877,9 @@ struct bch_fs {
|
||||
struct percpu_rw_semaphore mark_lock;
|
||||
|
||||
seqcount_t usage_lock;
|
||||
struct bch_fs_usage *usage_base;
|
||||
struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR];
|
||||
struct bch_fs_usage __percpu *usage_gc;
|
||||
struct bch_fs_usage_base __percpu *usage;
|
||||
u64 __percpu *online_reserved;
|
||||
|
||||
/* single element mempool: */
|
||||
struct mutex usage_scratch_lock;
|
||||
struct bch_fs_usage_online *usage_scratch;
|
||||
|
||||
struct io_clock io_clock[2];
|
||||
|
||||
/* JOURNAL SEQ BLACKLIST */
|
||||
@ -943,7 +937,6 @@ struct bch_fs {
|
||||
* The allocation code needs gc_mark in struct bucket to be correct, but
|
||||
* it's not while a gc is in progress.
|
||||
*/
|
||||
struct rw_semaphore gc_lock;
|
||||
struct mutex gc_gens_lock;
|
||||
|
||||
/* IO PATH */
|
||||
|
@ -417,7 +417,8 @@ static inline void bkey_init(struct bkey *k)
|
||||
x(bucket_gens, 30) \
|
||||
x(snapshot_tree, 31) \
|
||||
x(logged_op_truncate, 32) \
|
||||
x(logged_op_finsert, 33)
|
||||
x(logged_op_finsert, 33) \
|
||||
x(accounting, 34)
|
||||
|
||||
enum bch_bkey_type {
|
||||
#define x(name, nr) KEY_TYPE_##name = nr,
|
||||
@ -502,6 +503,9 @@ struct bch_sb_field {
|
||||
x(downgrade, 14)
|
||||
|
||||
#include "alloc_background_format.h"
|
||||
#include "dirent_format.h"
|
||||
#include "disk_accounting_format.h"
|
||||
#include "disk_groups_format.h"
|
||||
#include "extents_format.h"
|
||||
#include "ec_format.h"
|
||||
#include "dirent_format.h"
|
||||
@ -599,48 +603,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
|
||||
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
|
||||
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
|
||||
|
||||
#define BCH_DATA_TYPES() \
|
||||
x(free, 0) \
|
||||
x(sb, 1) \
|
||||
x(journal, 2) \
|
||||
x(btree, 3) \
|
||||
x(user, 4) \
|
||||
x(cached, 5) \
|
||||
x(parity, 6) \
|
||||
x(stripe, 7) \
|
||||
x(need_gc_gens, 8) \
|
||||
x(need_discard, 9)
|
||||
|
||||
enum bch_data_type {
|
||||
#define x(t, n) BCH_DATA_##t,
|
||||
BCH_DATA_TYPES()
|
||||
#undef x
|
||||
BCH_DATA_NR
|
||||
};
|
||||
|
||||
static inline bool data_type_is_empty(enum bch_data_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_DATA_free:
|
||||
case BCH_DATA_need_gc_gens:
|
||||
case BCH_DATA_need_discard:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool data_type_is_hidden(enum bch_data_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_DATA_sb:
|
||||
case BCH_DATA_journal:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* On clean shutdown, store btree roots and current journal sequence number in
|
||||
* the superblock:
|
||||
@ -719,7 +681,9 @@ struct bch_sb_field_ext {
|
||||
x(member_seq, BCH_VERSION(1, 4)) \
|
||||
x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
|
||||
x(btree_subvolume_children, BCH_VERSION(1, 6)) \
|
||||
x(mi_btree_bitmap, BCH_VERSION(1, 7))
|
||||
x(mi_btree_bitmap, BCH_VERSION(1, 7)) \
|
||||
x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
|
||||
x(disk_accounting_v2, BCH_VERSION(1, 9))
|
||||
|
||||
enum bcachefs_metadata_version {
|
||||
bcachefs_metadata_version_min = 9,
|
||||
@ -1170,7 +1134,6 @@ static inline bool jset_entry_is_key(struct jset_entry *e)
|
||||
switch (e->type) {
|
||||
case BCH_JSET_ENTRY_btree_keys:
|
||||
case BCH_JSET_ENTRY_btree_root:
|
||||
case BCH_JSET_ENTRY_overwrite:
|
||||
case BCH_JSET_ENTRY_write_buffer_keys:
|
||||
return true;
|
||||
}
|
||||
@ -1371,7 +1334,9 @@ enum btree_id_flags {
|
||||
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
|
||||
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
|
||||
x(subvolume_children, 19, 0, \
|
||||
BIT_ULL(KEY_TYPE_set))
|
||||
BIT_ULL(KEY_TYPE_set)) \
|
||||
x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \
|
||||
BIT_ULL(KEY_TYPE_accounting)) \
|
||||
|
||||
enum btree_id {
|
||||
#define x(name, nr, ...) BTREE_ID_##name = nr,
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <linux/uuid.h>
|
||||
#include <asm/ioctl.h>
|
||||
#include "bcachefs_format.h"
|
||||
#include "bkey_types.h"
|
||||
|
||||
/*
|
||||
* Flags common to multiple ioctls:
|
||||
@ -85,6 +86,7 @@ struct bch_ioctl_incremental {
|
||||
|
||||
#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
|
||||
#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
|
||||
#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
|
||||
|
||||
/* ioctl below act on a particular file, not the filesystem as a whole: */
|
||||
|
||||
@ -251,12 +253,18 @@ struct bch_replicas_usage {
|
||||
struct bch_replicas_entry_v1 r;
|
||||
} __packed;
|
||||
|
||||
static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
|
||||
{
|
||||
return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
|
||||
}
|
||||
|
||||
static inline struct bch_replicas_usage *
|
||||
replicas_usage_next(struct bch_replicas_usage *u)
|
||||
{
|
||||
return (void *) u + replicas_entry_bytes(&u->r) + 8;
|
||||
return (void *) u + replicas_usage_bytes(u);
|
||||
}
|
||||
|
||||
/* Obsolete */
|
||||
/*
|
||||
* BCH_IOCTL_FS_USAGE: query filesystem disk space usage
|
||||
*
|
||||
@ -282,6 +290,7 @@ struct bch_ioctl_fs_usage {
|
||||
struct bch_replicas_usage replicas[];
|
||||
};
|
||||
|
||||
/* Obsolete */
|
||||
/*
|
||||
* BCH_IOCTL_DEV_USAGE: query device disk space usage
|
||||
*
|
||||
@ -306,6 +315,7 @@ struct bch_ioctl_dev_usage {
|
||||
} d[10];
|
||||
};
|
||||
|
||||
/* Obsolete */
|
||||
struct bch_ioctl_dev_usage_v2 {
|
||||
__u64 dev;
|
||||
__u32 flags;
|
||||
@ -409,4 +419,28 @@ struct bch_ioctl_fsck_online {
|
||||
__u64 opts; /* string */
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting
|
||||
*
|
||||
* Returns disk space usage broken out by data type, number of replicas, and
|
||||
* by component device
|
||||
*
|
||||
* @replica_entries_bytes - size, in bytes, allocated for replica usage entries
|
||||
*
|
||||
* On success, @replica_entries_bytes will be changed to indicate the number of
|
||||
* bytes actually used.
|
||||
*
|
||||
* Returns -ERANGE if @replica_entries_bytes was too small
|
||||
*/
|
||||
struct bch_ioctl_query_accounting {
|
||||
__u64 capacity;
|
||||
__u64 used;
|
||||
__u64 online_reserved;
|
||||
|
||||
__u32 accounting_u64s; /* input parameter */
|
||||
__u32 accounting_types_mask; /* input parameter */
|
||||
|
||||
struct bkey_i_accounting accounting[];
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_IOCTL_H */
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "btree_types.h"
|
||||
#include "alloc_background.h"
|
||||
#include "dirent.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "ec.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
|
@ -601,8 +601,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct task_struct *old;
|
||||
|
||||
old = cmpxchg(&bc->alloc_lock, NULL, current);
|
||||
if (old == NULL || old == current)
|
||||
old = NULL;
|
||||
if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
|
||||
goto success;
|
||||
|
||||
if (!cl) {
|
||||
@ -613,8 +613,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
|
||||
closure_wait(&bc->alloc_wait, cl);
|
||||
|
||||
/* Try again, after adding ourselves to waitlist */
|
||||
old = cmpxchg(&bc->alloc_lock, NULL, current);
|
||||
if (old == NULL || old == current) {
|
||||
old = NULL;
|
||||
if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
|
||||
/* We raced */
|
||||
closure_wake_up(&bc->alloc_wait);
|
||||
goto success;
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "buckets.h"
|
||||
#include "clock.h"
|
||||
#include "debug.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "ec.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
@ -174,10 +175,11 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
|
||||
static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
|
||||
struct btree *prev, struct btree *cur,
|
||||
struct bpos *pulled_from_scan)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bpos expected_start = !prev
|
||||
? b->data->min_key
|
||||
: bpos_successor(prev->key.k.p);
|
||||
@ -215,29 +217,29 @@ static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
|
||||
*pulled_from_scan = cur->data->min_key;
|
||||
ret = DID_FILL_FROM_SCAN;
|
||||
} else {
|
||||
if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
|
||||
if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
|
||||
"btree node with incorrect min_key%s", buf.buf))
|
||||
ret = set_node_min(c, cur, expected_start);
|
||||
}
|
||||
} else { /* overlap */
|
||||
if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
|
||||
if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
|
||||
if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
|
||||
if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
|
||||
"btree node overwritten by next node%s", buf.buf))
|
||||
ret = DROP_PREV_NODE;
|
||||
} else {
|
||||
if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
|
||||
if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
|
||||
"btree node with incorrect max_key%s", buf.buf))
|
||||
ret = set_node_max(c, prev,
|
||||
bpos_predecessor(cur->data->min_key));
|
||||
}
|
||||
} else {
|
||||
if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
|
||||
if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
|
||||
if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
|
||||
"btree node overwritten by prev node%s", buf.buf))
|
||||
ret = DROP_THIS_NODE;
|
||||
} else {
|
||||
if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
|
||||
if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
|
||||
"btree node with incorrect min_key%s", buf.buf))
|
||||
ret = set_node_min(c, cur, expected_start);
|
||||
}
|
||||
@ -249,9 +251,10 @@ fsck_err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
|
||||
static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
|
||||
struct btree *child, struct bpos *pulled_from_scan)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
@ -265,7 +268,7 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
|
||||
prt_str(&buf, "\n child: ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
|
||||
|
||||
if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
|
||||
if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
|
||||
"btree node with incorrect max_key%s", buf.buf)) {
|
||||
if (b->c.level == 1 &&
|
||||
bpos_lt(*pulled_from_scan, b->key.k.p)) {
|
||||
@ -324,8 +327,8 @@ again:
|
||||
printbuf_reset(&buf);
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
|
||||
|
||||
if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
|
||||
btree_node_unreadable,
|
||||
if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
|
||||
trans, btree_node_unreadable,
|
||||
"Topology repair: unreadable btree node at btree %s level %u:\n"
|
||||
" %s",
|
||||
bch2_btree_id_str(b->c.btree_id),
|
||||
@ -362,7 +365,7 @@ again:
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
|
||||
ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan);
|
||||
if (ret == DID_FILL_FROM_SCAN) {
|
||||
new_pass = true;
|
||||
ret = 0;
|
||||
@ -403,7 +406,7 @@ again:
|
||||
|
||||
if (!ret && !IS_ERR_OR_NULL(prev)) {
|
||||
BUG_ON(cur);
|
||||
ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
|
||||
ret = btree_repair_node_end(trans, b, prev, pulled_from_scan);
|
||||
if (ret == DID_FILL_FROM_SCAN) {
|
||||
new_pass = true;
|
||||
ret = 0;
|
||||
@ -461,8 +464,8 @@ again:
|
||||
printbuf_reset(&buf);
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
|
||||
if (mustfix_fsck_err_on(!have_child, c,
|
||||
btree_node_topology_interior_node_empty,
|
||||
if (mustfix_fsck_err_on(!have_child,
|
||||
trans, btree_node_topology_interior_node_empty,
|
||||
"empty interior btree node at btree %s level %u\n"
|
||||
" %s",
|
||||
bch2_btree_id_str(b->c.btree_id),
|
||||
@ -509,7 +512,7 @@ reconstruct_root:
|
||||
r->error = 0;
|
||||
|
||||
if (!bch2_btree_has_scanned_nodes(c, i)) {
|
||||
mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
|
||||
mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
|
||||
"no nodes found for btree %s, continue?", bch2_btree_id_str(i));
|
||||
bch2_btree_root_alloc_fake_trans(trans, i, 0);
|
||||
} else {
|
||||
@ -583,8 +586,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
|
||||
BUG_ON(bch2_journal_seq_verify &&
|
||||
k.k->version.lo > atomic64_read(&c->journal.seq));
|
||||
|
||||
if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
|
||||
bkey_version_in_future,
|
||||
if (fsck_err_on(btree_id != BTREE_ID_accounting &&
|
||||
k.k->version.lo > atomic64_read(&c->key_version),
|
||||
trans, bkey_version_in_future,
|
||||
"key version number higher than recorded %llu\n %s",
|
||||
atomic64_read(&c->key_version),
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
||||
@ -592,7 +596,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
|
||||
}
|
||||
|
||||
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
|
||||
c, btree_bitmap_not_marked,
|
||||
trans, btree_bitmap_not_marked,
|
||||
"btree ptr not marked in member info btree allocated bitmap\n %s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, k),
|
||||
@ -622,7 +626,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
|
||||
}
|
||||
|
||||
ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
|
||||
BTREE_TRIGGER_gc|flags);
|
||||
BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
|
||||
out:
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
@ -633,29 +637,14 @@ fsck_err:
|
||||
static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1;
|
||||
unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
|
||||
int ret = 0;
|
||||
|
||||
/* We need to make sure every leaf node is readable before going RW */
|
||||
if (initial)
|
||||
target_depth = 0;
|
||||
|
||||
/* root */
|
||||
mutex_lock(&c->btree_root_lock);
|
||||
struct btree *b = bch2_btree_id_root(c, btree)->b;
|
||||
if (!btree_node_fake(b)) {
|
||||
gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
|
||||
ret = lockrestart_do(trans,
|
||||
bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
|
||||
NULL, NULL, bkey_i_to_s_c(&b->key), initial));
|
||||
level = b->c.level;
|
||||
}
|
||||
mutex_unlock(&c->btree_root_lock);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
for (; level >= target_depth; --level) {
|
||||
for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
|
||||
struct btree *prev = NULL;
|
||||
struct btree_iter iter;
|
||||
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
|
||||
@ -666,9 +655,21 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
|
||||
bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
|
||||
}));
|
||||
if (ret)
|
||||
break;
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* root */
|
||||
mutex_lock(&c->btree_root_lock);
|
||||
struct btree *b = bch2_btree_id_root(c, btree)->b;
|
||||
if (!btree_node_fake(b)) {
|
||||
gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
|
||||
ret = lockrestart_do(trans,
|
||||
bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
|
||||
NULL, NULL, bkey_i_to_s_c(&b->key), initial));
|
||||
}
|
||||
mutex_unlock(&c->btree_root_lock);
|
||||
err:
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -697,7 +698,7 @@ static int bch2_gc_btrees(struct bch_fs *c)
|
||||
ret = bch2_gc_btree(trans, btree, true);
|
||||
|
||||
if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
|
||||
c, btree_node_read_error,
|
||||
trans, btree_node_read_error,
|
||||
"btree node read error for %s",
|
||||
bch2_btree_id_str(btree)))
|
||||
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
|
||||
@ -720,131 +721,25 @@ static int bch2_mark_superblocks(struct bch_fs *c)
|
||||
|
||||
static void bch2_gc_free(struct bch_fs *c)
|
||||
{
|
||||
bch2_accounting_free(&c->accounting[1]);
|
||||
|
||||
genradix_free(&c->reflink_gc_table);
|
||||
genradix_free(&c->gc_stripes);
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
|
||||
ca->buckets_gc = NULL;
|
||||
|
||||
free_percpu(ca->usage_gc);
|
||||
ca->usage_gc = NULL;
|
||||
}
|
||||
|
||||
free_percpu(c->usage_gc);
|
||||
c->usage_gc = NULL;
|
||||
}
|
||||
|
||||
static int bch2_gc_done(struct bch_fs *c)
|
||||
{
|
||||
struct bch_dev *ca = NULL;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
unsigned i;
|
||||
int ret = 0;
|
||||
|
||||
percpu_down_write(&c->mark_lock);
|
||||
|
||||
#define copy_field(_err, _f, _msg, ...) \
|
||||
if (fsck_err_on(dst->_f != src->_f, c, _err, \
|
||||
_msg ": got %llu, should be %llu" , ##__VA_ARGS__, \
|
||||
dst->_f, src->_f)) \
|
||||
dst->_f = src->_f
|
||||
#define copy_dev_field(_err, _f, _msg, ...) \
|
||||
copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
|
||||
#define copy_fs_field(_err, _f, _msg, ...) \
|
||||
copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
|
||||
bch2_fs_usage_acc_to_base(c, i);
|
||||
|
||||
__for_each_member_device(c, ca) {
|
||||
struct bch_dev_usage *dst = ca->usage_base;
|
||||
struct bch_dev_usage *src = (void *)
|
||||
bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
|
||||
dev_usage_u64s());
|
||||
|
||||
for (i = 0; i < BCH_DATA_NR; i++) {
|
||||
copy_dev_field(dev_usage_buckets_wrong,
|
||||
d[i].buckets, "%s buckets", bch2_data_type_str(i));
|
||||
copy_dev_field(dev_usage_sectors_wrong,
|
||||
d[i].sectors, "%s sectors", bch2_data_type_str(i));
|
||||
copy_dev_field(dev_usage_fragmented_wrong,
|
||||
d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
unsigned nr = fs_usage_u64s(c);
|
||||
struct bch_fs_usage *dst = c->usage_base;
|
||||
struct bch_fs_usage *src = (void *)
|
||||
bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
|
||||
|
||||
copy_fs_field(fs_usage_hidden_wrong,
|
||||
b.hidden, "hidden");
|
||||
copy_fs_field(fs_usage_btree_wrong,
|
||||
b.btree, "btree");
|
||||
|
||||
copy_fs_field(fs_usage_data_wrong,
|
||||
b.data, "data");
|
||||
copy_fs_field(fs_usage_cached_wrong,
|
||||
b.cached, "cached");
|
||||
copy_fs_field(fs_usage_reserved_wrong,
|
||||
b.reserved, "reserved");
|
||||
copy_fs_field(fs_usage_nr_inodes_wrong,
|
||||
b.nr_inodes,"nr_inodes");
|
||||
|
||||
for (i = 0; i < BCH_REPLICAS_MAX; i++)
|
||||
copy_fs_field(fs_usage_persistent_reserved_wrong,
|
||||
persistent_reserved[i],
|
||||
"persistent_reserved[%i]", i);
|
||||
|
||||
for (i = 0; i < c->replicas.nr; i++) {
|
||||
struct bch_replicas_entry_v1 *e =
|
||||
cpu_replicas_entry(&c->replicas, i);
|
||||
|
||||
printbuf_reset(&buf);
|
||||
bch2_replicas_entry_to_text(&buf, e);
|
||||
|
||||
copy_fs_field(fs_usage_replicas_wrong,
|
||||
replicas[i], "%s", buf.buf);
|
||||
}
|
||||
}
|
||||
|
||||
#undef copy_fs_field
|
||||
#undef copy_dev_field
|
||||
#undef copy_stripe_field
|
||||
#undef copy_field
|
||||
fsck_err:
|
||||
bch2_dev_put(ca);
|
||||
bch_err_fn(c, ret);
|
||||
percpu_up_write(&c->mark_lock);
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_gc_start(struct bch_fs *c)
|
||||
{
|
||||
BUG_ON(c->usage_gc);
|
||||
|
||||
c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
|
||||
sizeof(u64), GFP_KERNEL);
|
||||
if (!c->usage_gc) {
|
||||
bch_err(c, "error allocating c->usage_gc");
|
||||
return -BCH_ERR_ENOMEM_gc_start;
|
||||
}
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
BUG_ON(ca->usage_gc);
|
||||
|
||||
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
|
||||
if (!ca->usage_gc) {
|
||||
bch_err(c, "error allocating ca->usage_gc");
|
||||
int ret = bch2_dev_usage_init(ca, true);
|
||||
if (ret) {
|
||||
bch2_dev_put(ca);
|
||||
return -BCH_ERR_ENOMEM_gc_start;
|
||||
return ret;
|
||||
}
|
||||
|
||||
this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
|
||||
ca->mi.nbuckets - ca->mi.first_bucket);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -858,6 +753,7 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
|
||||
l.oldest_gen != r.oldest_gen ||
|
||||
l.data_type != r.data_type ||
|
||||
l.dirty_sectors != r.dirty_sectors ||
|
||||
l.stripe_sectors != r.stripe_sectors ||
|
||||
l.cached_sectors != r.cached_sectors ||
|
||||
l.stripe_redundancy != r.stripe_redundancy ||
|
||||
l.stripe != r.stripe;
|
||||
@ -888,6 +784,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
|
||||
gc.data_type = old->data_type;
|
||||
gc.dirty_sectors = old->dirty_sectors;
|
||||
}
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
/*
|
||||
* gc.data_type doesn't yet include need_discard & need_gc_gen states -
|
||||
@ -896,12 +793,14 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
|
||||
alloc_data_type_set(&gc, gc.data_type);
|
||||
|
||||
if (gc.data_type != old_gc.data_type ||
|
||||
gc.dirty_sectors != old_gc.dirty_sectors)
|
||||
bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
gc.dirty_sectors != old_gc.dirty_sectors) {
|
||||
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (fsck_err_on(new.data_type != gc.data_type, c,
|
||||
alloc_key_data_type_wrong,
|
||||
if (fsck_err_on(new.data_type != gc.data_type,
|
||||
trans, alloc_key_data_type_wrong,
|
||||
"bucket %llu:%llu gen %u has wrong data_type"
|
||||
": got %s, should be %s",
|
||||
iter->pos.inode, iter->pos.offset,
|
||||
@ -911,7 +810,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
|
||||
new.data_type = gc.data_type;
|
||||
|
||||
#define copy_bucket_field(_errtype, _f) \
|
||||
if (fsck_err_on(new._f != gc._f, c, _errtype, \
|
||||
if (fsck_err_on(new._f != gc._f, \
|
||||
trans, _errtype, \
|
||||
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
|
||||
": got %u, should be %u", \
|
||||
iter->pos.inode, iter->pos.offset, \
|
||||
@ -924,6 +824,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
|
||||
gen);
|
||||
copy_bucket_field(alloc_key_dirty_sectors_wrong,
|
||||
dirty_sectors);
|
||||
copy_bucket_field(alloc_key_stripe_sectors_wrong,
|
||||
stripe_sectors);
|
||||
copy_bucket_field(alloc_key_cached_sectors_wrong,
|
||||
cached_sectors);
|
||||
copy_bucket_field(alloc_key_stripe_wrong,
|
||||
@ -978,14 +880,16 @@ static int bch2_gc_alloc_done(struct bch_fs *c)
|
||||
|
||||
static int bch2_gc_alloc_start(struct bch_fs *c)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
|
||||
ca->mi.nbuckets * sizeof(struct bucket),
|
||||
GFP_KERNEL|__GFP_ZERO);
|
||||
if (!buckets) {
|
||||
bch2_dev_put(ca);
|
||||
bch_err(c, "error allocating ca->buckets[gc]");
|
||||
return -BCH_ERR_ENOMEM_gc_alloc_start;
|
||||
ret = -BCH_ERR_ENOMEM_gc_alloc_start;
|
||||
break;
|
||||
}
|
||||
|
||||
buckets->first_bucket = ca->mi.first_bucket;
|
||||
@ -993,25 +897,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
|
||||
rcu_assign_pointer(ca->buckets_gc, buckets);
|
||||
}
|
||||
|
||||
struct bch_dev *ca = NULL;
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
|
||||
BTREE_ITER_prefetch, k, ({
|
||||
ca = bch2_dev_iterate(c, ca, k.k->p.inode);
|
||||
if (!ca) {
|
||||
bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
|
||||
continue;
|
||||
}
|
||||
|
||||
struct bch_alloc_v4 a_convert;
|
||||
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
|
||||
|
||||
struct bucket *g = gc_bucket(ca, k.k->p.offset);
|
||||
g->gen_valid = 1;
|
||||
g->gen = a->gen;
|
||||
0;
|
||||
})));
|
||||
bch2_dev_put(ca);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
@ -1041,8 +926,8 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
|
||||
reflink_v_refcount_wrong,
|
||||
if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
|
||||
trans, reflink_v_refcount_wrong,
|
||||
"reflink key has wrong refcount:\n"
|
||||
" %s\n"
|
||||
" should be %u",
|
||||
@ -1140,7 +1025,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans,
|
||||
if (bad)
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
|
||||
if (fsck_err_on(bad, c, stripe_sector_count_wrong,
|
||||
if (fsck_err_on(bad,
|
||||
trans, stripe_sector_count_wrong,
|
||||
"%s", buf.buf)) {
|
||||
struct bkey_i_stripe *new;
|
||||
|
||||
@ -1199,8 +1085,6 @@ int bch2_check_allocations(struct bch_fs *c)
|
||||
|
||||
lockdep_assert_held(&c->state_lock);
|
||||
|
||||
down_write(&c->gc_lock);
|
||||
|
||||
bch2_btree_interior_updates_flush(c);
|
||||
|
||||
ret = bch2_gc_start(c) ?:
|
||||
@ -1212,7 +1096,9 @@ int bch2_check_allocations(struct bch_fs *c)
|
||||
gc_pos_set(c, gc_phase(GC_PHASE_start));
|
||||
|
||||
ret = bch2_mark_superblocks(c);
|
||||
BUG_ON(ret);
|
||||
bch_err_msg(c, ret, "marking superblocks");
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = bch2_gc_btrees(c);
|
||||
if (ret)
|
||||
@ -1220,15 +1106,11 @@ int bch2_check_allocations(struct bch_fs *c)
|
||||
|
||||
c->gc_count++;
|
||||
|
||||
bch2_journal_block(&c->journal);
|
||||
out:
|
||||
ret = bch2_gc_alloc_done(c) ?:
|
||||
bch2_gc_done(c) ?:
|
||||
bch2_accounting_gc_done(c) ?:
|
||||
bch2_gc_stripes_done(c) ?:
|
||||
bch2_gc_reflink_done(c);
|
||||
|
||||
bch2_journal_unblock(&c->journal);
|
||||
|
||||
out:
|
||||
percpu_down_write(&c->mark_lock);
|
||||
/* Indicates that gc is no longer in progress: */
|
||||
__gc_pos_set(c, gc_phase(GC_PHASE_not_running));
|
||||
@ -1236,13 +1118,6 @@ out:
|
||||
bch2_gc_free(c);
|
||||
percpu_up_write(&c->mark_lock);
|
||||
|
||||
up_write(&c->gc_lock);
|
||||
|
||||
/*
|
||||
* At startup, allocations can happen directly instead of via the
|
||||
* allocator thread - issue wakeup in case they blocked on gc_lock:
|
||||
*/
|
||||
closure_wake_up(&c->freelist_wait);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
@ -1323,7 +1198,7 @@ int bch2_gc_gens(struct bch_fs *c)
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Ideally we would be using state_lock and not gc_lock here, but that
|
||||
* Ideally we would be using state_lock and not gc_gens_lock here, but that
|
||||
* introduces a deadlock in the RO path - we currently take the state
|
||||
* lock at the start of going RO, thus the gc thread may get stuck:
|
||||
*/
|
||||
@ -1331,7 +1206,8 @@ int bch2_gc_gens(struct bch_fs *c)
|
||||
return 0;
|
||||
|
||||
trace_and_count(c, gc_gens_start, c);
|
||||
down_read(&c->gc_lock);
|
||||
|
||||
down_read(&c->state_lock);
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
struct bucket_gens *gens = bucket_gens(ca);
|
||||
@ -1400,7 +1276,7 @@ err:
|
||||
ca->oldest_gen = NULL;
|
||||
}
|
||||
|
||||
up_read(&c->gc_lock);
|
||||
up_read(&c->state_lock);
|
||||
mutex_unlock(&c->gc_gens_lock);
|
||||
if (!bch2_err_matches(ret, EROFS))
|
||||
bch_err_fn(c, ret);
|
||||
|
@ -58,6 +58,8 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b)
|
||||
|
||||
static inline int gc_btree_order(enum btree_id btree)
|
||||
{
|
||||
if (btree == BTREE_ID_alloc)
|
||||
return -2;
|
||||
if (btree == BTREE_ID_stripes)
|
||||
return -1;
|
||||
return btree;
|
||||
@ -65,11 +67,11 @@ static inline int gc_btree_order(enum btree_id btree)
|
||||
|
||||
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
|
||||
{
|
||||
return cmp_int(l.phase, r.phase) ?:
|
||||
cmp_int(gc_btree_order(l.btree),
|
||||
gc_btree_order(r.btree)) ?:
|
||||
-cmp_int(l.level, r.level) ?:
|
||||
bpos_cmp(l.pos, r.pos);
|
||||
return cmp_int(l.phase, r.phase) ?:
|
||||
cmp_int(gc_btree_order(l.btree),
|
||||
gc_btree_order(r.btree)) ?:
|
||||
cmp_int(l.level, r.level) ?:
|
||||
bpos_cmp(l.pos, r.pos);
|
||||
}
|
||||
|
||||
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
|
||||
|
@ -534,7 +534,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
|
||||
printbuf_indent_add(out, 2);
|
||||
|
||||
prt_printf(out, "\nnode offset %u/%u",
|
||||
b->written, btree_ptr_sectors_written(&b->key));
|
||||
b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
|
||||
if (i)
|
||||
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
|
||||
if (k)
|
||||
@ -585,7 +585,7 @@ static int __btree_err(int ret,
|
||||
switch (ret) {
|
||||
case -BCH_ERR_btree_node_read_err_fixable:
|
||||
ret = !silent
|
||||
? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf)
|
||||
? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
|
||||
: -BCH_ERR_fsck_fix;
|
||||
if (ret != -BCH_ERR_fsck_fix &&
|
||||
ret != -BCH_ERR_fsck_ignore)
|
||||
@ -689,6 +689,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
||||
int write, bool have_retry, bool *saw_error)
|
||||
{
|
||||
unsigned version = le16_to_cpu(i->version);
|
||||
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
|
||||
struct printbuf buf1 = PRINTBUF;
|
||||
struct printbuf buf2 = PRINTBUF;
|
||||
int ret = 0;
|
||||
@ -732,11 +733,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
||||
btree_node_unsupported_version,
|
||||
"BSET_SEPARATE_WHITEOUTS no longer supported");
|
||||
|
||||
if (btree_err_on(offset + sectors > btree_sectors(c),
|
||||
if (!write &&
|
||||
btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
|
||||
-BCH_ERR_btree_node_read_err_fixable,
|
||||
c, ca, b, i, NULL,
|
||||
bset_past_end_of_btree_node,
|
||||
"bset past end of btree node")) {
|
||||
"bset past end of btree node (offset %u len %u but written %zu)",
|
||||
offset, sectors, ptr_written ?: btree_sectors(c))) {
|
||||
i->u64s = 0;
|
||||
ret = 0;
|
||||
goto out;
|
||||
@ -1002,7 +1005,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
|
||||
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
|
||||
unsigned u64s;
|
||||
unsigned ptr_written = btree_ptr_sectors_written(&b->key);
|
||||
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0, retry_read = 0, write = READ;
|
||||
u64 start_time = local_clock();
|
||||
@ -1796,15 +1799,16 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
|
||||
static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
|
||||
struct btree_write *w)
|
||||
{
|
||||
unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
|
||||
unsigned long old, new;
|
||||
|
||||
old = READ_ONCE(b->will_make_reachable);
|
||||
do {
|
||||
old = new = v;
|
||||
new = old;
|
||||
if (!(old & 1))
|
||||
break;
|
||||
|
||||
new &= ~1UL;
|
||||
} while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
|
||||
} while (!try_cmpxchg(&b->will_make_reachable, &old, new));
|
||||
|
||||
if (old & 1)
|
||||
closure_put(&((struct btree_update *) new)->cl);
|
||||
@ -1815,14 +1819,14 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
|
||||
static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
struct btree_write *w = btree_prev_write(b);
|
||||
unsigned long old, new, v;
|
||||
unsigned long old, new;
|
||||
unsigned type = 0;
|
||||
|
||||
bch2_btree_complete_write(c, b, w);
|
||||
|
||||
v = READ_ONCE(b->flags);
|
||||
old = READ_ONCE(b->flags);
|
||||
do {
|
||||
old = new = v;
|
||||
new = old;
|
||||
|
||||
if ((old & (1U << BTREE_NODE_dirty)) &&
|
||||
(old & (1U << BTREE_NODE_need_write)) &&
|
||||
@ -1842,7 +1846,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
|
||||
new &= ~(1U << BTREE_NODE_write_in_flight);
|
||||
new &= ~(1U << BTREE_NODE_write_in_flight_inner);
|
||||
}
|
||||
} while ((v = cmpxchg(&b->flags, old, new)) != old);
|
||||
} while (!try_cmpxchg(&b->flags, &old, new));
|
||||
|
||||
if (new & (1U << BTREE_NODE_write_in_flight))
|
||||
__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
|
||||
@ -2014,8 +2018,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
|
||||
* dirty bit requires a write lock, we can't race with other threads
|
||||
* redirtying it:
|
||||
*/
|
||||
old = READ_ONCE(b->flags);
|
||||
do {
|
||||
old = new = READ_ONCE(b->flags);
|
||||
new = old;
|
||||
|
||||
if (!(old & (1 << BTREE_NODE_dirty)))
|
||||
return;
|
||||
@ -2046,7 +2051,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
|
||||
new |= (1 << BTREE_NODE_write_in_flight_inner);
|
||||
new |= (1 << BTREE_NODE_just_written);
|
||||
new ^= (1 << BTREE_NODE_write_idx);
|
||||
} while (cmpxchg_acquire(&b->flags, old, new) != old);
|
||||
} while (!try_cmpxchg_acquire(&b->flags, &old, new));
|
||||
|
||||
if (new & (1U << BTREE_NODE_need_write))
|
||||
return;
|
||||
@ -2133,7 +2138,7 @@ do_write:
|
||||
|
||||
if (!b->written &&
|
||||
b->key.k.type == KEY_TYPE_btree_ptr_v2)
|
||||
BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
|
||||
BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
|
||||
|
||||
memset(data + bytes_to_write, 0,
|
||||
(sectors_to_write << 9) - bytes_to_write);
|
||||
|
@ -27,10 +27,10 @@ static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b
|
||||
atomic_dec(&c->btree_cache.dirty);
|
||||
}
|
||||
|
||||
static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
|
||||
static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
|
||||
{
|
||||
return k->k.type == KEY_TYPE_btree_ptr_v2
|
||||
? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
|
||||
return k.k->type == KEY_TYPE_btree_ptr_v2
|
||||
? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written)
|
||||
: 0;
|
||||
}
|
||||
|
||||
|
@ -1002,6 +1002,7 @@ retry_all:
|
||||
bch2_trans_unlock(trans);
|
||||
cond_resched();
|
||||
trans->locked = true;
|
||||
trans->last_unlock_ip = 0;
|
||||
|
||||
if (unlikely(trans->memory_allocation_failure)) {
|
||||
struct closure cl;
|
||||
@ -1470,7 +1471,7 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
bch2_trans_updates_to_text(&buf, trans);
|
||||
bch2_print_string_as_lines(KERN_ERR, buf.buf);
|
||||
bch2_print_str(trans->c, buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
@ -1562,7 +1563,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
|
||||
__bch2_trans_paths_to_text(&buf, trans, nosort);
|
||||
bch2_trans_updates_to_text(&buf, trans);
|
||||
|
||||
bch2_print_string_as_lines(KERN_ERR, buf.buf);
|
||||
bch2_print_str(trans->c, buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
@ -3095,6 +3096,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
|
||||
|
||||
trans->last_begin_ip = _RET_IP_;
|
||||
trans->locked = true;
|
||||
trans->last_unlock_ip = 0;
|
||||
|
||||
if (trans->restarted) {
|
||||
bch2_btree_path_traverse_all(trans);
|
||||
@ -3249,15 +3251,6 @@ void bch2_trans_put(struct btree_trans *trans)
|
||||
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
|
||||
}
|
||||
|
||||
if (trans->fs_usage_deltas) {
|
||||
if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
|
||||
REPLICAS_DELTA_LIST_MAX)
|
||||
mempool_free(trans->fs_usage_deltas,
|
||||
&c->replicas_delta_pool);
|
||||
else
|
||||
kfree(trans->fs_usage_deltas);
|
||||
}
|
||||
|
||||
if (unlikely(trans->journal_replay_not_finished))
|
||||
bch2_journal_keys_put(c);
|
||||
|
||||
@ -3288,6 +3281,20 @@ void bch2_trans_put(struct btree_trans *trans)
|
||||
}
|
||||
}
|
||||
|
||||
bool bch2_current_has_btree_trans(struct bch_fs *c)
|
||||
{
|
||||
seqmutex_lock(&c->btree_trans_lock);
|
||||
struct btree_trans *trans;
|
||||
bool ret = false;
|
||||
list_for_each_entry(trans, &c->btree_trans_list, list)
|
||||
if (trans->locking_wait.task == current) {
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
seqmutex_unlock(&c->btree_trans_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __maybe_unused
|
||||
bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
|
||||
struct btree_bkey_cached_common *b)
|
||||
|
@ -866,6 +866,14 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
|
||||
_p; \
|
||||
})
|
||||
|
||||
#define bch2_trans_run(_c, _do) \
|
||||
({ \
|
||||
struct btree_trans *trans = bch2_trans_get(_c); \
|
||||
int _ret = (_do); \
|
||||
bch2_trans_put(trans); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
|
||||
void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
|
||||
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
|
||||
@ -875,6 +883,8 @@ void bch2_dump_trans_paths_updates(struct btree_trans *);
|
||||
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
|
||||
void bch2_trans_put(struct btree_trans *);
|
||||
|
||||
bool bch2_current_has_btree_trans(struct bch_fs *);
|
||||
|
||||
extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
|
||||
unsigned bch2_trans_get_fn_idx(const char *);
|
||||
|
||||
|
@ -16,21 +16,6 @@
|
||||
* operations for the regular btree iter code to use:
|
||||
*/
|
||||
|
||||
static int __journal_key_cmp(enum btree_id l_btree_id,
|
||||
unsigned l_level,
|
||||
struct bpos l_pos,
|
||||
const struct journal_key *r)
|
||||
{
|
||||
return (cmp_int(l_btree_id, r->btree_id) ?:
|
||||
cmp_int(l_level, r->level) ?:
|
||||
bpos_cmp(l_pos, r->k->k.p));
|
||||
}
|
||||
|
||||
static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
|
||||
{
|
||||
return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
|
||||
}
|
||||
|
||||
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
|
||||
{
|
||||
size_t gap_size = keys->size - keys->nr;
|
||||
@ -548,7 +533,13 @@ static void __journal_keys_sort(struct journal_keys *keys)
|
||||
struct journal_key *dst = keys->data;
|
||||
|
||||
darray_for_each(*keys, src) {
|
||||
if (src + 1 < &darray_top(*keys) &&
|
||||
/*
|
||||
* We don't accumulate accounting keys here because we have to
|
||||
* compare each individual accounting key against the version in
|
||||
* the btree during replay:
|
||||
*/
|
||||
if (src->k->k.type != KEY_TYPE_accounting &&
|
||||
src + 1 < &darray_top(*keys) &&
|
||||
!journal_key_cmp(src, src + 1))
|
||||
continue;
|
||||
|
||||
|
@ -26,6 +26,21 @@ struct btree_and_journal_iter {
|
||||
bool prefetch;
|
||||
};
|
||||
|
||||
static inline int __journal_key_cmp(enum btree_id l_btree_id,
|
||||
unsigned l_level,
|
||||
struct bpos l_pos,
|
||||
const struct journal_key *r)
|
||||
{
|
||||
return (cmp_int(l_btree_id, r->btree_id) ?:
|
||||
cmp_int(l_level, r->level) ?:
|
||||
bpos_cmp(l_pos, r->k->k.p));
|
||||
}
|
||||
|
||||
static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
|
||||
{
|
||||
return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
|
||||
}
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos, struct bpos, size_t *);
|
||||
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
|
||||
|
@ -215,6 +215,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
|
||||
|
||||
if (unlikely(!best)) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
buf.atomic++;
|
||||
|
||||
prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
|
||||
|
||||
@ -792,6 +793,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
|
||||
}
|
||||
|
||||
trans->locked = true;
|
||||
trans->last_unlock_ip = 0;
|
||||
out:
|
||||
bch2_trans_verify_locks(trans);
|
||||
return 0;
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "btree_update_interior.h"
|
||||
#include "btree_write_buffer.h"
|
||||
#include "buckets.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "errcode.h"
|
||||
#include "error.h"
|
||||
#include "journal.h"
|
||||
@ -228,14 +229,14 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
|
||||
struct btree_write *w = container_of(pin, struct btree_write, journal);
|
||||
struct btree *b = container_of(w, struct btree, writes[i]);
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
unsigned long old, new, v;
|
||||
unsigned long old, new;
|
||||
unsigned idx = w - b->writes;
|
||||
|
||||
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
|
||||
v = READ_ONCE(b->flags);
|
||||
|
||||
old = READ_ONCE(b->flags);
|
||||
do {
|
||||
old = new = v;
|
||||
new = old;
|
||||
|
||||
if (!(old & (1 << BTREE_NODE_dirty)) ||
|
||||
!!(old & (1 << BTREE_NODE_write_idx)) != idx ||
|
||||
@ -245,7 +246,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
|
||||
new &= ~BTREE_WRITE_TYPE_MASK;
|
||||
new |= BTREE_WRITE_journal_reclaim;
|
||||
new |= 1 << BTREE_NODE_need_write;
|
||||
} while ((v = cmpxchg(&b->flags, old, new)) != old);
|
||||
} while (!try_cmpxchg(&b->flags, &old, new));
|
||||
|
||||
btree_node_write_if_need(c, b, SIX_LOCK_read);
|
||||
six_unlock_read(&b->c.lock);
|
||||
@ -456,34 +457,36 @@ static int run_one_mem_trigger(struct btree_trans *trans,
|
||||
struct btree_insert_entry *i,
|
||||
unsigned flags)
|
||||
{
|
||||
struct bkey_s_c old = { &i->old_k, i->old_v };
|
||||
struct bkey_i *new = i->k;
|
||||
const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
|
||||
const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
|
||||
int ret;
|
||||
|
||||
verify_update_old_key(trans, i);
|
||||
|
||||
if (unlikely(flags & BTREE_TRIGGER_norun))
|
||||
return 0;
|
||||
|
||||
if (old_ops->trigger == new_ops->trigger) {
|
||||
ret = bch2_key_trigger(trans, i->btree_id, i->level,
|
||||
struct bkey_s_c old = { &i->old_k, i->old_v };
|
||||
struct bkey_i *new = i->k;
|
||||
const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
|
||||
const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
|
||||
|
||||
if (old_ops->trigger == new_ops->trigger)
|
||||
return bch2_key_trigger(trans, i->btree_id, i->level,
|
||||
old, bkey_i_to_s(new),
|
||||
BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
|
||||
} else {
|
||||
ret = bch2_key_trigger_new(trans, i->btree_id, i->level,
|
||||
else
|
||||
return bch2_key_trigger_new(trans, i->btree_id, i->level,
|
||||
bkey_i_to_s(new), flags) ?:
|
||||
bch2_key_trigger_old(trans, i->btree_id, i->level,
|
||||
bch2_key_trigger_old(trans, i->btree_id, i->level,
|
||||
old, flags);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
|
||||
bool overwrite)
|
||||
{
|
||||
verify_update_old_key(trans, i);
|
||||
|
||||
if ((i->flags & BTREE_TRIGGER_norun) ||
|
||||
!btree_node_type_has_trans_triggers(i->bkey_type))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Transactional triggers create new btree_insert_entries, so we can't
|
||||
* pass them a pointer to a btree_insert_entry, that memory is going to
|
||||
@ -495,12 +498,6 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
|
||||
const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
|
||||
unsigned flags = i->flags|BTREE_TRIGGER_transactional;
|
||||
|
||||
verify_update_old_key(trans, i);
|
||||
|
||||
if ((i->flags & BTREE_TRIGGER_norun) ||
|
||||
!(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
|
||||
return 0;
|
||||
|
||||
if (!i->insert_trigger_run &&
|
||||
!i->overwrite_trigger_run &&
|
||||
old_ops->trigger == new_ops->trigger) {
|
||||
@ -523,10 +520,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
|
||||
static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
|
||||
unsigned btree_id_start)
|
||||
{
|
||||
bool trans_trigger_run;
|
||||
int ret, overwrite;
|
||||
|
||||
for (overwrite = 1; overwrite >= 0; --overwrite) {
|
||||
for (int overwrite = 1; overwrite >= 0; --overwrite) {
|
||||
bool trans_trigger_run;
|
||||
|
||||
/*
|
||||
* Running triggers will append more updates to the list of updates as
|
||||
@ -541,7 +536,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
|
||||
if (trans->updates[i].btree_id != btree_id)
|
||||
continue;
|
||||
|
||||
ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
|
||||
int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret)
|
||||
@ -594,7 +589,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
trans_for_each_update(trans, i)
|
||||
BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
|
||||
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
|
||||
btree_node_type_has_trans_triggers(i->bkey_type) &&
|
||||
(!i->insert_trigger_run || !i->overwrite_trigger_run));
|
||||
#endif
|
||||
return 0;
|
||||
@ -602,24 +597,25 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
|
||||
|
||||
static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
|
||||
{
|
||||
trans_for_each_update(trans, i) {
|
||||
/*
|
||||
* XXX: synchronization of cached update triggers with gc
|
||||
* XXX: synchronization of interior node updates with gc
|
||||
*/
|
||||
BUG_ON(i->cached || i->level);
|
||||
|
||||
if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
|
||||
trans_for_each_update(trans, i)
|
||||
if (btree_node_type_has_triggers(i->bkey_type) &&
|
||||
gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
|
||||
int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
|
||||
{
|
||||
return (struct bversion) {
|
||||
.hi = res->seq >> 32,
|
||||
.lo = (res->seq << 32) | (res->offset + offset),
|
||||
};
|
||||
}
|
||||
|
||||
static inline int
|
||||
bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
||||
struct btree_insert_entry **stopped_at,
|
||||
@ -628,7 +624,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_trans_commit_hook *h;
|
||||
unsigned u64s = 0;
|
||||
int ret;
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_in_restart(trans);
|
||||
@ -693,23 +689,40 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
||||
i->k->k.version = MAX_VERSION;
|
||||
}
|
||||
|
||||
if (trans->fs_usage_deltas &&
|
||||
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
|
||||
return -BCH_ERR_btree_insert_need_mark_replicas;
|
||||
|
||||
/* XXX: we only want to run this if deltas are nonzero */
|
||||
bch2_trans_account_disk_usage_change(trans);
|
||||
|
||||
h = trans->hooks;
|
||||
while (h) {
|
||||
ret = h->fn(trans, h);
|
||||
if (ret)
|
||||
goto revert_fs_usage;
|
||||
return ret;
|
||||
h = h->next;
|
||||
}
|
||||
|
||||
struct jset_entry *entry = trans->journal_entries;
|
||||
|
||||
if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
|
||||
percpu_down_read(&c->mark_lock);
|
||||
|
||||
for (entry = trans->journal_entries;
|
||||
entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
|
||||
entry = vstruct_next(entry))
|
||||
if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) {
|
||||
struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
|
||||
|
||||
a->k.version = journal_pos_to_bversion(&trans->journal_res,
|
||||
(u64 *) entry - (u64 *) trans->journal_entries);
|
||||
BUG_ON(bversion_zero(a->k.version));
|
||||
ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false);
|
||||
if (ret)
|
||||
goto revert_fs_usage;
|
||||
}
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
/* XXX: we only want to run this if deltas are nonzero */
|
||||
bch2_trans_account_disk_usage_change(trans);
|
||||
}
|
||||
|
||||
trans_for_each_update(trans, i)
|
||||
if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
|
||||
if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
|
||||
ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
|
||||
if (ret)
|
||||
goto fatal_err;
|
||||
@ -776,17 +789,34 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
||||
|
||||
return 0;
|
||||
fatal_err:
|
||||
bch2_fatal_error(c);
|
||||
bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
|
||||
percpu_down_read(&c->mark_lock);
|
||||
revert_fs_usage:
|
||||
if (trans->fs_usage_deltas)
|
||||
bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
|
||||
for (struct jset_entry *entry2 = trans->journal_entries;
|
||||
entry2 != entry;
|
||||
entry2 = vstruct_next(entry2))
|
||||
if (jset_entry_is_key(entry2) && entry2->start->k.type == KEY_TYPE_accounting) {
|
||||
struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
|
||||
|
||||
bch2_accounting_neg(a);
|
||||
bch2_accounting_mem_mod_locked(trans, a.c, false);
|
||||
bch2_accounting_neg(a);
|
||||
}
|
||||
percpu_up_read(&c->mark_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
|
||||
{
|
||||
/*
|
||||
* Accounting keys aren't deduped in the journal: we have to compare
|
||||
* each individual update against what's in the btree to see if it has
|
||||
* been applied yet, and accounting updates also don't overwrite,
|
||||
* they're deltas that accumulate.
|
||||
*/
|
||||
trans_for_each_update(trans, i)
|
||||
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
|
||||
if (i->k->k.type != KEY_TYPE_accounting)
|
||||
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
|
||||
}
|
||||
|
||||
static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
|
||||
@ -922,7 +952,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
|
||||
break;
|
||||
case -BCH_ERR_btree_insert_need_mark_replicas:
|
||||
ret = drop_locks_do(trans,
|
||||
bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
|
||||
bch2_accounting_update_sb(trans));
|
||||
break;
|
||||
case -BCH_ERR_journal_res_get_blocked:
|
||||
/*
|
||||
@ -993,15 +1023,24 @@ static noinline int
|
||||
do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
int ret = 0;
|
||||
|
||||
trans_for_each_update(trans, i) {
|
||||
ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
|
||||
int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
|
||||
if (ret)
|
||||
break;
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ret;
|
||||
for (struct jset_entry *i = trans->journal_entries;
|
||||
i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
|
||||
i = vstruct_next(i))
|
||||
if (i->type == BCH_JSET_ENTRY_btree_keys ||
|
||||
i->type == BCH_JSET_ENTRY_write_buffer_keys) {
|
||||
int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
|
||||
@ -1017,8 +1056,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
|
||||
!trans->journal_entries_u64s)
|
||||
goto out_reset;
|
||||
|
||||
memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
|
||||
|
||||
ret = bch2_trans_commit_run_triggers(trans);
|
||||
if (ret)
|
||||
goto out_reset;
|
||||
@ -1115,6 +1152,7 @@ retry:
|
||||
bch2_trans_verify_not_in_restart(trans);
|
||||
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
|
||||
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
|
||||
memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
|
||||
|
||||
ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
|
||||
|
||||
|
@ -522,7 +522,6 @@ struct btree_trans {
|
||||
|
||||
unsigned journal_u64s;
|
||||
unsigned extra_disk_res; /* XXX kill */
|
||||
struct replicas_delta_list *fs_usage_deltas;
|
||||
|
||||
/* Entries before this are zeroed out on every bch2_trans_get() call */
|
||||
|
||||
@ -754,20 +753,30 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
|
||||
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
|
||||
BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
|
||||
|
||||
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
|
||||
static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type)
|
||||
{
|
||||
return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type);
|
||||
return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS;
|
||||
}
|
||||
|
||||
static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type)
|
||||
{
|
||||
return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS;
|
||||
}
|
||||
|
||||
static inline bool btree_node_type_has_triggers(enum btree_node_type type)
|
||||
{
|
||||
return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
|
||||
}
|
||||
|
||||
static inline bool btree_node_type_is_extents(enum btree_node_type type)
|
||||
{
|
||||
const unsigned mask = 0
|
||||
const u64 mask = 0
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
|
||||
BCH_BTREE_IDS()
|
||||
#undef x
|
||||
;
|
||||
|
||||
return (1U << type) & mask;
|
||||
return BIT_ULL(type) & mask;
|
||||
}
|
||||
|
||||
static inline bool btree_id_is_extents(enum btree_id btree)
|
||||
@ -777,35 +786,35 @@ static inline bool btree_id_is_extents(enum btree_id btree)
|
||||
|
||||
static inline bool btree_type_has_snapshots(enum btree_id id)
|
||||
{
|
||||
const unsigned mask = 0
|
||||
const u64 mask = 0
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
|
||||
BCH_BTREE_IDS()
|
||||
#undef x
|
||||
;
|
||||
|
||||
return (1U << id) & mask;
|
||||
return BIT_ULL(id) & mask;
|
||||
}
|
||||
|
||||
static inline bool btree_type_has_snapshot_field(enum btree_id id)
|
||||
{
|
||||
const unsigned mask = 0
|
||||
const u64 mask = 0
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
|
||||
BCH_BTREE_IDS()
|
||||
#undef x
|
||||
;
|
||||
|
||||
return (1U << id) & mask;
|
||||
return BIT_ULL(id) & mask;
|
||||
}
|
||||
|
||||
static inline bool btree_type_has_ptrs(enum btree_id id)
|
||||
{
|
||||
const unsigned mask = 0
|
||||
const u64 mask = 0
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr)
|
||||
BCH_BTREE_IDS()
|
||||
#undef x
|
||||
;
|
||||
|
||||
return (1U << id) & mask;
|
||||
return BIT_ULL(id) & mask;
|
||||
}
|
||||
|
||||
struct btree_root {
|
||||
|
@ -656,14 +656,16 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
|
||||
* @disk_res: must be non-NULL whenever inserting or potentially
|
||||
* splitting data extents
|
||||
* @flags: transaction commit flags
|
||||
* @iter_flags: btree iter update trigger flags
|
||||
*
|
||||
* Returns: 0 on success, error code on failure
|
||||
*/
|
||||
int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
|
||||
struct disk_reservation *disk_res, int flags)
|
||||
struct disk_reservation *disk_res, int flags,
|
||||
enum btree_iter_update_trigger_flags iter_flags)
|
||||
{
|
||||
return bch2_trans_do(c, disk_res, NULL, flags,
|
||||
bch2_btree_insert_trans(trans, id, k, 0));
|
||||
bch2_btree_insert_trans(trans, id, k, iter_flags));
|
||||
}
|
||||
|
||||
int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
|
||||
|
@ -29,6 +29,7 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
|
||||
"pin journal entry referred to by trans->journal_res.seq") \
|
||||
x(journal_reclaim, "operation required for journal reclaim; may return error" \
|
||||
"instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
|
||||
x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied")
|
||||
|
||||
enum __bch_trans_commit_flags {
|
||||
/* First bits for bch_watermark: */
|
||||
@ -56,8 +57,9 @@ int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
|
||||
|
||||
int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
|
||||
enum btree_iter_update_trigger_flags);
|
||||
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
|
||||
struct disk_reservation *, int flags);
|
||||
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct
|
||||
disk_reservation *, int flags, enum
|
||||
btree_iter_update_trigger_flags iter_flags);
|
||||
|
||||
int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
|
||||
struct bpos, struct bpos, unsigned, u64 *);
|
||||
@ -130,7 +132,19 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr
|
||||
enum btree_id btree,
|
||||
struct bkey_i *k)
|
||||
{
|
||||
if (unlikely(trans->journal_replay_not_finished))
|
||||
/*
|
||||
* Most updates skip the btree write buffer until journal replay is
|
||||
* finished because synchronization with journal replay relies on having
|
||||
* a btree node locked - if we're overwriting a key in the journal that
|
||||
* journal replay hasn't yet replayed, we have to mark it as
|
||||
* overwritten.
|
||||
*
|
||||
* But accounting updates don't overwrite, they're deltas, and they have
|
||||
* to be flushed to the btree strictly in order for journal replay to be
|
||||
* able to tell which updates need to be applied:
|
||||
*/
|
||||
if (k->k.type != KEY_TYPE_accounting &&
|
||||
unlikely(trans->journal_replay_not_finished))
|
||||
return bch2_btree_insert_clone_trans(trans, btree, k);
|
||||
|
||||
struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
|
||||
@ -178,14 +192,6 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
|
||||
nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
|
||||
(_journal_seq), (_flags)))
|
||||
|
||||
#define bch2_trans_run(_c, _do) \
|
||||
({ \
|
||||
struct btree_trans *trans = bch2_trans_get(_c); \
|
||||
int _ret = (_do); \
|
||||
bch2_trans_put(trans); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
|
||||
bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
|
||||
|
||||
@ -203,14 +209,6 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
|
||||
trans->journal_entries_u64s = 0;
|
||||
trans->hooks = NULL;
|
||||
trans->extra_disk_res = 0;
|
||||
|
||||
if (trans->fs_usage_deltas) {
|
||||
trans->fs_usage_deltas->used = 0;
|
||||
memset((void *) trans->fs_usage_deltas +
|
||||
offsetof(struct replicas_delta_list, memset_start), 0,
|
||||
(void *) &trans->fs_usage_deltas->memset_end -
|
||||
(void *) &trans->fs_usage_deltas->memset_start);
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
|
||||
|
@ -61,7 +61,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
||||
if (!bpos_eq(b->data->min_key, POS_MIN)) {
|
||||
printbuf_reset(&buf);
|
||||
bch2_bpos_to_text(&buf, b->data->min_key);
|
||||
need_fsck_err(c, btree_root_bad_min_key,
|
||||
need_fsck_err(trans, btree_root_bad_min_key,
|
||||
"btree root with incorrect min_key: %s", buf.buf);
|
||||
goto topology_repair;
|
||||
}
|
||||
@ -69,7 +69,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
||||
if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
|
||||
printbuf_reset(&buf);
|
||||
bch2_bpos_to_text(&buf, b->data->max_key);
|
||||
need_fsck_err(c, btree_root_bad_max_key,
|
||||
need_fsck_err(trans, btree_root_bad_max_key,
|
||||
"btree root with incorrect max_key: %s", buf.buf);
|
||||
goto topology_repair;
|
||||
}
|
||||
@ -105,7 +105,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
||||
prt_str(&buf, "\n next ");
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
|
||||
need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf);
|
||||
need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
|
||||
goto topology_repair;
|
||||
}
|
||||
|
||||
@ -122,7 +122,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
||||
bch2_btree_id_str(b->c.btree_id), b->c.level);
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
|
||||
need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf);
|
||||
need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
|
||||
goto topology_repair;
|
||||
} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
|
||||
bch2_topology_error(c);
|
||||
@ -135,7 +135,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
||||
prt_str(&buf, "\n last key ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
|
||||
|
||||
need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf);
|
||||
need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
|
||||
goto topology_repair;
|
||||
}
|
||||
out:
|
||||
@ -565,10 +565,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
|
||||
{
|
||||
struct bch_fs *c = as->c;
|
||||
|
||||
if (as->took_gc_lock)
|
||||
up_read(&c->gc_lock);
|
||||
as->took_gc_lock = false;
|
||||
|
||||
bch2_journal_pin_drop(&c->journal, &as->journal);
|
||||
bch2_journal_pin_flush(&c->journal, &as->journal);
|
||||
bch2_disk_reservation_put(c, &as->disk_res);
|
||||
@ -1117,10 +1113,6 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
|
||||
|
||||
BUG_ON(as->mode == BTREE_UPDATE_none);
|
||||
|
||||
if (as->took_gc_lock)
|
||||
up_read(&as->c->gc_lock);
|
||||
as->took_gc_lock = false;
|
||||
|
||||
bch2_btree_reserve_put(as, trans);
|
||||
|
||||
continue_at(&as->cl, btree_update_set_nodes_written,
|
||||
@ -1192,14 +1184,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
|
||||
split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
|
||||
}
|
||||
|
||||
if (!down_read_trylock(&c->gc_lock)) {
|
||||
ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
|
||||
if (ret) {
|
||||
up_read(&c->gc_lock);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
}
|
||||
|
||||
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
|
||||
memset(as, 0, sizeof(*as));
|
||||
closure_init(&as->cl, NULL);
|
||||
@ -1208,7 +1192,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
|
||||
as->ip_started = _RET_IP_;
|
||||
as->mode = BTREE_UPDATE_none;
|
||||
as->flags = flags;
|
||||
as->took_gc_lock = true;
|
||||
as->btree_id = path->btree_id;
|
||||
as->update_level_start = level_start;
|
||||
as->update_level_end = level_end;
|
||||
@ -1356,10 +1339,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
|
||||
struct bch_fs *c = as->c;
|
||||
struct bkey_packed *k;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
unsigned long old, new, v;
|
||||
unsigned long old, new;
|
||||
|
||||
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
|
||||
!btree_ptr_sectors_written(insert));
|
||||
!btree_ptr_sectors_written(bkey_i_to_s_c(insert)));
|
||||
|
||||
if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
|
||||
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
|
||||
@ -1395,14 +1378,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
|
||||
bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
|
||||
set_btree_node_dirty_acct(c, b);
|
||||
|
||||
v = READ_ONCE(b->flags);
|
||||
old = READ_ONCE(b->flags);
|
||||
do {
|
||||
old = new = v;
|
||||
new = old;
|
||||
|
||||
new &= ~BTREE_WRITE_TYPE_MASK;
|
||||
new |= BTREE_WRITE_interior;
|
||||
new |= 1 << BTREE_NODE_need_write;
|
||||
} while ((v = cmpxchg(&b->flags, old, new)) != old);
|
||||
} while (!try_cmpxchg(&b->flags, &old, new));
|
||||
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
@ -1777,7 +1760,6 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
|
||||
int live_u64s_added, u64s_added;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&c->gc_lock);
|
||||
BUG_ON(!btree_node_intent_locked(path, b->c.level));
|
||||
BUG_ON(!b->c.level);
|
||||
BUG_ON(!as || as->b);
|
||||
|
@ -54,7 +54,6 @@ struct btree_update {
|
||||
enum btree_update_mode mode;
|
||||
enum bch_trans_commit_flags flags;
|
||||
unsigned nodes_written:1;
|
||||
unsigned took_gc_lock:1;
|
||||
|
||||
enum btree_id btree_id;
|
||||
unsigned update_level_start;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "btree_update.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "btree_write_buffer.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "error.h"
|
||||
#include "journal.h"
|
||||
#include "journal_io.h"
|
||||
@ -132,7 +133,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
|
||||
|
||||
static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
|
||||
struct btree_write_buffered_key *wb,
|
||||
bool *write_locked, size_t *fast)
|
||||
bool *write_locked,
|
||||
bool *accounting_accumulated,
|
||||
size_t *fast)
|
||||
{
|
||||
struct btree_path *path;
|
||||
int ret;
|
||||
@ -145,6 +148,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
|
||||
struct bkey u;
|
||||
struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
|
||||
|
||||
if (k.k->type == KEY_TYPE_accounting)
|
||||
bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
|
||||
bkey_s_c_to_accounting(k));
|
||||
}
|
||||
*accounting_accumulated = true;
|
||||
|
||||
/*
|
||||
* We can't clone a path that has write locks: unshare it now, before
|
||||
* set_pos and traverse():
|
||||
@ -257,8 +270,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
struct journal *j = &c->journal;
|
||||
struct btree_write_buffer *wb = &c->btree_write_buffer;
|
||||
struct btree_iter iter = { NULL };
|
||||
size_t skipped = 0, fast = 0, slowpath = 0;
|
||||
size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
|
||||
bool write_locked = false;
|
||||
bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_unlock(trans);
|
||||
@ -299,11 +313,22 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
|
||||
BUG_ON(!k->journal_seq);
|
||||
|
||||
if (!accounting_replay_done &&
|
||||
k->k.k.type == KEY_TYPE_accounting) {
|
||||
slowpath++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (i + 1 < &darray_top(wb->sorted) &&
|
||||
wb_key_eq(i, i + 1)) {
|
||||
struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
|
||||
|
||||
skipped++;
|
||||
if (k->k.k.type == KEY_TYPE_accounting &&
|
||||
n->k.k.type == KEY_TYPE_accounting)
|
||||
bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
|
||||
bkey_i_to_s_c_accounting(&k->k));
|
||||
|
||||
overwritten++;
|
||||
n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
|
||||
k->journal_seq = 0;
|
||||
continue;
|
||||
@ -338,13 +363,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
bch2_btree_iter_set_pos(&iter, k->k.k.p);
|
||||
btree_iter_path(trans, &iter)->preserve = false;
|
||||
|
||||
bool accounting_accumulated = false;
|
||||
do {
|
||||
if (race_fault()) {
|
||||
ret = -BCH_ERR_journal_reclaim_would_deadlock;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
|
||||
ret = wb_flush_one(trans, &iter, k, &write_locked,
|
||||
&accounting_accumulated, &fast);
|
||||
if (!write_locked)
|
||||
bch2_trans_begin(trans);
|
||||
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
|
||||
@ -385,8 +412,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
if (!i->journal_seq)
|
||||
continue;
|
||||
|
||||
bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
|
||||
bch2_btree_write_buffer_journal_flush);
|
||||
if (!accounting_replay_done &&
|
||||
i->k.k.type == KEY_TYPE_accounting) {
|
||||
could_not_insert++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!could_not_insert)
|
||||
bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
|
||||
bch2_btree_write_buffer_journal_flush);
|
||||
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
@ -399,13 +433,45 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
btree_write_buffered_insert(trans, i));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
i->journal_seq = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If journal replay hasn't finished with accounting keys we
|
||||
* can't flush accounting keys at all - condense them and leave
|
||||
* them for next time.
|
||||
*
|
||||
* Q: Can the write buffer overflow?
|
||||
* A Shouldn't be any actual risk. It's just new accounting
|
||||
* updates that the write buffer can't flush, and those are only
|
||||
* going to be generated by interior btree node updates as
|
||||
* journal replay has to split/rewrite nodes to make room for
|
||||
* its updates.
|
||||
*
|
||||
* And for those new acounting updates, updates to the same
|
||||
* counters get accumulated as they're flushed from the journal
|
||||
* to the write buffer - see the patch for eytzingcer tree
|
||||
* accumulated. So we could only overflow if the number of
|
||||
* distinct counters touched somehow was very large.
|
||||
*/
|
||||
if (could_not_insert) {
|
||||
struct btree_write_buffered_key *dst = wb->flushing.keys.data;
|
||||
|
||||
darray_for_each(wb->flushing.keys, i)
|
||||
if (i->journal_seq)
|
||||
*dst++ = *i;
|
||||
wb->flushing.keys.nr = dst - wb->flushing.keys.data;
|
||||
}
|
||||
}
|
||||
err:
|
||||
if (ret || !could_not_insert) {
|
||||
bch2_journal_pin_drop(j, &wb->flushing.pin);
|
||||
wb->flushing.keys.nr = 0;
|
||||
}
|
||||
|
||||
bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
|
||||
trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
|
||||
bch2_journal_pin_drop(j, &wb->flushing.pin);
|
||||
wb->flushing.keys.nr = 0;
|
||||
trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -507,6 +573,29 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
|
||||
}
|
||||
|
||||
static void wb_accounting_sort(struct btree_write_buffer *wb)
|
||||
{
|
||||
eytzinger0_sort(wb->accounting.data, wb->accounting.nr,
|
||||
sizeof(wb->accounting.data[0]),
|
||||
wb_key_cmp, NULL);
|
||||
}
|
||||
|
||||
int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree,
|
||||
struct bkey_i_accounting *k)
|
||||
{
|
||||
struct btree_write_buffer *wb = &c->btree_write_buffer;
|
||||
struct btree_write_buffered_key new = { .btree = btree };
|
||||
|
||||
bkey_copy(&new.k, &k->k_i);
|
||||
|
||||
int ret = darray_push(&wb->accounting, new);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
wb_accounting_sort(wb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
|
||||
struct journal_keys_to_wb *dst,
|
||||
enum btree_id btree, struct bkey_i *k)
|
||||
@ -576,11 +665,35 @@ void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_ke
|
||||
|
||||
bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
|
||||
bch2_btree_write_buffer_journal_flush);
|
||||
|
||||
darray_for_each(wb->accounting, i)
|
||||
memset(&i->k.v, 0, bkey_val_bytes(&i->k.k));
|
||||
}
|
||||
|
||||
void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
|
||||
int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
|
||||
{
|
||||
struct btree_write_buffer *wb = &c->btree_write_buffer;
|
||||
unsigned live_accounting_keys = 0;
|
||||
int ret = 0;
|
||||
|
||||
darray_for_each(wb->accounting, i)
|
||||
if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) {
|
||||
i->journal_seq = dst->seq;
|
||||
live_accounting_keys++;
|
||||
ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
if (live_accounting_keys * 2 < wb->accounting.nr) {
|
||||
struct btree_write_buffered_key *dst = wb->accounting.data;
|
||||
|
||||
darray_for_each(wb->accounting, src)
|
||||
if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k)))
|
||||
*dst++ = *src;
|
||||
wb->accounting.nr = dst - wb->accounting.data;
|
||||
wb_accounting_sort(wb);
|
||||
}
|
||||
|
||||
if (!dst->wb->keys.nr)
|
||||
bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
|
||||
@ -593,6 +706,8 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys
|
||||
if (dst->wb == &wb->flushing)
|
||||
mutex_unlock(&wb->flushing.lock);
|
||||
mutex_unlock(&wb->inc.lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
|
||||
@ -616,7 +731,7 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu
|
||||
buf->need_flush_to_write_buffer = false;
|
||||
spin_unlock(&c->journal.lock);
|
||||
out:
|
||||
bch2_journal_keys_to_write_buffer_end(c, &dst);
|
||||
ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -648,6 +763,7 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
|
||||
BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
|
||||
!bch2_journal_error(&c->journal));
|
||||
|
||||
darray_exit(&wb->accounting);
|
||||
darray_exit(&wb->sorted);
|
||||
darray_exit(&wb->flushing.keys);
|
||||
darray_exit(&wb->inc.keys);
|
||||
|
@ -3,6 +3,7 @@
|
||||
#define _BCACHEFS_BTREE_WRITE_BUFFER_H
|
||||
|
||||
#include "bkey.h"
|
||||
#include "disk_accounting.h"
|
||||
|
||||
static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
|
||||
{
|
||||
@ -29,16 +30,45 @@ struct journal_keys_to_wb {
|
||||
u64 seq;
|
||||
};
|
||||
|
||||
static inline int wb_key_cmp(const void *_l, const void *_r)
|
||||
{
|
||||
const struct btree_write_buffered_key *l = _l;
|
||||
const struct btree_write_buffered_key *r = _r;
|
||||
|
||||
return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p);
|
||||
}
|
||||
|
||||
int bch2_accounting_key_to_wb_slowpath(struct bch_fs *,
|
||||
enum btree_id, struct bkey_i_accounting *);
|
||||
|
||||
static inline int bch2_accounting_key_to_wb(struct bch_fs *c,
|
||||
enum btree_id btree, struct bkey_i_accounting *k)
|
||||
{
|
||||
struct btree_write_buffer *wb = &c->btree_write_buffer;
|
||||
struct btree_write_buffered_key search;
|
||||
search.btree = btree;
|
||||
search.k.k.p = k->k.p;
|
||||
|
||||
unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr,
|
||||
sizeof(wb->accounting.data[0]),
|
||||
wb_key_cmp, &search);
|
||||
|
||||
if (idx >= wb->accounting.nr)
|
||||
return bch2_accounting_key_to_wb_slowpath(c, btree, k);
|
||||
|
||||
struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k);
|
||||
bch2_accounting_accumulate(dst, accounting_i_to_s_c(k));
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
|
||||
struct journal_keys_to_wb *,
|
||||
enum btree_id, struct bkey_i *);
|
||||
|
||||
static inline int bch2_journal_key_to_wb(struct bch_fs *c,
|
||||
static inline int __bch2_journal_key_to_wb(struct bch_fs *c,
|
||||
struct journal_keys_to_wb *dst,
|
||||
enum btree_id btree, struct bkey_i *k)
|
||||
{
|
||||
EBUG_ON(!dst->seq);
|
||||
|
||||
if (unlikely(!dst->room))
|
||||
return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
|
||||
|
||||
@ -51,8 +81,19 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int bch2_journal_key_to_wb(struct bch_fs *c,
|
||||
struct journal_keys_to_wb *dst,
|
||||
enum btree_id btree, struct bkey_i *k)
|
||||
{
|
||||
EBUG_ON(!dst->seq);
|
||||
|
||||
return k->k.type == KEY_TYPE_accounting
|
||||
? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k))
|
||||
: __bch2_journal_key_to_wb(c, dst, btree, k);
|
||||
}
|
||||
|
||||
void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
|
||||
void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
|
||||
int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
|
||||
|
||||
int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
|
||||
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
|
||||
|
@ -52,6 +52,8 @@ struct btree_write_buffer {
|
||||
struct btree_write_buffer_keys inc;
|
||||
struct btree_write_buffer_keys flushing;
|
||||
struct work_struct flush_work;
|
||||
|
||||
DARRAY(struct btree_write_buffered_key) accounting;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -85,7 +85,7 @@ static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
|
||||
return rcu_dereference_check(ca->buckets_gc,
|
||||
!ca->fs ||
|
||||
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
|
||||
lockdep_is_held(&ca->fs->gc_lock) ||
|
||||
lockdep_is_held(&ca->fs->state_lock) ||
|
||||
lockdep_is_held(&ca->bucket_lock));
|
||||
}
|
||||
|
||||
@ -102,7 +102,7 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
|
||||
return rcu_dereference_check(ca->bucket_gens,
|
||||
!ca->fs ||
|
||||
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
|
||||
lockdep_is_held(&ca->fs->gc_lock) ||
|
||||
lockdep_is_held(&ca->fs->state_lock) ||
|
||||
lockdep_is_held(&ca->bucket_lock));
|
||||
}
|
||||
|
||||
@ -199,7 +199,6 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_dev_usage_init(struct bch_dev *);
|
||||
void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);
|
||||
|
||||
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
|
||||
@ -261,73 +260,14 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,
|
||||
|
||||
/* Filesystem usage: */
|
||||
|
||||
static inline unsigned __fs_usage_u64s(unsigned nr_replicas)
|
||||
{
|
||||
return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas;
|
||||
}
|
||||
|
||||
static inline unsigned fs_usage_u64s(struct bch_fs *c)
|
||||
{
|
||||
return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
|
||||
}
|
||||
|
||||
static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas)
|
||||
{
|
||||
return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas;
|
||||
}
|
||||
|
||||
static inline unsigned fs_usage_online_u64s(struct bch_fs *c)
|
||||
{
|
||||
return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr));
|
||||
}
|
||||
|
||||
static inline unsigned dev_usage_u64s(void)
|
||||
{
|
||||
return sizeof(struct bch_dev_usage) / sizeof(u64);
|
||||
}
|
||||
|
||||
u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
|
||||
|
||||
struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
|
||||
|
||||
void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
|
||||
|
||||
void bch2_fs_usage_to_text(struct printbuf *,
|
||||
struct bch_fs *, struct bch_fs_usage_online *);
|
||||
|
||||
u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
|
||||
|
||||
struct bch_fs_usage_short
|
||||
bch2_fs_usage_read_short(struct bch_fs *);
|
||||
|
||||
void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
|
||||
const struct bch_alloc_v4 *,
|
||||
const struct bch_alloc_v4 *, u64, bool);
|
||||
|
||||
/* key/bucket marking: */
|
||||
|
||||
static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
|
||||
unsigned journal_seq,
|
||||
bool gc)
|
||||
{
|
||||
percpu_rwsem_assert_held(&c->mark_lock);
|
||||
BUG_ON(!gc && !journal_seq);
|
||||
|
||||
return this_cpu_ptr(gc
|
||||
? c->usage_gc
|
||||
: c->usage[journal_seq & JOURNAL_BUF_MASK]);
|
||||
}
|
||||
|
||||
int bch2_update_replicas(struct bch_fs *, struct bkey_s_c,
|
||||
struct bch_replicas_entry_v1 *, s64,
|
||||
unsigned, bool);
|
||||
int bch2_update_replicas_list(struct btree_trans *,
|
||||
struct bch_replicas_entry_v1 *, s64);
|
||||
int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
|
||||
int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
|
||||
|
||||
void bch2_fs_usage_initialize(struct bch_fs *);
|
||||
|
||||
int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
|
||||
struct bkey_s_c, const struct bch_extent_ptr *,
|
||||
s64, enum bch_data_type, u8, u8, u32 *);
|
||||
@ -356,9 +296,6 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
|
||||
|
||||
void bch2_trans_account_disk_usage_change(struct btree_trans *);
|
||||
|
||||
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
|
||||
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
|
||||
|
||||
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
|
||||
enum bch_data_type, unsigned,
|
||||
enum btree_iter_update_trigger_flags);
|
||||
@ -419,13 +356,13 @@ static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reserv
|
||||
#ifdef __KERNEL__
|
||||
u64 old, new;
|
||||
|
||||
old = this_cpu_read(c->pcpu->sectors_available);
|
||||
do {
|
||||
old = this_cpu_read(c->pcpu->sectors_available);
|
||||
if (sectors > old)
|
||||
return __bch2_disk_reservation_add(c, res, sectors, flags);
|
||||
|
||||
new = old - sectors;
|
||||
} while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
|
||||
} while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new));
|
||||
|
||||
this_cpu_add(*c->online_reserved, sectors);
|
||||
res->sectors += sectors;
|
||||
|
@ -16,7 +16,8 @@ struct bucket {
|
||||
u32 stripe;
|
||||
u32 dirty_sectors;
|
||||
u32 cached_sectors;
|
||||
};
|
||||
u32 stripe_sectors;
|
||||
} __aligned(sizeof(long));
|
||||
|
||||
struct bucket_array {
|
||||
struct rcu_head rcu;
|
||||
@ -33,7 +34,7 @@ struct bucket_gens {
|
||||
};
|
||||
|
||||
struct bch_dev_usage {
|
||||
struct {
|
||||
struct bch_dev_usage_type {
|
||||
u64 buckets;
|
||||
u64 sectors; /* _compressed_ sectors: */
|
||||
/*
|
||||
@ -54,18 +55,6 @@ struct bch_fs_usage_base {
|
||||
u64 nr_inodes;
|
||||
};
|
||||
|
||||
struct bch_fs_usage {
|
||||
/* all fields are in units of 512 byte sectors: */
|
||||
struct bch_fs_usage_base b;
|
||||
u64 persistent_reserved[BCH_REPLICAS_MAX];
|
||||
u64 replicas[];
|
||||
};
|
||||
|
||||
struct bch_fs_usage_online {
|
||||
u64 online_reserved;
|
||||
struct bch_fs_usage u;
|
||||
};
|
||||
|
||||
struct bch_fs_usage_short {
|
||||
u64 capacity;
|
||||
u64 used;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "bcachefs_ioctl.h"
|
||||
#include "buckets.h"
|
||||
#include "chardev.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "journal.h"
|
||||
#include "move.h"
|
||||
#include "recovery_passes.h"
|
||||
@ -213,9 +214,21 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
|
||||
|
||||
if (arg.opts) {
|
||||
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
|
||||
char *ro, *rest;
|
||||
|
||||
/*
|
||||
* If passed a "read_only" mount option, remove it because it is
|
||||
* no longer a valid mount option, and the filesystem will be
|
||||
* set "read_only" regardless.
|
||||
*/
|
||||
ro = strstr(optstr, "read_only");
|
||||
if (ro) {
|
||||
rest = ro + strlen("read_only");
|
||||
memmove(ro, rest, strlen(rest) + 1);
|
||||
}
|
||||
|
||||
ret = PTR_ERR_OR_ZERO(optstr) ?:
|
||||
bch2_parse_mount_opts(NULL, &thr->opts, optstr);
|
||||
bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
|
||||
kfree(optstr);
|
||||
|
||||
if (ret)
|
||||
@ -223,6 +236,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
|
||||
}
|
||||
|
||||
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
|
||||
opt_set(thr->opts, read_only, 1);
|
||||
|
||||
/* We need request_key() to be called before we punt to kthread: */
|
||||
opt_set(thr->opts, nostart, true);
|
||||
@ -501,11 +515,9 @@ static long bch2_ioctl_data(struct bch_fs *c,
|
||||
static long bch2_ioctl_fs_usage(struct bch_fs *c,
|
||||
struct bch_ioctl_fs_usage __user *user_arg)
|
||||
{
|
||||
struct bch_ioctl_fs_usage *arg = NULL;
|
||||
struct bch_replicas_usage *dst_e, *dst_end;
|
||||
struct bch_fs_usage_online *src;
|
||||
struct bch_ioctl_fs_usage arg;
|
||||
darray_char replicas = {};
|
||||
u32 replica_entries_bytes;
|
||||
unsigned i;
|
||||
int ret = 0;
|
||||
|
||||
if (!test_bit(BCH_FS_started, &c->flags))
|
||||
@ -514,62 +526,61 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
|
||||
if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
|
||||
return -EFAULT;
|
||||
|
||||
arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
|
||||
if (!arg)
|
||||
return -ENOMEM;
|
||||
|
||||
src = bch2_fs_usage_read(c);
|
||||
if (!src) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
arg->capacity = c->capacity;
|
||||
arg->used = bch2_fs_sectors_used(c, src);
|
||||
arg->online_reserved = src->online_reserved;
|
||||
|
||||
for (i = 0; i < BCH_REPLICAS_MAX; i++)
|
||||
arg->persistent_reserved[i] = src->u.persistent_reserved[i];
|
||||
|
||||
dst_e = arg->replicas;
|
||||
dst_end = (void *) arg->replicas + replica_entries_bytes;
|
||||
|
||||
for (i = 0; i < c->replicas.nr; i++) {
|
||||
struct bch_replicas_entry_v1 *src_e =
|
||||
cpu_replicas_entry(&c->replicas, i);
|
||||
|
||||
/* check that we have enough space for one replicas entry */
|
||||
if (dst_e + 1 > dst_end) {
|
||||
ret = -ERANGE;
|
||||
break;
|
||||
}
|
||||
|
||||
dst_e->sectors = src->u.replicas[i];
|
||||
dst_e->r = *src_e;
|
||||
|
||||
/* recheck after setting nr_devs: */
|
||||
if (replicas_usage_next(dst_e) > dst_end) {
|
||||
ret = -ERANGE;
|
||||
break;
|
||||
}
|
||||
|
||||
memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
|
||||
|
||||
dst_e = replicas_usage_next(dst_e);
|
||||
}
|
||||
|
||||
arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
|
||||
|
||||
percpu_up_read(&c->mark_lock);
|
||||
kfree(src);
|
||||
|
||||
ret = bch2_fs_replicas_usage_read(c, &replicas) ?:
|
||||
(replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?:
|
||||
copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = copy_to_user_errcode(user_arg, arg,
|
||||
sizeof(*arg) + arg->replica_entries_bytes);
|
||||
struct bch_fs_usage_short u = bch2_fs_usage_read_short(c);
|
||||
arg.capacity = c->capacity;
|
||||
arg.used = u.used;
|
||||
arg.online_reserved = percpu_u64_get(c->online_reserved);
|
||||
arg.replica_entries_bytes = replicas.nr;
|
||||
|
||||
for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
|
||||
struct disk_accounting_pos k = {
|
||||
.type = BCH_DISK_ACCOUNTING_persistent_reserved,
|
||||
.persistent_reserved.nr_replicas = i,
|
||||
};
|
||||
|
||||
bch2_accounting_mem_read(c,
|
||||
disk_accounting_pos_to_bpos(&k),
|
||||
&arg.persistent_reserved[i], 1);
|
||||
}
|
||||
|
||||
ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
|
||||
err:
|
||||
kfree(arg);
|
||||
darray_exit(&replicas);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long bch2_ioctl_query_accounting(struct bch_fs *c,
|
||||
struct bch_ioctl_query_accounting __user *user_arg)
|
||||
{
|
||||
struct bch_ioctl_query_accounting arg;
|
||||
darray_char accounting = {};
|
||||
int ret = 0;
|
||||
|
||||
if (!test_bit(BCH_FS_started, &c->flags))
|
||||
return -EINVAL;
|
||||
|
||||
ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?:
|
||||
bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?:
|
||||
(arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?:
|
||||
copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
arg.capacity = c->capacity;
|
||||
arg.used = bch2_fs_usage_read_short(c).used;
|
||||
arg.online_reserved = percpu_u64_get(c->online_reserved);
|
||||
arg.accounting_u64s = accounting.nr / sizeof(u64);
|
||||
|
||||
ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
|
||||
err:
|
||||
bch_err_fn(c, ret);
|
||||
darray_exit(&accounting);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -604,7 +615,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
|
||||
arg.bucket_size = ca->mi.bucket_size;
|
||||
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
|
||||
|
||||
for (i = 0; i < BCH_DATA_NR; i++) {
|
||||
for (i = 0; i < ARRAY_SIZE(arg.d); i++) {
|
||||
arg.d[i].buckets = src.d[i].buckets;
|
||||
arg.d[i].sectors = src.d[i].sectors;
|
||||
arg.d[i].fragmented = src.d[i].fragmented;
|
||||
@ -849,7 +860,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
|
||||
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
|
||||
|
||||
ret = PTR_ERR_OR_ZERO(optstr) ?:
|
||||
bch2_parse_mount_opts(c, &thr->opts, optstr);
|
||||
bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
|
||||
kfree(optstr);
|
||||
|
||||
if (ret)
|
||||
@ -925,6 +936,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
|
||||
BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
|
||||
case BCH_IOCTL_FSCK_ONLINE:
|
||||
BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
|
||||
case BCH_IOCTL_QUERY_ACCOUNTING:
|
||||
return bch2_ioctl_query_accounting(c, arg);
|
||||
default:
|
||||
return -ENOTTY;
|
||||
}
|
||||
|
@ -534,6 +534,14 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
|
||||
static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
|
||||
{
|
||||
struct qstr name = bch2_dirent_get_name(d);
|
||||
/*
|
||||
* Although not required by the kernel code, updating ctx->pos is needed
|
||||
* for the bcachefs FUSE driver. Without this update, the FUSE
|
||||
* implementation will be stuck in an infinite loop when reading
|
||||
* directories (via the bcachefs_fuse_readdir callback).
|
||||
* In kernel space, ctx->pos is updated by the VFS code.
|
||||
*/
|
||||
ctx->pos = d.k->p.offset;
|
||||
bool ret = dir_emit(ctx, name.name,
|
||||
name.len,
|
||||
target.inum,
|
||||
|
742
libbcachefs/disk_accounting.c
Normal file
742
libbcachefs/disk_accounting.c
Normal file
@ -0,0 +1,742 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "bcachefs_ioctl.h"
|
||||
#include "btree_cache.h"
|
||||
#include "btree_update.h"
|
||||
#include "btree_write_buffer.h"
|
||||
#include "buckets.h"
|
||||
#include "compress.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "error.h"
|
||||
#include "journal_io.h"
|
||||
#include "replicas.h"
|
||||
|
||||
/*
|
||||
* Notes on disk accounting:
|
||||
*
|
||||
* We have two parallel sets of counters to be concerned with, and both must be
|
||||
* kept in sync.
|
||||
*
|
||||
* - Persistent/on disk accounting, stored in the accounting btree and updated
|
||||
* via btree write buffer updates that treat new accounting keys as deltas to
|
||||
* apply to existing values. But reading from a write buffer btree is
|
||||
* expensive, so we also have
|
||||
*
|
||||
* - In memory accounting, where accounting is stored as an array of percpu
|
||||
* counters, indexed by an eytzinger array of disk acounting keys/bpos (which
|
||||
* are the same thing, excepting byte swabbing on big endian).
|
||||
*
|
||||
* Cheap to read, but non persistent.
|
||||
*
|
||||
* Disk accounting updates are generated by transactional triggers; these run as
|
||||
* keys enter and leave the btree, and can compare old and new versions of keys;
|
||||
* the output of these triggers are deltas to the various counters.
|
||||
*
|
||||
* Disk accounting updates are done as btree write buffer updates, where the
|
||||
* counters in the disk accounting key are deltas that will be applied to the
|
||||
* counter in the btree when the key is flushed by the write buffer (or journal
|
||||
* replay).
|
||||
*
|
||||
* To do a disk accounting update:
|
||||
* - initialize a disk_accounting_pos, to specify which counter is being update
|
||||
* - initialize counter deltas, as an array of 1-3 s64s
|
||||
* - call bch2_disk_accounting_mod()
|
||||
*
|
||||
* This queues up the accounting update to be done at transaction commit time.
|
||||
* Underneath, it's a normal btree write buffer update.
|
||||
*
|
||||
* The transaction commit path is responsible for propagating updates to the in
|
||||
* memory counters, with bch2_accounting_mem_mod().
|
||||
*
|
||||
* The commit path also assigns every disk accounting update a unique version
|
||||
* number, based on the journal sequence number and offset within that journal
|
||||
* buffer; this is used by journal replay to determine which updates have been
|
||||
* done.
|
||||
*
|
||||
* The transaction commit path also ensures that replicas entry accounting
|
||||
* updates are properly marked in the superblock (so that we know whether we can
|
||||
* mount without data being unavailable); it will update the superblock if
|
||||
* bch2_accounting_mem_mod() tells it to.
|
||||
*/
|
||||
|
||||
static const char * const disk_accounting_type_strs[] = {
|
||||
#define x(t, n, ...) [n] = #t,
|
||||
BCH_DISK_ACCOUNTING_TYPES()
|
||||
#undef x
|
||||
NULL
|
||||
};
|
||||
|
||||
static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos pos,
|
||||
s64 *d, unsigned nr)
|
||||
{
|
||||
struct bkey_i_accounting *acc = bkey_accounting_init(k);
|
||||
|
||||
acc->k.p = disk_accounting_pos_to_bpos(&pos);
|
||||
set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
|
||||
|
||||
memcpy_u64s_small(acc->v.d, d, nr);
|
||||
}
|
||||
|
||||
int bch2_disk_accounting_mod(struct btree_trans *trans,
|
||||
struct disk_accounting_pos *k,
|
||||
s64 *d, unsigned nr, bool gc)
|
||||
{
|
||||
/* Normalize: */
|
||||
switch (k->type) {
|
||||
case BCH_DISK_ACCOUNTING_replicas:
|
||||
bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp);
|
||||
break;
|
||||
}
|
||||
|
||||
BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
|
||||
|
||||
struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
|
||||
|
||||
accounting_key_init(&k_i.k, *k, d, nr);
|
||||
|
||||
return likely(!gc)
|
||||
? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k)
|
||||
: bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
|
||||
}
|
||||
|
||||
int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
|
||||
unsigned dev, s64 sectors,
|
||||
bool gc)
|
||||
{
|
||||
struct disk_accounting_pos acc = {
|
||||
.type = BCH_DISK_ACCOUNTING_replicas,
|
||||
};
|
||||
|
||||
bch2_replicas_entry_cached(&acc.replicas, dev);
|
||||
|
||||
return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc);
|
||||
}
|
||||
|
||||
int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags,
|
||||
struct printbuf *err)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
|
||||
{
|
||||
if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
|
||||
prt_printf(out, "unknown type %u", k->type);
|
||||
return;
|
||||
}
|
||||
|
||||
prt_str(out, disk_accounting_type_strs[k->type]);
|
||||
prt_str(out, " ");
|
||||
|
||||
switch (k->type) {
|
||||
case BCH_DISK_ACCOUNTING_nr_inodes:
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_persistent_reserved:
|
||||
prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas);
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_replicas:
|
||||
bch2_replicas_entry_to_text(out, &k->replicas);
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_dev_data_type:
|
||||
prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
|
||||
bch2_prt_data_type(out, k->dev_data_type.data_type);
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_compression:
|
||||
bch2_prt_compression_type(out, k->compression.type);
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_snapshot:
|
||||
prt_printf(out, "id=%u", k->snapshot.id);
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_btree:
|
||||
prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
|
||||
{
|
||||
struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k);
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, k.k->p);
|
||||
|
||||
bch2_accounting_key_to_text(out, &acc_k);
|
||||
|
||||
for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
|
||||
prt_printf(out, " %lli", acc.v->d[i]);
|
||||
}
|
||||
|
||||
void bch2_accounting_swab(struct bkey_s k)
|
||||
{
|
||||
for (u64 *p = (u64 *) k.v;
|
||||
p < (u64 *) bkey_val_end(k);
|
||||
p++)
|
||||
*p = swab64(*p);
|
||||
}
|
||||
|
||||
static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
|
||||
{
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, p);
|
||||
|
||||
switch (acc_k.type) {
|
||||
case BCH_DISK_ACCOUNTING_replicas:
|
||||
memcpy(r, &acc_k.replicas, replicas_entry_bytes(&acc_k.replicas));
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
|
||||
{
|
||||
struct bch_replicas_padded r;
|
||||
return accounting_to_replicas(&r.e, p)
|
||||
? bch2_mark_replicas(c, &r.e)
|
||||
: 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure accounting keys being updated are present in the superblock, when
|
||||
* applicable (i.e. replicas updates)
|
||||
*/
|
||||
int bch2_accounting_update_sb(struct btree_trans *trans)
|
||||
{
|
||||
for (struct jset_entry *i = trans->journal_entries;
|
||||
i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
|
||||
i = vstruct_next(i))
|
||||
if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) {
|
||||
int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
|
||||
{
|
||||
struct bch_replicas_padded r;
|
||||
|
||||
if (accounting_to_replicas(&r.e, a.k->p) &&
|
||||
!bch2_replicas_marked_locked(c, &r.e))
|
||||
return -BCH_ERR_btree_insert_need_mark_replicas;
|
||||
|
||||
struct bch_accounting_mem *acc = &c->accounting[gc];
|
||||
unsigned new_nr_counters = acc->nr_counters + bch2_accounting_counters(a.k);
|
||||
|
||||
u64 __percpu *new_counters = __alloc_percpu_gfp(new_nr_counters * sizeof(u64),
|
||||
sizeof(u64), GFP_KERNEL);
|
||||
if (!new_counters)
|
||||
return -BCH_ERR_ENOMEM_disk_accounting;
|
||||
|
||||
preempt_disable();
|
||||
memcpy(this_cpu_ptr(new_counters),
|
||||
bch2_acc_percpu_u64s(acc->v, acc->nr_counters),
|
||||
acc->nr_counters * sizeof(u64));
|
||||
preempt_enable();
|
||||
|
||||
struct accounting_pos_offset n = {
|
||||
.pos = a.k->p,
|
||||
.version = a.k->version,
|
||||
.offset = acc->nr_counters,
|
||||
.nr_counters = bch2_accounting_counters(a.k),
|
||||
};
|
||||
if (darray_push(&acc->k, n)) {
|
||||
free_percpu(new_counters);
|
||||
return -BCH_ERR_ENOMEM_disk_accounting;
|
||||
}
|
||||
|
||||
eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL);
|
||||
|
||||
free_percpu(acc->v);
|
||||
acc->v = new_counters;
|
||||
acc->nr_counters = new_nr_counters;
|
||||
|
||||
for (unsigned i = 0; i < n.nr_counters; i++)
|
||||
this_cpu_add(acc->v[n.offset + i], a.v->d[i]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
|
||||
{
|
||||
percpu_up_read(&c->mark_lock);
|
||||
percpu_down_write(&c->mark_lock);
|
||||
int ret = __bch2_accounting_mem_mod_slowpath(c, a, gc);
|
||||
percpu_up_write(&c->mark_lock);
|
||||
percpu_down_read(&c->mark_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read out accounting keys for replicas entries, as an array of
|
||||
* bch_replicas_usage entries.
|
||||
*
|
||||
* Note: this may be deprecated/removed at smoe point in the future and replaced
|
||||
* with something more general, it exists to support the ioctl used by the
|
||||
* 'bcachefs fs usage' command.
|
||||
*/
|
||||
int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
|
||||
{
|
||||
struct bch_accounting_mem *acc = &c->accounting[0];
|
||||
int ret = 0;
|
||||
|
||||
darray_init(usage);
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
darray_for_each(acc->k, i) {
|
||||
struct {
|
||||
struct bch_replicas_usage r;
|
||||
u8 pad[BCH_BKEY_PTRS_MAX];
|
||||
} u;
|
||||
|
||||
if (!accounting_to_replicas(&u.r.r, i->pos))
|
||||
continue;
|
||||
|
||||
bch2_accounting_mem_read(c, i->pos, &u.r.sectors, 1);
|
||||
|
||||
ret = darray_make_room(usage, replicas_usage_bytes(&u.r));
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r));
|
||||
usage->nr += replicas_usage_bytes(&u.r);
|
||||
}
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
if (ret)
|
||||
darray_exit(usage);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask)
|
||||
{
|
||||
|
||||
struct bch_accounting_mem *acc = &c->accounting[0];
|
||||
int ret = 0;
|
||||
|
||||
darray_init(out_buf);
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
darray_for_each(acc->k, i) {
|
||||
struct disk_accounting_pos a_p;
|
||||
bpos_to_disk_accounting_pos(&a_p, i->pos);
|
||||
|
||||
if (!(accounting_types_mask & BIT(a_p.type)))
|
||||
continue;
|
||||
|
||||
ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) +
|
||||
sizeof(u64) * i->nr_counters);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
struct bkey_i_accounting *a_out =
|
||||
bkey_accounting_init((void *) &darray_top(*out_buf));
|
||||
set_bkey_val_u64s(&a_out->k, i->nr_counters);
|
||||
a_out->k.p = i->pos;
|
||||
bch2_accounting_mem_read(c, i->pos, a_out->v.d, i->nr_counters);
|
||||
|
||||
if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out)))
|
||||
out_buf->nr += bkey_bytes(&a_out->k);
|
||||
}
|
||||
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
if (ret)
|
||||
darray_exit(out_buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
{
|
||||
struct bch_accounting_mem *acc = &c->accounting[0];
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
out->atomic++;
|
||||
|
||||
eytzinger0_for_each(i, acc->k.nr) {
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos);
|
||||
|
||||
bch2_accounting_key_to_text(out, &acc_k);
|
||||
|
||||
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
|
||||
bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v), false);
|
||||
|
||||
prt_str(out, ":");
|
||||
for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
|
||||
prt_printf(out, " %llu", v[j]);
|
||||
prt_newline(out);
|
||||
}
|
||||
|
||||
--out->atomic;
|
||||
percpu_up_read(&c->mark_lock);
|
||||
}
|
||||
|
||||
/* Ensures all counters in @src exist in @dst: */
|
||||
static int copy_counters(struct bch_accounting_mem *dst,
|
||||
struct bch_accounting_mem *src)
|
||||
{
|
||||
unsigned orig_dst_k_nr = dst->k.nr;
|
||||
unsigned dst_counters = dst->nr_counters;
|
||||
|
||||
darray_for_each(src->k, i)
|
||||
if (eytzinger0_find(dst->k.data, orig_dst_k_nr, sizeof(dst->k.data[0]),
|
||||
accounting_pos_cmp, &i->pos) >= orig_dst_k_nr) {
|
||||
if (darray_push(&dst->k, ((struct accounting_pos_offset) {
|
||||
.pos = i->pos,
|
||||
.offset = dst_counters,
|
||||
.nr_counters = i->nr_counters })))
|
||||
goto err;
|
||||
|
||||
dst_counters += i->nr_counters;
|
||||
}
|
||||
|
||||
if (dst->k.nr == orig_dst_k_nr)
|
||||
return 0;
|
||||
|
||||
u64 __percpu *new_counters = __alloc_percpu_gfp(dst_counters * sizeof(u64),
|
||||
sizeof(u64), GFP_KERNEL);
|
||||
if (!new_counters)
|
||||
goto err;
|
||||
|
||||
preempt_disable();
|
||||
memcpy(this_cpu_ptr(new_counters),
|
||||
bch2_acc_percpu_u64s(dst->v, dst->nr_counters),
|
||||
dst->nr_counters * sizeof(u64));
|
||||
preempt_enable();
|
||||
|
||||
free_percpu(dst->v);
|
||||
dst->v = new_counters;
|
||||
dst->nr_counters = dst_counters;
|
||||
|
||||
eytzinger0_sort(dst->k.data, dst->k.nr, sizeof(dst->k.data[0]), accounting_pos_cmp, NULL);
|
||||
|
||||
return 0;
|
||||
err:
|
||||
dst->k.nr = orig_dst_k_nr;
|
||||
return -BCH_ERR_ENOMEM_disk_accounting;
|
||||
}
|
||||
|
||||
int bch2_accounting_gc_done(struct bch_fs *c)
|
||||
{
|
||||
struct bch_accounting_mem *dst = &c->accounting[0];
|
||||
struct bch_accounting_mem *src = &c->accounting[1];
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
percpu_down_write(&c->mark_lock);
|
||||
|
||||
ret = copy_counters(dst, src) ?:
|
||||
copy_counters(src, dst);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
BUG_ON(dst->k.nr != src->k.nr);
|
||||
|
||||
for (unsigned i = 0; i < src->k.nr; i++) {
|
||||
BUG_ON(src->k.data[i].nr_counters != dst->k.data[i].nr_counters);
|
||||
BUG_ON(!bpos_eq(dst->k.data[i].pos, src->k.data[i].pos));
|
||||
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, src->k.data[i].pos);
|
||||
|
||||
unsigned nr = src->k.data[i].nr_counters;
|
||||
u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
|
||||
u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
|
||||
|
||||
bch2_accounting_mem_read_counters(c, i, dst_v, nr, false);
|
||||
bch2_accounting_mem_read_counters(c, i, src_v, nr, true);
|
||||
|
||||
if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
|
||||
printbuf_reset(&buf);
|
||||
prt_str(&buf, "accounting mismatch for ");
|
||||
bch2_accounting_key_to_text(&buf, &acc_k);
|
||||
|
||||
prt_str(&buf, ": got");
|
||||
for (unsigned j = 0; j < nr; j++)
|
||||
prt_printf(&buf, " %llu", dst_v[j]);
|
||||
|
||||
prt_str(&buf, " should be");
|
||||
for (unsigned j = 0; j < nr; j++)
|
||||
prt_printf(&buf, " %llu", src_v[j]);
|
||||
|
||||
for (unsigned j = 0; j < nr; j++)
|
||||
src_v[j] -= dst_v[j];
|
||||
|
||||
if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) {
|
||||
ret = commit_do(trans, NULL, NULL, 0,
|
||||
bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
|
||||
memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
|
||||
struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
|
||||
|
||||
accounting_key_init(&k_i.k, acc_k, src_v, nr);
|
||||
bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false);
|
||||
|
||||
preempt_disable();
|
||||
struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
|
||||
struct bch_fs_usage_base *src = &trans->fs_usage_delta;
|
||||
acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
|
||||
preempt_enable();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
err:
|
||||
fsck_err:
|
||||
percpu_up_write(&c->mark_lock);
|
||||
printbuf_exit(&buf);
|
||||
bch2_trans_put(trans);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int accounting_read_key(struct bch_fs *c, struct btree_trans *trans, struct bkey_s_c k)
|
||||
{
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
if (k.k->type != KEY_TYPE_accounting)
|
||||
return 0;
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
int ret = __bch2_accounting_mem_mod(c, bkey_s_c_to_accounting(k), false);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) &&
|
||||
ret == -BCH_ERR_btree_insert_need_mark_replicas)
|
||||
ret = 0;
|
||||
|
||||
struct disk_accounting_pos acc;
|
||||
bpos_to_disk_accounting_pos(&acc, k.k->p);
|
||||
|
||||
if (fsck_err_on(ret == -BCH_ERR_btree_insert_need_mark_replicas,
|
||||
trans, accounting_replicas_not_marked,
|
||||
"accounting not marked in superblock replicas\n %s",
|
||||
(bch2_accounting_key_to_text(&buf, &acc),
|
||||
buf.buf)))
|
||||
ret = bch2_accounting_update_sb_one(c, k.k->p);
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* At startup time, initialize the in memory accounting from the btree (and
|
||||
* journal)
|
||||
*/
|
||||
int bch2_accounting_read(struct bch_fs *c)
|
||||
{
|
||||
struct bch_accounting_mem *acc = &c->accounting[0];
|
||||
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key(trans, iter,
|
||||
BTREE_ID_accounting, POS_MIN,
|
||||
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
|
||||
struct bkey u;
|
||||
struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
|
||||
accounting_read_key(c, trans, k);
|
||||
})));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
move_gap(keys, keys->nr);
|
||||
darray_for_each(*keys, i) {
|
||||
if (i->k->k.type == KEY_TYPE_accounting) {
|
||||
struct bkey_s_c k = bkey_i_to_s_c(i->k);
|
||||
unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
|
||||
sizeof(acc->k.data[0]),
|
||||
accounting_pos_cmp, &k.k->p);
|
||||
|
||||
bool applied = idx < acc->k.nr &&
|
||||
bversion_cmp(acc->k.data[idx].version, k.k->version) >= 0;
|
||||
|
||||
if (applied)
|
||||
continue;
|
||||
|
||||
ret = accounting_read_key(c, NULL, k);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
preempt_disable();
|
||||
struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
|
||||
|
||||
for (unsigned i = 0; i < acc->k.nr; i++) {
|
||||
struct disk_accounting_pos k;
|
||||
bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
|
||||
|
||||
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
|
||||
bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v), false);
|
||||
|
||||
switch (k.type) {
|
||||
case BCH_DISK_ACCOUNTING_persistent_reserved:
|
||||
usage->reserved += v[0] * k.persistent_reserved.nr_replicas;
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_replicas:
|
||||
fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_dev_data_type:
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev);
|
||||
if (ca) {
|
||||
struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
|
||||
percpu_u64_set(&d->buckets, v[0]);
|
||||
percpu_u64_set(&d->sectors, v[1]);
|
||||
percpu_u64_set(&d->fragmented, v[2]);
|
||||
|
||||
if (k.dev_data_type.data_type == BCH_DATA_sb ||
|
||||
k.dev_data_type.data_type == BCH_DATA_journal)
|
||||
usage->hidden += v[0] * ca->mi.bucket_size;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
break;
|
||||
}
|
||||
}
|
||||
preempt_enable();
|
||||
percpu_up_read(&c->mark_lock);
|
||||
err:
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
|
||||
{
|
||||
return bch2_trans_run(c,
|
||||
bch2_btree_write_buffer_flush_sync(trans) ?:
|
||||
for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN,
|
||||
BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({
|
||||
struct disk_accounting_pos acc;
|
||||
bpos_to_disk_accounting_pos(&acc, k.k->p);
|
||||
|
||||
acc.type == BCH_DISK_ACCOUNTING_dev_data_type &&
|
||||
acc.dev_data_type.dev == dev
|
||||
? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0)
|
||||
: 0;
|
||||
})) ?:
|
||||
bch2_btree_write_buffer_flush_sync(trans));
|
||||
}
|
||||
|
||||
int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
|
||||
{
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct disk_accounting_pos acc = {
|
||||
.type = BCH_DISK_ACCOUNTING_dev_data_type,
|
||||
.dev_data_type.dev = ca->dev_idx,
|
||||
.dev_data_type.data_type = BCH_DATA_free,
|
||||
};
|
||||
u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
|
||||
|
||||
int ret = bch2_trans_do(c, NULL, NULL, 0,
|
||||
bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc));
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_verify_accounting_clean(struct bch_fs *c)
|
||||
{
|
||||
bool mismatch = false;
|
||||
struct bch_fs_usage_base base = {}, base_inmem = {};
|
||||
|
||||
bch2_trans_run(c,
|
||||
for_each_btree_key(trans, iter,
|
||||
BTREE_ID_accounting, POS_MIN,
|
||||
BTREE_ITER_all_snapshots, k, ({
|
||||
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
|
||||
struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k);
|
||||
unsigned nr = bch2_accounting_counters(k.k);
|
||||
|
||||
bch2_accounting_mem_read(c, k.k->p, v, nr);
|
||||
|
||||
if (memcmp(a.v->d, v, nr * sizeof(u64))) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
prt_str(&buf, " !=");
|
||||
for (unsigned j = 0; j < nr; j++)
|
||||
prt_printf(&buf, " %llu", v[j]);
|
||||
|
||||
pr_err("%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
mismatch = true;
|
||||
}
|
||||
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, a.k->p);
|
||||
|
||||
switch (acc_k.type) {
|
||||
case BCH_DISK_ACCOUNTING_persistent_reserved:
|
||||
base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_replicas:
|
||||
fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_dev_data_type: {
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
|
||||
if (!ca) {
|
||||
rcu_read_unlock();
|
||||
continue;
|
||||
}
|
||||
|
||||
v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
|
||||
v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
|
||||
v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
prt_str(&buf, " in mem");
|
||||
for (unsigned j = 0; j < nr; j++)
|
||||
prt_printf(&buf, " %llu", v[j]);
|
||||
|
||||
pr_err("dev accounting mismatch: %s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
mismatch = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
0;
|
||||
})));
|
||||
|
||||
acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64));
|
||||
|
||||
#define check(x) \
|
||||
if (base.x != base_inmem.x) { \
|
||||
pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \
|
||||
mismatch = true; \
|
||||
}
|
||||
|
||||
//check(hidden);
|
||||
check(btree);
|
||||
check(data);
|
||||
check(cached);
|
||||
check(reserved);
|
||||
check(nr_inodes);
|
||||
|
||||
WARN_ON(mismatch);
|
||||
}
|
||||
|
||||
void bch2_accounting_free(struct bch_accounting_mem *acc)
|
||||
{
|
||||
darray_exit(&acc->k);
|
||||
free_percpu(acc->v);
|
||||
acc->v = NULL;
|
||||
acc->nr_counters = 0;
|
||||
}
|
||||
|
||||
void bch2_fs_accounting_exit(struct bch_fs *c)
|
||||
{
|
||||
bch2_accounting_free(&c->accounting[0]);
|
||||
}
|
211
libbcachefs/disk_accounting.h
Normal file
211
libbcachefs/disk_accounting.h
Normal file
@ -0,0 +1,211 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_DISK_ACCOUNTING_H
|
||||
#define _BCACHEFS_DISK_ACCOUNTING_H
|
||||
|
||||
#include "eytzinger.h"
|
||||
#include "sb-members.h"
|
||||
|
||||
static inline void bch2_u64s_neg(u64 *v, unsigned nr)
|
||||
{
|
||||
for (unsigned i = 0; i < nr; i++)
|
||||
v[i] = -v[i];
|
||||
}
|
||||
|
||||
static inline unsigned bch2_accounting_counters(const struct bkey *k)
|
||||
{
|
||||
return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
|
||||
}
|
||||
|
||||
static inline void bch2_accounting_neg(struct bkey_s_accounting a)
|
||||
{
|
||||
bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k));
|
||||
}
|
||||
|
||||
static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
|
||||
{
|
||||
for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
|
||||
if (a.v->d[i])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
|
||||
struct bkey_s_c_accounting src)
|
||||
{
|
||||
EBUG_ON(dst->k.u64s != src.k->u64s);
|
||||
|
||||
for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
|
||||
dst->v.d[i] += src.v->d[i];
|
||||
if (bversion_cmp(dst->k.version, src.k->version) < 0)
|
||||
dst->k.version = src.k->version;
|
||||
}
|
||||
|
||||
static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
|
||||
enum bch_data_type data_type,
|
||||
s64 sectors)
|
||||
{
|
||||
switch (data_type) {
|
||||
case BCH_DATA_btree:
|
||||
fs_usage->btree += sectors;
|
||||
break;
|
||||
case BCH_DATA_user:
|
||||
case BCH_DATA_parity:
|
||||
fs_usage->data += sectors;
|
||||
break;
|
||||
case BCH_DATA_cached:
|
||||
fs_usage->cached += sectors;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
|
||||
{
|
||||
acc->_pad = p;
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
bch2_bpos_swab(&acc->_pad);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k)
|
||||
{
|
||||
struct bpos ret = k->_pad;
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
bch2_bpos_swab(&ret);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
|
||||
s64 *, unsigned, bool);
|
||||
int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
|
||||
|
||||
int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c,
|
||||
enum bch_validate_flags, struct printbuf *);
|
||||
void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
|
||||
void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
void bch2_accounting_swab(struct bkey_s);
|
||||
|
||||
#define bch2_bkey_ops_accounting ((struct bkey_ops) { \
|
||||
.key_invalid = bch2_accounting_invalid, \
|
||||
.val_to_text = bch2_accounting_to_text, \
|
||||
.swab = bch2_accounting_swab, \
|
||||
.min_val_size = 8, \
|
||||
})
|
||||
|
||||
int bch2_accounting_update_sb(struct btree_trans *);
|
||||
|
||||
static inline int accounting_pos_cmp(const void *_l, const void *_r)
|
||||
{
|
||||
const struct bpos *l = _l, *r = _r;
|
||||
|
||||
return bpos_cmp(*l, *r);
|
||||
}
|
||||
|
||||
int bch2_accounting_mem_mod_slowpath(struct bch_fs *, struct bkey_s_c_accounting, bool);
|
||||
|
||||
static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
|
||||
{
|
||||
struct bch_accounting_mem *acc = &c->accounting[gc];
|
||||
unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
|
||||
accounting_pos_cmp, &a.k->p);
|
||||
if (unlikely(idx >= acc->k.nr))
|
||||
return bch2_accounting_mem_mod_slowpath(c, a, gc);
|
||||
|
||||
unsigned offset = acc->k.data[idx].offset;
|
||||
|
||||
EBUG_ON(bch2_accounting_counters(a.k) != acc->k.data[idx].nr_counters);
|
||||
|
||||
for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
|
||||
this_cpu_add(acc->v[offset + i], a.v->d[i]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update in memory counters so they match the btree update we're doing; called
|
||||
* from transaction commit path
|
||||
*/
|
||||
static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
|
||||
if (!gc) {
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, a.k->p);
|
||||
|
||||
switch (acc_k.type) {
|
||||
case BCH_DISK_ACCOUNTING_persistent_reserved:
|
||||
trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_replicas:
|
||||
fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_dev_data_type:
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
|
||||
if (ca) {
|
||||
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
|
||||
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
|
||||
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return __bch2_accounting_mem_mod(c, a, gc);
|
||||
}
|
||||
|
||||
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
|
||||
{
|
||||
percpu_down_read(&trans->c->mark_lock);
|
||||
int ret = bch2_accounting_mem_mod_locked(trans, a, gc);
|
||||
percpu_up_read(&trans->c->mark_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void bch2_accounting_mem_read_counters(struct bch_fs *c, unsigned idx,
|
||||
u64 *v, unsigned nr, bool gc)
|
||||
{
|
||||
memset(v, 0, sizeof(*v) * nr);
|
||||
|
||||
struct bch_accounting_mem *acc = &c->accounting[gc];
|
||||
if (unlikely(idx >= acc->k.nr))
|
||||
return;
|
||||
|
||||
unsigned offset = acc->k.data[idx].offset;
|
||||
nr = min_t(unsigned, nr, acc->k.data[idx].nr_counters);
|
||||
|
||||
for (unsigned i = 0; i < nr; i++)
|
||||
v[i] = percpu_u64_get(acc->v + offset + i);
|
||||
}
|
||||
|
||||
static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
|
||||
u64 *v, unsigned nr)
|
||||
{
|
||||
struct bch_accounting_mem *acc = &c->accounting[0];
|
||||
unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
|
||||
accounting_pos_cmp, &p);
|
||||
|
||||
bch2_accounting_mem_read_counters(c, idx, v, nr, false);
|
||||
}
|
||||
|
||||
int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
|
||||
int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
|
||||
void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *);
|
||||
|
||||
int bch2_accounting_gc_done(struct bch_fs *);
|
||||
|
||||
int bch2_accounting_read(struct bch_fs *);
|
||||
|
||||
int bch2_dev_usage_remove(struct bch_fs *, unsigned);
|
||||
int bch2_dev_usage_init(struct bch_dev *, bool);
|
||||
|
||||
void bch2_verify_accounting_clean(struct bch_fs *c);
|
||||
|
||||
void bch2_accounting_free(struct bch_accounting_mem *);
|
||||
void bch2_fs_accounting_exit(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_DISK_ACCOUNTING_H */
|
162
libbcachefs/disk_accounting_format.h
Normal file
162
libbcachefs/disk_accounting_format.h
Normal file
@ -0,0 +1,162 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
|
||||
#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
|
||||
|
||||
#include "replicas_format.h"
|
||||
|
||||
/*
|
||||
* Disk accounting - KEY_TYPE_accounting - on disk format:
|
||||
*
|
||||
* Here, the key has considerably more structure than a typical key (bpos); an
|
||||
* accounting key is 'struct disk_accounting_pos', which is a union of bpos.
|
||||
*
|
||||
* More specifically: a key is just a muliword integer (where word endianness
|
||||
* matches native byte order), so we're treating bpos as an opaque 20 byte
|
||||
* integer and mapping bch_accounting_key to that.
|
||||
*
|
||||
* This is a type-tagged union of all our various subtypes; a disk accounting
|
||||
* key can be device counters, replicas counters, et cetera - it's extensible.
|
||||
*
|
||||
* The value is a list of u64s or s64s; the number of counters is specific to a
|
||||
* given accounting type.
|
||||
*
|
||||
* Unlike with other key types, updates are _deltas_, and the deltas are not
|
||||
* resolved until the update to the underlying btree, done by btree write buffer
|
||||
* flush or journal replay.
|
||||
*
|
||||
* Journal replay in particular requires special handling. The journal tracks a
|
||||
* range of entries which may possibly have not yet been applied to the btree
|
||||
* yet - it does not know definitively whether individual entries are dirty and
|
||||
* still need to be applied.
|
||||
*
|
||||
* To handle this, we use the version field of struct bkey, and give every
|
||||
* accounting update a unique version number - a total ordering in time; the
|
||||
* version number is derived from the key's position in the journal. Then
|
||||
* journal replay can compare the version number of the key from the journal
|
||||
* with the version number of the key in the btree to determine if a key needs
|
||||
* to be replayed.
|
||||
*
|
||||
* For this to work, we must maintain this strict time ordering of updates as
|
||||
* they are flushed to the btree, both via write buffer flush and via journal
|
||||
* replay. This has complications for the write buffer code while journal replay
|
||||
* is still in progress; the write buffer cannot flush any accounting keys to
|
||||
* the btree until journal replay has finished replaying its accounting keys, or
|
||||
* the (newer) version number of the keys from the write buffer will cause
|
||||
* updates from journal replay to be lost.
|
||||
*/
|
||||
|
||||
struct bch_accounting {
|
||||
struct bch_val v;
|
||||
__u64 d[];
|
||||
};
|
||||
|
||||
#define BCH_ACCOUNTING_MAX_COUNTERS 3
|
||||
|
||||
#define BCH_DATA_TYPES() \
|
||||
x(free, 0) \
|
||||
x(sb, 1) \
|
||||
x(journal, 2) \
|
||||
x(btree, 3) \
|
||||
x(user, 4) \
|
||||
x(cached, 5) \
|
||||
x(parity, 6) \
|
||||
x(stripe, 7) \
|
||||
x(need_gc_gens, 8) \
|
||||
x(need_discard, 9) \
|
||||
x(unstriped, 10)
|
||||
|
||||
enum bch_data_type {
|
||||
#define x(t, n) BCH_DATA_##t,
|
||||
BCH_DATA_TYPES()
|
||||
#undef x
|
||||
BCH_DATA_NR
|
||||
};
|
||||
|
||||
static inline bool data_type_is_empty(enum bch_data_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_DATA_free:
|
||||
case BCH_DATA_need_gc_gens:
|
||||
case BCH_DATA_need_discard:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool data_type_is_hidden(enum bch_data_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_DATA_sb:
|
||||
case BCH_DATA_journal:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#define BCH_DISK_ACCOUNTING_TYPES() \
|
||||
x(nr_inodes, 0) \
|
||||
x(persistent_reserved, 1) \
|
||||
x(replicas, 2) \
|
||||
x(dev_data_type, 3) \
|
||||
x(compression, 4) \
|
||||
x(snapshot, 5) \
|
||||
x(btree, 6) \
|
||||
x(rebalance_work, 7)
|
||||
|
||||
enum disk_accounting_type {
|
||||
#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr,
|
||||
BCH_DISK_ACCOUNTING_TYPES()
|
||||
#undef x
|
||||
BCH_DISK_ACCOUNTING_TYPE_NR,
|
||||
};
|
||||
|
||||
struct bch_nr_inodes {
|
||||
};
|
||||
|
||||
struct bch_persistent_reserved {
|
||||
__u8 nr_replicas;
|
||||
};
|
||||
|
||||
struct bch_dev_data_type {
|
||||
__u8 dev;
|
||||
__u8 data_type;
|
||||
};
|
||||
|
||||
struct bch_dev_stripe_buckets {
|
||||
__u8 dev;
|
||||
};
|
||||
|
||||
struct bch_acct_compression {
|
||||
__u8 type;
|
||||
};
|
||||
|
||||
struct bch_acct_snapshot {
|
||||
__u32 id;
|
||||
};
|
||||
|
||||
struct bch_acct_btree {
|
||||
__u32 id;
|
||||
};
|
||||
|
||||
struct disk_accounting_pos {
|
||||
union {
|
||||
struct {
|
||||
__u8 type;
|
||||
union {
|
||||
struct bch_nr_inodes nr_inodes;
|
||||
struct bch_persistent_reserved persistent_reserved;
|
||||
struct bch_replicas_entry_v1 replicas;
|
||||
struct bch_dev_data_type dev_data_type;
|
||||
struct bch_dev_stripe_buckets dev_stripe_buckets;
|
||||
struct bch_acct_compression compression;
|
||||
struct bch_acct_snapshot snapshot;
|
||||
struct bch_acct_btree btree;
|
||||
};
|
||||
};
|
||||
struct bpos _pad;
|
||||
};
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
|
20
libbcachefs/disk_accounting_types.h
Normal file
20
libbcachefs/disk_accounting_types.h
Normal file
@ -0,0 +1,20 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H
|
||||
#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H
|
||||
|
||||
#include "darray.h"
|
||||
|
||||
struct accounting_pos_offset {
|
||||
struct bpos pos;
|
||||
struct bversion version;
|
||||
u32 offset:24,
|
||||
nr_counters:8;
|
||||
};
|
||||
|
||||
struct bch_accounting_mem {
|
||||
DARRAY(struct accounting_pos_offset) k;
|
||||
u64 __percpu *v;
|
||||
unsigned nr_counters;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */
|
@ -511,7 +511,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
|
||||
return -EINVAL;
|
||||
|
||||
if (!c)
|
||||
return 0;
|
||||
return -BCH_ERR_option_needs_open_fs;
|
||||
|
||||
if (!strlen(val) || !strcmp(val, "none")) {
|
||||
*res = 0;
|
||||
|
115
libbcachefs/ec.c
115
libbcachefs/ec.c
@ -13,6 +13,7 @@
|
||||
#include "btree_write_buffer.h"
|
||||
#include "buckets.h"
|
||||
#include "checksum.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "disk_groups.h"
|
||||
#include "ec.h"
|
||||
#include "error.h"
|
||||
@ -292,12 +293,11 @@ static int mark_stripe_bucket(struct btree_trans *trans,
|
||||
bucket_lock(g);
|
||||
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
|
||||
ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
|
||||
if (!ret) {
|
||||
alloc_to_bucket(g, new);
|
||||
bch2_dev_usage_update(c, ca, &old, &new, 0, true);
|
||||
}
|
||||
alloc_to_bucket(g, new);
|
||||
bucket_unlock(g);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
if (!ret)
|
||||
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
|
||||
}
|
||||
err:
|
||||
bch2_dev_put(ca);
|
||||
@ -358,7 +358,12 @@ int bch2_trigger_stripe(struct btree_trans *trans,
|
||||
if (unlikely(flags & BTREE_TRIGGER_check_repair))
|
||||
return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
|
||||
|
||||
if (flags & BTREE_TRIGGER_transactional) {
|
||||
BUG_ON(new_s && old_s &&
|
||||
(new_s->nr_blocks != old_s->nr_blocks ||
|
||||
new_s->nr_redundant != old_s->nr_redundant));
|
||||
|
||||
|
||||
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
|
||||
/*
|
||||
* If the pointers aren't changing, we don't need to do anything:
|
||||
*/
|
||||
@ -369,26 +374,58 @@ int bch2_trigger_stripe(struct btree_trans *trans,
|
||||
new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
|
||||
return 0;
|
||||
|
||||
BUG_ON(new_s && old_s &&
|
||||
(new_s->nr_blocks != old_s->nr_blocks ||
|
||||
new_s->nr_redundant != old_s->nr_redundant));
|
||||
struct gc_stripe *gc = NULL;
|
||||
if (flags & BTREE_TRIGGER_gc) {
|
||||
gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
|
||||
if (!gc) {
|
||||
bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
|
||||
return -BCH_ERR_ENOMEM_mark_stripe;
|
||||
}
|
||||
|
||||
/*
|
||||
* This will be wrong when we bring back runtime gc: we should
|
||||
* be unmarking the old key and then marking the new key
|
||||
*
|
||||
* Also: when we bring back runtime gc, locking
|
||||
*/
|
||||
gc->alive = true;
|
||||
gc->sectors = le16_to_cpu(new_s->sectors);
|
||||
gc->nr_blocks = new_s->nr_blocks;
|
||||
gc->nr_redundant = new_s->nr_redundant;
|
||||
|
||||
for (unsigned i = 0; i < new_s->nr_blocks; i++)
|
||||
gc->ptrs[i] = new_s->ptrs[i];
|
||||
|
||||
/*
|
||||
* gc recalculates this field from stripe ptr
|
||||
* references:
|
||||
*/
|
||||
memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
|
||||
}
|
||||
|
||||
if (new_s) {
|
||||
s64 sectors = le16_to_cpu(new_s->sectors);
|
||||
s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
|
||||
|
||||
struct bch_replicas_padded r;
|
||||
bch2_bkey_to_replicas(&r.e, new);
|
||||
int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
|
||||
struct disk_accounting_pos acc = {
|
||||
.type = BCH_DISK_ACCOUNTING_replicas,
|
||||
};
|
||||
bch2_bkey_to_replicas(&acc.replicas, new);
|
||||
int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (gc)
|
||||
memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
|
||||
}
|
||||
|
||||
if (old_s) {
|
||||
s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
|
||||
s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
|
||||
|
||||
struct bch_replicas_padded r;
|
||||
bch2_bkey_to_replicas(&r.e, old);
|
||||
int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
|
||||
struct disk_accounting_pos acc = {
|
||||
.type = BCH_DISK_ACCOUNTING_replicas,
|
||||
};
|
||||
bch2_bkey_to_replicas(&acc.replicas, old);
|
||||
int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@ -437,52 +474,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & BTREE_TRIGGER_gc) {
|
||||
struct gc_stripe *m =
|
||||
genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
|
||||
|
||||
if (!m) {
|
||||
bch_err(c, "error allocating memory for gc_stripes, idx %llu",
|
||||
idx);
|
||||
return -BCH_ERR_ENOMEM_mark_stripe;
|
||||
}
|
||||
/*
|
||||
* This will be wrong when we bring back runtime gc: we should
|
||||
* be unmarking the old key and then marking the new key
|
||||
*/
|
||||
m->alive = true;
|
||||
m->sectors = le16_to_cpu(new_s->sectors);
|
||||
m->nr_blocks = new_s->nr_blocks;
|
||||
m->nr_redundant = new_s->nr_redundant;
|
||||
|
||||
for (unsigned i = 0; i < new_s->nr_blocks; i++)
|
||||
m->ptrs[i] = new_s->ptrs[i];
|
||||
|
||||
bch2_bkey_to_replicas(&m->r.e, new);
|
||||
|
||||
/*
|
||||
* gc recalculates this field from stripe ptr
|
||||
* references:
|
||||
*/
|
||||
memset(m->block_sectors, 0, sizeof(m->block_sectors));
|
||||
|
||||
int ret = mark_stripe_buckets(trans, old, new, flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = bch2_update_replicas(c, new, &m->r.e,
|
||||
((s64) m->sectors * m->nr_redundant),
|
||||
0, true);
|
||||
if (ret) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
bch2_bkey_val_to_text(&buf, c, new);
|
||||
bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -116,6 +116,9 @@
|
||||
x(ENOENT, ENOENT_dev_idx_not_found) \
|
||||
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
|
||||
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
|
||||
x(EEXIST, EEXIST_str_hash_set) \
|
||||
x(EEXIST, EEXIST_discard_in_flight_add) \
|
||||
x(EEXIST, EEXIST_subvolume_create) \
|
||||
x(0, open_buckets_empty) \
|
||||
x(0, freelist_empty) \
|
||||
x(BCH_ERR_freelist_empty, no_buckets_found) \
|
||||
@ -254,7 +257,8 @@
|
||||
x(BCH_ERR_nopromote, nopromote_no_writes) \
|
||||
x(BCH_ERR_nopromote, nopromote_enomem) \
|
||||
x(0, need_inode_lock) \
|
||||
x(0, invalid_snapshot_node)
|
||||
x(0, invalid_snapshot_node) \
|
||||
x(0, option_needs_open_fs)
|
||||
|
||||
enum bch_errcode {
|
||||
BCH_ERR_START = 2048,
|
||||
|
@ -1,5 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include "bcachefs.h"
|
||||
#include "btree_iter.h"
|
||||
#include "error.h"
|
||||
#include "journal.h"
|
||||
#include "recovery_passes.h"
|
||||
@ -97,7 +98,7 @@ static enum ask_yn parse_yn_response(char *buf)
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
|
||||
static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
|
||||
{
|
||||
struct stdio_redirect *stdio = c->stdio;
|
||||
|
||||
@ -107,25 +108,44 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
|
||||
if (!stdio)
|
||||
return YN_NO;
|
||||
|
||||
char buf[100];
|
||||
if (trans)
|
||||
bch2_trans_unlock(trans);
|
||||
|
||||
unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0;
|
||||
darray_char line = {};
|
||||
int ret;
|
||||
|
||||
do {
|
||||
unsigned long t;
|
||||
bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
|
||||
rewait:
|
||||
t = unlock_long_at
|
||||
? max_t(long, unlock_long_at - jiffies, 0)
|
||||
: MAX_SCHEDULE_TIMEOUT;
|
||||
|
||||
int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
|
||||
if (r < 0)
|
||||
return YN_NO;
|
||||
buf[r] = '\0';
|
||||
} while ((ret = parse_yn_response(buf)) < 0);
|
||||
int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t);
|
||||
if (r == -ETIME) {
|
||||
bch2_trans_unlock_long(trans);
|
||||
unlock_long_at = 0;
|
||||
goto rewait;
|
||||
}
|
||||
|
||||
if (r < 0) {
|
||||
ret = YN_NO;
|
||||
break;
|
||||
}
|
||||
|
||||
darray_last(line) = '\0';
|
||||
} while ((ret = parse_yn_response(line.data)) < 0);
|
||||
|
||||
darray_exit(&line);
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
|
||||
#include "tools-util.h"
|
||||
|
||||
static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
|
||||
static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
|
||||
{
|
||||
char *buf = NULL;
|
||||
size_t buflen = 0;
|
||||
@ -191,7 +211,8 @@ static void prt_actioning(struct printbuf *out, const char *action)
|
||||
prt_str(out, "ing");
|
||||
}
|
||||
|
||||
int bch2_fsck_err(struct bch_fs *c,
|
||||
int __bch2_fsck_err(struct bch_fs *c,
|
||||
struct btree_trans *trans,
|
||||
enum bch_fsck_flags flags,
|
||||
enum bch_sb_error_id err,
|
||||
const char *fmt, ...)
|
||||
@ -203,6 +224,11 @@ int bch2_fsck_err(struct bch_fs *c,
|
||||
int ret = -BCH_ERR_fsck_ignore;
|
||||
const char *action_orig = "fix?", *action = action_orig;
|
||||
|
||||
if (!c)
|
||||
c = trans->c;
|
||||
|
||||
WARN_ON(!trans && bch2_current_has_btree_trans(c));
|
||||
|
||||
if ((flags & FSCK_CAN_FIX) &&
|
||||
test_bit(err, c->sb.errors_silent))
|
||||
return -BCH_ERR_fsck_fix;
|
||||
@ -297,7 +323,15 @@ int bch2_fsck_err(struct bch_fs *c,
|
||||
bch2_print_string_as_lines(KERN_ERR, out->buf);
|
||||
print = false;
|
||||
|
||||
int ask = bch2_fsck_ask_yn(c);
|
||||
int ask = bch2_fsck_ask_yn(c, trans);
|
||||
|
||||
if (trans) {
|
||||
ret = bch2_trans_relock(trans);
|
||||
if (ret) {
|
||||
mutex_unlock(&c->fsck_error_msgs_lock);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
if (ask >= YN_ALLNO && s)
|
||||
s->fix = ask == YN_ALLNO
|
||||
|
@ -117,18 +117,21 @@ enum bch_fsck_flags {
|
||||
|
||||
#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
|
||||
|
||||
__printf(4, 5) __cold
|
||||
int bch2_fsck_err(struct bch_fs *,
|
||||
__printf(5, 6) __cold
|
||||
int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
|
||||
enum bch_fsck_flags,
|
||||
enum bch_sb_error_id,
|
||||
const char *, ...);
|
||||
#define bch2_fsck_err(c, _flags, _err_type, ...) \
|
||||
__bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\
|
||||
type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\
|
||||
_flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
|
||||
|
||||
void bch2_flush_fsck_errs(struct bch_fs *);
|
||||
|
||||
#define __fsck_err(c, _flags, _err_type, ...) \
|
||||
({ \
|
||||
int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \
|
||||
__VA_ARGS__); \
|
||||
\
|
||||
int _ret = bch2_fsck_err(c, _flags, _err_type, __VA_ARGS__); \
|
||||
if (_ret != -BCH_ERR_fsck_fix && \
|
||||
_ret != -BCH_ERR_fsck_ignore) { \
|
||||
ret = _ret; \
|
||||
@ -143,7 +146,12 @@ void bch2_flush_fsck_errs(struct bch_fs *);
|
||||
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
|
||||
|
||||
#define __fsck_err_on(cond, c, _flags, _err_type, ...) \
|
||||
(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false)
|
||||
({ \
|
||||
if (type_is(c, struct bch_fs *)) \
|
||||
WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\
|
||||
\
|
||||
(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
|
||||
})
|
||||
|
||||
#define need_fsck_err_on(cond, c, _err_type, ...) \
|
||||
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
|
||||
|
@ -609,8 +609,10 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
|
||||
if (unlikely(ret))
|
||||
goto err_put_write_ref;
|
||||
|
||||
if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
|
||||
if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) {
|
||||
ret = -EINVAL;
|
||||
goto err_put_write_ref;
|
||||
}
|
||||
|
||||
inode_dio_begin(&inode->v);
|
||||
bch2_pagecache_block_get(inode);
|
||||
|
@ -192,7 +192,9 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
{
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
int ret;
|
||||
int ret, err;
|
||||
|
||||
trace_bch2_fsync(file, datasync);
|
||||
|
||||
ret = file_write_and_wait_range(file, start, end);
|
||||
if (ret)
|
||||
@ -205,6 +207,11 @@ out:
|
||||
ret = bch2_err_class(ret);
|
||||
if (ret == -EROFS)
|
||||
ret = -EIO;
|
||||
|
||||
err = file_check_and_advance_wb_err(file);
|
||||
if (!ret)
|
||||
ret = err;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -860,9 +867,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
|
||||
if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
|
||||
return -EINVAL;
|
||||
|
||||
if (remap_flags & REMAP_FILE_DEDUP)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if ((pos_src & (block_bytes(c) - 1)) ||
|
||||
(pos_dst & (block_bytes(c) - 1)))
|
||||
return -EINVAL;
|
||||
@ -895,7 +899,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
file_update_time(file_dst);
|
||||
if (!(remap_flags & REMAP_FILE_DEDUP))
|
||||
file_update_time(file_dst);
|
||||
|
||||
bch2_mark_pagecache_unallocated(src, pos_src >> 9,
|
||||
(pos_src + aligned_len) >> 9);
|
||||
|
@ -272,6 +272,70 @@ err1:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg)
|
||||
{
|
||||
return put_user(inode->v.i_generation, arg);
|
||||
}
|
||||
|
||||
static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label)
|
||||
{
|
||||
int ret;
|
||||
size_t len;
|
||||
char label[BCH_SB_LABEL_SIZE];
|
||||
|
||||
BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX);
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
len = strnlen(label, BCH_SB_LABEL_SIZE);
|
||||
if (len == BCH_SB_LABEL_SIZE) {
|
||||
bch_warn(c,
|
||||
"label is too long, return the first %zu bytes",
|
||||
--len);
|
||||
}
|
||||
|
||||
ret = copy_to_user(user_label, label, len);
|
||||
|
||||
return ret ? -EFAULT : 0;
|
||||
}
|
||||
|
||||
static int bch2_ioc_setlabel(struct bch_fs *c,
|
||||
struct file *file,
|
||||
struct bch_inode_info *inode,
|
||||
const char __user *user_label)
|
||||
{
|
||||
int ret;
|
||||
char label[BCH_SB_LABEL_SIZE];
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
if (copy_from_user(label, user_label, sizeof(label)))
|
||||
return -EFAULT;
|
||||
|
||||
if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) {
|
||||
bch_err(c,
|
||||
"unable to set label with more than %d bytes",
|
||||
BCH_SB_LABEL_SIZE - 1);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = mnt_want_write_file(file);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
ret = bch2_write_super(c);
|
||||
|
||||
mnt_drop_write_file(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
|
||||
{
|
||||
u32 flags;
|
||||
@ -373,7 +437,7 @@ retry:
|
||||
}
|
||||
|
||||
if (dst_dentry->d_inode) {
|
||||
error = -EEXIST;
|
||||
error = -BCH_ERR_EEXIST_subvolume_create;
|
||||
goto err3;
|
||||
}
|
||||
|
||||
@ -506,13 +570,21 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
break;
|
||||
|
||||
case FS_IOC_GETVERSION:
|
||||
ret = -ENOTTY;
|
||||
ret = bch2_ioc_getversion(inode, (u32 __user *) arg);
|
||||
break;
|
||||
|
||||
case FS_IOC_SETVERSION:
|
||||
ret = -ENOTTY;
|
||||
break;
|
||||
|
||||
case FS_IOC_GETFSLABEL:
|
||||
ret = bch2_ioc_getlabel(c, (void __user *) arg);
|
||||
break;
|
||||
|
||||
case FS_IOC_SETFSLABEL:
|
||||
ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg);
|
||||
break;
|
||||
|
||||
case FS_IOC_GOINGDOWN:
|
||||
ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
|
||||
break;
|
||||
@ -554,6 +626,12 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
case FS_IOC32_SETFLAGS:
|
||||
cmd = FS_IOC_SETFLAGS;
|
||||
break;
|
||||
case FS_IOC32_GETVERSION:
|
||||
cmd = FS_IOC_GETVERSION;
|
||||
break;
|
||||
case FS_IOC_GETFSLABEL:
|
||||
case FS_IOC_SETFSLABEL:
|
||||
break;
|
||||
default:
|
||||
return -ENOIOCTLCMD;
|
||||
}
|
||||
|
183
libbcachefs/fs.c
183
libbcachefs/fs.c
@ -26,11 +26,13 @@
|
||||
#include "snapshot.h"
|
||||
#include "super.h"
|
||||
#include "xattr.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/aio.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/exportfs.h>
|
||||
#include <linux/fiemap.h>
|
||||
#include <linux/fs_context.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/posix_acl.h>
|
||||
@ -1680,6 +1682,8 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
int ret;
|
||||
|
||||
trace_bch2_sync_fs(sb, wait);
|
||||
|
||||
if (c->opts.journal_flush_disabled)
|
||||
return 0;
|
||||
|
||||
@ -1708,15 +1712,11 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
|
||||
return c ?: ERR_PTR(-ENOENT);
|
||||
}
|
||||
|
||||
static int bch2_remount(struct super_block *sb, int *flags, char *data)
|
||||
static int bch2_remount(struct super_block *sb, int *flags,
|
||||
struct bch_opts opts)
|
||||
{
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
struct bch_opts opts = bch2_opts_empty();
|
||||
int ret;
|
||||
|
||||
ret = bch2_parse_mount_opts(c, &opts, data);
|
||||
if (ret)
|
||||
goto err;
|
||||
int ret = 0;
|
||||
|
||||
opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
|
||||
|
||||
@ -1843,7 +1843,6 @@ static const struct super_operations bch_super_operations = {
|
||||
.statfs = bch2_statfs,
|
||||
.show_devname = bch2_show_devname,
|
||||
.show_options = bch2_show_options,
|
||||
.remount_fs = bch2_remount,
|
||||
.put_super = bch2_put_super,
|
||||
.freeze_fs = bch2_freeze,
|
||||
.unfreeze_fs = bch2_unfreeze,
|
||||
@ -1876,77 +1875,63 @@ static int bch2_test_super(struct super_block *s, void *data)
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct dentry *bch2_mount(struct file_system_type *fs_type,
|
||||
int flags, const char *dev_name, void *data)
|
||||
static int bch2_fs_get_tree(struct fs_context *fc)
|
||||
{
|
||||
struct bch_fs *c;
|
||||
struct super_block *sb;
|
||||
struct inode *vinode;
|
||||
struct bch_opts opts = bch2_opts_empty();
|
||||
struct bch2_opts_parse *opts_parse = fc->fs_private;
|
||||
struct bch_opts opts = opts_parse->opts;
|
||||
darray_str devs;
|
||||
darray_fs devs_to_fs = {};
|
||||
int ret;
|
||||
|
||||
opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
|
||||
opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
|
||||
opt_set(opts, nostart, true);
|
||||
|
||||
ret = bch2_parse_mount_opts(NULL, &opts, data);
|
||||
if (ret) {
|
||||
ret = bch2_err_class(ret);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
if (!fc->source || strlen(fc->source) == 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (!dev_name || strlen(dev_name) == 0)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
darray_str devs;
|
||||
ret = bch2_split_devs(dev_name, &devs);
|
||||
ret = bch2_split_devs(fc->source, &devs);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
return ret;
|
||||
|
||||
darray_fs devs_to_fs = {};
|
||||
darray_for_each(devs, i) {
|
||||
ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
|
||||
if (ret) {
|
||||
sb = ERR_PTR(ret);
|
||||
goto got_sb;
|
||||
}
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
|
||||
sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
|
||||
if (!IS_ERR(sb))
|
||||
goto got_sb;
|
||||
|
||||
c = bch2_fs_open(devs.data, devs.nr, opts);
|
||||
if (IS_ERR(c)) {
|
||||
sb = ERR_CAST(c);
|
||||
goto got_sb;
|
||||
}
|
||||
ret = PTR_ERR_OR_ZERO(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
/* Some options can't be parsed until after the fs is started: */
|
||||
ret = bch2_parse_mount_opts(c, &opts, data);
|
||||
if (ret) {
|
||||
bch2_fs_stop(c);
|
||||
sb = ERR_PTR(ret);
|
||||
goto got_sb;
|
||||
}
|
||||
opts = bch2_opts_empty();
|
||||
ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
|
||||
if (ret)
|
||||
goto err_stop_fs;
|
||||
|
||||
bch2_opts_apply(&c->opts, opts);
|
||||
|
||||
sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
|
||||
if (IS_ERR(sb))
|
||||
bch2_fs_stop(c);
|
||||
ret = bch2_fs_start(c);
|
||||
if (ret)
|
||||
goto err_stop_fs;
|
||||
|
||||
sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
|
||||
ret = PTR_ERR_OR_ZERO(sb);
|
||||
if (ret)
|
||||
goto err_stop_fs;
|
||||
got_sb:
|
||||
darray_exit(&devs_to_fs);
|
||||
bch2_darray_str_exit(&devs);
|
||||
|
||||
if (IS_ERR(sb)) {
|
||||
ret = PTR_ERR(sb);
|
||||
ret = bch2_err_class(ret);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
c = sb->s_fs_info;
|
||||
|
||||
if (sb->s_root) {
|
||||
if ((flags ^ sb->s_flags) & SB_RDONLY) {
|
||||
if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
|
||||
ret = -EBUSY;
|
||||
goto err_put_super;
|
||||
}
|
||||
@ -2011,12 +1996,28 @@ got_sb:
|
||||
|
||||
sb->s_flags |= SB_ACTIVE;
|
||||
out:
|
||||
return dget(sb->s_root);
|
||||
fc->root = dget(sb->s_root);
|
||||
err:
|
||||
darray_exit(&devs_to_fs);
|
||||
bch2_darray_str_exit(&devs);
|
||||
/*
|
||||
* On an inconsistency error in recovery we might see an -EROFS derived
|
||||
* errorcode (from the journal), but we don't want to return that to
|
||||
* userspace as that causes util-linux to retry the mount RO - which is
|
||||
* confusing:
|
||||
*/
|
||||
if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
|
||||
ret = -EIO;
|
||||
return bch2_err_class(ret);
|
||||
|
||||
err_stop_fs:
|
||||
bch2_fs_stop(c);
|
||||
goto err;
|
||||
|
||||
err_put_super:
|
||||
__bch2_fs_stop(c);
|
||||
deactivate_locked_super(sb);
|
||||
return ERR_PTR(bch2_err_class(ret));
|
||||
goto err;
|
||||
}
|
||||
|
||||
static void bch2_kill_sb(struct super_block *sb)
|
||||
@ -2027,12 +2028,76 @@ static void bch2_kill_sb(struct super_block *sb)
|
||||
bch2_fs_free(c);
|
||||
}
|
||||
|
||||
static void bch2_fs_context_free(struct fs_context *fc)
|
||||
{
|
||||
struct bch2_opts_parse *opts = fc->fs_private;
|
||||
|
||||
if (opts) {
|
||||
printbuf_exit(&opts->parse_later);
|
||||
kfree(opts);
|
||||
}
|
||||
}
|
||||
|
||||
static int bch2_fs_parse_param(struct fs_context *fc,
|
||||
struct fs_parameter *param)
|
||||
{
|
||||
/*
|
||||
* the "source" param, i.e., the name of the device(s) to mount,
|
||||
* is handled by the VFS layer.
|
||||
*/
|
||||
if (!strcmp(param->key, "source"))
|
||||
return -ENOPARAM;
|
||||
|
||||
struct bch2_opts_parse *opts = fc->fs_private;
|
||||
struct bch_fs *c = NULL;
|
||||
|
||||
/* for reconfigure, we already have a struct bch_fs */
|
||||
if (fc->root)
|
||||
c = fc->root->d_sb->s_fs_info;
|
||||
|
||||
int ret = bch2_parse_one_mount_opt(c, &opts->opts,
|
||||
&opts->parse_later, param->key,
|
||||
param->string);
|
||||
|
||||
return bch2_err_class(ret);
|
||||
}
|
||||
|
||||
static int bch2_fs_reconfigure(struct fs_context *fc)
|
||||
{
|
||||
struct super_block *sb = fc->root->d_sb;
|
||||
struct bch2_opts_parse *opts = fc->fs_private;
|
||||
|
||||
return bch2_remount(sb, &fc->sb_flags, opts->opts);
|
||||
}
|
||||
|
||||
static const struct fs_context_operations bch2_context_ops = {
|
||||
.free = bch2_fs_context_free,
|
||||
.parse_param = bch2_fs_parse_param,
|
||||
.get_tree = bch2_fs_get_tree,
|
||||
.reconfigure = bch2_fs_reconfigure,
|
||||
};
|
||||
|
||||
static int bch2_init_fs_context(struct fs_context *fc)
|
||||
{
|
||||
struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
|
||||
|
||||
if (!opts)
|
||||
return -ENOMEM;
|
||||
|
||||
opts->parse_later = PRINTBUF;
|
||||
|
||||
fc->ops = &bch2_context_ops;
|
||||
fc->fs_private = opts;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct file_system_type bcache_fs_type = {
|
||||
.owner = THIS_MODULE,
|
||||
.name = "bcachefs",
|
||||
.mount = bch2_mount,
|
||||
.kill_sb = bch2_kill_sb,
|
||||
.fs_flags = FS_REQUIRES_DEV,
|
||||
.owner = THIS_MODULE,
|
||||
.name = "bcachefs",
|
||||
.init_fs_context = bch2_init_fs_context,
|
||||
.kill_sb = bch2_kill_sb,
|
||||
.fs_flags = FS_REQUIRES_DEV,
|
||||
};
|
||||
|
||||
MODULE_ALIAS_FS("bcachefs");
|
||||
|
@ -455,33 +455,44 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
|
||||
static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_inode_unpacked new_inode;
|
||||
unsigned i_mode = S_IFREG;
|
||||
u64 i_size = 0;
|
||||
|
||||
switch (btree) {
|
||||
case BTREE_ID_extents: {
|
||||
struct btree_iter iter = {};
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
int ret = bkey_err(k);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
i_size = k.k->p.offset << 9;
|
||||
break;
|
||||
}
|
||||
case BTREE_ID_dirents:
|
||||
i_mode = S_IFDIR;
|
||||
break;
|
||||
case BTREE_ID_xattrs:
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
struct bch_inode_unpacked new_inode;
|
||||
bch2_inode_init_early(c, &new_inode);
|
||||
bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
|
||||
new_inode.bi_size = size;
|
||||
bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
|
||||
new_inode.bi_size = i_size;
|
||||
new_inode.bi_inum = inum;
|
||||
|
||||
return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
|
||||
}
|
||||
|
||||
static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
|
||||
{
|
||||
struct btree_iter iter = {};
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
int ret = bkey_err(k);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
|
||||
}
|
||||
|
||||
struct snapshots_seen {
|
||||
struct bpos pos;
|
||||
snapshot_id_list ids;
|
||||
@ -824,8 +835,8 @@ static int hash_check_key(struct btree_trans *trans,
|
||||
break;
|
||||
|
||||
if (fsck_err_on(k.k->type == desc.key_type &&
|
||||
!desc.cmp_bkey(k, hash_k), c,
|
||||
hash_table_key_duplicate,
|
||||
!desc.cmp_bkey(k, hash_k),
|
||||
trans, hash_table_key_duplicate,
|
||||
"duplicate hash table keys:\n%s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, hash_k),
|
||||
@ -844,7 +855,7 @@ out:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
bad_hash:
|
||||
if (fsck_err(c, hash_table_key_wrong_offset,
|
||||
if (fsck_err(trans, hash_table_key_wrong_offset,
|
||||
"hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
|
||||
bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
|
||||
(printbuf_reset(&buf),
|
||||
@ -919,11 +930,11 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i
|
||||
return ret;
|
||||
|
||||
if (fsck_err_on(ret,
|
||||
c, inode_points_to_missing_dirent,
|
||||
trans, inode_points_to_missing_dirent,
|
||||
"inode points to missing dirent\n%s",
|
||||
(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
|
||||
fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
|
||||
c, inode_points_to_wrong_dirent,
|
||||
trans, inode_points_to_wrong_dirent,
|
||||
"inode points to dirent that does not point back:\n%s",
|
||||
(bch2_bkey_val_to_text(&buf, c, inode_k),
|
||||
prt_newline(&buf),
|
||||
@ -986,7 +997,7 @@ static int check_inode(struct btree_trans *trans,
|
||||
|
||||
if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
|
||||
inode_d_type(prev) != inode_d_type(&u),
|
||||
c, inode_snapshot_mismatch,
|
||||
trans, inode_snapshot_mismatch,
|
||||
"inodes in different snapshots don't match")) {
|
||||
bch_err(c, "repair not implemented yet");
|
||||
return -BCH_ERR_fsck_repair_unimplemented;
|
||||
@ -1018,7 +1029,8 @@ static int check_inode(struct btree_trans *trans,
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
|
||||
fsck_err_on(!ret,
|
||||
trans, unlinked_inode_not_on_deleted_list,
|
||||
"inode %llu:%u unlinked, but not on deleted list",
|
||||
u.bi_inum, k.k->p.snapshot);
|
||||
ret = 0;
|
||||
@ -1026,7 +1038,7 @@ static int check_inode(struct btree_trans *trans,
|
||||
|
||||
if (u.bi_flags & BCH_INODE_unlinked &&
|
||||
(!c->sb.clean ||
|
||||
fsck_err(c, inode_unlinked_but_clean,
|
||||
fsck_err(trans, inode_unlinked_but_clean,
|
||||
"filesystem marked clean, but inode %llu unlinked",
|
||||
u.bi_inum))) {
|
||||
ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
|
||||
@ -1036,7 +1048,7 @@ static int check_inode(struct btree_trans *trans,
|
||||
|
||||
if (u.bi_flags & BCH_INODE_i_size_dirty &&
|
||||
(!c->sb.clean ||
|
||||
fsck_err(c, inode_i_size_dirty_but_clean,
|
||||
fsck_err(trans, inode_i_size_dirty_but_clean,
|
||||
"filesystem marked clean, but inode %llu has i_size dirty",
|
||||
u.bi_inum))) {
|
||||
bch_verbose(c, "truncating inode %llu", u.bi_inum);
|
||||
@ -1066,7 +1078,7 @@ static int check_inode(struct btree_trans *trans,
|
||||
|
||||
if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
|
||||
(!c->sb.clean ||
|
||||
fsck_err(c, inode_i_sectors_dirty_but_clean,
|
||||
fsck_err(trans, inode_i_sectors_dirty_but_clean,
|
||||
"filesystem marked clean, but inode %llu has i_sectors dirty",
|
||||
u.bi_inum))) {
|
||||
s64 sectors;
|
||||
@ -1101,7 +1113,7 @@ static int check_inode(struct btree_trans *trans,
|
||||
if (fsck_err_on(u.bi_parent_subvol &&
|
||||
(u.bi_subvol == 0 ||
|
||||
u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
|
||||
c, inode_bi_parent_nonzero,
|
||||
trans, inode_bi_parent_nonzero,
|
||||
"inode %llu:%u has subvol %u but nonzero parent subvol %u",
|
||||
u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
|
||||
u.bi_parent_subvol = 0;
|
||||
@ -1121,13 +1133,13 @@ static int check_inode(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
if (fsck_err_on(ret,
|
||||
c, inode_bi_subvol_missing,
|
||||
trans, inode_bi_subvol_missing,
|
||||
"inode %llu:%u bi_subvol points to missing subvolume %u",
|
||||
u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
|
||||
fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
|
||||
!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
|
||||
k.k->p.snapshot),
|
||||
c, inode_bi_subvol_wrong,
|
||||
trans, inode_bi_subvol_wrong,
|
||||
"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
|
||||
u.bi_inum, k.k->p.snapshot, u.bi_subvol,
|
||||
le64_to_cpu(s.inode),
|
||||
@ -1170,6 +1182,71 @@ int bch2_check_inodes(struct bch_fs *c)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
|
||||
{
|
||||
switch (btree) {
|
||||
case BTREE_ID_extents:
|
||||
return S_ISREG(mode) || S_ISLNK(mode);
|
||||
case BTREE_ID_dirents:
|
||||
return S_ISDIR(mode);
|
||||
case BTREE_ID_xattrs:
|
||||
return true;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static int check_key_has_inode(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct inode_walker *inode,
|
||||
struct inode_walker_entry *i,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = PTR_ERR_OR_ZERO(i);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (k.k->type == KEY_TYPE_whiteout)
|
||||
goto out;
|
||||
|
||||
if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
|
||||
ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
|
||||
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
inode->last_pos.inode--;
|
||||
ret = -BCH_ERR_transaction_restart_nested;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (fsck_err_on(!i,
|
||||
trans, key_in_missing_inode,
|
||||
"key in missing inode:\n %s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
||||
goto delete;
|
||||
|
||||
if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
|
||||
trans, key_in_wrong_inode_type,
|
||||
"key for wrong inode mode %o:\n %s",
|
||||
i->inode.bi_mode,
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
||||
goto delete;
|
||||
out:
|
||||
err:
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
delete:
|
||||
ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
@ -1192,7 +1269,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
|
||||
}
|
||||
|
||||
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
|
||||
c, inode_i_sectors_wrong,
|
||||
trans, inode_i_sectors_wrong,
|
||||
"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
|
||||
w->last_pos.inode, i->snapshot,
|
||||
i->inode.bi_sectors, i->count)) {
|
||||
@ -1340,7 +1417,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
|
||||
prt_printf(&buf, "\n overwriting %s extent",
|
||||
pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
|
||||
|
||||
if (fsck_err(c, extent_overlapping,
|
||||
if (fsck_err(trans, extent_overlapping,
|
||||
"overlapping extents%s", buf.buf)) {
|
||||
struct btree_iter *old_iter = &iter1;
|
||||
struct disk_reservation res = { 0 };
|
||||
@ -1476,43 +1553,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
i = walk_inode(trans, inode, k);
|
||||
ret = PTR_ERR_OR_ZERO(i);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
|
||||
ret = check_key_has_inode(trans, iter, inode, i, k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (k.k->type != KEY_TYPE_whiteout) {
|
||||
if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
|
||||
ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
|
||||
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
inode->last_pos.inode--;
|
||||
ret = -BCH_ERR_transaction_restart_nested;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (fsck_err_on(!i, c, extent_in_missing_inode,
|
||||
"extent in missing inode:\n %s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
||||
goto delete;
|
||||
|
||||
if (fsck_err_on(i &&
|
||||
!S_ISREG(i->inode.bi_mode) &&
|
||||
!S_ISLNK(i->inode.bi_mode),
|
||||
c, extent_in_non_reg_inode,
|
||||
"extent in non regular inode mode %o:\n %s",
|
||||
i->inode.bi_mode,
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
||||
goto delete;
|
||||
|
||||
ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
|
||||
&inode->recalculate_sums);
|
||||
if (ret)
|
||||
@ -1525,7 +1579,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
|
||||
* didn't have one, iterate over all inodes:
|
||||
*/
|
||||
if (!i)
|
||||
i = inode->inodes.data + inode->inodes.nr - 1;
|
||||
i = &darray_last(inode->inodes);
|
||||
|
||||
for (;
|
||||
inode->inodes.data && i >= inode->inodes.data;
|
||||
@ -1538,7 +1592,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
|
||||
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
|
||||
k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
|
||||
!bkey_extent_is_reservation(k),
|
||||
c, extent_past_end_of_inode,
|
||||
trans, extent_past_end_of_inode,
|
||||
"extent type past end of inode %llu:%u, i_size %llu\n %s",
|
||||
i->inode.bi_inum, i->snapshot, i->inode.bi_size,
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
@ -1574,9 +1628,6 @@ fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
delete:
|
||||
ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1656,7 +1707,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
|
||||
}
|
||||
|
||||
if (fsck_err_on(i->inode.bi_nlink != i->count,
|
||||
c, inode_dir_wrong_nlink,
|
||||
trans, inode_dir_wrong_nlink,
|
||||
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
|
||||
w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
|
||||
i->inode.bi_nlink = i->count;
|
||||
@ -1691,7 +1742,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
|
||||
return 0;
|
||||
|
||||
if (bch2_inode_should_have_bp(target) &&
|
||||
!fsck_err(c, inode_wrong_backpointer,
|
||||
!fsck_err(trans, inode_wrong_backpointer,
|
||||
"dirent points to inode that does not point back:\n %s",
|
||||
(bch2_bkey_val_to_text(&buf, c, d.s_c),
|
||||
prt_printf(&buf, "\n "),
|
||||
@ -1717,7 +1768,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
|
||||
ret = 0;
|
||||
|
||||
if (fsck_err_on(!backpointer_exists,
|
||||
c, inode_wrong_backpointer,
|
||||
trans, inode_wrong_backpointer,
|
||||
"inode %llu:%u has wrong backpointer:\n"
|
||||
"got %llu:%llu\n"
|
||||
"should be %llu:%llu",
|
||||
@ -1740,7 +1791,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
|
||||
if (fsck_err_on(backpointer_exists &&
|
||||
(S_ISDIR(target->bi_mode) ||
|
||||
target->bi_subvol),
|
||||
c, inode_dir_multiple_links,
|
||||
trans, inode_dir_multiple_links,
|
||||
"%s %llu:%u with multiple links\n%s",
|
||||
S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
|
||||
target->bi_inum, target_snapshot, buf.buf)) {
|
||||
@ -1754,7 +1805,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
|
||||
* it up, it ignores inodes with nlink 0
|
||||
*/
|
||||
if (fsck_err_on(backpointer_exists && !target->bi_nlink,
|
||||
c, inode_multiple_links_but_nlink_0,
|
||||
trans, inode_multiple_links_but_nlink_0,
|
||||
"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
|
||||
target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
|
||||
target->bi_nlink++;
|
||||
@ -1789,7 +1840,7 @@ static int check_dirent_target(struct btree_trans *trans,
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(d.v->d_type != inode_d_type(target),
|
||||
c, dirent_d_type_wrong,
|
||||
trans, dirent_d_type_wrong,
|
||||
"incorrect d_type: got %s, should be %s:\n%s",
|
||||
bch2_d_type_str(d.v->d_type),
|
||||
bch2_d_type_str(inode_d_type(target)),
|
||||
@ -1886,11 +1937,12 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
|
||||
parent_snapshot = d.k->p.snapshot;
|
||||
}
|
||||
|
||||
if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
|
||||
if (fsck_err_on(ret,
|
||||
trans, dirent_to_missing_parent_subvol,
|
||||
"dirent parent_subvol points to missing subvolume\n%s",
|
||||
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
|
||||
fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
|
||||
c, dirent_not_visible_in_parent_subvol,
|
||||
trans, dirent_not_visible_in_parent_subvol,
|
||||
"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
|
||||
parent_snapshot,
|
||||
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
|
||||
@ -1916,7 +1968,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
|
||||
return ret;
|
||||
|
||||
if (ret) {
|
||||
if (fsck_err(c, dirent_to_missing_subvol,
|
||||
if (fsck_err(trans, dirent_to_missing_subvol,
|
||||
"dirent points to missing subvolume\n%s",
|
||||
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
|
||||
return __remove_dirent(trans, d.k->p);
|
||||
@ -1925,7 +1977,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
|
||||
}
|
||||
|
||||
if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
|
||||
c, subvol_fs_path_parent_wrong,
|
||||
trans, subvol_fs_path_parent_wrong,
|
||||
"subvol with wrong fs_path_parent, should be be %u\n%s",
|
||||
parent_subvol,
|
||||
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
|
||||
@ -1953,7 +2005,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
|
||||
}
|
||||
|
||||
if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
|
||||
c, inode_bi_parent_wrong,
|
||||
trans, inode_bi_parent_wrong,
|
||||
"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
|
||||
target_inum,
|
||||
subvol_root.bi_parent_subvol, parent_subvol)) {
|
||||
@ -2006,49 +2058,21 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
|
||||
goto err;
|
||||
}
|
||||
|
||||
BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
|
||||
|
||||
i = walk_inode(trans, dir, k);
|
||||
ret = PTR_ERR_OR_ZERO(i);
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
|
||||
if (dir->first_this_inode && dir->inodes.nr)
|
||||
*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
|
||||
dir->first_this_inode = false;
|
||||
|
||||
if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
|
||||
ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
|
||||
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
dir->last_pos.inode--;
|
||||
ret = -BCH_ERR_transaction_restart_nested;
|
||||
ret = check_key_has_inode(trans, iter, dir, i, k);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
|
||||
"dirent in nonexisting directory:\n%s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
ret = bch2_btree_delete_at(trans, iter,
|
||||
BTREE_UPDATE_internal_snapshot_node);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!i)
|
||||
goto out;
|
||||
|
||||
if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
|
||||
c, dirent_in_non_dir_inode,
|
||||
"dirent in non directory inode type %s:\n%s",
|
||||
bch2_d_type_str(inode_d_type(&i->inode)),
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
ret = bch2_btree_delete_at(trans, iter, 0);
|
||||
goto out;
|
||||
}
|
||||
if (dir->first_this_inode)
|
||||
*hash_info = bch2_hash_info_init(c, &i->inode);
|
||||
dir->first_this_inode = false;
|
||||
|
||||
ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
|
||||
if (ret < 0)
|
||||
@ -2074,7 +2098,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(!target->inodes.nr,
|
||||
c, dirent_to_missing_inode,
|
||||
trans, dirent_to_missing_inode,
|
||||
"dirent points to missing inode:\n%s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, k),
|
||||
@ -2153,20 +2177,18 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (inode->first_this_inode && inode->inodes.nr)
|
||||
*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
|
||||
inode->first_this_inode = false;
|
||||
|
||||
if (fsck_err_on(!i, c, xattr_in_missing_inode,
|
||||
"xattr for missing inode %llu",
|
||||
k.k->p.inode))
|
||||
return bch2_btree_delete_at(trans, iter, 0);
|
||||
ret = check_key_has_inode(trans, iter, inode, i, k);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!i)
|
||||
return 0;
|
||||
|
||||
if (inode->first_this_inode)
|
||||
*hash_info = bch2_hash_info_init(c, &i->inode);
|
||||
inode->first_this_inode = false;
|
||||
|
||||
ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
|
||||
fsck_err:
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
@ -2204,7 +2226,7 @@ static int check_root_trans(struct btree_trans *trans)
|
||||
if (ret && !bch2_err_matches(ret, ENOENT))
|
||||
return ret;
|
||||
|
||||
if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
|
||||
if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
|
||||
"root subvol missing")) {
|
||||
struct bkey_i_subvolume *root_subvol =
|
||||
bch2_trans_kmalloc(trans, sizeof(*root_subvol));
|
||||
@ -2230,10 +2252,11 @@ static int check_root_trans(struct btree_trans *trans)
|
||||
if (ret && !bch2_err_matches(ret, ENOENT))
|
||||
return ret;
|
||||
|
||||
if (mustfix_fsck_err_on(ret, c, root_dir_missing,
|
||||
if (mustfix_fsck_err_on(ret,
|
||||
trans, root_dir_missing,
|
||||
"root directory missing") ||
|
||||
mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
|
||||
c, root_inode_not_dir,
|
||||
trans, root_inode_not_dir,
|
||||
"root inode not a directory")) {
|
||||
bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
|
||||
0, NULL);
|
||||
@ -2305,7 +2328,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
|
||||
break;
|
||||
|
||||
if (fsck_err_on(!ret,
|
||||
c, subvol_unreachable,
|
||||
trans, subvol_unreachable,
|
||||
"unreachable subvolume %s",
|
||||
(bch2_bkey_val_to_text(&buf, c, s.s_c),
|
||||
buf.buf))) {
|
||||
@ -2330,7 +2353,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
|
||||
c, subvol_unreachable,
|
||||
trans, subvol_unreachable,
|
||||
"unreachable subvolume %s",
|
||||
(bch2_bkey_val_to_text(&buf, c, s.s_c),
|
||||
buf.buf))) {
|
||||
@ -2409,7 +2432,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
|
||||
|
||||
if (bch2_err_matches(ret, ENOENT)) {
|
||||
ret = 0;
|
||||
if (fsck_err(c, inode_unreachable,
|
||||
if (fsck_err(trans, inode_unreachable,
|
||||
"unreachable inode\n%s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, inode_k),
|
||||
@ -2455,7 +2478,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
|
||||
pr_err("%llu:%u", i->inum, i->snapshot);
|
||||
pr_err("%llu:%u", inode.bi_inum, snapshot);
|
||||
|
||||
if (fsck_err(c, dir_loop, "directory structure loop")) {
|
||||
if (fsck_err(trans, dir_loop, "directory structure loop")) {
|
||||
ret = remove_backpointer(trans, &inode);
|
||||
bch_err_msg(c, ret, "removing dirent");
|
||||
if (ret)
|
||||
@ -2661,7 +2684,6 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
|
||||
struct nlink_table *links,
|
||||
size_t *idx, u64 range_end)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_inode_unpacked u;
|
||||
struct nlink *link = &links->d[*idx];
|
||||
int ret = 0;
|
||||
@ -2687,7 +2709,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
|
||||
}
|
||||
|
||||
if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
|
||||
c, inode_wrong_nlink,
|
||||
trans, inode_wrong_nlink,
|
||||
"inode %llu type %s has wrong i_nlink (%u, should be %u)",
|
||||
u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
|
||||
bch2_inode_nlink_get(&u), link->count)) {
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include "buckets.h"
|
||||
#include "compress.h"
|
||||
#include "dirent.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "extent_update.h"
|
||||
@ -550,6 +551,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
|
||||
prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
|
||||
BCH_INODE_FIELDS_v3()
|
||||
#undef x
|
||||
|
||||
bch2_printbuf_strip_trailing_newline(out);
|
||||
printbuf_indent_sub(out, 2);
|
||||
}
|
||||
|
||||
@ -596,39 +599,26 @@ int bch2_trigger_inode(struct btree_trans *trans,
|
||||
struct bkey_s new,
|
||||
enum btree_iter_update_trigger_flags flags)
|
||||
{
|
||||
s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k);
|
||||
|
||||
if (flags & BTREE_TRIGGER_transactional) {
|
||||
if (nr) {
|
||||
int ret = bch2_replicas_deltas_realloc(trans, 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
trans->fs_usage_deltas->nr_inodes += nr;
|
||||
}
|
||||
|
||||
bool old_deleted = bkey_is_deleted_inode(old);
|
||||
bool new_deleted = bkey_is_deleted_inode(new.s_c);
|
||||
if (old_deleted != new_deleted) {
|
||||
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
|
||||
new.k->p, new_deleted);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
|
||||
BUG_ON(!trans->journal_res.seq);
|
||||
|
||||
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
|
||||
}
|
||||
|
||||
if (flags & BTREE_TRIGGER_gc) {
|
||||
struct bch_fs *c = trans->c;
|
||||
s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
|
||||
if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
|
||||
struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
|
||||
int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
this_cpu_add(c->usage_gc->b.nr_inodes, nr);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
int deleted_delta = (int) bkey_is_deleted_inode(new.s_c) -
|
||||
(int) bkey_is_deleted_inode(old);
|
||||
if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) {
|
||||
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
|
||||
new.k->p, deleted_delta > 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -1096,8 +1086,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
|
||||
return ret;
|
||||
|
||||
ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
|
||||
if (fsck_err_on(!bkey_is_inode(k.k), c,
|
||||
deleted_inode_missing,
|
||||
if (fsck_err_on(!bkey_is_inode(k.k),
|
||||
trans, deleted_inode_missing,
|
||||
"nonexistent inode %llu:%u in deleted_inodes btree",
|
||||
pos.offset, pos.snapshot))
|
||||
goto delete;
|
||||
@ -1109,7 +1099,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
|
||||
if (S_ISDIR(inode.bi_mode)) {
|
||||
ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
|
||||
if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
|
||||
c, deleted_inode_is_dir,
|
||||
trans, deleted_inode_is_dir,
|
||||
"non empty directory %llu:%u in deleted_inodes btree",
|
||||
pos.offset, pos.snapshot))
|
||||
goto delete;
|
||||
@ -1117,15 +1107,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
|
||||
deleted_inode_not_unlinked,
|
||||
if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
|
||||
trans, deleted_inode_not_unlinked,
|
||||
"non-deleted inode %llu:%u in deleted_inodes btree",
|
||||
pos.offset, pos.snapshot))
|
||||
goto delete;
|
||||
|
||||
if (c->sb.clean &&
|
||||
!fsck_err(c,
|
||||
deleted_inode_but_clean,
|
||||
!fsck_err(trans, deleted_inode_but_clean,
|
||||
"filesystem marked as clean but have deleted inode %llu:%u",
|
||||
pos.offset, pos.snapshot)) {
|
||||
ret = 0;
|
||||
|
@ -69,11 +69,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
|
||||
u64 io_latency = time_after64(now, submit_time)
|
||||
? now - submit_time
|
||||
: 0;
|
||||
u64 old, new, v = atomic64_read(latency);
|
||||
u64 old, new;
|
||||
|
||||
old = atomic64_read(latency);
|
||||
do {
|
||||
old = v;
|
||||
|
||||
/*
|
||||
* If the io latency was reasonably close to the current
|
||||
* latency, skip doing the update and atomic operation - most of
|
||||
@ -84,7 +83,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
|
||||
break;
|
||||
|
||||
new = ewma_add(old, io_latency, 5);
|
||||
} while ((v = atomic64_cmpxchg(latency, old, new)) != old);
|
||||
} while (!atomic64_try_cmpxchg(latency, &old, new));
|
||||
|
||||
bch2_congested_acct(ca, io_latency, now, rw);
|
||||
|
||||
|
@ -230,7 +230,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct journal_buf *buf = journal_cur_buf(j);
|
||||
union journal_res_state old, new;
|
||||
u64 v = atomic64_read(&j->reservations.counter);
|
||||
unsigned sectors;
|
||||
|
||||
BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
|
||||
@ -238,15 +237,16 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
|
||||
|
||||
lockdep_assert_held(&j->lock);
|
||||
|
||||
old.v = atomic64_read(&j->reservations.counter);
|
||||
do {
|
||||
old.v = new.v = v;
|
||||
new.v = old.v;
|
||||
new.cur_entry_offset = closed_val;
|
||||
|
||||
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
|
||||
old.cur_entry_offset == new.cur_entry_offset)
|
||||
return;
|
||||
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
|
||||
old.v, new.v)) != old.v);
|
||||
} while (!atomic64_try_cmpxchg(&j->reservations.counter,
|
||||
&old.v, new.v));
|
||||
|
||||
if (!__journal_entry_is_open(old))
|
||||
return;
|
||||
@ -353,7 +353,6 @@ static int journal_entry_open(struct journal *j)
|
||||
((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
|
||||
union journal_res_state old, new;
|
||||
int u64s;
|
||||
u64 v;
|
||||
|
||||
lockdep_assert_held(&j->lock);
|
||||
BUG_ON(journal_entry_is_open(j));
|
||||
@ -432,9 +431,9 @@ static int journal_entry_open(struct journal *j)
|
||||
*/
|
||||
j->cur_entry_u64s = u64s;
|
||||
|
||||
v = atomic64_read(&j->reservations.counter);
|
||||
old.v = atomic64_read(&j->reservations.counter);
|
||||
do {
|
||||
old.v = new.v = v;
|
||||
new.v = old.v;
|
||||
|
||||
BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
|
||||
|
||||
@ -446,8 +445,8 @@ static int journal_entry_open(struct journal *j)
|
||||
|
||||
/* Handle any already added entries */
|
||||
new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
|
||||
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
|
||||
old.v, new.v)) != old.v);
|
||||
} while (!atomic64_try_cmpxchg(&j->reservations.counter,
|
||||
&old.v, new.v));
|
||||
|
||||
if (nr_unwritten_journal_entries(j) == 1)
|
||||
mod_delayed_work(j->wq,
|
||||
|
@ -327,10 +327,10 @@ static inline int journal_res_get_fast(struct journal *j,
|
||||
unsigned flags)
|
||||
{
|
||||
union journal_res_state old, new;
|
||||
u64 v = atomic64_read(&j->reservations.counter);
|
||||
|
||||
old.v = atomic64_read(&j->reservations.counter);
|
||||
do {
|
||||
old.v = new.v = v;
|
||||
new.v = old.v;
|
||||
|
||||
/*
|
||||
* Check if there is still room in the current journal
|
||||
@ -356,8 +356,8 @@ static inline int journal_res_get_fast(struct journal *j,
|
||||
|
||||
if (flags & JOURNAL_RES_GET_CHECK)
|
||||
return 1;
|
||||
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
|
||||
old.v, new.v)) != old.v);
|
||||
} while (!atomic64_try_cmpxchg(&j->reservations.counter,
|
||||
&old.v, new.v));
|
||||
|
||||
res->ref = true;
|
||||
res->idx = old.idx;
|
||||
|
@ -1583,7 +1583,7 @@ static CLOSURE_CALLBACK(journal_write_done)
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct bch_replicas_padded replicas;
|
||||
union journal_res_state old, new;
|
||||
u64 v, seq = le64_to_cpu(w->data->seq);
|
||||
u64 seq = le64_to_cpu(w->data->seq);
|
||||
int err = 0;
|
||||
|
||||
bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
|
||||
@ -1642,14 +1642,15 @@ static CLOSURE_CALLBACK(journal_write_done)
|
||||
if (j->watermark != BCH_WATERMARK_stripe)
|
||||
journal_reclaim_kick(&c->journal);
|
||||
|
||||
v = atomic64_read(&j->reservations.counter);
|
||||
old.v = atomic64_read(&j->reservations.counter);
|
||||
do {
|
||||
old.v = new.v = v;
|
||||
new.v = old.v;
|
||||
BUG_ON(journal_state_count(new, new.unwritten_idx));
|
||||
BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
|
||||
|
||||
new.unwritten_idx++;
|
||||
} while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
|
||||
} while (!atomic64_try_cmpxchg(&j->reservations.counter,
|
||||
&old.v, new.v));
|
||||
|
||||
closure_wake_up(&w->wait);
|
||||
completed = true;
|
||||
@ -1847,8 +1848,14 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
|
||||
}
|
||||
}
|
||||
|
||||
if (wb.wb)
|
||||
bch2_journal_keys_to_write_buffer_end(c, &wb);
|
||||
if (wb.wb) {
|
||||
ret = bch2_journal_keys_to_write_buffer_end(c, &wb);
|
||||
if (ret) {
|
||||
bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s",
|
||||
bch2_err_str(ret));
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
spin_lock(&c->journal.lock);
|
||||
w->need_flush_to_write_buffer = false;
|
||||
|
@ -94,8 +94,8 @@ static int bch2_check_lru_key(struct btree_trans *trans,
|
||||
u64 idx;
|
||||
int ret;
|
||||
|
||||
if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
|
||||
lru_entry_to_invalid_bucket,
|
||||
if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos),
|
||||
trans, lru_entry_to_invalid_bucket,
|
||||
"lru key points to nonexistent device:bucket %llu:%llu",
|
||||
alloc_pos.inode, alloc_pos.offset))
|
||||
return bch2_btree_delete_at(trans, lru_iter, 0);
|
||||
@ -125,7 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (fsck_err(c, lru_entry_bad,
|
||||
if (fsck_err(trans, lru_entry_bad,
|
||||
"incorrect lru entry: lru %s time %llu\n"
|
||||
" %s\n"
|
||||
" for %s",
|
||||
|
@ -547,6 +547,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
|
||||
ctxt->stats->pos = BBPOS(btree_id, start);
|
||||
}
|
||||
|
||||
bch2_trans_begin(trans);
|
||||
bch2_trans_iter_init(trans, &iter, btree_id, start,
|
||||
BTREE_ITER_prefetch|
|
||||
BTREE_ITER_all_snapshots);
|
||||
@ -804,7 +805,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
if (!b)
|
||||
goto next;
|
||||
|
||||
unsigned sectors = btree_ptr_sectors_written(&b->key);
|
||||
unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
|
||||
|
||||
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
@ -920,7 +921,20 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
|
||||
? c->opts.metadata_replicas
|
||||
: io_opts->data_replicas;
|
||||
|
||||
if (!nr_good || nr_good >= replicas)
|
||||
rcu_read_lock();
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
unsigned i = 0;
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
|
||||
if (!ptr->cached &&
|
||||
(!ca || !ca->mi.durability))
|
||||
data_opts->kill_ptrs |= BIT(i);
|
||||
i++;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!data_opts->kill_ptrs &&
|
||||
(!nr_good || nr_good >= replicas))
|
||||
return false;
|
||||
|
||||
data_opts->target = 0;
|
||||
|
@ -378,6 +378,10 @@ int bch2_opt_parse(struct bch_fs *c,
|
||||
break;
|
||||
case BCH_OPT_FN:
|
||||
ret = opt->fn.parse(c, val, res, err);
|
||||
|
||||
if (ret == -BCH_ERR_option_needs_open_fs)
|
||||
return ret;
|
||||
|
||||
if (ret < 0) {
|
||||
if (err)
|
||||
prt_printf(err, "%s: parse error",
|
||||
@ -460,14 +464,81 @@ int bch2_opts_check_may_set(struct bch_fs *c)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
|
||||
struct printbuf *parse_later,
|
||||
const char *name, const char *val)
|
||||
{
|
||||
struct printbuf err = PRINTBUF;
|
||||
u64 v;
|
||||
int ret, id;
|
||||
|
||||
id = bch2_mount_opt_lookup(name);
|
||||
|
||||
/* Check for the form "noopt", negation of a boolean opt: */
|
||||
if (id < 0 &&
|
||||
!val &&
|
||||
!strncmp("no", name, 2)) {
|
||||
id = bch2_mount_opt_lookup(name + 2);
|
||||
val = "0";
|
||||
}
|
||||
|
||||
/* Unknown options are ignored: */
|
||||
if (id < 0)
|
||||
return 0;
|
||||
|
||||
if (!(bch2_opt_table[id].flags & OPT_MOUNT))
|
||||
goto bad_opt;
|
||||
|
||||
if (id == Opt_acl &&
|
||||
!IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
|
||||
goto bad_opt;
|
||||
|
||||
if ((id == Opt_usrquota ||
|
||||
id == Opt_grpquota) &&
|
||||
!IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
|
||||
goto bad_opt;
|
||||
|
||||
ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
|
||||
if (ret == -BCH_ERR_option_needs_open_fs && parse_later) {
|
||||
prt_printf(parse_later, "%s=%s,", name, val);
|
||||
if (parse_later->allocation_failure) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (ret < 0)
|
||||
goto bad_val;
|
||||
|
||||
if (opts)
|
||||
bch2_opt_set_by_id(opts, id, v);
|
||||
|
||||
ret = 0;
|
||||
goto out;
|
||||
|
||||
bad_opt:
|
||||
pr_err("Bad mount option %s", name);
|
||||
ret = -BCH_ERR_option_name;
|
||||
goto out;
|
||||
|
||||
bad_val:
|
||||
pr_err("Invalid mount option %s", err.buf);
|
||||
ret = -BCH_ERR_option_value;
|
||||
|
||||
out:
|
||||
printbuf_exit(&err);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
|
||||
char *options)
|
||||
struct printbuf *parse_later, char *options)
|
||||
{
|
||||
char *copied_opts, *copied_opts_start;
|
||||
char *opt, *name, *val;
|
||||
int ret, id;
|
||||
struct printbuf err = PRINTBUF;
|
||||
u64 v;
|
||||
int ret;
|
||||
|
||||
if (!options)
|
||||
return 0;
|
||||
@ -488,53 +559,16 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
|
||||
name = strsep(&opt, "=");
|
||||
val = opt;
|
||||
|
||||
id = bch2_mount_opt_lookup(name);
|
||||
|
||||
/* Check for the form "noopt", negation of a boolean opt: */
|
||||
if (id < 0 &&
|
||||
!val &&
|
||||
!strncmp("no", name, 2)) {
|
||||
id = bch2_mount_opt_lookup(name + 2);
|
||||
val = "0";
|
||||
}
|
||||
|
||||
/* Unknown options are ignored: */
|
||||
if (id < 0)
|
||||
continue;
|
||||
|
||||
if (!(bch2_opt_table[id].flags & OPT_MOUNT))
|
||||
goto bad_opt;
|
||||
|
||||
if (id == Opt_acl &&
|
||||
!IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
|
||||
goto bad_opt;
|
||||
|
||||
if ((id == Opt_usrquota ||
|
||||
id == Opt_grpquota) &&
|
||||
!IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
|
||||
goto bad_opt;
|
||||
|
||||
ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
|
||||
ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
|
||||
if (ret < 0)
|
||||
goto bad_val;
|
||||
|
||||
bch2_opt_set_by_id(opts, id, v);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
goto out;
|
||||
|
||||
bad_opt:
|
||||
pr_err("Bad mount option %s", name);
|
||||
ret = -BCH_ERR_option_name;
|
||||
goto out;
|
||||
bad_val:
|
||||
pr_err("Invalid mount option %s", err.buf);
|
||||
ret = -BCH_ERR_option_value;
|
||||
goto out;
|
||||
out:
|
||||
kfree(copied_opts_start);
|
||||
printbuf_exit(&err);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -406,7 +406,7 @@ enum fsck_err_opts {
|
||||
BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
|
||||
"offset", "Sector offset of superblock") \
|
||||
x(read_only, u8, \
|
||||
OPT_FS|OPT_MOUNT, \
|
||||
OPT_FS, \
|
||||
OPT_BOOL(), \
|
||||
BCH2_NO_SB_OPT, false, \
|
||||
NULL, NULL) \
|
||||
@ -488,6 +488,13 @@ struct bch_opts {
|
||||
#undef x
|
||||
};
|
||||
|
||||
struct bch2_opts_parse {
|
||||
struct bch_opts opts;
|
||||
|
||||
/* to save opts that can't be parsed before the FS is opened: */
|
||||
struct printbuf parse_later;
|
||||
};
|
||||
|
||||
static const __maybe_unused struct bch_opts bch2_opts_default = {
|
||||
#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \
|
||||
._name##_defined = true, \
|
||||
@ -566,7 +573,10 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
|
||||
|
||||
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
|
||||
int bch2_opts_check_may_set(struct bch_fs *);
|
||||
int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
|
||||
int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
|
||||
struct printbuf *, const char *, const char *);
|
||||
int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
|
||||
char *);
|
||||
|
||||
/* inode opts: */
|
||||
|
||||
|
@ -316,6 +316,20 @@ void bch2_prt_newline(struct printbuf *buf)
|
||||
buf->cur_tabstop = 0;
|
||||
}
|
||||
|
||||
void bch2_printbuf_strip_trailing_newline(struct printbuf *out)
|
||||
{
|
||||
for (int p = out->pos - 1; p >= 0; --p) {
|
||||
if (out->buf[p] == '\n') {
|
||||
out->pos = p;
|
||||
break;
|
||||
}
|
||||
if (out->buf[p] != ' ')
|
||||
break;
|
||||
}
|
||||
|
||||
printbuf_nul_terminate_reserved(out);
|
||||
}
|
||||
|
||||
static void __prt_tab(struct printbuf *out)
|
||||
{
|
||||
int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
|
||||
|
@ -115,6 +115,7 @@ void bch2_printbuf_indent_add(struct printbuf *, unsigned);
|
||||
void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
|
||||
|
||||
void bch2_prt_newline(struct printbuf *);
|
||||
void bch2_printbuf_strip_trailing_newline(struct printbuf *);
|
||||
void bch2_prt_tab(struct printbuf *);
|
||||
void bch2_prt_tab_rjust(struct printbuf *);
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "btree_io.h"
|
||||
#include "buckets.h"
|
||||
#include "dirent.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "errcode.h"
|
||||
#include "error.h"
|
||||
#include "fs-common.h"
|
||||
@ -90,6 +91,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
|
||||
__set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
|
||||
__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
|
||||
__set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
|
||||
__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
|
||||
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
|
||||
|
||||
bch2_write_super(c);
|
||||
@ -134,6 +136,45 @@ static void replay_now_at(struct journal *j, u64 seq)
|
||||
bch2_journal_pin_put(j, j->replay_journal_seq++);
|
||||
}
|
||||
|
||||
static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
|
||||
struct journal_key *k)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
|
||||
BTREE_MAX_DEPTH, k->level,
|
||||
BTREE_ITER_intent);
|
||||
int ret = bch2_btree_iter_traverse(&iter);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
struct bkey u;
|
||||
struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
|
||||
|
||||
/* Has this delta already been applied to the btree? */
|
||||
if (bversion_cmp(old.k->version, k->k->k.version) >= 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
struct bkey_i *new = k->k;
|
||||
if (old.k->type == KEY_TYPE_accounting) {
|
||||
new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
|
||||
ret = PTR_ERR_OR_ZERO(new);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
bch2_accounting_accumulate(bkey_i_to_accounting(new),
|
||||
bkey_s_c_to_accounting(old));
|
||||
}
|
||||
|
||||
trans->journal_res.seq = k->journal_seq;
|
||||
|
||||
ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
|
||||
out:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_journal_replay_key(struct btree_trans *trans,
|
||||
struct journal_key *k)
|
||||
{
|
||||
@ -184,6 +225,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
|
||||
if (k->overwritten)
|
||||
goto out;
|
||||
|
||||
if (k->k->k.type == KEY_TYPE_accounting) {
|
||||
ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = bch2_trans_update(trans, &iter, k->k, update_flags);
|
||||
out:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
@ -221,6 +267,30 @@ int bch2_journal_replay(struct bch_fs *c)
|
||||
move_gap(keys, keys->nr);
|
||||
trans = bch2_trans_get(c);
|
||||
|
||||
/*
|
||||
* Replay accounting keys first: we can't allow the write buffer to
|
||||
* flush accounting keys until we're done
|
||||
*/
|
||||
darray_for_each(*keys, k) {
|
||||
if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
|
||||
continue;
|
||||
|
||||
cond_resched();
|
||||
|
||||
ret = commit_do(trans, NULL, NULL,
|
||||
BCH_TRANS_COMMIT_no_enospc|
|
||||
BCH_TRANS_COMMIT_journal_reclaim|
|
||||
BCH_TRANS_COMMIT_skip_accounting_apply|
|
||||
BCH_TRANS_COMMIT_no_journal_res,
|
||||
bch2_journal_replay_accounting_key(trans, k));
|
||||
if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
|
||||
goto err;
|
||||
|
||||
k->overwritten = true;
|
||||
}
|
||||
|
||||
set_bit(BCH_FS_accounting_replay_done, &c->flags);
|
||||
|
||||
/*
|
||||
* First, attempt to replay keys in sorted order. This is more
|
||||
* efficient - better locality of btree access - but some might fail if
|
||||
@ -241,9 +311,10 @@ int bch2_journal_replay(struct bch_fs *c)
|
||||
commit_do(trans, NULL, NULL,
|
||||
BCH_TRANS_COMMIT_no_enospc|
|
||||
BCH_TRANS_COMMIT_journal_reclaim|
|
||||
BCH_TRANS_COMMIT_skip_accounting_apply|
|
||||
(!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
|
||||
bch2_journal_replay_key(trans, k));
|
||||
BUG_ON(!ret && !k->overwritten);
|
||||
BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
|
||||
if (ret) {
|
||||
ret = darray_push(&keys_sorted, k);
|
||||
if (ret)
|
||||
@ -271,6 +342,7 @@ int bch2_journal_replay(struct bch_fs *c)
|
||||
|
||||
ret = commit_do(trans, NULL, NULL,
|
||||
BCH_TRANS_COMMIT_no_enospc|
|
||||
BCH_TRANS_COMMIT_skip_accounting_apply|
|
||||
(!k->allocated
|
||||
? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
|
||||
: 0),
|
||||
@ -280,7 +352,7 @@ int bch2_journal_replay(struct bch_fs *c)
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
BUG_ON(!k->overwritten);
|
||||
BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -349,45 +421,10 @@ static int journal_replay_entry_early(struct bch_fs *c,
|
||||
container_of(entry, struct jset_entry_usage, entry);
|
||||
|
||||
switch (entry->btree_id) {
|
||||
case BCH_FS_USAGE_reserved:
|
||||
if (entry->level < BCH_REPLICAS_MAX)
|
||||
c->usage_base->persistent_reserved[entry->level] =
|
||||
le64_to_cpu(u->v);
|
||||
break;
|
||||
case BCH_FS_USAGE_inodes:
|
||||
c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
|
||||
break;
|
||||
case BCH_FS_USAGE_key_version:
|
||||
atomic64_set(&c->key_version,
|
||||
le64_to_cpu(u->v));
|
||||
atomic64_set(&c->key_version, le64_to_cpu(u->v));
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case BCH_JSET_ENTRY_data_usage: {
|
||||
struct jset_entry_data_usage *u =
|
||||
container_of(entry, struct jset_entry_data_usage, entry);
|
||||
|
||||
ret = bch2_replicas_set_usage(c, &u->r,
|
||||
le64_to_cpu(u->v));
|
||||
break;
|
||||
}
|
||||
case BCH_JSET_ENTRY_dev_usage: {
|
||||
struct jset_entry_dev_usage *u =
|
||||
container_of(entry, struct jset_entry_dev_usage, entry);
|
||||
unsigned nr_types = jset_entry_dev_usage_nr_types(u);
|
||||
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, le32_to_cpu(u->dev));
|
||||
if (ca)
|
||||
for (unsigned i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
|
||||
ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
|
||||
ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
|
||||
ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
break;
|
||||
}
|
||||
case BCH_JSET_ENTRY_blacklist: {
|
||||
@ -448,8 +485,6 @@ static int journal_replay_early(struct bch_fs *c,
|
||||
}
|
||||
}
|
||||
|
||||
bch2_fs_usage_initialize(c);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -804,6 +839,10 @@ use_clean:
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
set_bit(BCH_FS_btree_running, &c->flags);
|
||||
|
||||
ret = bch2_sb_set_upgrade_extra(c);
|
||||
|
||||
ret = bch2_run_recovery_passes(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
@ -963,14 +1002,12 @@ int bch2_fs_initialize(struct bch_fs *c)
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
|
||||
set_bit(BCH_FS_btree_running, &c->flags);
|
||||
set_bit(BCH_FS_may_go_rw, &c->flags);
|
||||
|
||||
for (unsigned i = 0; i < BTREE_ID_NR; i++)
|
||||
bch2_btree_root_alloc_fake(c, i, 0);
|
||||
|
||||
for_each_member_device(c, ca)
|
||||
bch2_dev_usage_init(ca);
|
||||
|
||||
ret = bch2_fs_journal_alloc(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
@ -980,12 +1017,21 @@ int bch2_fs_initialize(struct bch_fs *c)
|
||||
* set up the journal.pin FIFO and journal.cur pointer:
|
||||
*/
|
||||
bch2_fs_journal_start(&c->journal, 1);
|
||||
set_bit(BCH_FS_accounting_replay_done, &c->flags);
|
||||
bch2_journal_set_replay_done(&c->journal);
|
||||
|
||||
ret = bch2_fs_read_write_early(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
ret = bch2_dev_usage_init(ca, false);
|
||||
if (ret) {
|
||||
bch2_dev_put(ca);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out the superblock and journal buckets, now that we can do
|
||||
* btree updates
|
||||
@ -1019,7 +1065,7 @@ int bch2_fs_initialize(struct bch_fs *c)
|
||||
bch2_inode_pack(&packed_inode, &root_inode);
|
||||
packed_inode.inode.k.p.snapshot = U32_MAX;
|
||||
|
||||
ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
|
||||
ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0);
|
||||
bch_err_msg(c, ret, "creating root directory");
|
||||
if (ret)
|
||||
goto err;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "backpointers.h"
|
||||
#include "btree_gc.h"
|
||||
#include "btree_node_scan.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "ec.h"
|
||||
#include "fsck.h"
|
||||
#include "inode.h"
|
||||
@ -192,6 +193,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
down_read(&c->state_lock);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
|
||||
struct recovery_pass_fn *p = recovery_pass_fns + i;
|
||||
|
||||
@ -207,6 +210,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c)
|
||||
break;
|
||||
}
|
||||
|
||||
up_read(&c->state_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -15,6 +15,7 @@
|
||||
#define BCH_RECOVERY_PASSES() \
|
||||
x(scan_for_btree_nodes, 37, 0) \
|
||||
x(check_topology, 4, 0) \
|
||||
x(accounting_read, 39, PASS_ALWAYS) \
|
||||
x(alloc_read, 0, PASS_ALWAYS) \
|
||||
x(stripes_read, 1, PASS_ALWAYS) \
|
||||
x(initialize_subvolumes, 2, 0) \
|
||||
|
@ -171,7 +171,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
|
||||
not_found:
|
||||
BUG_ON(!(flags & BTREE_TRIGGER_check_repair));
|
||||
|
||||
if (fsck_err(c, reflink_p_to_missing_reflink_v,
|
||||
if (fsck_err(trans, reflink_p_to_missing_reflink_v,
|
||||
"pointer to missing indirect extent\n"
|
||||
" %s\n"
|
||||
" missing range %llu-%llu",
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "buckets.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "journal.h"
|
||||
#include "replicas.h"
|
||||
#include "super-io.h"
|
||||
@ -243,145 +244,25 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
|
||||
return __replicas_entry_idx(r, search) >= 0;
|
||||
}
|
||||
|
||||
bool bch2_replicas_marked_locked(struct bch_fs *c,
|
||||
struct bch_replicas_entry_v1 *search)
|
||||
{
|
||||
verify_replicas_entry(search);
|
||||
|
||||
return !search->nr_devs ||
|
||||
(__replicas_has_entry(&c->replicas, search) &&
|
||||
(likely((!c->replicas_gc.entries)) ||
|
||||
__replicas_has_entry(&c->replicas_gc, search)));
|
||||
}
|
||||
|
||||
bool bch2_replicas_marked(struct bch_fs *c,
|
||||
struct bch_replicas_entry_v1 *search)
|
||||
{
|
||||
bool marked;
|
||||
|
||||
if (!search->nr_devs)
|
||||
return true;
|
||||
|
||||
verify_replicas_entry(search);
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
marked = __replicas_has_entry(&c->replicas, search) &&
|
||||
(likely((!c->replicas_gc.entries)) ||
|
||||
__replicas_has_entry(&c->replicas_gc, search));
|
||||
bool ret = bch2_replicas_marked_locked(c, search);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
return marked;
|
||||
}
|
||||
|
||||
static void __replicas_table_update(struct bch_fs_usage *dst,
|
||||
struct bch_replicas_cpu *dst_r,
|
||||
struct bch_fs_usage *src,
|
||||
struct bch_replicas_cpu *src_r)
|
||||
{
|
||||
int src_idx, dst_idx;
|
||||
|
||||
*dst = *src;
|
||||
|
||||
for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
|
||||
if (!src->replicas[src_idx])
|
||||
continue;
|
||||
|
||||
dst_idx = __replicas_entry_idx(dst_r,
|
||||
cpu_replicas_entry(src_r, src_idx));
|
||||
BUG_ON(dst_idx < 0);
|
||||
|
||||
dst->replicas[dst_idx] = src->replicas[src_idx];
|
||||
}
|
||||
}
|
||||
|
||||
static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
|
||||
struct bch_replicas_cpu *dst_r,
|
||||
struct bch_fs_usage __percpu *src_p,
|
||||
struct bch_replicas_cpu *src_r)
|
||||
{
|
||||
unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
|
||||
struct bch_fs_usage *dst, *src = (void *)
|
||||
bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
|
||||
|
||||
preempt_disable();
|
||||
dst = this_cpu_ptr(dst_p);
|
||||
preempt_enable();
|
||||
|
||||
__replicas_table_update(dst, dst_r, src, src_r);
|
||||
}
|
||||
|
||||
/*
|
||||
* Resize filesystem accounting:
|
||||
*/
|
||||
static int replicas_table_update(struct bch_fs *c,
|
||||
struct bch_replicas_cpu *new_r)
|
||||
{
|
||||
struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
|
||||
struct bch_fs_usage_online *new_scratch = NULL;
|
||||
struct bch_fs_usage __percpu *new_gc = NULL;
|
||||
struct bch_fs_usage *new_base = NULL;
|
||||
unsigned i, bytes = sizeof(struct bch_fs_usage) +
|
||||
sizeof(u64) * new_r->nr;
|
||||
unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
|
||||
sizeof(u64) * new_r->nr;
|
||||
int ret = 0;
|
||||
|
||||
memset(new_usage, 0, sizeof(new_usage));
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(new_usage); i++)
|
||||
if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
|
||||
sizeof(u64), GFP_KERNEL)))
|
||||
goto err;
|
||||
|
||||
if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
|
||||
!(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) ||
|
||||
(c->usage_gc &&
|
||||
!(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
|
||||
goto err;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(new_usage); i++)
|
||||
if (c->usage[i])
|
||||
__replicas_table_update_pcpu(new_usage[i], new_r,
|
||||
c->usage[i], &c->replicas);
|
||||
if (c->usage_base)
|
||||
__replicas_table_update(new_base, new_r,
|
||||
c->usage_base, &c->replicas);
|
||||
if (c->usage_gc)
|
||||
__replicas_table_update_pcpu(new_gc, new_r,
|
||||
c->usage_gc, &c->replicas);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(new_usage); i++)
|
||||
swap(c->usage[i], new_usage[i]);
|
||||
swap(c->usage_base, new_base);
|
||||
swap(c->usage_scratch, new_scratch);
|
||||
swap(c->usage_gc, new_gc);
|
||||
swap(c->replicas, *new_r);
|
||||
out:
|
||||
free_percpu(new_gc);
|
||||
kfree(new_scratch);
|
||||
for (i = 0; i < ARRAY_SIZE(new_usage); i++)
|
||||
free_percpu(new_usage[i]);
|
||||
kfree(new_base);
|
||||
return ret;
|
||||
err:
|
||||
bch_err(c, "error updating replicas table: memory allocation failure");
|
||||
ret = -BCH_ERR_ENOMEM_replicas_table;
|
||||
goto out;
|
||||
}
|
||||
|
||||
static unsigned reserve_journal_replicas(struct bch_fs *c,
|
||||
struct bch_replicas_cpu *r)
|
||||
{
|
||||
struct bch_replicas_entry_v1 *e;
|
||||
unsigned journal_res_u64s = 0;
|
||||
|
||||
/* nr_inodes: */
|
||||
journal_res_u64s +=
|
||||
DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
|
||||
|
||||
/* key_version: */
|
||||
journal_res_u64s +=
|
||||
DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
|
||||
|
||||
/* persistent_reserved: */
|
||||
journal_res_u64s +=
|
||||
DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
|
||||
BCH_REPLICAS_MAX;
|
||||
|
||||
for_each_cpu_replicas_entry(r, e)
|
||||
journal_res_u64s +=
|
||||
DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
|
||||
e->nr_devs, sizeof(u64));
|
||||
return journal_res_u64s;
|
||||
}
|
||||
|
||||
noinline
|
||||
@ -417,10 +298,6 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
|
||||
ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_journal_entry_res_resize(&c->journal,
|
||||
&c->replicas_journal_res,
|
||||
reserve_journal_replicas(c, &new_r));
|
||||
}
|
||||
|
||||
if (!new_r.entries &&
|
||||
@ -435,7 +312,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
|
||||
/* don't update in memory replicas until changes are persistent */
|
||||
percpu_down_write(&c->mark_lock);
|
||||
if (new_r.entries)
|
||||
ret = replicas_table_update(c, &new_r);
|
||||
swap(c->replicas, new_r);
|
||||
if (new_gc.entries)
|
||||
swap(new_gc, c->replicas_gc);
|
||||
percpu_up_write(&c->mark_lock);
|
||||
@ -457,20 +334,6 @@ int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
|
||||
? 0 : bch2_mark_replicas_slowpath(c, r);
|
||||
}
|
||||
|
||||
/* replicas delta list: */
|
||||
|
||||
int bch2_replicas_delta_list_mark(struct bch_fs *c,
|
||||
struct replicas_delta_list *r)
|
||||
{
|
||||
struct replicas_delta *d = r->d;
|
||||
struct replicas_delta *top = (void *) r->d + r->used;
|
||||
int ret = 0;
|
||||
|
||||
for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
|
||||
ret = bch2_mark_replicas(c, &d->r);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Old replicas_gc mechanism: only used for journal replicas entries now, should
|
||||
* die at some point:
|
||||
@ -484,8 +347,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
|
||||
percpu_down_write(&c->mark_lock);
|
||||
|
||||
ret = ret ?:
|
||||
bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
|
||||
replicas_table_update(c, &c->replicas_gc);
|
||||
bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
|
||||
if (!ret)
|
||||
swap(c->replicas, c->replicas_gc);
|
||||
|
||||
kfree(c->replicas_gc.entries);
|
||||
c->replicas_gc.entries = NULL;
|
||||
@ -584,20 +448,26 @@ retry:
|
||||
struct bch_replicas_entry_v1 *e =
|
||||
cpu_replicas_entry(&c->replicas, i);
|
||||
|
||||
if (e->data_type == BCH_DATA_journal ||
|
||||
c->usage_base->replicas[i] ||
|
||||
percpu_u64_get(&c->usage[0]->replicas[i]) ||
|
||||
percpu_u64_get(&c->usage[1]->replicas[i]) ||
|
||||
percpu_u64_get(&c->usage[2]->replicas[i]) ||
|
||||
percpu_u64_get(&c->usage[3]->replicas[i]))
|
||||
struct disk_accounting_pos k = {
|
||||
.type = BCH_DISK_ACCOUNTING_replicas,
|
||||
};
|
||||
|
||||
memcpy(&k.replicas, e, replicas_entry_bytes(e));
|
||||
|
||||
u64 v = 0;
|
||||
bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&k), &v, 1);
|
||||
|
||||
if (e->data_type == BCH_DATA_journal || v)
|
||||
memcpy(cpu_replicas_entry(&new, new.nr++),
|
||||
e, new.entry_size);
|
||||
}
|
||||
|
||||
bch2_cpu_replicas_sort(&new);
|
||||
|
||||
ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
|
||||
replicas_table_update(c, &new);
|
||||
ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
|
||||
|
||||
if (!ret)
|
||||
swap(c->replicas, new);
|
||||
|
||||
kfree(new.entries);
|
||||
|
||||
@ -611,34 +481,6 @@ retry:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_replicas_set_usage(struct bch_fs *c,
|
||||
struct bch_replicas_entry_v1 *r,
|
||||
u64 sectors)
|
||||
{
|
||||
int ret, idx = bch2_replicas_entry_idx(c, r);
|
||||
|
||||
if (idx < 0) {
|
||||
struct bch_replicas_cpu n;
|
||||
|
||||
n = cpu_replicas_add_entry(c, &c->replicas, r);
|
||||
if (!n.entries)
|
||||
return -BCH_ERR_ENOMEM_cpu_replicas;
|
||||
|
||||
ret = replicas_table_update(c, &n);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
kfree(n.entries);
|
||||
|
||||
idx = bch2_replicas_entry_idx(c, r);
|
||||
BUG_ON(ret < 0);
|
||||
}
|
||||
|
||||
c->usage_base->replicas[idx] = sectors;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Replicas tracking - superblock: */
|
||||
|
||||
static int
|
||||
@ -724,8 +566,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
|
||||
bch2_cpu_replicas_sort(&new_r);
|
||||
|
||||
percpu_down_write(&c->mark_lock);
|
||||
|
||||
ret = replicas_table_update(c, &new_r);
|
||||
swap(c->replicas, new_r);
|
||||
percpu_up_write(&c->mark_lock);
|
||||
|
||||
kfree(new_r.entries);
|
||||
@ -1027,10 +868,8 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
|
||||
|
||||
unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
unsigned ret;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
|
||||
unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return ret;
|
||||
@ -1038,25 +877,6 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
|
||||
|
||||
void bch2_fs_replicas_exit(struct bch_fs *c)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
kfree(c->usage_scratch);
|
||||
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
|
||||
free_percpu(c->usage[i]);
|
||||
kfree(c->usage_base);
|
||||
kfree(c->replicas.entries);
|
||||
kfree(c->replicas_gc.entries);
|
||||
|
||||
mempool_exit(&c->replicas_delta_pool);
|
||||
}
|
||||
|
||||
int bch2_fs_replicas_init(struct bch_fs *c)
|
||||
{
|
||||
bch2_journal_entry_res_resize(&c->journal,
|
||||
&c->replicas_journal_res,
|
||||
reserve_journal_replicas(c, &c->replicas));
|
||||
|
||||
return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
|
||||
REPLICAS_DELTA_LIST_MAX) ?:
|
||||
replicas_table_update(c, &c->replicas);
|
||||
}
|
||||
|
@ -25,18 +25,13 @@ int bch2_replicas_entry_idx(struct bch_fs *,
|
||||
void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
|
||||
enum bch_data_type,
|
||||
struct bch_devs_list);
|
||||
|
||||
bool bch2_replicas_marked_locked(struct bch_fs *,
|
||||
struct bch_replicas_entry_v1 *);
|
||||
bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
|
||||
int bch2_mark_replicas(struct bch_fs *,
|
||||
struct bch_replicas_entry_v1 *);
|
||||
|
||||
static inline struct replicas_delta *
|
||||
replicas_delta_next(struct replicas_delta *d)
|
||||
{
|
||||
return (void *) d + replicas_entry_bytes(&d->r) + 8;
|
||||
}
|
||||
|
||||
int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
|
||||
|
||||
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
|
||||
|
||||
static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
|
||||
@ -58,10 +53,6 @@ int bch2_replicas_gc_end(struct bch_fs *, int);
|
||||
int bch2_replicas_gc_start(struct bch_fs *, unsigned);
|
||||
int bch2_replicas_gc2(struct bch_fs *);
|
||||
|
||||
int bch2_replicas_set_usage(struct bch_fs *,
|
||||
struct bch_replicas_entry_v1 *,
|
||||
u64);
|
||||
|
||||
#define for_each_cpu_replicas_entry(_r, _i) \
|
||||
for (_i = (_r)->entries; \
|
||||
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
|
||||
@ -88,6 +79,5 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
|
||||
extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
|
||||
|
||||
void bch2_fs_replicas_exit(struct bch_fs *);
|
||||
int bch2_fs_replicas_init(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_REPLICAS_H */
|
||||
|
@ -8,20 +8,4 @@ struct bch_replicas_cpu {
|
||||
struct bch_replicas_entry_v1 *entries;
|
||||
};
|
||||
|
||||
struct replicas_delta {
|
||||
s64 delta;
|
||||
struct bch_replicas_entry_v1 r;
|
||||
} __packed;
|
||||
|
||||
struct replicas_delta_list {
|
||||
unsigned size;
|
||||
unsigned used;
|
||||
|
||||
struct {} memset_start;
|
||||
u64 nr_inodes;
|
||||
u64 persistent_reserved[BCH_REPLICAS_MAX];
|
||||
struct {} memset_end;
|
||||
struct replicas_delta d[];
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_REPLICAS_TYPES_H */
|
||||
|
@ -183,25 +183,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
|
||||
struct jset_entry **end,
|
||||
u64 journal_seq)
|
||||
{
|
||||
percpu_down_read(&c->mark_lock);
|
||||
|
||||
if (!journal_seq) {
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
|
||||
bch2_fs_usage_acc_to_base(c, i);
|
||||
} else {
|
||||
bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
|
||||
}
|
||||
|
||||
{
|
||||
struct jset_entry_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u)),
|
||||
struct jset_entry_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_usage;
|
||||
u->entry.btree_id = BCH_FS_USAGE_inodes;
|
||||
u->v = cpu_to_le64(c->usage_base->b.nr_inodes);
|
||||
}
|
||||
|
||||
{
|
||||
struct jset_entry_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u)),
|
||||
@ -212,49 +193,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
|
||||
u->v = cpu_to_le64(atomic64_read(&c->key_version));
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
|
||||
struct jset_entry_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u)),
|
||||
struct jset_entry_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_usage;
|
||||
u->entry.btree_id = BCH_FS_USAGE_reserved;
|
||||
u->entry.level = i;
|
||||
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < c->replicas.nr; i++) {
|
||||
struct bch_replicas_entry_v1 *e =
|
||||
cpu_replicas_entry(&c->replicas, i);
|
||||
struct jset_entry_data_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
|
||||
struct jset_entry_data_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_data_usage;
|
||||
u->v = cpu_to_le64(c->usage_base->replicas[i]);
|
||||
unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
|
||||
"embedded variable length struct");
|
||||
}
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
unsigned b = sizeof(struct jset_entry_dev_usage) +
|
||||
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
|
||||
struct jset_entry_dev_usage *u =
|
||||
container_of(jset_entry_init(end, b),
|
||||
struct jset_entry_dev_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_dev_usage;
|
||||
u->dev = cpu_to_le32(ca->dev_idx);
|
||||
|
||||
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
|
||||
u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
|
||||
u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
|
||||
u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
|
||||
}
|
||||
}
|
||||
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
struct jset_entry_clock *clock =
|
||||
container_of(jset_entry_init(end, sizeof(*clock)),
|
||||
|
@ -54,9 +54,27 @@
|
||||
BCH_FSCK_ERR_subvol_children_not_set) \
|
||||
x(mi_btree_bitmap, \
|
||||
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
||||
BCH_FSCK_ERR_btree_bitmap_not_marked)
|
||||
BCH_FSCK_ERR_btree_bitmap_not_marked) \
|
||||
x(disk_accounting_v2, \
|
||||
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
||||
BCH_FSCK_ERR_accounting_mismatch)
|
||||
|
||||
#define DOWNGRADE_TABLE()
|
||||
#define DOWNGRADE_TABLE() \
|
||||
x(bucket_stripe_sectors, \
|
||||
0) \
|
||||
x(disk_accounting_v2, \
|
||||
BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info), \
|
||||
BCH_FSCK_ERR_dev_usage_buckets_wrong, \
|
||||
BCH_FSCK_ERR_dev_usage_sectors_wrong, \
|
||||
BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
|
||||
BCH_FSCK_ERR_fs_usage_hidden_wrong, \
|
||||
BCH_FSCK_ERR_fs_usage_btree_wrong, \
|
||||
BCH_FSCK_ERR_fs_usage_data_wrong, \
|
||||
BCH_FSCK_ERR_fs_usage_cached_wrong, \
|
||||
BCH_FSCK_ERR_fs_usage_reserved_wrong, \
|
||||
BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
|
||||
BCH_FSCK_ERR_fs_usage_replicas_wrong, \
|
||||
BCH_FSCK_ERR_bkey_version_in_future)
|
||||
|
||||
struct upgrade_downgrade_entry {
|
||||
u64 recovery_passes;
|
||||
@ -80,6 +98,37 @@ UPGRADE_TABLE()
|
||||
#undef x
|
||||
};
|
||||
|
||||
static int have_stripes(struct bch_fs *c)
|
||||
{
|
||||
return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
|
||||
}
|
||||
|
||||
int bch2_sb_set_upgrade_extra(struct bch_fs *c)
|
||||
{
|
||||
unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
|
||||
unsigned new_version = c->sb.version;
|
||||
bool write_sb = false;
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
|
||||
|
||||
if (old_version < bcachefs_metadata_version_bucket_stripe_sectors &&
|
||||
new_version >= bcachefs_metadata_version_bucket_stripe_sectors &&
|
||||
(ret = have_stripes(c) > 0)) {
|
||||
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
|
||||
__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
|
||||
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent);
|
||||
write_sb = true;
|
||||
}
|
||||
|
||||
if (write_sb)
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return ret < 0 ? ret : 0;
|
||||
}
|
||||
|
||||
void bch2_sb_set_upgrade(struct bch_fs *c,
|
||||
unsigned old_version,
|
||||
unsigned new_version)
|
||||
@ -101,16 +150,12 @@ void bch2_sb_set_upgrade(struct bch_fs *c,
|
||||
ext->recovery_passes_required[0] |=
|
||||
cpu_to_le64(bch2_recovery_passes_to_stable(passes));
|
||||
|
||||
for (const u16 *e = i->errors;
|
||||
e < i->errors + i->nr_errors;
|
||||
e++) {
|
||||
__set_bit(*e, c->sb.errors_silent);
|
||||
ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64));
|
||||
}
|
||||
for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++)
|
||||
__set_bit_le64(*e, ext->errors_silent);
|
||||
}
|
||||
}
|
||||
|
||||
#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ };
|
||||
#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
|
||||
DOWNGRADE_TABLE()
|
||||
#undef x
|
||||
|
||||
@ -125,6 +170,33 @@ DOWNGRADE_TABLE()
|
||||
#undef x
|
||||
};
|
||||
|
||||
static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
|
||||
{
|
||||
struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
|
||||
unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
|
||||
int ret = 0;
|
||||
|
||||
unsigned nr_errors = le16_to_cpu(dst->nr_errors);
|
||||
|
||||
switch (le16_to_cpu(dst->version)) {
|
||||
case bcachefs_metadata_version_bucket_stripe_sectors:
|
||||
if (have_stripes(c)) {
|
||||
bytes += sizeof(dst->errors[0]) * 2;
|
||||
|
||||
ret = darray_make_room(table, bytes);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, dst->recovery_passes);
|
||||
dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
dst->nr_errors = cpu_to_le16(nr_errors);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline const struct bch_sb_field_downgrade_entry *
|
||||
downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
|
||||
{
|
||||
@ -210,6 +282,9 @@ const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
|
||||
|
||||
int bch2_sb_downgrade_update(struct bch_fs *c)
|
||||
{
|
||||
if (!test_bit(BCH_FS_btree_running, &c->flags))
|
||||
return 0;
|
||||
|
||||
darray_char table = {};
|
||||
int ret = 0;
|
||||
|
||||
@ -234,7 +309,14 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
|
||||
for (unsigned i = 0; i < src->nr_errors; i++)
|
||||
dst->errors[i] = cpu_to_le16(src->errors[i]);
|
||||
|
||||
table.nr += bytes;
|
||||
downgrade_table_extra(c, &table);
|
||||
|
||||
if (!dst->recovery_passes[0] &&
|
||||
!dst->recovery_passes[1] &&
|
||||
!dst->nr_errors)
|
||||
continue;
|
||||
|
||||
table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
|
||||
}
|
||||
|
||||
struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
|
||||
|
@ -6,6 +6,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
|
||||
|
||||
int bch2_sb_downgrade_update(struct bch_fs *);
|
||||
void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
|
||||
int bch2_sb_set_upgrade_extra(struct bch_fs *);
|
||||
void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
|
||||
|
||||
#endif /* _BCACHEFS_SB_DOWNGRADE_H */
|
||||
|
@ -219,8 +219,8 @@
|
||||
x(deleted_inode_is_dir, 213) \
|
||||
x(deleted_inode_not_unlinked, 214) \
|
||||
x(extent_overlapping, 215) \
|
||||
x(extent_in_missing_inode, 216) \
|
||||
x(extent_in_non_reg_inode, 217) \
|
||||
x(key_in_missing_inode, 216) \
|
||||
x(key_in_wrong_inode_type, 217) \
|
||||
x(extent_past_end_of_inode, 218) \
|
||||
x(dirent_empty_name, 219) \
|
||||
x(dirent_val_too_big, 220) \
|
||||
@ -273,7 +273,10 @@
|
||||
x(sb_clean_entry_overrun, 267) \
|
||||
x(btree_ptr_v2_written_0, 268) \
|
||||
x(subvol_snapshot_bad, 269) \
|
||||
x(subvol_inode_bad, 270)
|
||||
x(subvol_inode_bad, 270) \
|
||||
x(alloc_key_stripe_sectors_wrong, 271) \
|
||||
x(accounting_mismatch, 272) \
|
||||
x(accounting_replicas_not_marked, 273)
|
||||
|
||||
enum bch_sb_error_id {
|
||||
#define x(t, n) BCH_FSCK_ERR_##t = n,
|
||||
|
@ -549,7 +549,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
|
||||
if (fsck_err_on(ret ||
|
||||
root_id != bch2_snapshot_root(c, root_id) ||
|
||||
st.k->p.offset != le32_to_cpu(s.tree),
|
||||
c, snapshot_tree_to_missing_snapshot,
|
||||
trans, snapshot_tree_to_missing_snapshot,
|
||||
"snapshot tree points to missing/incorrect snapshot:\n %s",
|
||||
(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
|
||||
ret = bch2_btree_delete_at(trans, iter, 0);
|
||||
@ -562,19 +562,19 @@ static int check_snapshot_tree(struct btree_trans *trans,
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(ret,
|
||||
c, snapshot_tree_to_missing_subvol,
|
||||
trans, snapshot_tree_to_missing_subvol,
|
||||
"snapshot tree points to missing subvolume:\n %s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
|
||||
fsck_err_on(!bch2_snapshot_is_ancestor(c,
|
||||
le32_to_cpu(subvol.snapshot),
|
||||
root_id),
|
||||
c, snapshot_tree_to_wrong_subvol,
|
||||
trans, snapshot_tree_to_wrong_subvol,
|
||||
"snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
|
||||
fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
|
||||
c, snapshot_tree_to_snapshot_subvol,
|
||||
trans, snapshot_tree_to_snapshot_subvol,
|
||||
"snapshot tree points to snapshot subvolume:\n %s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
|
||||
@ -811,7 +811,7 @@ static int check_snapshot(struct btree_trans *trans,
|
||||
}
|
||||
} else {
|
||||
if (fsck_err_on(s.subvol,
|
||||
c, snapshot_should_not_have_subvol,
|
||||
trans, snapshot_should_not_have_subvol,
|
||||
"snapshot should not point to subvol:\n %s",
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
|
||||
@ -828,7 +828,8 @@ static int check_snapshot(struct btree_trans *trans,
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
|
||||
if (fsck_err_on(!ret,
|
||||
trans, snapshot_to_bad_snapshot_tree,
|
||||
"snapshot points to missing/incorrect tree:\n %s",
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
|
||||
@ -840,7 +841,7 @@ static int check_snapshot(struct btree_trans *trans,
|
||||
real_depth = bch2_snapshot_depth(c, parent_id);
|
||||
|
||||
if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
|
||||
c, snapshot_bad_depth,
|
||||
trans, snapshot_bad_depth,
|
||||
"snapshot with incorrect depth field, should be %u:\n %s",
|
||||
real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
|
||||
@ -856,7 +857,8 @@ static int check_snapshot(struct btree_trans *trans,
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(!ret, c, snapshot_bad_skiplist,
|
||||
if (fsck_err_on(!ret,
|
||||
trans, snapshot_bad_skiplist,
|
||||
"snapshot with bad skiplist field:\n %s",
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
|
||||
@ -1018,7 +1020,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
|
||||
|
||||
darray_for_each(*t, id) {
|
||||
if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
|
||||
c, snapshot_node_missing,
|
||||
trans, snapshot_node_missing,
|
||||
"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
|
||||
if (t->nr > 1) {
|
||||
bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
|
||||
@ -1050,8 +1052,8 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans,
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
|
||||
bkey_in_missing_snapshot,
|
||||
if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot),
|
||||
trans, bkey_in_missing_snapshot,
|
||||
"key in missing snapshot %s, delete?",
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
||||
ret = bch2_btree_delete_at(trans, iter,
|
||||
|
@ -300,7 +300,7 @@ not_found:
|
||||
if (!found && (flags & STR_HASH_must_replace)) {
|
||||
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
|
||||
} else if (found && (flags & STR_HASH_must_create)) {
|
||||
ret = -EEXIST;
|
||||
ret = -BCH_ERR_EEXIST_str_hash_set;
|
||||
} else {
|
||||
if (!found && slot.path)
|
||||
swap(iter, slot);
|
||||
|
@ -57,7 +57,7 @@ static int check_subvol(struct btree_trans *trans,
|
||||
|
||||
if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
|
||||
subvol.v->fs_path_parent,
|
||||
c, subvol_root_fs_path_parent_nonzero,
|
||||
trans, subvol_root_fs_path_parent_nonzero,
|
||||
"root subvolume has nonzero fs_path_parent\n%s",
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
struct bkey_i_subvolume *n =
|
||||
@ -80,7 +80,7 @@ static int check_subvol(struct btree_trans *trans,
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
|
||||
c, subvol_children_not_set,
|
||||
trans, subvol_children_not_set,
|
||||
"subvolume not set in subvolume_children btree at %llu:%llu\n%s",
|
||||
pos.inode, pos.offset,
|
||||
(printbuf_reset(&buf),
|
||||
@ -101,7 +101,8 @@ static int check_subvol(struct btree_trans *trans,
|
||||
if (ret && !bch2_err_matches(ret, ENOENT))
|
||||
return ret;
|
||||
|
||||
if (fsck_err_on(ret, c, subvol_to_missing_root,
|
||||
if (fsck_err_on(ret,
|
||||
trans, subvol_to_missing_root,
|
||||
"subvolume %llu points to missing subvolume root %llu:%u",
|
||||
k.k->p.offset, le64_to_cpu(subvol.v->inode),
|
||||
le32_to_cpu(subvol.v->snapshot))) {
|
||||
@ -111,7 +112,7 @@ static int check_subvol(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
|
||||
c, subvol_root_wrong_bi_subvol,
|
||||
trans, subvol_root_wrong_bi_subvol,
|
||||
"subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
|
||||
inode.bi_inum, inode_iter.k.p.snapshot,
|
||||
inode.bi_subvol, subvol.k->p.offset)) {
|
||||
@ -139,7 +140,7 @@ static int check_subvol(struct btree_trans *trans,
|
||||
return ret;
|
||||
|
||||
if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
|
||||
c, subvol_not_master_and_not_snapshot,
|
||||
trans, subvol_not_master_and_not_snapshot,
|
||||
"subvolume %llu is not set as snapshot but is not master subvolume",
|
||||
k.k->p.offset)) {
|
||||
struct bkey_i_subvolume *s =
|
||||
@ -173,7 +174,6 @@ static int check_subvol_child(struct btree_trans *trans,
|
||||
struct btree_iter *child_iter,
|
||||
struct bkey_s_c child_k)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_subvolume s;
|
||||
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
|
||||
0, subvolume, &s);
|
||||
@ -182,7 +182,7 @@ static int check_subvol_child(struct btree_trans *trans,
|
||||
|
||||
if (fsck_err_on(ret ||
|
||||
le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
|
||||
c, subvol_children_bad,
|
||||
trans, subvol_children_bad,
|
||||
"incorrect entry in subvolume_children btree %llu:%llu",
|
||||
child_k.k->p.inode, child_k.k->p.offset)) {
|
||||
ret = bch2_btree_delete_at(trans, child_iter, 0);
|
||||
@ -630,9 +630,9 @@ int bch2_initialize_subvolumes(struct bch_fs *c)
|
||||
root_volume.v.snapshot = cpu_to_le32(U32_MAX);
|
||||
root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
|
||||
|
||||
ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
|
||||
bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
|
||||
bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
|
||||
ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?:
|
||||
bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?:
|
||||
bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
@ -1310,15 +1310,15 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
|
||||
|
||||
prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
|
||||
|
||||
prt_str(out, "Label:\t");
|
||||
prt_printf(out, "Label:\t");
|
||||
prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
|
||||
prt_newline(out);
|
||||
|
||||
prt_str(out, "Version:\t");
|
||||
prt_printf(out, "Version:\t");
|
||||
bch2_version_to_text(out, le16_to_cpu(sb->version));
|
||||
prt_newline(out);
|
||||
|
||||
prt_str(out, "Version upgrade complete:\t");
|
||||
prt_printf(out, "Version upgrade complete:\t");
|
||||
bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
|
||||
prt_newline(out);
|
||||
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "clock.h"
|
||||
#include "compress.h"
|
||||
#include "debug.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "disk_groups.h"
|
||||
#include "ec.h"
|
||||
#include "errcode.h"
|
||||
@ -88,6 +89,19 @@ const char * const bch2_fs_flag_strs[] = {
|
||||
NULL
|
||||
};
|
||||
|
||||
void bch2_print_str(struct bch_fs *c, const char *str)
|
||||
{
|
||||
#ifdef __KERNEL__
|
||||
struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
|
||||
|
||||
if (unlikely(stdio)) {
|
||||
bch2_stdio_redirect_printf(stdio, true, "%s", str);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
bch2_print_string_as_lines(KERN_ERR, str);
|
||||
}
|
||||
|
||||
__printf(2, 0)
|
||||
static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
|
||||
{
|
||||
@ -222,22 +236,6 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
|
||||
return c;
|
||||
}
|
||||
|
||||
static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
|
||||
{
|
||||
unsigned nr = 0, u64s =
|
||||
((sizeof(struct jset_entry_dev_usage) +
|
||||
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
|
||||
sizeof(u64);
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_member_device_rcu(c, ca, NULL)
|
||||
nr++;
|
||||
rcu_read_unlock();
|
||||
|
||||
bch2_journal_entry_res_resize(&c->journal,
|
||||
&c->dev_usage_journal_res, u64s * nr);
|
||||
}
|
||||
|
||||
/* Filesystem RO/RW: */
|
||||
|
||||
/*
|
||||
@ -376,6 +374,7 @@ void bch2_fs_read_only(struct bch_fs *c)
|
||||
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
|
||||
BUG_ON(c->btree_write_buffer.inc.keys.nr);
|
||||
BUG_ON(c->btree_write_buffer.flushing.keys.nr);
|
||||
bch2_verify_accounting_clean(c);
|
||||
|
||||
bch_verbose(c, "marking filesystem clean");
|
||||
bch2_fs_mark_clean(c);
|
||||
@ -537,6 +536,7 @@ static void __bch2_fs_free(struct bch_fs *c)
|
||||
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
|
||||
bch2_free_pending_node_rewrites(c);
|
||||
bch2_fs_allocator_background_exit(c);
|
||||
bch2_fs_accounting_exit(c);
|
||||
bch2_fs_sb_errors_exit(c);
|
||||
bch2_fs_counters_exit(c);
|
||||
bch2_fs_snapshots_exit(c);
|
||||
@ -569,6 +569,7 @@ static void __bch2_fs_free(struct bch_fs *c)
|
||||
|
||||
darray_exit(&c->btree_roots_extra);
|
||||
free_percpu(c->pcpu);
|
||||
free_percpu(c->usage);
|
||||
mempool_exit(&c->large_bkey_pool);
|
||||
mempool_exit(&c->btree_bounce_pool);
|
||||
bioset_exit(&c->btree_bio);
|
||||
@ -762,7 +763,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
init_waitqueue_head(&c->ro_ref_wait);
|
||||
sema_init(&c->online_fsck_mutex, 1);
|
||||
|
||||
init_rwsem(&c->gc_lock);
|
||||
mutex_init(&c->gc_gens_lock);
|
||||
atomic_set(&c->journal_keys.ref, 1);
|
||||
c->journal_keys.initial_ref_held = true;
|
||||
@ -785,8 +785,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
|
||||
INIT_LIST_HEAD(&c->list);
|
||||
|
||||
mutex_init(&c->usage_scratch_lock);
|
||||
|
||||
mutex_init(&c->bio_bounce_pages_lock);
|
||||
mutex_init(&c->snapshot_table_lock);
|
||||
init_rwsem(&c->snapshot_create_lock);
|
||||
@ -892,6 +890,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
offsetof(struct btree_write_bio, wbio.bio)),
|
||||
BIOSET_NEED_BVECS) ||
|
||||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
|
||||
!(c->usage = alloc_percpu(struct bch_fs_usage_base)) ||
|
||||
!(c->online_reserved = alloc_percpu(u64)) ||
|
||||
mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
|
||||
c->opts.btree_node_size) ||
|
||||
@ -907,7 +906,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
bch2_io_clock_init(&c->io_clock[READ]) ?:
|
||||
bch2_io_clock_init(&c->io_clock[WRITE]) ?:
|
||||
bch2_fs_journal_init(&c->journal) ?:
|
||||
bch2_fs_replicas_init(c) ?:
|
||||
bch2_fs_btree_cache_init(c) ?:
|
||||
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
|
||||
bch2_fs_btree_iter_init(c) ?:
|
||||
@ -927,17 +925,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
for (i = 0; i < c->sb.nr_devices; i++)
|
||||
if (bch2_member_exists(c->disk_sb.sb, i) &&
|
||||
bch2_dev_alloc(c, i)) {
|
||||
ret = -EEXIST;
|
||||
for (i = 0; i < c->sb.nr_devices; i++) {
|
||||
if (!bch2_member_exists(c->disk_sb.sb, i))
|
||||
continue;
|
||||
ret = bch2_dev_alloc(c, i);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
bch2_journal_entry_res_resize(&c->journal,
|
||||
&c->btree_root_journal_res,
|
||||
BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
|
||||
bch2_dev_usage_journal_reserve(c);
|
||||
bch2_journal_entry_res_resize(&c->journal,
|
||||
&c->clock_journal_res,
|
||||
(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
|
||||
@ -1603,7 +1601,8 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
|
||||
bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
|
||||
BTREE_TRIGGER_norun, NULL);
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_dev_usage_remove(c, ca->dev_idx);
|
||||
bch_err_msg(c, ret, "removing dev alloc info");
|
||||
return ret;
|
||||
}
|
||||
@ -1640,6 +1639,16 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* We need to flush the entire journal to get rid of keys that reference
|
||||
* the device being removed before removing the superblock entry
|
||||
*/
|
||||
bch2_journal_flush_all_pins(&c->journal);
|
||||
|
||||
/*
|
||||
* this is really just needed for the bch2_replicas_gc_(start|end)
|
||||
* calls, and could be cleaned up:
|
||||
*/
|
||||
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
|
||||
bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
|
||||
if (ret)
|
||||
@ -1682,17 +1691,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
||||
|
||||
bch2_dev_free(ca);
|
||||
|
||||
/*
|
||||
* At this point the device object has been removed in-core, but the
|
||||
* on-disk journal might still refer to the device index via sb device
|
||||
* usage entries. Recovery fails if it sees usage information for an
|
||||
* invalid device. Flush journal pins to push the back of the journal
|
||||
* past now invalid device index references before we update the
|
||||
* superblock, but after the device object has been removed so any
|
||||
* further journal writes elide usage info for the device.
|
||||
*/
|
||||
bch2_journal_flush_all_pins(&c->journal);
|
||||
|
||||
/*
|
||||
* Free this device's slot in the bch_member array - all pointers to
|
||||
* this device must be gone:
|
||||
@ -1705,8 +1703,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
||||
|
||||
mutex_unlock(&c->sb_lock);
|
||||
up_write(&c->state_lock);
|
||||
|
||||
bch2_dev_usage_journal_reserve(c);
|
||||
return 0;
|
||||
err:
|
||||
if (ca->mi.state == BCH_MEMBER_STATE_rw &&
|
||||
@ -1754,8 +1750,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
|
||||
goto err;
|
||||
}
|
||||
|
||||
bch2_dev_usage_init(ca);
|
||||
|
||||
ret = __bch2_dev_attach_bdev(ca, &sb);
|
||||
if (ret)
|
||||
goto err;
|
||||
@ -1837,7 +1831,9 @@ have_slot:
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
bch2_dev_usage_journal_reserve(c);
|
||||
ret = bch2_dev_usage_init(ca, false);
|
||||
if (ret)
|
||||
goto err_late;
|
||||
|
||||
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
|
||||
bch_err_msg(ca, ret, "marking new superblock");
|
||||
@ -2009,15 +2005,18 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
if (ca->mi.freespace_initialized) {
|
||||
ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
|
||||
struct disk_accounting_pos acc = {
|
||||
.type = BCH_DISK_ACCOUNTING_dev_data_type,
|
||||
.dev_data_type.dev = ca->dev_idx,
|
||||
.dev_data_type.data_type = BCH_DATA_free,
|
||||
};
|
||||
u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
|
||||
|
||||
ret = bch2_trans_do(ca->fs, NULL, NULL, 0,
|
||||
bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
|
||||
bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* XXX: this is all wrong transactionally - we'll be able to do
|
||||
* this correctly after the disk space accounting rewrite
|
||||
*/
|
||||
ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets;
|
||||
}
|
||||
|
||||
bch2_recalc_capacity(c);
|
||||
@ -2029,6 +2028,9 @@ err:
|
||||
/* return with ref on ca->ref: */
|
||||
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
|
||||
{
|
||||
if (!strncmp(name, "/dev/", strlen("/dev/")))
|
||||
name += strlen("/dev/");
|
||||
|
||||
for_each_member_device(c, ca)
|
||||
if (!strcmp(name, ca->name))
|
||||
return ca;
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "buckets.h"
|
||||
#include "clock.h"
|
||||
#include "compress.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "disk_groups.h"
|
||||
#include "ec.h"
|
||||
#include "inode.h"
|
||||
@ -198,6 +199,8 @@ read_attribute(disk_groups);
|
||||
|
||||
read_attribute(has_data);
|
||||
read_attribute(alloc_debug);
|
||||
read_attribute(accounting);
|
||||
read_attribute(usage_base);
|
||||
|
||||
#define x(t, n, ...) read_attribute(t);
|
||||
BCH_PERSISTENT_COUNTERS()
|
||||
@ -251,91 +254,42 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
|
||||
|
||||
static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
{
|
||||
struct btree_trans *trans;
|
||||
enum btree_id id;
|
||||
struct compression_type_stats {
|
||||
u64 nr_extents;
|
||||
u64 sectors_compressed;
|
||||
u64 sectors_uncompressed;
|
||||
} s[BCH_COMPRESSION_TYPE_NR];
|
||||
u64 compressed_incompressible = 0;
|
||||
int ret = 0;
|
||||
|
||||
memset(s, 0, sizeof(s));
|
||||
|
||||
if (!test_bit(BCH_FS_started, &c->flags))
|
||||
return -EPERM;
|
||||
|
||||
trans = bch2_trans_get(c);
|
||||
|
||||
for (id = 0; id < BTREE_ID_NR; id++) {
|
||||
if (!btree_type_has_ptrs(id))
|
||||
continue;
|
||||
|
||||
ret = for_each_btree_key(trans, iter, id, POS_MIN,
|
||||
BTREE_ITER_all_snapshots, k, ({
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
const union bch_extent_entry *entry;
|
||||
bool compressed = false, incompressible = false;
|
||||
|
||||
bkey_for_each_crc(k.k, ptrs, crc, entry) {
|
||||
incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible;
|
||||
compressed |= crc_is_compressed(crc);
|
||||
|
||||
if (crc_is_compressed(crc)) {
|
||||
s[crc.compression_type].nr_extents++;
|
||||
s[crc.compression_type].sectors_compressed += crc.compressed_size;
|
||||
s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size;
|
||||
}
|
||||
}
|
||||
|
||||
compressed_incompressible += compressed && incompressible;
|
||||
|
||||
if (!compressed) {
|
||||
unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0;
|
||||
|
||||
s[t].nr_extents++;
|
||||
s[t].sectors_compressed += k.k->size;
|
||||
s[t].sectors_uncompressed += k.k->size;
|
||||
}
|
||||
0;
|
||||
}));
|
||||
}
|
||||
|
||||
bch2_trans_put(trans);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
prt_str(out, "type");
|
||||
printbuf_tabstop_push(out, 12);
|
||||
printbuf_tabstop_push(out, 16);
|
||||
printbuf_tabstop_push(out, 16);
|
||||
printbuf_tabstop_push(out, 24);
|
||||
prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
|
||||
for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) {
|
||||
struct disk_accounting_pos a = {
|
||||
.type = BCH_DISK_ACCOUNTING_compression,
|
||||
.compression.type = i,
|
||||
};
|
||||
struct bpos p = disk_accounting_pos_to_bpos(&a);
|
||||
u64 v[3];
|
||||
bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v));
|
||||
|
||||
u64 nr_extents = v[0];
|
||||
u64 sectors_uncompressed = v[1];
|
||||
u64 sectors_compressed = v[2];
|
||||
|
||||
bch2_prt_compression_type(out, i);
|
||||
prt_tab(out);
|
||||
|
||||
prt_human_readable_u64(out, s[i].sectors_compressed << 9);
|
||||
prt_human_readable_u64(out, sectors_compressed << 9);
|
||||
prt_tab_rjust(out);
|
||||
|
||||
prt_human_readable_u64(out, s[i].sectors_uncompressed << 9);
|
||||
prt_human_readable_u64(out, sectors_uncompressed << 9);
|
||||
prt_tab_rjust(out);
|
||||
|
||||
prt_human_readable_u64(out, s[i].nr_extents
|
||||
? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents)
|
||||
prt_human_readable_u64(out, nr_extents
|
||||
? div_u64(sectors_uncompressed << 9, nr_extents)
|
||||
: 0);
|
||||
prt_tab_rjust(out);
|
||||
prt_newline(out);
|
||||
}
|
||||
|
||||
if (compressed_incompressible) {
|
||||
prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible);
|
||||
prt_newline(out);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -346,6 +300,20 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
prt_printf(out, "\n");
|
||||
}
|
||||
|
||||
static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
{
|
||||
struct bch_fs_usage_base b = {};
|
||||
|
||||
acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64));
|
||||
|
||||
prt_printf(out, "hidden:\t\t%llu\n", b.hidden);
|
||||
prt_printf(out, "btree:\t\t%llu\n", b.btree);
|
||||
prt_printf(out, "data:\t\t%llu\n", b.data);
|
||||
prt_printf(out, "cached:\t%llu\n", b.cached);
|
||||
prt_printf(out, "reserved:\t\t%llu\n", b.reserved);
|
||||
prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes);
|
||||
}
|
||||
|
||||
SHOW(bch2_fs)
|
||||
{
|
||||
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
|
||||
@ -429,6 +397,12 @@ SHOW(bch2_fs)
|
||||
if (attr == &sysfs_alloc_debug)
|
||||
bch2_fs_alloc_debug_to_text(out, c);
|
||||
|
||||
if (attr == &sysfs_accounting)
|
||||
bch2_fs_accounting_to_text(out, c);
|
||||
|
||||
if (attr == &sysfs_usage_base)
|
||||
bch2_fs_usage_base_to_text(out, c);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -633,6 +607,8 @@ struct attribute *bch2_fs_internal_files[] = {
|
||||
|
||||
&sysfs_disk_groups,
|
||||
&sysfs_alloc_debug,
|
||||
&sysfs_accounting,
|
||||
&sysfs_usage_base,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
@ -121,7 +121,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
|
||||
ck.k.p.offset = i;
|
||||
ck.k.p.snapshot = U32_MAX;
|
||||
|
||||
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
|
||||
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
|
||||
bch_err_msg(c, ret, "insert error");
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -176,7 +176,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
|
||||
ck.k.p.snapshot = U32_MAX;
|
||||
ck.k.size = 8;
|
||||
|
||||
ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
|
||||
ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
|
||||
bch_err_msg(c, ret, "insert error");
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -232,7 +232,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
|
||||
ck.k.p.offset = i * 2;
|
||||
ck.k.p.snapshot = U32_MAX;
|
||||
|
||||
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
|
||||
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
|
||||
bch_err_msg(c, ret, "insert error");
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -292,7 +292,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
|
||||
ck.k.p.snapshot = U32_MAX;
|
||||
ck.k.size = 8;
|
||||
|
||||
ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
|
||||
ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
|
||||
bch_err_msg(c, ret, "insert error");
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -396,7 +396,7 @@ static int insert_test_extent(struct bch_fs *c,
|
||||
k.k_i.k.size = end - start;
|
||||
k.k_i.k.version.lo = test_version++;
|
||||
|
||||
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
|
||||
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
@ -481,7 +481,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
|
||||
|
||||
bkey_cookie_init(&cookie.k_i);
|
||||
cookie.k.p.snapshot = snapid_hi;
|
||||
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
|
||||
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -506,7 +506,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
|
||||
|
||||
bkey_cookie_init(&cookie.k_i);
|
||||
cookie.k.p.snapshot = U32_MAX;
|
||||
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
|
||||
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
@ -67,9 +67,14 @@ err:
|
||||
|
||||
/* stdio_redirect */
|
||||
|
||||
static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen)
|
||||
{
|
||||
return stdio->input.buf.nr > seen || stdio->done;
|
||||
}
|
||||
|
||||
static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
|
||||
{
|
||||
return stdio->input.buf.nr || stdio->done;
|
||||
return stdio_redirect_has_more_input(stdio, 0);
|
||||
}
|
||||
|
||||
static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
|
||||
@ -181,9 +186,13 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu
|
||||
}
|
||||
|
||||
spin_lock(&buf->lock);
|
||||
if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE)
|
||||
darray_make_room_gfp(&buf->buf,
|
||||
min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT);
|
||||
size_t makeroom = b;
|
||||
if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr))
|
||||
makeroom = min_t(ssize_t, makeroom,
|
||||
max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr,
|
||||
0));
|
||||
darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT);
|
||||
|
||||
b = min(len, darray_room(buf->buf));
|
||||
|
||||
if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
|
||||
@ -355,43 +364,67 @@ int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t le
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
|
||||
int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio,
|
||||
darray_char *line,
|
||||
unsigned long timeout)
|
||||
{
|
||||
unsigned long until = jiffies + timeout, t;
|
||||
struct stdio_buf *buf = &stdio->input;
|
||||
size_t copied = 0;
|
||||
ssize_t ret = 0;
|
||||
size_t seen = 0;
|
||||
again:
|
||||
do {
|
||||
wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
|
||||
sysctl_hung_task_timeout_secs * HZ / 2);
|
||||
} while (!stdio_redirect_has_input(stdio));
|
||||
t = timeout != MAX_SCHEDULE_TIMEOUT
|
||||
? max_t(long, until - jiffies, 0)
|
||||
: timeout;
|
||||
|
||||
if (stdio->done) {
|
||||
ret = -1;
|
||||
goto out;
|
||||
}
|
||||
t = min(t, sysctl_hung_task_timeout_secs * HZ / 2);
|
||||
|
||||
wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t);
|
||||
|
||||
if (stdio->done)
|
||||
return -1;
|
||||
|
||||
spin_lock(&buf->lock);
|
||||
size_t b = min(len, buf->buf.nr);
|
||||
char *n = memchr(buf->buf.data, '\n', b);
|
||||
if (n)
|
||||
b = min_t(size_t, b, n + 1 - buf->buf.data);
|
||||
seen = buf->buf.nr;
|
||||
char *n = memchr(buf->buf.data, '\n', seen);
|
||||
|
||||
if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) {
|
||||
spin_unlock(&buf->lock);
|
||||
return -ETIME;
|
||||
}
|
||||
|
||||
if (!n) {
|
||||
buf->waiting_for_line = true;
|
||||
spin_unlock(&buf->lock);
|
||||
goto again;
|
||||
}
|
||||
|
||||
size_t b = n + 1 - buf->buf.data;
|
||||
if (b > line->size) {
|
||||
spin_unlock(&buf->lock);
|
||||
int ret = darray_resize(line, b);
|
||||
if (ret)
|
||||
return ret;
|
||||
seen = 0;
|
||||
goto again;
|
||||
}
|
||||
|
||||
buf->buf.nr -= b;
|
||||
memcpy(ubuf, buf->buf.data, b);
|
||||
memcpy(line->data, buf->buf.data, b);
|
||||
memmove(buf->buf.data,
|
||||
buf->buf.data + b,
|
||||
buf->buf.nr);
|
||||
ubuf += b;
|
||||
len -= b;
|
||||
copied += b;
|
||||
line->nr = b;
|
||||
|
||||
buf->waiting_for_line = false;
|
||||
spin_unlock(&buf->lock);
|
||||
|
||||
wake_up(&buf->wait);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!n && len)
|
||||
goto again;
|
||||
out:
|
||||
return copied ?: ret;
|
||||
int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line)
|
||||
{
|
||||
return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT);
|
||||
}
|
||||
|
||||
__printf(3, 0)
|
||||
|
@ -71,7 +71,9 @@ int bch2_run_thread_with_stdio(struct thread_with_stdio *,
|
||||
int bch2_run_thread_with_stdout(struct thread_with_stdio *,
|
||||
const struct thread_with_stdio_ops *);
|
||||
int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
|
||||
int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
|
||||
|
||||
int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long);
|
||||
int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *);
|
||||
|
||||
__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
|
||||
__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
|
||||
|
@ -8,15 +8,12 @@ struct stdio_buf {
|
||||
spinlock_t lock;
|
||||
wait_queue_head_t wait;
|
||||
darray_char buf;
|
||||
bool waiting_for_line;
|
||||
};
|
||||
|
||||
struct stdio_redirect {
|
||||
struct stdio_buf input;
|
||||
struct stdio_buf output;
|
||||
|
||||
spinlock_t input_lock;
|
||||
wait_queue_head_t input_wait;
|
||||
darray_char input_buf;
|
||||
bool done;
|
||||
};
|
||||
|
||||
|
@ -43,7 +43,7 @@ DECLARE_EVENT_CLASS(fs_str,
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = c->dev;
|
||||
__assign_str(str, str);
|
||||
__assign_str(str);
|
||||
),
|
||||
|
||||
TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
|
||||
@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(trans_str,
|
||||
__entry->dev = trans->c->dev;
|
||||
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
|
||||
__entry->caller_ip = caller_ip;
|
||||
__assign_str(str, str);
|
||||
__assign_str(str);
|
||||
),
|
||||
|
||||
TP_printk("%d,%d %s %pS %s",
|
||||
@ -85,7 +85,7 @@ DECLARE_EVENT_CLASS(trans_str_nocaller,
|
||||
TP_fast_assign(
|
||||
__entry->dev = trans->c->dev;
|
||||
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
|
||||
__assign_str(str, str);
|
||||
__assign_str(str);
|
||||
),
|
||||
|
||||
TP_printk("%d,%d %s %s",
|
||||
@ -200,6 +200,56 @@ DECLARE_EVENT_CLASS(bio,
|
||||
(unsigned long long)__entry->sector, __entry->nr_sector)
|
||||
);
|
||||
|
||||
/* fs.c: */
|
||||
TRACE_EVENT(bch2_sync_fs,
|
||||
TP_PROTO(struct super_block *sb, int wait),
|
||||
|
||||
TP_ARGS(sb, wait),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( dev_t, dev )
|
||||
__field( int, wait )
|
||||
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = sb->s_dev;
|
||||
__entry->wait = wait;
|
||||
),
|
||||
|
||||
TP_printk("dev %d,%d wait %d",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->wait)
|
||||
);
|
||||
|
||||
/* fs-io.c: */
|
||||
TRACE_EVENT(bch2_fsync,
|
||||
TP_PROTO(struct file *file, int datasync),
|
||||
|
||||
TP_ARGS(file, datasync),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( dev_t, dev )
|
||||
__field( ino_t, ino )
|
||||
__field( ino_t, parent )
|
||||
__field( int, datasync )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
|
||||
__entry->dev = dentry->d_sb->s_dev;
|
||||
__entry->ino = d_inode(dentry)->i_ino;
|
||||
__entry->parent = d_inode(dentry->d_parent)->i_ino;
|
||||
__entry->datasync = datasync;
|
||||
),
|
||||
|
||||
TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
(unsigned long) __entry->ino,
|
||||
(unsigned long) __entry->parent, __entry->datasync)
|
||||
);
|
||||
|
||||
/* super-io.c: */
|
||||
TRACE_EVENT(write_super,
|
||||
TP_PROTO(struct bch_fs *c, unsigned long ip),
|
||||
|
@ -36,15 +36,14 @@ static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
|
||||
static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
|
||||
{
|
||||
long i = s ? 1 : -1;
|
||||
long v = atomic_long_read(&lock->v), old;
|
||||
long old;
|
||||
|
||||
old = atomic_long_read(&lock->v);
|
||||
do {
|
||||
old = v;
|
||||
|
||||
if (i > 0 ? v < 0 : v > 0)
|
||||
if (i > 0 ? old < 0 : old > 0)
|
||||
return false;
|
||||
} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
|
||||
old, old + i)) != old);
|
||||
} while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -445,11 +445,6 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
|
||||
void bch2_bio_map(struct bio *bio, void *base, size_t);
|
||||
int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
|
||||
|
||||
static inline sector_t bdev_sectors(struct block_device *bdev)
|
||||
{
|
||||
return bdev->bd_inode->i_size >> 9;
|
||||
}
|
||||
|
||||
#define closure_bio_submit(bio, cl) \
|
||||
do { \
|
||||
closure_get(cl); \
|
||||
@ -723,9 +718,7 @@ static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
|
||||
|
||||
static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < nr; i++)
|
||||
for (unsigned i = 0; i < nr; i++)
|
||||
acc[i] += src[i];
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user