Update bcachefs sources to 2cb70a82bc bcachefs: delete some debug code

This commit is contained in:
Kent Overstreet 2018-06-27 14:41:51 -04:00
parent f2f3de4da4
commit 17e2f2775b
42 changed files with 2047 additions and 982 deletions

View File

@ -1 +1 @@
9abf628c701ad92670d697624f674cc01d42705e
2cb70a82bc0ca05d8c3cf666d221badd5724e339

View File

@ -112,4 +112,14 @@ static inline void *vmap(struct page **pages, unsigned int count,
#define vmalloc_to_page(addr) ((struct page *) (addr))
static inline void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
void *p;
p = kmalloc(len, gfp);
if (p)
memcpy(p, src, len);
return p;
}
#endif /* __TOOLS_LINUX_SLAB_H */

View File

@ -519,6 +519,11 @@ static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f,
{
}
static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
enum units units)
{
}
typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);
struct bch_sb_field_toolops {

View File

@ -259,6 +259,10 @@ do { \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
"done in memory") \
BCH_DEBUG_PARAM(journal_seq_verify, \
"Store the journal sequence number in the version " \
"number of every btree key, and verify that btree " \
"update ordering is preserved during recovery")
#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
@ -314,7 +318,13 @@ enum bch_time_stats {
struct btree;
enum gc_phase {
GC_PHASE_SB = BTREE_ID_NR + 1,
GC_PHASE_START,
GC_PHASE_SB,
#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
DEFINE_BCH_BTREE_IDS()
#undef DEF_BTREE_ID
GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
GC_PHASE_DONE

View File

@ -426,6 +426,16 @@ enum bch_csum_type {
BCH_CSUM_NR = 7,
};
static const unsigned bch_crc_bytes[] = {
[BCH_CSUM_NONE] = 0,
[BCH_CSUM_CRC32C_NONZERO] = 4,
[BCH_CSUM_CRC32C] = 4,
[BCH_CSUM_CRC64_NONZERO] = 8,
[BCH_CSUM_CRC64] = 8,
[BCH_CSUM_CHACHA20_POLY1305_80] = 10,
[BCH_CSUM_CHACHA20_POLY1305_128] = 16,
};
static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
{
switch (type) {
@ -783,6 +793,11 @@ struct bch_dirent {
} __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(dirent, BCH_DIRENT);
#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \
sizeof(struct bkey) - \
offsetof(struct bch_dirent, d_name))
/* Xattrs */
enum {
@ -868,7 +883,8 @@ struct bch_sb_field {
x(crypt, 2) \
x(replicas, 3) \
x(quota, 4) \
x(disk_groups, 5)
x(disk_groups, 5) \
x(clean, 6)
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@ -1038,6 +1054,37 @@ struct bch_sb_field_disk_groups {
struct bch_disk_group entries[0];
};
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
*/
struct jset_entry {
__le16 u64s;
__u8 btree_id;
__u8 level;
__u8 type; /* designates what this jset holds */
__u8 pad[3];
union {
struct bkey_i start[0];
__u64 _data[0];
};
};
struct bch_sb_field_clean {
struct bch_sb_field field;
__le32 flags;
__le16 read_clock;
__le16 write_clock;
__le64 journal_seq;
union {
struct jset_entry start[0];
__u64 _data[0];
};
};
/* Superblock: */
/*
@ -1255,19 +1302,6 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
#define BCACHE_JSET_VERSION_JKEYS 2
#define BCACHE_JSET_VERSION 2
struct jset_entry {
__le16 u64s;
__u8 btree_id;
__u8 level;
__u8 type; /* designates what this jset holds */
__u8 pad[3];
union {
struct bkey_i start[0];
__u64 _data[0];
};
};
#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
#define BCH_JSET_ENTRY_TYPES() \

View File

@ -649,7 +649,14 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
struct btree *b;
struct bset_tree *t;
/* btree_node_fill() requires parent to be locked: */
/*
* XXX: locking optimization
*
* we can make the locking looser here - caller can drop lock on parent
* node before locking child node (and potentially blocking): we just
* have to have bch2_btree_node_fill() call relock on the parent and
* return -EINTR if that fails
*/
EBUG_ON(!btree_node_locked(iter, level + 1));
EBUG_ON(level >= BTREE_MAX_DEPTH);
retry:
@ -749,23 +756,22 @@ retry:
struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
struct btree_iter *iter,
struct btree *b,
bool may_drop_locks,
enum btree_node_sibling sib)
{
struct btree *parent;
struct btree_node_iter node_iter;
struct bkey_packed *k;
BKEY_PADDED(k) tmp;
struct btree *ret;
struct btree *ret = NULL;
unsigned level = b->level;
parent = btree_iter_node(iter, level + 1);
if (!parent)
return NULL;
if (!bch2_btree_node_relock(iter, level + 1)) {
bch2_btree_iter_set_locks_want(iter, level + 2);
return ERR_PTR(-EINTR);
}
if (!bch2_btree_node_relock(iter, level + 1))
goto out_upgrade;
node_iter = iter->l[parent->level].iter;
@ -778,34 +784,66 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
: (bch2_btree_node_iter_advance(&node_iter, parent),
bch2_btree_node_iter_peek_all(&node_iter, parent));
if (!k)
return NULL;
goto out;
} while (bkey_deleted(k));
bch2_bkey_unpack(parent, &tmp.k, k);
ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) {
btree_node_unlock(iter, level);
if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) {
struct btree_iter *linked;
if (!bch2_btree_node_relock(iter, level + 1)) {
bch2_btree_iter_set_locks_want(iter, level + 2);
return ERR_PTR(-EINTR);
if (!bch2_btree_node_relock(iter, level + 1))
goto out_upgrade;
/*
* We might have got -EINTR because trylock failed, and we're
* holding other locks that would cause us to deadlock:
*/
for_each_linked_btree_iter(iter, linked)
if (btree_iter_cmp(iter, linked) < 0)
__bch2_btree_iter_unlock(linked);
if (sib == btree_prev_sib)
btree_node_unlock(iter, level);
ret = bch2_btree_node_get(c, iter, &tmp.k, level,
SIX_LOCK_intent);
/*
* before btree_iter_relock() calls btree_iter_verify_locks():
*/
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
btree_node_unlock(iter, level + 1);
if (!bch2_btree_node_relock(iter, level)) {
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
if (!IS_ERR(ret)) {
six_unlock_intent(&ret->lock);
ret = ERR_PTR(-EINTR);
}
}
ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
bch2_btree_iter_relock(iter);
}
out:
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
btree_node_unlock(iter, level + 1);
if (!bch2_btree_node_relock(iter, level)) {
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
bch2_btree_iter_verify_locks(iter);
if (!IS_ERR(ret)) {
six_unlock_intent(&ret->lock);
ret = ERR_PTR(-EINTR);
}
}
BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
(iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
!btree_node_locked(iter, level)));
return ret;
out_upgrade:
if (may_drop_locks)
bch2_btree_iter_upgrade(iter, level + 2);
ret = ERR_PTR(-EINTR);
goto out;
}
void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k,

View File

@ -26,7 +26,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
enum six_lock_type);
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
struct btree *,
struct btree *, bool,
enum btree_node_sibling);
void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *,

View File

@ -148,6 +148,9 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
? BCH_DATA_BTREE : BCH_DATA_USER;
int ret = 0;
BUG_ON(journal_seq_verify(c) &&
k.k->version.lo > journal_cur_seq(&c->journal));
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
"superblock not marked as containing replicas (type %u)",
@ -243,6 +246,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
unsigned max_stale;
int ret = 0;
gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
if (!c->btree_roots[btree_id].b)
return 0;
/*
* if expensive_debug_checks is on, run range_checks on all leaf nodes:
*/
@ -454,7 +462,7 @@ static void bch2_gc_start(struct bch_fs *c)
* Indicates to buckets code that gc is now in progress - done under
* usage_lock to avoid racing with bch2_mark_key():
*/
__gc_pos_set(c, GC_POS_MIN);
__gc_pos_set(c, gc_phase(GC_PHASE_START));
/* Save a copy of the existing bucket stats while we recompute them: */
for_each_member_device(ca, c, i) {
@ -535,22 +543,18 @@ void bch2_gc(struct bch_fs *c)
bch2_gc_start(c);
/* Walk btree: */
while (c->gc_pos.phase < (int) BTREE_ID_NR) {
int ret = c->btree_roots[c->gc_pos.phase].b
? bch2_gc_btree(c, (int) c->gc_pos.phase)
: 0;
bch2_mark_superblocks(c);
/* Walk btree: */
for (i = 0; i < BTREE_ID_NR; i++) {
int ret = bch2_gc_btree(c, i);
if (ret) {
bch_err(c, "btree gc failed: %d", ret);
set_bit(BCH_FS_GC_FAILURE, &c->flags);
goto out;
}
gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
}
bch2_mark_superblocks(c);
bch2_mark_pending_btree_node_frees(c);
bch2_mark_allocator_buckets(c);
@ -780,13 +784,13 @@ next:
bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
/* Insert the newly coalesced nodes */
bch2_btree_insert_node(as, parent, iter, &keylist);
bch2_btree_insert_node(as, parent, iter, &keylist, 0);
BUG_ON(!bch2_keylist_empty(&keylist));
BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0]));
bch2_btree_iter_node_replace(iter, new_nodes[0]);
for (i = 0; i < nr_new_nodes; i++)
bch2_btree_open_bucket_put(c, new_nodes[i]);
@ -1003,6 +1007,8 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
btree_node_range_checks_init(&r, 0);
gc_pos_set(c, gc_pos_btree(id, POS_MIN, 0));
if (!c->btree_roots[id].b)
return 0;
@ -1041,36 +1047,33 @@ err:
return bch2_btree_iter_unlock(&iter) ?: ret;
}
static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
{
unsigned iter = 0;
enum btree_id id;
int ret;
int ret = 0;
mutex_lock(&c->sb_lock);
if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
if (BCH_SB_INITIALIZED(c->disk_sb.sb))
bch_info(c, "building replicas info");
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
mutex_unlock(&c->sb_lock);
down_write(&c->gc_lock);
again:
bch2_gc_start(c);
bch2_mark_superblocks(c);
for (id = 0; id < BTREE_ID_NR; id++) {
ret = bch2_initial_gc_btree(c, id);
if (ret)
return ret;
goto err;
}
ret = bch2_journal_mark(c, journal);
if (ret)
return ret;
goto err;
if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
if (iter++ > 2) {
bch_info(c, "Unable to fix bucket gens, looping");
return -EINVAL;
ret = -EINVAL;
goto err;
}
bch_info(c, "Fixed gens, restarting initial mark and sweep:");
@ -1085,21 +1088,9 @@ again:
if (c->sb.encryption_type)
atomic64_add(1 << 16, &c->key_version);
bch2_mark_superblocks(c);
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
return 0;
}
int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
{
int ret;
down_write(&c->gc_lock);
ret = __bch2_initial_gc(c, journal);
err:
up_write(&c->gc_lock);
return ret;
}

View File

@ -46,8 +46,6 @@ static inline struct gc_pos gc_phase(enum gc_phase phase)
};
}
#define GC_POS_MIN gc_phase(0)
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
if (l.phase != r.phase)
@ -59,17 +57,23 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
return 0;
}
static inline struct gc_pos gc_pos_btree(enum btree_id id,
struct bpos pos, unsigned level)
{
return (struct gc_pos) {
.phase = GC_PHASE_BTREE_EXTENTS + id,
.pos = pos,
.level = level,
};
}
/*
* GC position of the pointers within a btree node: note, _not_ for &b->key
* itself, that lives in the parent node:
*/
static inline struct gc_pos gc_pos_btree_node(struct btree *b)
{
return (struct gc_pos) {
.phase = b->btree_id,
.pos = b->key.k.p,
.level = b->level,
};
return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
}
/*
@ -81,11 +85,7 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b)
*/
static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
{
return (struct gc_pos) {
.phase = (int) id,
.pos = POS_MAX,
.level = U8_MAX,
};
return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
}
static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)

View File

@ -920,7 +920,7 @@ static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i,
char *out = buf, *end = buf + len;
out += scnprintf(out, end - out,
"error validating btree node %s "
"error validating btree node %s"
"at btree %u level %u/%u\n"
"pos %llu:%llu node offset %u",
write ? "before write " : "",
@ -1120,7 +1120,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
"invalid bkey:\n%s\n%s", buf, invalid);
"invalid bkey:\n%s\n%s", invalid, buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),

View File

@ -34,11 +34,9 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
EBUG_ON(iter->l[b->level].b != b);
EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
for_each_linked_btree_node(iter, b, linked)
for_each_btree_iter_with_node(iter, b, linked)
linked->lock_seq[b->level] += 2;
iter->lock_seq[b->level] += 2;
six_unlock_write(&b->lock);
}
@ -48,6 +46,8 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
struct btree_iter *linked;
unsigned readers = 0;
EBUG_ON(btree_node_read_locked(iter, b->level));
for_each_linked_btree_iter(iter, linked)
if (linked->l[b->level].b == b &&
btree_node_read_locked(linked, b->level))
@ -66,15 +66,30 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
&b->lock.state.counter);
}
bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
/*
* Lock a btree node if we already have it locked on one of our linked
* iterators:
*/
static inline bool btree_node_lock_increment(struct btree_iter *iter,
struct btree *b, unsigned level,
enum btree_node_locked_type want)
{
struct btree_iter *linked;
struct btree *b = iter->l[level].b;
int want = btree_lock_want(iter, level);
int have = btree_node_locked_type(iter, level);
if (want == have)
return true;
for_each_linked_btree_iter(iter, linked)
if (linked->l[level].b == b &&
btree_node_locked_type(linked, level) >= want) {
six_lock_increment(&b->lock, want);
return true;
}
return false;
}
bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
{
struct btree *b = iter->l[level].b;
int want = __btree_lock_want(iter, level);
if (!is_btree_node(iter, level))
return false;
@ -82,42 +97,83 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
if (race_fault())
return false;
if (have != BTREE_NODE_UNLOCKED
? six_trylock_convert(&b->lock, have, want)
: six_relock_type(&b->lock, want, iter->lock_seq[level]))
goto success;
if (!six_relock_type(&b->lock, want, iter->lock_seq[level]) &&
!(iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
btree_node_lock_increment(iter, b, level, want)))
return false;
for_each_linked_btree_iter(iter, linked)
if (linked->l[level].b == b &&
btree_node_locked_type(linked, level) == want &&
iter->lock_seq[level] == b->lock.state.seq) {
btree_node_unlock(iter, level);
six_lock_increment(&b->lock, want);
goto success;
}
return false;
success:
mark_btree_node_unlocked(iter, level);
mark_btree_node_locked(iter, level, want);
return true;
}
bool bch2_btree_iter_relock(struct btree_iter *iter)
static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
{
unsigned l;
struct btree *b = iter->l[level].b;
for (l = iter->level;
l < max_t(unsigned, iter->locks_want, 1) && iter->l[l].b;
l++)
if (!bch2_btree_node_relock(iter, l)) {
EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
if (!is_btree_node(iter, level))
return false;
if (race_fault())
return false;
if (btree_node_intent_locked(iter, level))
return true;
if (btree_node_locked(iter, level)
? six_lock_tryupgrade(&b->lock)
: six_relock_type(&b->lock, SIX_LOCK_intent, iter->lock_seq[level]))
goto success;
if (iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
btree_node_unlock(iter, level);
goto success;
}
return false;
success:
mark_btree_node_intent_locked(iter, level);
return true;
}
static inline bool btree_iter_get_locks(struct btree_iter *iter,
bool upgrade)
{
unsigned l = iter->level;
int fail_idx = -1;
do {
if (!btree_iter_node(iter, l))
break;
if (!(upgrade
? bch2_btree_node_upgrade(iter, l)
: bch2_btree_node_relock(iter, l))) {
fail_idx = l;
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
return false;
}
l++;
} while (l < iter->locks_want);
/*
* When we fail to get a lock, we have to ensure that any child nodes
* can't be relocked so bch2_btree_iter_traverse has to walk back up to
* the node that we failed to relock:
*/
while (fail_idx >= 0) {
btree_node_unlock(iter, fail_idx);
iter->l[fail_idx].b = BTREE_ITER_NOT_END;
--fail_idx;
}
if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
iter->uptodate = BTREE_ITER_NEED_PEEK;
return true;
bch2_btree_iter_verify_locks(iter);
return iter->uptodate < BTREE_ITER_NEED_RELOCK;
}
/* Slowpath: */
@ -128,6 +184,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
{
struct bch_fs *c = iter->c;
struct btree_iter *linked;
bool ret = true;
/* Can't have children locked before ancestors: */
EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
@ -140,15 +197,11 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
EBUG_ON(type == SIX_LOCK_intent &&
iter->nodes_locked != iter->nodes_intent_locked);
for_each_linked_btree_iter(iter, linked)
if (linked->l[level].b == b &&
btree_node_locked_type(linked, level) == type) {
six_lock_increment(&b->lock, type);
return true;
}
if (btree_node_lock_increment(iter, b, level, type))
return true;
/*
* Must lock btree nodes in key order - this case hapens when locking
* Must lock btree nodes in key order - this case happens when locking
* the prev sibling in btree node merging:
*/
if (iter->nodes_locked &&
@ -160,6 +213,10 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
if (!linked->nodes_locked)
continue;
/* We have to lock btree nodes in key order: */
if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
ret = false;
/*
* Can't block taking an intent lock if we have _any_ nodes read
* locked:
@ -175,15 +232,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) {
linked->locks_want = max_t(unsigned,
linked->locks_want,
iter->locks_want);
return false;
linked->locks_want,
__fls(linked->nodes_locked) + 1);
btree_iter_get_locks(linked, true);
ret = false;
}
/* We have to lock btree nodes in key order: */
if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
return false;
/*
* Interior nodes must be locked before their descendants: if
* another iterator has possible descendants locked of the node
@ -194,82 +248,133 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
linked->locks_want = max_t(unsigned,
linked->locks_want,
iter->locks_want);
return false;
btree_iter_get_locks(linked, true);
ret = false;
}
}
__btree_node_lock_type(c, b, type);
return true;
if (ret)
__btree_node_lock_type(c, b, type);
return ret;
}
/* Btree iterator locking: */
static void btree_iter_drop_extra_locks(struct btree_iter *iter)
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_iter_verify_locks(struct btree_iter *iter)
{
unsigned l;
while (iter->nodes_locked &&
(l = __fls(iter->nodes_locked)) > iter->locks_want) {
if (l > iter->level) {
btree_node_unlock(iter, l);
} else {
if (btree_node_intent_locked(iter, l)) {
six_lock_downgrade(&iter->l[l].b->lock);
iter->nodes_intent_locked ^= 1 << l;
}
break;
}
if (iter->uptodate == BTREE_ITER_END) {
BUG_ON(iter->nodes_locked);
return;
}
for (l = 0; btree_iter_node(iter, l); l++) {
if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
!btree_node_locked(iter, l))
continue;
BUG_ON(btree_lock_want(iter, l) !=
btree_node_locked_type(iter, l));
}
}
#endif
bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
unsigned new_locks_want)
__flatten
static bool __bch2_btree_iter_relock(struct btree_iter *iter)
{
if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
return true;
if (iter->uptodate > BTREE_ITER_NEED_TRAVERSE)
return false;
return btree_iter_get_locks(iter, false);
}
bool bch2_btree_iter_relock(struct btree_iter *iter)
{
struct btree_iter *linked;
bool ret = true;
for_each_btree_iter(iter, linked)
ret &= __bch2_btree_iter_relock(linked);
return ret;
}
bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
unsigned new_locks_want)
{
struct btree_iter *linked;
/* Drop locks we don't want anymore: */
if (new_locks_want < iter->locks_want)
for_each_linked_btree_iter(iter, linked)
if (linked->locks_want > new_locks_want) {
linked->locks_want = max_t(unsigned, 1,
new_locks_want);
btree_iter_drop_extra_locks(linked);
}
EBUG_ON(iter->locks_want >= new_locks_want);
iter->locks_want = new_locks_want;
btree_iter_drop_extra_locks(iter);
if (bch2_btree_iter_relock(iter))
if (btree_iter_get_locks(iter, true))
return true;
/*
* Just an optimization: ancestor nodes must be locked before child
* nodes, so set locks_want on iterators that might lock ancestors
* before us to avoid getting -EINTR later:
* Ancestor nodes must be locked before child nodes, so set locks_want
* on iterators that might lock ancestors before us to avoid getting
* -EINTR later:
*/
for_each_linked_btree_iter(iter, linked)
if (linked->btree_id == iter->btree_id &&
btree_iter_cmp(linked, iter) <= 0)
linked->locks_want = max_t(unsigned, linked->locks_want,
new_locks_want);
btree_iter_cmp(linked, iter) <= 0 &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
btree_iter_get_locks(linked, true);
}
return false;
}
static void __bch2_btree_iter_unlock(struct btree_iter *iter)
void __bch2_btree_iter_downgrade(struct btree_iter *iter,
unsigned downgrade_to)
{
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
struct btree_iter *linked;
unsigned l;
while (iter->nodes_locked)
btree_node_unlock(iter, __ffs(iter->nodes_locked));
/*
* We downgrade linked iterators as well because btree_iter_upgrade
* might have had to modify locks_want on linked iterators due to lock
* ordering:
*/
for_each_btree_iter(iter, linked) {
unsigned new_locks_want = downgrade_to ?:
(linked->flags & BTREE_ITER_INTENT ? 1 : 0);
if (linked->locks_want <= new_locks_want)
continue;
linked->locks_want = new_locks_want;
while (linked->nodes_locked &&
(l = __fls(linked->nodes_locked)) >= linked->locks_want) {
if (l > linked->level) {
btree_node_unlock(linked, l);
} else {
if (btree_node_intent_locked(linked, l)) {
six_lock_downgrade(&linked->l[l].b->lock);
linked->nodes_intent_locked ^= 1 << l;
}
break;
}
}
bch2_btree_iter_verify_locks(linked);
}
}
int bch2_btree_iter_unlock(struct btree_iter *iter)
{
struct btree_iter *linked;
for_each_linked_btree_iter(iter, linked)
for_each_btree_iter(iter, linked)
__bch2_btree_iter_unlock(linked);
__bch2_btree_iter_unlock(iter);
return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
}
@ -320,11 +425,8 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
{
struct btree_iter *linked;
if (iter->l[b->level].b == b)
__bch2_btree_iter_verify(iter, b);
for_each_linked_btree_node(iter, b, linked)
__bch2_btree_iter_verify(iter, b);
for_each_btree_iter_with_node(iter, b, linked)
__bch2_btree_iter_verify(linked, b);
}
#endif
@ -456,12 +558,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
__bch2_btree_node_iter_fix(iter, b, node_iter, t,
where, clobber_u64s, new_u64s);
if (iter->l[b->level].b == b)
__bch2_btree_node_iter_fix(iter, b,
&iter->l[b->level].iter, t,
where, clobber_u64s, new_u64s);
for_each_linked_btree_node(iter, b, linked)
for_each_btree_iter_with_node(iter, b, linked)
__bch2_btree_node_iter_fix(linked, b,
&linked->l[b->level].iter, t,
where, clobber_u64s, new_u64s);
@ -613,11 +710,12 @@ static inline void btree_iter_node_set(struct btree_iter *iter,
* A btree node is being replaced - update the iterator to point to the new
* node:
*/
bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
{
enum btree_node_locked_type t;
struct btree_iter *linked;
for_each_linked_btree_iter(iter, linked)
for_each_btree_iter(iter, linked)
if (btree_iter_pos_in_node(linked, b)) {
/*
* bch2_btree_iter_node_drop() has already been called -
@ -626,52 +724,28 @@ bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
*/
BUG_ON(btree_node_locked(linked, b->level));
/*
* If @linked wants this node read locked, we don't want
* to actually take the read lock now because it's not
* legal to hold read locks on other nodes while we take
* write locks, so the journal can make forward
* progress...
*
* Instead, btree_iter_node_set() sets things up so
* bch2_btree_node_relock() will succeed:
*/
if (btree_want_intent(linked, b->level)) {
six_lock_increment(&b->lock, SIX_LOCK_intent);
mark_btree_node_intent_locked(linked, b->level);
t = btree_lock_want(linked, b->level);
if (t != BTREE_NODE_UNLOCKED) {
six_lock_increment(&b->lock, t);
mark_btree_node_locked(linked, b->level, t);
}
btree_iter_node_set(linked, b);
}
if (!btree_iter_pos_in_node(iter, b)) {
six_unlock_intent(&b->lock);
return false;
}
mark_btree_node_intent_locked(iter, b->level);
btree_iter_node_set(iter, b);
return true;
}
void bch2_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b)
{
struct btree_iter *linked;
for_each_linked_btree_iter(iter, linked)
bch2_btree_iter_node_drop(linked, b);
six_unlock_intent(&b->lock);
}
void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
{
struct btree_iter *linked;
unsigned level = b->level;
if (iter->l[level].b == b) {
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
btree_node_unlock(iter, level);
iter->l[level].b = BTREE_ITER_NOT_END;
}
for_each_btree_iter(iter, linked)
if (linked->l[level].b == b) {
btree_node_unlock(linked, level);
linked->l[level].b = BTREE_ITER_NOT_END;
}
}
/*
@ -682,9 +756,8 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
{
struct btree_iter *linked;
for_each_linked_btree_node(iter, b, linked)
for_each_btree_iter_with_node(iter, b, linked)
__btree_iter_init(linked, b);
__btree_iter_init(iter, b);
}
static inline int btree_iter_lock_root(struct btree_iter *iter,
@ -713,7 +786,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
return 0;
}
lock_type = btree_lock_want(iter, iter->level);
lock_type = __btree_lock_want(iter, iter->level);
if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
iter, lock_type)))
return -EINTR;
@ -771,7 +844,7 @@ static inline int btree_iter_down(struct btree_iter *iter)
struct btree_iter_level *l = &iter->l[iter->level];
struct btree *b;
unsigned level = iter->level - 1;
enum six_lock_type lock_type = btree_lock_want(iter, level);
enum six_lock_type lock_type = __btree_lock_want(iter, level);
BKEY_PADDED(k) tmp;
BUG_ON(!btree_node_locked(iter, iter->level));
@ -799,6 +872,12 @@ static void btree_iter_up(struct btree_iter *iter)
btree_node_unlock(iter, iter->level++);
}
static void btree_iter_set_end(struct btree_iter *iter)
{
iter->uptodate = BTREE_ITER_END;
__bch2_btree_iter_unlock(iter);
}
int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
@ -871,7 +950,7 @@ io_error:
BUG_ON(ret != -EIO);
iter->flags |= BTREE_ITER_ERROR;
iter->l[iter->level].b = NULL;
iter->l[iter->level].b = BTREE_ITER_NOT_END;
goto out;
}
@ -888,9 +967,12 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
{
unsigned depth_want = iter->level;
if (unlikely(!iter->l[iter->level].b))
if (unlikely(iter->uptodate == BTREE_ITER_END))
return 0;
BUG_ON(iter->level >= BTREE_MAX_DEPTH);
BUG_ON(!iter->l[iter->level].b);
iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
/* make sure we have all the intent locks we need - ugh */
@ -959,6 +1041,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
}
iter->uptodate = BTREE_ITER_NEED_PEEK;
bch2_btree_iter_verify_locks(iter);
return 0;
}
@ -966,13 +1049,15 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
{
int ret;
if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
if (__bch2_btree_iter_relock(iter))
return 0;
ret = __bch2_btree_iter_traverse(iter);
if (unlikely(ret))
ret = btree_iter_traverse_error(iter, ret);
BUG_ON(ret == -EINTR && !btree_iter_linked(iter));
return ret;
}
@ -984,18 +1069,29 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
int ret;
EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
bch2_btree_iter_verify_locks(iter);
if (iter->uptodate == BTREE_ITER_UPTODATE)
return iter->l[iter->level].b;
if (unlikely(iter->uptodate == BTREE_ITER_END))
return NULL;
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ERR_PTR(ret);
b = iter->l[iter->level].b;
if (b) {
EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
iter->pos = b->key.k.p;
if (!b) {
btree_iter_set_end(iter);
return NULL;
}
BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
iter->pos = b->key.k.p;
iter->uptodate = BTREE_ITER_UPTODATE;
return b;
}
@ -1005,24 +1101,39 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
int ret;
EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
bch2_btree_iter_verify_locks(iter);
btree_iter_up(iter);
if (!btree_iter_node(iter, iter->level))
if (!btree_iter_node(iter, iter->level)) {
btree_iter_set_end(iter);
return NULL;
}
/* parent node usually won't be locked: redo traversal if necessary */
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
ret = bch2_btree_iter_traverse(iter);
if (ret)
return NULL;
if (!bch2_btree_node_relock(iter, iter->level)) {
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
ret = bch2_btree_iter_traverse(iter);
if (ret)
return NULL;
}
b = iter->l[iter->level].b;
if (!b)
return b;
BUG_ON(!b);
if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
/* Haven't gotten to the end of the parent node: */
/*
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
/*
* We don't really want to be unlocking here except we can't
* directly tell btree_iter_traverse() "traverse to this level"
* except by setting iter->level, so we have to unlock so we
* don't screw up our lock invariants:
*/
if (btree_node_read_locked(iter, iter->level))
btree_node_unlock(iter, iter->level);
/* ick: */
iter->pos = iter->btree_id == BTREE_ID_INODES
@ -1086,8 +1197,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
EBUG_ON(iter->flags & BTREE_ITER_SLOTS);
EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
!btree_node_locked(iter, 0));
bch2_btree_iter_verify_locks(iter);
if (iter->uptodate == BTREE_ITER_UPTODATE) {
struct bkey_packed *k =
@ -1117,7 +1227,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
/* got to the end of the leaf, iterator needs to be traversed: */
iter->pos = l->b->key.k.p;
if (!bkey_cmp(iter->pos, POS_MAX)) {
iter->uptodate = BTREE_ITER_END;
btree_iter_set_end(iter);
return bkey_s_c_null;
}
@ -1144,7 +1254,7 @@ struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
iter->pos = l->b->key.k.p;
if (!bkey_cmp(iter->pos, POS_MAX)) {
iter->uptodate = BTREE_ITER_END;
btree_iter_set_end(iter);
return bkey_s_c_null;
}
@ -1163,6 +1273,7 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
EBUG_ON(iter->flags & BTREE_ITER_SLOTS);
bch2_btree_iter_verify_locks(iter);
if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
k = bch2_btree_iter_peek(iter);
@ -1225,7 +1336,7 @@ recheck:
if (iter->flags & BTREE_ITER_IS_EXTENTS) {
if (n.p.offset == KEY_OFFSET_MAX) {
if (n.p.inode == KEY_INODE_MAX) {
iter->uptodate = BTREE_ITER_END;
btree_iter_set_end(iter);
return bkey_s_c_null;
}
@ -1259,8 +1370,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS));
EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
!btree_node_locked(iter, 0));
bch2_btree_iter_verify_locks(iter);
if (iter->uptodate == BTREE_ITER_UPTODATE) {
struct bkey_s_c ret = { .k = &iter->k };
@ -1286,6 +1396,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
{
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS));
bch2_btree_iter_verify_locks(iter);
iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
@ -1347,13 +1462,11 @@ void bch2_btree_iter_unlink(struct btree_iter *iter)
if (!btree_iter_linked(iter))
return;
for_each_linked_btree_iter(iter, linked) {
for_each_linked_btree_iter(iter, linked)
if (linked->next == iter) {
linked->next = iter->next;
return;
}
}
BUG();
}
@ -1366,9 +1479,9 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
iter->next = new;
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
unsigned nr_iters = 1;
unsigned nr_iters = 0;
for_each_linked_btree_iter(iter, new)
for_each_btree_iter(iter, new)
nr_iters++;
BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE);

View File

@ -28,40 +28,47 @@ static inline bool btree_iter_linked(const struct btree_iter *iter)
return iter->next != iter;
}
/**
* for_each_linked_btree_iter - iterate over all iterators linked with @_iter
*/
#define for_each_linked_btree_iter(_iter, _linked) \
for ((_linked) = (_iter)->next; \
(_linked) != (_iter); \
(_linked) = (_linked)->next)
static inline bool __iter_has_node(const struct btree_iter *iter,
const struct btree *b)
{
/*
* We don't compare the low bits of the lock sequence numbers because
* @iter might have taken a write lock on @b, and we don't want to skip
* the linked iterator if the sequence numbers were equal before taking
* that write lock. The lock sequence number is incremented by taking
* and releasing write locks and is even when unlocked:
*/
return iter->l[b->level].b == b &&
iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1;
}
static inline struct btree_iter *
__next_linked_btree_node(struct btree_iter *iter, struct btree *b,
struct btree_iter *linked)
__next_linked_iter(struct btree_iter *iter, struct btree_iter *linked)
{
do {
linked = linked->next;
return linked->next != iter ? linked->next : NULL;
}
if (linked == iter)
return NULL;
/*
* We don't compare the low bits of the lock sequence numbers
* because @iter might have taken a write lock on @b, and we
* don't want to skip the linked iterator if the sequence
* numbers were equal before taking that write lock. The lock
* sequence number is incremented by taking and releasing write
* locks and is even when unlocked:
*/
} while (linked->l[b->level].b != b ||
linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1);
static inline struct btree_iter *
__next_iter_with_node(struct btree_iter *iter, struct btree *b,
struct btree_iter *linked)
{
while (linked && !__iter_has_node(linked, b))
linked = __next_linked_iter(iter, linked);
return linked;
}
/**
* for_each_linked_btree_node - iterate over all iterators linked with @_iter
* for_each_btree_iter - iterate over all iterators linked with @_iter,
* including @_iter
*/
#define for_each_btree_iter(_iter, _linked) \
for ((_linked) = (_iter); (_linked); \
(_linked) = __next_linked_iter(_iter, _linked))
/**
* for_each_btree_iter_with_node - iterate over all iterators linked with @_iter
* that also point to @_b
*
* @_b is assumed to be locked by @_iter
@ -69,15 +76,27 @@ __next_linked_btree_node(struct btree_iter *iter, struct btree *b,
* Filters out iterators that don't have a valid btree_node iterator for @_b -
* i.e. iterators for which bch2_btree_node_relock() would not succeed.
*/
#define for_each_linked_btree_node(_iter, _b, _linked) \
#define for_each_btree_iter_with_node(_iter, _b, _linked) \
for ((_linked) = (_iter); \
((_linked) = __next_linked_btree_node(_iter, _b, _linked));)
((_linked) = __next_iter_with_node(_iter, _b, _linked)); \
(_linked) = __next_linked_iter(_iter, _linked))
/**
* for_each_linked_btree_iter - iterate over all iterators linked with @_iter,
* _not_ including @_iter
*/
#define for_each_linked_btree_iter(_iter, _linked) \
for ((_linked) = (_iter)->next; \
(_linked) != (_iter); \
(_linked) = (_linked)->next)
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
void bch2_btree_iter_verify_locks(struct btree_iter *);
#else
static inline void bch2_btree_iter_verify(struct btree_iter *iter,
struct btree *b) {}
struct btree *b) {}
static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
#endif
void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
@ -85,22 +104,28 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
struct bkey_packed *, unsigned, unsigned);
int bch2_btree_iter_unlock(struct btree_iter *);
bool __bch2_btree_iter_set_locks_want(struct btree_iter *, unsigned);
static inline bool bch2_btree_iter_set_locks_want(struct btree_iter *iter,
unsigned new_locks_want)
bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
unsigned new_locks_want)
{
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
if (iter->locks_want == new_locks_want &&
iter->nodes_intent_locked == (1 << new_locks_want) - 1)
return true;
return __bch2_btree_iter_set_locks_want(iter, new_locks_want);
return iter->locks_want < new_locks_want
? __bch2_btree_iter_upgrade(iter, new_locks_want)
: iter->uptodate <= BTREE_ITER_NEED_PEEK;
}
bool bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
void bch2_btree_iter_node_drop_linked(struct btree_iter *, struct btree *);
void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
{
if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
__bch2_btree_iter_downgrade(iter, 0);
}
void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);

View File

@ -75,16 +75,23 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
mark_btree_node_locked(iter, level, SIX_LOCK_intent);
}
static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level)
static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
{
return level < iter->locks_want
? SIX_LOCK_intent
: SIX_LOCK_read;
}
static inline bool btree_want_intent(struct btree_iter *iter, int level)
static inline enum btree_node_locked_type
btree_lock_want(struct btree_iter *iter, int level)
{
return btree_lock_want(iter, level) == SIX_LOCK_intent;
if (level < iter->level)
return BTREE_NODE_UNLOCKED;
if (level < iter->locks_want)
return BTREE_NODE_INTENT_LOCKED;
if (level == iter->level)
return BTREE_NODE_READ_LOCKED;
return BTREE_NODE_UNLOCKED;
}
static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
@ -98,6 +105,14 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
mark_btree_node_unlocked(iter, level);
}
static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
{
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
while (iter->nodes_locked)
btree_node_unlock(iter, __ffs(iter->nodes_locked));
}
static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
{
switch (type) {
@ -150,8 +165,11 @@ bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
static inline bool bch2_btree_node_relock(struct btree_iter *iter,
unsigned level)
{
return likely(btree_lock_want(iter, level) ==
btree_node_locked_type(iter, level)) ||
EBUG_ON(btree_node_locked(iter, level) &&
btree_node_locked_type(iter, level) !=
__btree_lock_want(iter, level));
return likely(btree_node_locked(iter, level)) ||
__bch2_btree_node_relock(iter, level);
}

View File

@ -85,31 +85,49 @@ int __bch2_btree_insert_at(struct btree_insert *);
__VA_ARGS__ \
}})
enum {
__BTREE_INSERT_ATOMIC,
__BTREE_INSERT_NOUNLOCK,
__BTREE_INSERT_NOFAIL,
__BTREE_INSERT_USE_RESERVE,
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
__BCH_HASH_SET_MUST_CREATE,
__BCH_HASH_SET_MUST_REPLACE,
};
/*
* Don't drop/retake locks: instead return -EINTR if need to upgrade to intent
* locks, -EAGAIN if need to wait on btree reserve
* Don't drop/retake locks before doing btree update, instead return -EINTR if
* we had to drop locks for any reason
*/
#define BTREE_INSERT_ATOMIC (1 << 0)
#define BTREE_INSERT_ATOMIC (1 << __BTREE_INSERT_ATOMIC)
/*
* Don't drop locks _after_ successfully updating btree:
*/
#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK)
/* Don't check for -ENOSPC: */
#define BTREE_INSERT_NOFAIL (1 << 1)
#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL)
/* for copygc, or when merging btree nodes */
#define BTREE_INSERT_USE_RESERVE (1 << 2)
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << 3)
#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
/*
* Insert is for journal replay: don't get journal reservations, or mark extents
* (bch_mark_key)
*/
#define BTREE_INSERT_JOURNAL_REPLAY (1 << 4)
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT (1 << 5)
#define BTREE_INSERT_GC_LOCK_HELD (1 << 6)
#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)
#define BCH_HASH_SET_MUST_CREATE (1 << 7)
#define BCH_HASH_SET_MUST_REPLACE (1 << 8)
#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE)
#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE)
int bch2_btree_delete_at(struct btree_iter *, unsigned);

View File

@ -223,8 +223,7 @@ found:
mutex_unlock(&c->btree_interior_update_lock);
}
static void __btree_node_free(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
static void __btree_node_free(struct bch_fs *c, struct btree *b)
{
trace_btree_node_free(c, b);
@ -237,21 +236,11 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
clear_btree_node_noevict(b);
btree_node_lock_type(c, b, SIX_LOCK_write);
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
/*
* By using six_unlock_write() directly instead of
* bch2_btree_node_unlock_write(), we don't update the iterator's
* sequence numbers and cause future bch2_btree_node_relock() calls to
* fail:
*/
six_unlock_write(&b->lock);
}
void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
@ -264,7 +253,9 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
clear_btree_node_dirty(b);
__btree_node_free(c, b, NULL);
btree_node_lock_type(c, b, SIX_LOCK_write);
__btree_node_free(c, b);
six_unlock_write(&b->lock);
bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
}
@ -283,9 +274,9 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
*/
btree_update_drop_new_node(c, b);
bch2_btree_iter_node_drop_linked(iter, b);
__btree_node_free(c, b, iter);
__bch2_btree_node_lock_write(b, iter);
__btree_node_free(c, b);
six_unlock_write(&b->lock);
bch2_btree_iter_node_drop(iter, b);
}
@ -499,7 +490,9 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
bch2_btree_open_bucket_put(c, b);
}
__btree_node_free(c, b, NULL);
btree_node_lock_type(c, b, SIX_LOCK_write);
__btree_node_free(c, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
}
@ -1362,7 +1355,8 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
}
static void btree_split(struct btree_update *as, struct btree *b,
struct btree_iter *iter, struct keylist *keys)
struct btree_iter *iter, struct keylist *keys,
unsigned flags)
{
struct bch_fs *c = as->c;
struct btree *parent = btree_node_parent(iter, b);
@ -1425,7 +1419,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
if (parent) {
/* Split a non root node */
bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
} else if (n3) {
bch2_btree_set_root(as, n3, iter);
} else {
@ -1491,9 +1485,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
btree_update_updated_node(as, b);
for_each_linked_btree_node(iter, b, linked)
for_each_btree_iter_with_node(iter, b, linked)
bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
bch2_btree_node_iter_peek(&iter->l[b->level].iter, b);
bch2_btree_iter_verify(iter, b);
}
@ -1511,7 +1504,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
* for leaf nodes -- inserts into interior nodes have to be atomic.
*/
void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
struct btree_iter *iter, struct keylist *keys)
struct btree_iter *iter, struct keylist *keys,
unsigned flags)
{
struct bch_fs *c = as->c;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
@ -1551,14 +1545,14 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
btree_node_interior_verify(b);
bch2_foreground_maybe_merge(c, iter, b->level);
bch2_foreground_maybe_merge(c, iter, b->level, flags);
return;
split:
btree_split(as, b, iter, keys);
btree_split(as, b, iter, keys, flags);
}
int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
unsigned btree_reserve_flags)
unsigned flags)
{
struct btree *b = iter->l[0].b;
struct btree_update *as;
@ -1570,16 +1564,17 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
* We already have a disk reservation and open buckets pinned; this
* allocation must not block:
*/
for_each_linked_btree_iter(iter, linked)
for_each_btree_iter(iter, linked)
if (linked->btree_id == BTREE_ID_EXTENTS)
btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
if (iter->btree_id == BTREE_ID_EXTENTS)
btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
flags |= BTREE_INSERT_USE_RESERVE;
closure_init_stack(&cl);
/* Hack, because gc and splitting nodes doesn't mix yet: */
if (!down_read_trylock(&c->gc_lock)) {
if (flags & BTREE_INSERT_NOUNLOCK)
return -EINTR;
bch2_btree_iter_unlock(iter);
down_read(&c->gc_lock);
@ -1591,39 +1586,43 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
* XXX: figure out how far we might need to split,
* instead of locking/reserving all the way to the root:
*/
if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) {
if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
ret = -EINTR;
goto out;
}
as = bch2_btree_update_start(c, iter->btree_id,
btree_update_reserve_required(c, b),
btree_reserve_flags, &cl);
btree_update_reserve_required(c, b), flags,
!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
if (ret == -EAGAIN) {
BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
bch2_btree_iter_unlock(iter);
up_read(&c->gc_lock);
closure_sync(&cl);
return -EINTR;
ret = -EINTR;
}
goto out;
}
btree_split(as, b, iter, NULL);
btree_split(as, b, iter, NULL, flags);
bch2_btree_update_done(as);
bch2_btree_iter_set_locks_want(iter, 1);
/*
* We haven't successfully inserted yet, so don't downgrade all the way
* back to read locks;
*/
__bch2_btree_iter_downgrade(iter, 1);
out:
up_read(&c->gc_lock);
closure_sync(&cl);
return ret;
}
int __bch2_foreground_maybe_merge(struct bch_fs *c,
struct btree_iter *iter,
unsigned level,
enum btree_node_sibling sib)
void __bch2_foreground_maybe_merge(struct bch_fs *c,
struct btree_iter *iter,
unsigned level,
unsigned flags,
enum btree_node_sibling sib)
{
struct btree_update *as;
struct bkey_format_state new_s;
@ -1636,29 +1635,29 @@ int __bch2_foreground_maybe_merge(struct bch_fs *c,
closure_init_stack(&cl);
retry:
if (!bch2_btree_node_relock(iter, level))
return 0;
BUG_ON(!btree_node_locked(iter, level));
b = iter->l[level].b;
parent = btree_node_parent(iter, b);
if (!parent)
return 0;
goto out;
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
return 0;
goto out;
/* XXX: can't be holding read locks */
m = bch2_btree_node_get_sibling(c, iter, b, sib);
m = bch2_btree_node_get_sibling(c, iter, b,
!(flags & BTREE_INSERT_NOUNLOCK), sib);
if (IS_ERR(m)) {
ret = PTR_ERR(m);
goto out;
goto err;
}
/* NULL means no sibling: */
if (!m) {
b->sib_u64s[sib] = U16_MAX;
return 0;
goto out;
}
if (sib == btree_prev_sib) {
@ -1688,33 +1687,26 @@ retry:
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
six_unlock_intent(&m->lock);
return 0;
}
/* We're changing btree topology, doesn't mix with gc: */
if (!down_read_trylock(&c->gc_lock)) {
six_unlock_intent(&m->lock);
bch2_btree_iter_unlock(iter);
down_read(&c->gc_lock);
up_read(&c->gc_lock);
ret = -EINTR;
goto out;
}
if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) {
/* We're changing btree topology, doesn't mix with gc: */
if (!down_read_trylock(&c->gc_lock))
goto err_cycle_gc_lock;
if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
ret = -EINTR;
goto out_unlock;
goto err_unlock;
}
as = bch2_btree_update_start(c, iter->btree_id,
btree_update_reserve_required(c, parent) + 1,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE,
&cl);
!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
goto out_unlock;
goto err_unlock;
}
trace_btree_merge(c, b);
@ -1744,7 +1736,7 @@ retry:
bch2_btree_node_write(c, n, SIX_LOCK_intent);
bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
bch2_btree_open_bucket_put(c, n);
bch2_btree_node_free_inmem(c, b, iter);
@ -1754,26 +1746,53 @@ retry:
bch2_btree_iter_verify(iter, n);
bch2_btree_update_done(as);
out_unlock:
if (ret != -EINTR && ret != -EAGAIN)
bch2_btree_iter_set_locks_want(iter, 1);
six_unlock_intent(&m->lock);
up_read(&c->gc_lock);
out:
if (ret == -EAGAIN || ret == -EINTR) {
bch2_btree_iter_unlock(iter);
ret = -EINTR;
}
/*
* Don't downgrade locks here: we're called after successful insert,
* and the caller will downgrade locks after a successful insert
* anyways (in case e.g. a split was required first)
*
* And we're also called when inserting into interior nodes in the
* split path, and downgrading to read locks in there is potentially
* confusing:
*/
closure_sync(&cl);
return;
if (ret == -EINTR) {
err_cycle_gc_lock:
six_unlock_intent(&m->lock);
if (flags & BTREE_INSERT_NOUNLOCK)
goto out;
bch2_btree_iter_unlock(iter);
down_read(&c->gc_lock);
up_read(&c->gc_lock);
ret = -EINTR;
goto err;
err_unlock:
six_unlock_intent(&m->lock);
up_read(&c->gc_lock);
err:
BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
if ((ret == -EAGAIN || ret == -EINTR) &&
!(flags & BTREE_INSERT_NOUNLOCK)) {
bch2_btree_iter_unlock(iter);
closure_sync(&cl);
ret = bch2_btree_iter_traverse(iter);
if (!ret)
goto retry;
if (ret)
goto out;
goto retry;
}
return ret;
goto out;
}
static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
@ -1806,7 +1825,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
} else {
bch2_btree_set_root(as, n, iter);
}
@ -1815,7 +1834,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_node_free_inmem(c, b, iter);
BUG_ON(!bch2_btree_iter_node_replace(iter, n));
bch2_btree_iter_node_replace(iter, n);
bch2_btree_update_done(as);
return 0;
@ -1830,7 +1849,6 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
__le64 seq, unsigned flags)
{
unsigned locks_want = iter->locks_want;
struct closure cl;
struct btree *b;
int ret;
@ -1839,7 +1857,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
closure_init_stack(&cl);
bch2_btree_iter_set_locks_want(iter, U8_MAX);
bch2_btree_iter_upgrade(iter, U8_MAX);
if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
if (!down_read_trylock(&c->gc_lock)) {
@ -1866,7 +1884,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
closure_sync(&cl);
}
bch2_btree_iter_set_locks_want(iter, locks_want);
bch2_btree_iter_downgrade(iter);
if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
up_read(&c->gc_lock);
@ -1920,7 +1938,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
}
bch2_keylist_add(&as->parent_keys, &new_key->k_i);
bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
@ -1982,6 +2000,9 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
closure_init_stack(&cl);
if (!bch2_btree_iter_upgrade(iter, U8_MAX))
return -EINTR;
if (!down_read_trylock(&c->gc_lock)) {
bch2_btree_iter_unlock(iter);
down_read(&c->gc_lock);
@ -2041,6 +2062,8 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
goto err_free_update;
__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
bch2_btree_iter_downgrade(iter);
err:
if (new_hash) {
mutex_lock(&c->btree_cache.lock);

View File

@ -146,35 +146,51 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *,
struct btree *);
void bch2_btree_insert_node(struct btree_update *, struct btree *,
struct btree_iter *, struct keylist *);
struct btree_iter *, struct keylist *,
unsigned);
int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
unsigned, enum btree_node_sibling);
void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
unsigned, unsigned, enum btree_node_sibling);
static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
struct btree_iter *iter,
unsigned level,
unsigned level, unsigned flags,
enum btree_node_sibling sib)
{
struct btree *b;
/*
* iterators are inconsistent when they hit end of leaf, until
* traversed again
*
* XXX inconsistent how?
*/
if (iter->flags & BTREE_ITER_AT_END_OF_LEAF)
return;
if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
return;
if (!bch2_btree_node_relock(iter, level))
return 0;
return;
b = iter->l[level].b;
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
return 0;
return;
return __bch2_foreground_maybe_merge(c, iter, level, sib);
__bch2_foreground_maybe_merge(c, iter, level, flags, sib);
}
static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
struct btree_iter *iter,
unsigned level)
unsigned level,
unsigned flags)
{
bch2_foreground_maybe_merge_sibling(c, iter, level, btree_prev_sib);
bch2_foreground_maybe_merge_sibling(c, iter, level, btree_next_sib);
bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
btree_prev_sib);
bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
btree_next_sib);
}
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);

View File

@ -227,19 +227,36 @@ btree_insert_key_leaf(struct btree_insert *trans,
return ret;
}
#define trans_for_each_entry(trans, i) \
for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
/*
* We sort transaction entries so that if multiple iterators point to the same
* leaf node they'll be adjacent:
*/
static bool same_leaf_as_prev(struct btree_insert *trans,
struct btree_insert_entry *i)
{
/*
* Because we sorted the transaction entries, if multiple iterators
* point to the same leaf node they'll always be adjacent now:
*/
return i != trans->entries &&
i[0].iter->l[0].b == i[-1].iter->l[0].b;
}
#define trans_for_each_entry(trans, i) \
for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans,
struct btree_insert_entry *i)
{
struct btree *b = i->iter->l[0].b;
do {
i++;
} while (i < trans->entries + trans->nr && b == i->iter->l[0].b);
return i;
}
#define trans_for_each_leaf(trans, i) \
for ((i) = (trans)->entries; \
(i) < (trans)->entries + (trans)->nr; \
(i) = trans_next_leaf(trans, i))
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
@ -262,19 +279,16 @@ static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
{
struct btree_insert_entry *i;
trans_for_each_entry(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_lock_for_insert(c, i->iter->l[0].b,
i->iter);
trans_for_each_leaf(trans, i)
bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
}
static void multi_unlock_write(struct btree_insert *trans)
{
struct btree_insert_entry *i;
trans_for_each_entry(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
trans_for_each_leaf(trans, i)
bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
}
static inline int btree_trans_cmp(struct btree_insert_entry l,
@ -285,6 +299,107 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
/* Normal update interface: */
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
static inline int do_btree_insert_at(struct btree_insert *trans,
struct btree_iter **split,
bool *cycle_gc_lock)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
unsigned u64s;
int ret;
trans_for_each_entry(trans, i)
BUG_ON(i->done);
u64s = 0;
trans_for_each_entry(trans, i)
u64s += jset_u64s(i->k->k.u64s + i->extra_res);
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
? bch2_journal_res_get(&c->journal,
&trans->journal_res,
u64s, u64s)
: 0;
if (ret)
return ret;
multi_lock_write(c, trans);
if (race_fault()) {
ret = -EINTR;
goto out;
}
u64s = 0;
trans_for_each_entry(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
u64s = 0;
/*
* bch2_btree_node_insert_fits() must be called under write lock:
* with only an intent lock, another thread can still call
* bch2_btree_node_write(), converting an unwritten bset to a
* written one
*/
u64s += i->k->k.u64s + i->extra_res;
if (!bch2_btree_node_insert_fits(c,
i->iter->l[0].b, u64s)) {
ret = -EINTR;
*split = i->iter;
goto out;
}
}
if (journal_seq_verify(c) &&
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
trans_for_each_entry(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
trans_for_each_entry(trans, i) {
switch (btree_insert_key_leaf(trans, i)) {
case BTREE_INSERT_OK:
i->done = true;
break;
case BTREE_INSERT_JOURNAL_RES_FULL:
case BTREE_INSERT_NEED_TRAVERSE:
case BTREE_INSERT_NEED_RESCHED:
ret = -EINTR;
break;
case BTREE_INSERT_BTREE_NODE_FULL:
ret = -EINTR;
*split = i->iter;
break;
case BTREE_INSERT_ENOSPC:
ret = -ENOSPC;
break;
case BTREE_INSERT_NEED_GC_LOCK:
ret = -EINTR;
*cycle_gc_lock = true;
break;
default:
BUG();
}
/*
* If we did some work (i.e. inserted part of an extent),
* we have to do all the other updates as well:
*/
if (!trans->did_work && (ret || *split))
break;
}
out:
multi_unlock_write(trans);
bch2_journal_res_put(&c->journal, &trans->journal_res);
return ret;
}
/**
* __bch_btree_insert_at - insert keys at given iterator positions
*
@ -300,194 +415,142 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
struct btree_iter *split = NULL;
struct btree_iter *linked, *split = NULL;
bool cycle_gc_lock = false;
unsigned u64s;
unsigned flags;
int ret;
for_each_btree_iter(trans->entries[0].iter, linked)
bch2_btree_iter_verify_locks(linked);
/* for the sake of sanity: */
BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
trans_for_each_entry(trans, i) {
BUG_ON(i->iter->level);
BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
BUG_ON(debug_check_bkeys(c) &&
bch2_bkey_invalid(c, i->iter->btree_id,
bkey_i_to_s_c(i->k)));
BUG_ON(i->iter->uptodate == BTREE_ITER_END);
}
bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
if (unlikely(!percpu_ref_tryget(&c->writes)))
return -EROFS;
retry_locks:
ret = -EINTR;
trans_for_each_entry(trans, i) {
if (!bch2_btree_iter_set_locks_want(i->iter, 1))
goto err;
if (i->iter->uptodate == BTREE_ITER_NEED_TRAVERSE) {
ret = bch2_btree_iter_traverse(i->iter);
if (ret)
goto err;
}
}
retry:
trans->did_work = false;
u64s = 0;
trans_for_each_entry(trans, i)
if (!i->done)
u64s += jset_u64s(i->k->k.u64s + i->extra_res);
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
? bch2_journal_res_get(&c->journal,
&trans->journal_res,
u64s, u64s)
: 0;
if (ret)
goto err;
multi_lock_write(c, trans);
if (race_fault()) {
ret = -EINTR;
goto unlock;
}
u64s = 0;
trans_for_each_entry(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
u64s = 0;
/*
* bch2_btree_node_insert_fits() must be called under write lock:
* with only an intent lock, another thread can still call
* bch2_btree_node_write(), converting an unwritten bset to a
* written one
*/
if (!i->done) {
u64s += i->k->k.u64s + i->extra_res;
if (!bch2_btree_node_insert_fits(c,
i->iter->l[0].b, u64s)) {
split = i->iter;
goto unlock;
}
}
}
ret = 0;
split = NULL;
cycle_gc_lock = false;
trans_for_each_entry(trans, i) {
if (i->done)
continue;
switch (btree_insert_key_leaf(trans, i)) {
case BTREE_INSERT_OK:
i->done = true;
break;
case BTREE_INSERT_JOURNAL_RES_FULL:
case BTREE_INSERT_NEED_TRAVERSE:
if (!bch2_btree_iter_upgrade(i->iter, 1)) {
ret = -EINTR;
break;
case BTREE_INSERT_NEED_RESCHED:
ret = -EAGAIN;
break;
case BTREE_INSERT_BTREE_NODE_FULL:
split = i->iter;
break;
case BTREE_INSERT_ENOSPC:
ret = -ENOSPC;
break;
case BTREE_INSERT_NEED_GC_LOCK:
cycle_gc_lock = true;
ret = -EINTR;
break;
default:
BUG();
goto err;
}
if (!trans->did_work && (ret || split))
break;
if (i->iter->flags & BTREE_ITER_ERROR) {
ret = -EIO;
goto err;
}
}
unlock:
multi_unlock_write(trans);
bch2_journal_res_put(&c->journal, &trans->journal_res);
if (split)
goto split;
if (ret)
ret = do_btree_insert_at(trans, &split, &cycle_gc_lock);
if (unlikely(ret))
goto err;
trans_for_each_leaf(trans, i)
bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
trans_for_each_entry(trans, i)
if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF)
goto out;
trans_for_each_entry(trans, i) {
/*
* iterators are inconsistent when they hit end of leaf, until
* traversed again
*/
if (i->iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
!same_leaf_as_prev(trans, i))
bch2_foreground_maybe_merge(c, i->iter, 0);
}
bch2_btree_iter_downgrade(i->iter);
out:
/* make sure we didn't lose an error: */
if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
trans_for_each_entry(trans, i)
BUG_ON(!i->done);
percpu_ref_put(&c->writes);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
/* make sure we didn't drop or screw up locks: */
for_each_btree_iter(trans->entries[0].iter, linked) {
bch2_btree_iter_verify_locks(linked);
BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
trans->did_work &&
linked->uptodate >= BTREE_ITER_NEED_RELOCK);
}
/* make sure we didn't lose an error: */
if (!ret)
trans_for_each_entry(trans, i)
BUG_ON(!i->done);
}
BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
return ret;
split:
/*
* have to drop journal res before splitting, because splitting means
* allocating new btree nodes, and holding a journal reservation
* potentially blocks the allocator:
*/
ret = bch2_btree_split_leaf(c, split, trans->flags);
/*
* This can happen when we insert part of an extent - with an update
* with multiple keys, we don't want to redo the entire update - that's
* just too confusing:
*/
if (!ret &&
(trans->flags & BTREE_INSERT_ATOMIC) &&
trans->did_work)
ret = -EINTR;
if (ret)
goto err;
/*
* if the split didn't have to drop locks the insert will still be
* atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
* and is overwriting won't have changed)
*/
goto retry_locks;
err:
flags = trans->flags;
/*
* BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
* update; if we haven't done anything yet it doesn't apply
*/
if (!trans->did_work)
flags &= ~BTREE_INSERT_NOUNLOCK;
if (split) {
ret = bch2_btree_split_leaf(c, split, flags);
/*
* if the split succeeded without dropping locks the insert will
* still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
* caller peeked() and is overwriting won't have changed)
*/
#if 0
/*
* XXX:
* split -> btree node merging (of parent node) might still drop
* locks when we're not passing it BTREE_INSERT_NOUNLOCK
*/
if (!ret && !trans->did_work)
goto retry;
#endif
/*
* don't care if we got ENOSPC because we told split it
* couldn't block:
*/
if (!ret || (flags & BTREE_INSERT_NOUNLOCK))
ret = -EINTR;
}
if (cycle_gc_lock) {
down_read(&c->gc_lock);
if (!down_read_trylock(&c->gc_lock)) {
if (flags & BTREE_INSERT_NOUNLOCK)
goto out;
bch2_btree_iter_unlock(trans->entries[0].iter);
down_read(&c->gc_lock);
}
up_read(&c->gc_lock);
}
if (ret == -EINTR) {
if (flags & BTREE_INSERT_NOUNLOCK)
goto out;
trans_for_each_entry(trans, i) {
int ret2 = bch2_btree_iter_traverse(i->iter);
if (ret2) {
ret = ret2;
goto out;
}
BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
}
/*
* BTREE_ITER_ATOMIC means we have to return -EINTR if we
* dropped locks:
*/
if (!(trans->flags & BTREE_INSERT_ATOMIC))
if (!(flags & BTREE_INSERT_ATOMIC))
goto retry;
}
@ -549,7 +612,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
BTREE_INSERT_ENTRY(&iter, k));
BTREE_INSERT_ENTRY(&iter, k));
bch2_btree_iter_unlock(&iter);
return ret;
@ -584,6 +647,11 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
if (bkey_cmp(iter.pos, end) >= 0)
break;
if (k.k->type == KEY_TYPE_DISCARD) {
bch2_btree_iter_next(&iter);
continue;
}
bkey_init(&delete.k);
/*
@ -615,8 +683,8 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
}
ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&iter, &delete));
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&iter, &delete));
if (ret)
break;

View File

@ -358,8 +358,9 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
old.data_type != new.data_type) {
BUG_ON(!c);
bch2_fs_inconsistent(c,
"different types of data in same bucket: %u, %u",
old.data_type, new.data_type);
"different types of data in same bucket: %s, %s",
bch2_data_types[old.data_type],
bch2_data_types[new.data_type]);
}
dev_usage = this_cpu_ptr(ca->usage_percpu);

View File

@ -109,14 +109,6 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
return true;
}
static const unsigned bch_crc_bytes[] = {
[BCH_CSUM_NONE] = 0,
[BCH_CSUM_CRC32C] = 4,
[BCH_CSUM_CRC64] = 8,
[BCH_CSUM_CHACHA20_POLY1305_80] = 10,
[BCH_CSUM_CHACHA20_POLY1305_128] = 16,
};
/* returns true if not equal */
static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
{

View File

@ -12,7 +12,8 @@
unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent);
unsigned len = bkey_val_bytes(d.k) -
offsetof(struct bch_dirent, d_name);
while (len && !d.v->d_name[len - 1])
--len;
@ -22,7 +23,8 @@ unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
static unsigned dirent_val_u64s(unsigned len)
{
return DIV_ROUND_UP(sizeof(struct bch_dirent) + len, sizeof(u64));
return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
sizeof(u64));
}
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
@ -98,7 +100,7 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
return "value too big";
if (len > NAME_MAX)
if (len > BCH_NAME_MAX)
return "dirent name too big";
if (memchr(d.v->d_name, '/', len))
@ -141,9 +143,14 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
struct bkey_i_dirent *dirent;
unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
if (name->len > BCH_NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
BUG_ON(u64s > U8_MAX);
dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS);
if (!dirent)
return NULL;
return ERR_PTR(-ENOMEM);
bkey_dirent_init(&dirent->k_i);
dirent->k.u64s = u64s;
@ -153,7 +160,8 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
memcpy(dirent->v.d_name, name->name, name->len);
memset(dirent->v.d_name + name->len, 0,
bkey_val_bytes(&dirent->k) -
(sizeof(struct bch_dirent) + name->len));
offsetof(struct bch_dirent, d_name) -
name->len);
EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
@ -169,8 +177,8 @@ int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
int ret;
dirent = dirent_create_key(type, name, dst_inum);
if (!dirent)
return -ENOMEM;
if (IS_ERR(dirent))
return PTR_ERR(dirent);
ret = bch2_hash_set(bch2_dirent_hash_desc, hash_info, c, dir_inum,
journal_seq, &dirent->k_i, flags);
@ -204,7 +212,7 @@ int bch2_dirent_rename(struct bch_fs *c,
struct bpos src_pos = bch2_dirent_pos(src_dir, src_name);
struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
bool need_whiteout;
int ret = -ENOMEM;
int ret;
bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@ -218,15 +226,19 @@ int bch2_dirent_rename(struct bch_fs *c,
if (mode == BCH_RENAME_EXCHANGE) {
new_src = dirent_create_key(0, src_name, 0);
if (!new_src)
if (IS_ERR(new_src)) {
ret = PTR_ERR(new_src);
goto err;
}
} else {
new_src = (void *) &delete;
}
new_dst = dirent_create_key(0, dst_name, 0);
if (!new_dst)
if (IS_ERR(new_dst)) {
ret = PTR_ERR(new_dst);
goto err;
}
retry:
/*
* Note that on -EINTR/dropped locks we're not restarting the lookup

View File

@ -257,12 +257,12 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
int ret;
mutex_lock(&h->inode->ei_update_lock);
if (h->new_i_size != U64_MAX)
i_size_write(&h->inode->v, h->new_i_size);
i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
if (!ret && h->new_i_size != U64_MAX)
i_size_write(&h->inode->v, h->new_i_size);
mutex_unlock(&h->inode->ei_update_lock);
bch2_quota_reservation_put(c, h->inode, &h->quota_res);
@ -348,17 +348,25 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
return BTREE_INSERT_NEED_TRAVERSE;
}
BUG_ON(h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY);
/* truncate in progress? */
if (h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)
goto no_i_size_update;
h->inode_u.bi_size = offset;
do_pack = true;
inode->ei_inode.bi_size = offset;
if (h->op->is_dio)
i_size_write(&inode->v, offset);
spin_lock(&inode->v.i_lock);
if (offset > inode->v.i_size) {
if (h->op->is_dio)
i_size_write(&inode->v, offset);
else
BUG();
}
spin_unlock(&inode->v.i_lock);
}
no_i_size_update:
if (sectors) {
if (!h->need_inode_update) {
h->need_inode_update = true;
@ -1457,8 +1465,10 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
copied = 0;
}
spin_lock(&inode->v.i_lock);
if (pos + copied > inode->v.i_size)
i_size_write(&inode->v, pos + copied);
spin_unlock(&inode->v.i_lock);
if (copied) {
if (!PageUptodate(page))
@ -1563,8 +1573,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
inode->ei_last_dirtied = (unsigned long) current;
spin_lock(&inode->v.i_lock);
if (pos + copied > inode->v.i_size)
i_size_write(&inode->v, pos + copied);
spin_unlock(&inode->v.i_lock);
if (copied < len &&
((offset + copied) & (PAGE_SIZE - 1))) {
@ -2047,10 +2059,17 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
int ret;
ret = filemap_write_and_wait_range(inode->v.i_mapping, start, end);
ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
goto out;
ret = sync_inode_metadata(&inode->v, 1);
if (ret)
return ret;
out:
if (c->opts.journal_flush_disabled)
return 0;
@ -2149,25 +2168,61 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
from, from + PAGE_SIZE);
}
static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
int ret;
ret = filemap_write_and_wait_range(mapping,
inode->ei_inode.bi_size, S64_MAX);
if (ret)
return ret;
truncate_setsize(&inode->v, iattr->ia_size);
setattr_copy(&inode->v, iattr);
mutex_lock(&inode->ei_update_lock);
inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
ret = bch2_write_inode_size(c, inode, inode->v.i_size);
mutex_unlock(&inode->ei_update_lock);
return ret;
}
int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
bool shrink = iattr->ia_size <= inode->v.i_size;
struct i_sectors_hook i_sectors_hook =
i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
bool shrink;
int ret = 0;
inode_dio_wait(&inode->v);
pagecache_block_get(&mapping->add_lock);
truncate_setsize(&inode->v, iattr->ia_size);
BUG_ON(inode->v.i_size < inode->ei_inode.bi_size);
shrink = iattr->ia_size <= inode->v.i_size;
if (!shrink) {
ret = bch2_extend(inode, iattr);
goto err_put_pagecache;
}
ret = bch2_truncate_page(inode, iattr->ia_size);
if (unlikely(ret))
goto err_put_pagecache;
/* sync appends.. */
/* XXX what protects inode->i_size? */
if (iattr->ia_size > inode->ei_inode.bi_size)
ret = filemap_write_and_wait_range(mapping,
inode->ei_inode.bi_size, S64_MAX);
inode->ei_inode.bi_size,
iattr->ia_size - 1);
else if (iattr->ia_size & (PAGE_SIZE - 1))
ret = filemap_write_and_wait_range(mapping,
round_down(iattr->ia_size, PAGE_SIZE),
iattr->ia_size - 1);
if (ret)
goto err_put_pagecache;
@ -2175,41 +2230,31 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
ret = i_sectors_dirty_start(c, &i_sectors_hook);
if (unlikely(ret))
goto err;
goto err_put_pagecache;
/*
* There might be persistent reservations (from fallocate())
* above i_size, which bch2_inode_truncate() will discard - we're
* only supposed to discard them if we're doing a real truncate
* here (new i_size < current i_size):
*/
if (shrink) {
ret = bch2_truncate_page(inode, iattr->ia_size);
if (unlikely(ret))
goto err;
truncate_setsize(&inode->v, iattr->ia_size);
ret = bch2_inode_truncate(c, inode->v.i_ino,
round_up(iattr->ia_size, PAGE_SIZE) >> 9,
&i_sectors_hook.hook,
&inode->ei_journal_seq);
if (unlikely(ret))
goto err;
}
ret = bch2_inode_truncate(c, inode->v.i_ino,
round_up(iattr->ia_size, PAGE_SIZE) >> 9,
&i_sectors_hook.hook,
&inode->ei_journal_seq);
if (unlikely(ret))
goto err_put_sectors_dirty;
setattr_copy(&inode->v, iattr);
inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
err:
/*
* On error - in particular, bch2_truncate_page() error - don't clear
* I_SIZE_DIRTY, as we've left data above i_size!:
*/
if (ret)
i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
out:
ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
err_put_pagecache:
pagecache_block_put(&mapping->add_lock);
return ret;
err_put_sectors_dirty:
/*
* On error - in particular, bch2_truncate_page() error - don't clear
* I_SIZE_DIRTY, as we've left data above i_size!:
*/
i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
goto out;
}
/* fallocate: */
@ -2389,7 +2434,6 @@ btree_iter_err:
if (ret)
goto err_put_sectors_dirty;
i_size_write(&inode->v, new_size);
i_sectors_hook.new_i_size = new_size;
err_put_sectors_dirty:
ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;

View File

@ -106,6 +106,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
break;
}
BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size);
if (set) {
ret = set(inode, &inode_u, p);
if (ret)
@ -114,6 +116,10 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
BUG_ON(i_nlink < nlink_bias(inode->v.i_mode));
BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size &&
!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
inode_u.bi_size > i_size_read(&inode->v));
inode_u.bi_mode = inode->v.i_mode;
inode_u.bi_uid = i_uid_read(&inode->v);
inode_u.bi_gid = i_gid_read(&inode->v);
@ -129,11 +135,17 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
ret = bch2_btree_insert_at(c, NULL, NULL,
&inode->ei_journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
} while (ret == -EINTR);
if (!ret) {
/*
* the btree node lock protects inode->ei_inode, not
* ei_update_lock; this is important for inode updates via
* bchfs_write_index_update
*/
inode->ei_inode = inode_u;
inode->ei_qid = bch_qid(&inode_u);
}
@ -1107,7 +1119,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
buf->f_namelen = NAME_MAX;
buf->f_namelen = BCH_NAME_MAX;
return 0;
}

View File

@ -75,6 +75,19 @@ static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
}
static inline bool journal_entry_empty(struct jset *j)
{
struct jset_entry *i;
if (j->seq != j->last_seq)
return false;
vstruct_for_each(j, i)
if (i->type || i->u64s)
return false;
return true;
}
static enum {
JOURNAL_ENTRY_ERROR,
JOURNAL_ENTRY_INUSE,
@ -129,6 +142,11 @@ static enum {
/* XXX: why set this here, and not in bch2_journal_write()? */
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
if (journal_entry_empty(buf->data))
clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
else
set_bit(JOURNAL_NOT_EMPTY, &j->flags);
journal_pin_new_entry(j, 1);
bch2_journal_buf_init(j);
@ -884,8 +902,18 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
void bch2_fs_journal_stop(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
wait_event(j->wait, journal_flush_write(j));
/* do we need to write another journal entry? */
if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
c->btree_roots_dirty)
bch2_journal_meta(j);
BUG_ON(!bch2_journal_error(j) &&
test_bit(JOURNAL_NOT_EMPTY, &j->flags));
cancel_delayed_work_sync(&j->write_work);
cancel_delayed_work_sync(&j->reclaim_work);
}

View File

@ -13,37 +13,6 @@
#include <trace/events/bcachefs.h>
static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type,
enum btree_id id)
{
struct jset_entry *entry;
for_each_jset_entry_type(entry, j, type)
if (entry->btree_id == id)
return entry;
return NULL;
}
struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j,
enum btree_id id, unsigned *level)
{
struct bkey_i *k;
struct jset_entry *entry =
bch2_journal_find_entry(j, BCH_JSET_ENTRY_btree_root, id);
if (!entry)
return NULL;
if (!entry->u64s)
return ERR_PTR(-EINVAL);
k = entry->start;
*level = entry->level;
*level = entry->level;
return k;
}
struct journal_list {
struct closure cl;
struct mutex lock;
@ -717,6 +686,37 @@ void bch2_journal_entries_free(struct list_head *list)
}
}
int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq)
{
struct journal *j = &c->journal;
struct journal_entry_pin_list *p;
u64 seq, nr = end_seq - last_seq + 1;
if (nr > j->pin.size) {
free_fifo(&j->pin);
init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
if (!j->pin.data) {
bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
return -ENOMEM;
}
}
atomic64_set(&j->seq, end_seq);
j->last_seq_ondisk = last_seq;
j->pin.front = last_seq;
j->pin.back = end_seq + 1;
fifo_for_each_entry_ptr(p, &j->pin, seq) {
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0);
p->devs.nr = 0;
}
return 0;
}
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
@ -724,10 +724,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
struct journal_replay *i;
struct journal_entry_pin_list *p;
struct bch_dev *ca;
u64 cur_seq, end_seq, seq;
u64 cur_seq, end_seq;
unsigned iter;
size_t entries = 0;
u64 nr, keys = 0;
size_t keys = 0, entries = 0;
bool degraded = false;
int ret = 0;
@ -783,43 +782,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
}
}
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
for_each_jset_key(k, _n, entry, &i->j)
keys++;
}
i = list_last_entry(list, struct journal_replay, list);
nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
fsck_err_on(c->sb.clean && (keys || nr > 1), c,
"filesystem marked clean but journal not empty (%llu keys in %llu entries)",
keys, nr);
if (nr > j->pin.size) {
free_fifo(&j->pin);
init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
if (!j->pin.data) {
bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
return -ENOMEM;
}
}
atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
j->pin.front = le64_to_cpu(i->j.last_seq);
j->pin.back = le64_to_cpu(i->j.seq) + 1;
fifo_for_each_entry_ptr(p, &j->pin, seq) {
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0);
p->devs.nr = 0;
}
ret = bch2_journal_set_seq(c,
le64_to_cpu(i->j.last_seq),
le64_to_cpu(i->j.seq));
if (ret)
return ret;
mutex_lock(&j->blacklist_lock);
@ -842,6 +811,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
struct journal_replay, list)->j.seq);
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
bool blacklisted;
mutex_lock(&j->blacklist_lock);
@ -863,10 +834,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
journal_last_seq(j), end_seq);
cur_seq = le64_to_cpu(i->j.seq) + 1;
for_each_jset_key(k, _n, entry, &i->j)
keys++;
entries++;
}
bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu",
bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
keys, entries, journal_cur_seq(j));
fsck_err:
return ret;
@ -950,7 +924,8 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
j->replay_journal_seq = 0;
bch2_journal_set_replay_done(j);
ret = bch2_journal_flush_all_pins(j);
bch2_journal_flush_all_pins(j);
ret = bch2_journal_error(j);
err:
bch2_journal_entries_free(list);
return ret;

View File

@ -1,9 +1,6 @@
#ifndef _BCACHEFS_JOURNAL_IO_H
#define _BCACHEFS_JOURNAL_IO_H
struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
enum btree_id, unsigned *);
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
@ -37,6 +34,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
int bch2_journal_read(struct bch_fs *, struct list_head *);
int bch2_journal_entry_sectors(struct journal *);

View File

@ -337,34 +337,22 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
return ret;
}
int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin *pin;
u64 pin_seq;
bool flush;
if (!test_bit(JOURNAL_STARTED, &j->flags))
return 0;
again:
wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
if (pin) {
/* flushing a journal pin might cause a new one to be added: */
return;
while (1) {
wait_event(j->wait, journal_flush_done(j, seq_to_flush,
&pin, &pin_seq));
if (!pin)
break;
pin->flush(j, pin, pin_seq);
goto again;
}
spin_lock(&j->lock);
flush = journal_last_seq(j) != j->last_seq_ondisk ||
(seq_to_flush == U64_MAX && c->btree_roots_dirty);
spin_unlock(&j->lock);
return flush ? bch2_journal_meta(j) : 0;
}
int bch2_journal_flush_all_pins(struct journal *j)
{
return bch2_journal_flush_pins(j, U64_MAX);
}
int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
@ -383,7 +371,9 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
seq = iter;
spin_unlock(&j->lock);
ret = bch2_journal_flush_pins(j, seq);
bch2_journal_flush_pins(j, seq);
ret = bch2_journal_error(j);
if (ret)
return ret;
@ -404,7 +394,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
}
spin_unlock(&j->lock);
bch2_replicas_gc_end(c, ret);
ret = bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;

View File

@ -29,8 +29,13 @@ void bch2_journal_pin_add_if_older(struct journal *,
void bch2_journal_reclaim_fast(struct journal *);
void bch2_journal_reclaim_work(struct work_struct *);
int bch2_journal_flush_pins(struct journal *, u64);
int bch2_journal_flush_all_pins(struct journal *);
void bch2_journal_flush_pins(struct journal *, u64);
static inline void bch2_journal_flush_all_pins(struct journal *j)
{
bch2_journal_flush_pins(j, U64_MAX);
}
int bch2_journal_flush_device_pins(struct journal *, int);
#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */

View File

@ -117,6 +117,7 @@ enum {
JOURNAL_REPLAY_DONE,
JOURNAL_STARTED,
JOURNAL_NEED_WRITE,
JOURNAL_NOT_EMPTY,
};
/* Embedded in struct bch_fs */

View File

@ -126,7 +126,13 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
retry:
if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
dev_idx)) {
bch2_btree_iter_set_locks_want(&iter, 0);
/*
* we might have found a btree node key we
* needed to update, and then tried to update it
* but got -EINTR after upgrading the iter, but
* then raced and the node is now gone:
*/
bch2_btree_iter_downgrade(&iter);
ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
bkey_i_to_s_c(&b->key));
@ -141,11 +147,6 @@ retry:
if (ret)
goto err;
if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
b = bch2_btree_iter_peek_node(&iter);
goto retry;
}
ret = bch2_btree_node_update_key(c, &iter, b, new_key);
if (ret == -EINTR) {
b = bch2_btree_iter_peek_node(&iter);
@ -160,7 +161,7 @@ retry:
ret = 0;
out:
bch2_replicas_gc_end(c, ret);
ret = bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;

View File

@ -137,6 +137,9 @@ enum opt_type {
BCH_OPT(degraded, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(discard, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(verbose_recovery, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \

346
libbcachefs/recovery.c Normal file
View File

@ -0,0 +1,346 @@
#include "bcachefs.h"
#include "alloc.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "error.h"
#include "fsck.h"
#include "journal_io.h"
#include "quota.h"
#include "recovery.h"
#include "super-io.h"
#include <linux/stat.h>
struct bkey_i *btree_root_find(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct jset *j,
enum btree_id id, unsigned *level)
{
struct bkey_i *k;
struct jset_entry *entry, *start, *end;
if (clean) {
start = clean->start;
end = vstruct_end(&clean->field);
} else {
start = j->start;
end = vstruct_last(j);
}
for (entry = start; entry < end; entry = vstruct_next(entry))
if (entry->type == BCH_JSET_ENTRY_btree_root &&
entry->btree_id == id)
goto found;
return NULL;
found:
if (!entry->u64s)
return ERR_PTR(-EINVAL);
k = entry->start;
*level = entry->level;
return k;
}
static int verify_superblock_clean(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct jset *j)
{
unsigned i;
int ret = 0;
if (!clean || !j)
return 0;
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
le64_to_cpu(clean->journal_seq),
le64_to_cpu(j->seq)))
bch2_fs_mark_clean(c, false);
mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
"superblock read clock doesn't match journal after clean shutdown");
mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
"superblock read clock doesn't match journal after clean shutdown");
for (i = 0; i < BTREE_ID_NR; i++) {
struct bkey_i *k1, *k2;
unsigned l1 = 0, l2 = 0;
k1 = btree_root_find(c, clean, NULL, i, &l1);
k2 = btree_root_find(c, NULL, j, i, &l2);
if (!k1 && !k2)
continue;
mustfix_fsck_err_on(!k1 || !k2 ||
IS_ERR(k1) ||
IS_ERR(k2) ||
k1->k.u64s != k2->k.u64s ||
memcmp(k1, k2, bkey_bytes(k1)) ||
l1 != l2, c,
"superblock btree root doesn't match journal after clean shutdown");
}
fsck_err:
return ret;
}
static bool journal_empty(struct list_head *journal)
{
struct journal_replay *i;
struct jset_entry *entry;
if (list_empty(journal))
return true;
i = list_last_entry(journal, struct journal_replay, list);
if (i->j.last_seq != i->j.seq)
return false;
list_for_each_entry(i, journal, list) {
vstruct_for_each(&i->j, entry) {
if (entry->type == BCH_JSET_ENTRY_btree_root)
continue;
if (entry->type == BCH_JSET_ENTRY_btree_keys &&
!entry->u64s)
continue;
return false;
}
}
return true;
}
int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
LIST_HEAD(journal);
struct jset *j = NULL;
unsigned i;
int ret;
mutex_lock(&c->sb_lock);
if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
bch_info(c, "building replicas info");
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
if (c->sb.clean)
sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
if (sb_clean) {
clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
GFP_KERNEL);
if (!clean) {
ret = -ENOMEM;
mutex_unlock(&c->sb_lock);
goto err;
}
}
mutex_unlock(&c->sb_lock);
if (clean)
bch_info(c, "recovering from clean shutdown, journal seq %llu",
le64_to_cpu(clean->journal_seq));
if (!clean || !c->opts.nofsck) {
ret = bch2_journal_read(c, &journal);
if (ret)
goto err;
j = &list_entry(journal.prev, struct journal_replay, list)->j;
} else {
ret = bch2_journal_set_seq(c,
le64_to_cpu(clean->journal_seq),
le64_to_cpu(clean->journal_seq));
BUG_ON(ret);
}
ret = verify_superblock_clean(c, clean, j);
if (ret)
goto err;
fsck_err_on(clean && !journal_empty(&journal), c,
"filesystem marked clean but journal not empty");
if (clean) {
c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
} else {
c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
}
for (i = 0; i < BTREE_ID_NR; i++) {
unsigned level;
struct bkey_i *k;
k = btree_root_find(c, clean, j, i, &level);
if (!k)
continue;
err = "invalid btree root pointer";
if (IS_ERR(k))
goto err;
err = "error reading btree root";
if (bch2_btree_root_read(c, i, k, level)) {
if (i != BTREE_ID_ALLOC)
goto err;
mustfix_fsck_err(c, "error reading btree root");
}
}
for (i = 0; i < BTREE_ID_NR; i++)
if (!c->btree_roots[i].b)
bch2_btree_root_alloc(c, i);
err = "error reading allocation information";
ret = bch2_alloc_read(c, &journal);
if (ret)
goto err;
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
bch_verbose(c, "starting mark and sweep:");
err = "error in recovery";
ret = bch2_initial_gc(c, &journal);
if (ret)
goto err;
bch_verbose(c, "mark and sweep done");
if (c->opts.noreplay)
goto out;
/*
* Mark dirty before journal replay, fsck:
* XXX: after a clean shutdown, this could be done lazily only when fsck
* finds an error
*/
bch2_fs_mark_clean(c, false);
/*
* bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
* will give spurious errors about oldest_gen > bucket_gen -
* this is a hack but oh well.
*/
bch2_fs_journal_start(&c->journal);
err = "error starting allocator";
if (bch2_fs_allocator_start(c))
goto err;
bch_verbose(c, "starting journal replay:");
err = "journal replay failed";
ret = bch2_journal_replay(c, &journal);
if (ret)
goto err;
bch_verbose(c, "journal replay done");
if (c->opts.norecovery)
goto out;
bch_verbose(c, "starting fsck:");
err = "error in fsck";
ret = bch2_fsck(c, !c->opts.nofsck);
if (ret)
goto err;
bch_verbose(c, "fsck done");
if (enabled_qtypes(c)) {
bch_verbose(c, "reading quotas:");
ret = bch2_fs_quota_read(c);
if (ret)
goto err;
bch_verbose(c, "quotas done");
}
out:
bch2_journal_entries_free(&journal);
kfree(clean);
return ret;
err:
fsck_err:
BUG_ON(!ret);
goto out;
}
int bch2_fs_initialize(struct bch_fs *c)
{
struct bch_inode_unpacked inode;
struct bkey_inode_buf packed_inode;
const char *err = "cannot allocate memory";
struct bch_dev *ca;
LIST_HEAD(journal);
unsigned i;
int ret;
bch_notice(c, "initializing new filesystem");
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
ret = bch2_initial_gc(c, &journal);
if (ret)
goto err;
err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i)
if (bch2_dev_journal_alloc(ca)) {
percpu_ref_put(&ca->io_ref);
goto err;
}
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
/*
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
bch2_fs_journal_start(&c->journal);
bch2_journal_set_replay_done(&c->journal);
err = "error starting allocator";
if (bch2_fs_allocator_start(c))
goto err;
bch2_inode_init(c, &inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
inode.bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(&packed_inode, &inode);
err = "error creating root directory";
if (bch2_btree_insert(c, BTREE_ID_INODES,
&packed_inode.inode.k_i,
NULL, NULL, NULL, 0))
goto err;
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
if (ret)
goto err;
}
err = "error writing first journal entry";
if (bch2_journal_meta(&c->journal))
goto err;
mutex_lock(&c->sb_lock);
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return 0;
err:
BUG_ON(!ret);
return ret;
}

7
libbcachefs/recovery.h Normal file
View File

@ -0,0 +1,7 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
#endif /* _BCACHEFS_RECOVERY_H */

View File

@ -215,10 +215,8 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
return 0;
err:
mutex_unlock(&c->sb_lock);
if (new_gc)
kfree(new_gc);
if (new_r)
kfree(new_r);
kfree(new_gc);
kfree(new_r);
return ret;
}
@ -265,10 +263,9 @@ int bch2_mark_bkey_replicas(struct bch_fs *c,
return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
}
int bch2_replicas_gc_end(struct bch_fs *c, int err)
int bch2_replicas_gc_end(struct bch_fs *c, int ret)
{
struct bch_replicas_cpu *new_r, *old_r;
int ret = 0;
lockdep_assert_held(&c->replicas_gc_lock);
@ -276,29 +273,31 @@ int bch2_replicas_gc_end(struct bch_fs *c, int err)
new_r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas_gc, NULL);
if (err) {
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(new_r, rcu);
if (ret)
goto err;
}
if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
ret = -ENOSPC;
goto err;
}
bch2_write_super(c);
/* don't update in memory replicas until changes are persistent */
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, new_r);
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(old_r, rcu);
bch2_write_super(c);
err:
out:
mutex_unlock(&c->sb_lock);
return ret;
err:
kfree_rcu(new_r, rcu);
goto out;
}
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)

View File

@ -237,6 +237,7 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
{
struct bkey_s_c k;
bch2_btree_iter_copy(iter, start);
bch2_btree_iter_next_slot(iter);
for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {

View File

@ -4,6 +4,7 @@
#include "disk_groups.h"
#include "error.h"
#include "io.h"
#include "journal.h"
#include "replicas.h"
#include "quota.h"
#include "super-io.h"
@ -89,6 +90,9 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
struct bch_sb *new_sb;
struct bio *bio;
if (sb->sb && sb->page_order >= order)
return 0;
if (sb->have_layout) {
u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
@ -849,6 +853,84 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
.validate = bch2_sb_validate_crypt,
};
/* BCH_SB_FIELD_clean: */
void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
{
struct bch_sb_field_clean *sb_clean;
unsigned u64s = sizeof(*sb_clean) / sizeof(u64);
struct jset_entry *entry;
struct btree_root *r;
mutex_lock(&c->sb_lock);
if (clean == BCH_SB_CLEAN(c->disk_sb.sb))
goto out;
SET_BCH_SB_CLEAN(c->disk_sb.sb, clean);
if (!clean)
goto write_super;
mutex_lock(&c->btree_root_lock);
for (r = c->btree_roots;
r < c->btree_roots + BTREE_ID_NR;
r++)
if (r->alive)
u64s += jset_u64s(r->key.u64s);
sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
if (!sb_clean) {
bch_err(c, "error resizing superblock while setting filesystem clean");
goto out;
}
sb_clean->flags = 0;
sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
sb_clean->journal_seq = journal_cur_seq(&c->journal) - 1;
entry = sb_clean->start;
memset(entry, 0,
vstruct_end(&sb_clean->field) - (void *) entry);
for (r = c->btree_roots;
r < c->btree_roots + BTREE_ID_NR;
r++)
if (r->alive) {
entry->u64s = r->key.u64s;
entry->btree_id = r - c->btree_roots;
entry->level = r->level;
entry->type = BCH_JSET_ENTRY_btree_root;
bkey_copy(&entry->start[0], &r->key);
entry = vstruct_next(entry);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
}
BUG_ON(entry != vstruct_end(&sb_clean->field));
mutex_unlock(&c->btree_root_lock);
write_super:
bch2_write_super(c);
out:
mutex_unlock(&c->sb_lock);
}
static const char *bch2_sb_validate_clean(struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_clean *clean = field_to_type(f, clean);
if (vstruct_bytes(&clean->field) < sizeof(*clean))
return "invalid field crypt: wrong size";
return NULL;
}
static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
.validate = bch2_sb_validate_clean,
};
static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
#define x(f, nr) \
[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,

View File

@ -131,6 +131,10 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
};
}
/* BCH_SB_FIELD_clean: */
void bch2_fs_mark_clean(struct bch_fs *, bool);
size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
struct bch_sb_field *);

View File

@ -10,7 +10,6 @@
#include "alloc.h"
#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "chardev.h"
@ -26,14 +25,13 @@
#include "inode.h"
#include "io.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
#include "migrate.h"
#include "movinggc.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
@ -201,18 +199,6 @@ int bch2_congested(void *data, int bdi_bits)
* - allocator depends on the journal (when it rewrites prios and gens)
*/
static void bch_fs_mark_clean(struct bch_fs *c)
{
if (!bch2_journal_error(&c->journal) &&
!test_bit(BCH_FS_ERROR, &c->flags) &&
!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
}
static void __bch2_fs_read_only(struct bch_fs *c)
{
struct bch_dev *ca;
@ -229,7 +215,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
*/
bch2_journal_flush_pins(&c->journal, U64_MAX - 1);
bch2_journal_flush_all_pins(&c->journal);
for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca);
@ -246,9 +232,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
bch2_btree_verify_flushed(c);
bch2_fs_journal_stop(&c->journal);
/*
@ -257,6 +240,8 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/
if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
bch2_btree_flush_all_writes(c);
else
bch2_btree_verify_flushed(c);
/*
* After stopping journal:
@ -275,12 +260,10 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
void bch2_fs_read_only(struct bch_fs *c)
{
if (c->state != BCH_FS_STARTING &&
c->state != BCH_FS_RW)
if (c->state == BCH_FS_RO)
return;
if (test_bit(BCH_FS_ERROR, &c->flags))
return;
BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
/*
* Block new foreground-end write operations from starting - any new
@ -311,13 +294,18 @@ void bch2_fs_read_only(struct bch_fs *c)
__bch2_fs_read_only(c);
bch_fs_mark_clean(c);
wait_event(bch_read_only_wait,
test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
c->state = BCH_FS_RO;
if (!bch2_journal_error(&c->journal) &&
!test_bit(BCH_FS_ERROR, &c->flags) &&
!test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
bch2_fs_mark_clean(c, true);
if (c->state != BCH_FS_STOPPING)
c->state = BCH_FS_RO;
}
static void bch2_fs_read_only_work(struct work_struct *work)
@ -352,10 +340,11 @@ const char *bch2_fs_read_write(struct bch_fs *c)
const char *err = NULL;
unsigned i;
if (c->state != BCH_FS_STARTING &&
c->state != BCH_FS_RO)
if (c->state == BCH_FS_RW)
return NULL;
bch2_fs_mark_clean(c, false);
for_each_rw_member(ca, c, i)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
@ -446,11 +435,6 @@ void bch2_fs_stop(struct bch_fs *c)
struct bch_dev *ca;
unsigned i;
mutex_lock(&c->state_lock);
BUG_ON(c->state == BCH_FS_STOPPING);
c->state = BCH_FS_STOPPING;
mutex_unlock(&c->state_lock);
for_each_member_device(ca, c, i)
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev)
@ -475,11 +459,9 @@ void bch2_fs_stop(struct bch_fs *c)
closure_debug_destroy(&c->cl);
mutex_lock(&c->state_lock);
__bch2_fs_read_only(c);
bch2_fs_read_only(c);
mutex_unlock(&c->state_lock);
bch_fs_mark_clean(c);
/* btree prefetch might have kicked off reads in the background: */
bch2_btree_flush_all_reads(c);
@ -695,9 +677,7 @@ const char *bch2_fs_start(struct bch_fs *c)
const char *err = "cannot allocate memory";
struct bch_sb_field_members *mi;
struct bch_dev *ca;
LIST_HEAD(journal);
struct jset *j;
time64_t now;
time64_t now = ktime_get_seconds();
unsigned i;
int ret = -EINVAL;
@ -706,157 +686,26 @@ const char *bch2_fs_start(struct bch_fs *c)
BUG_ON(c->state != BCH_FS_STARTING);
mutex_lock(&c->sb_lock);
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
mi = bch2_sb_get_members(c->disk_sb.sb);
for_each_online_member(ca, c, i)
mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
mutex_unlock(&c->sb_lock);
for_each_rw_member(ca, c, i)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
if (BCH_SB_INITIALIZED(c->disk_sb.sb)) {
ret = bch2_journal_read(c, &journal);
if (ret)
goto err;
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
? bch2_fs_recovery(c)
: bch2_fs_initialize(c);
if (ret)
goto err;
j = &list_entry(journal.prev, struct journal_replay, list)->j;
c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
for (i = 0; i < BTREE_ID_NR; i++) {
unsigned level;
struct bkey_i *k;
k = bch2_journal_find_btree_root(c, j, i, &level);
if (!k)
continue;
err = "invalid btree root pointer";
if (IS_ERR(k))
goto err;
err = "error reading btree root";
if (bch2_btree_root_read(c, i, k, level)) {
if (i != BTREE_ID_ALLOC)
goto err;
mustfix_fsck_err(c, "error reading btree root");
}
}
for (i = 0; i < BTREE_ID_NR; i++)
if (!c->btree_roots[i].b)
bch2_btree_root_alloc(c, i);
err = "error reading allocation information";
ret = bch2_alloc_read(c, &journal);
if (ret)
goto err;
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
bch_verbose(c, "starting mark and sweep:");
err = "error in recovery";
ret = bch2_initial_gc(c, &journal);
if (ret)
goto err;
bch_verbose(c, "mark and sweep done");
if (c->opts.noreplay)
goto recovery_done;
/*
* bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
* will give spurious errors about oldest_gen > bucket_gen -
* this is a hack but oh well.
*/
bch2_fs_journal_start(&c->journal);
err = "error starting allocator";
if (bch2_fs_allocator_start(c))
goto err;
bch_verbose(c, "starting journal replay:");
err = "journal replay failed";
ret = bch2_journal_replay(c, &journal);
if (ret)
goto err;
bch_verbose(c, "journal replay done");
if (c->opts.norecovery)
goto recovery_done;
bch_verbose(c, "starting fsck:");
err = "error in fsck";
ret = bch2_fsck(c, !c->opts.nofsck);
if (ret)
goto err;
bch_verbose(c, "fsck done");
if (enabled_qtypes(c)) {
bch_verbose(c, "reading quotas:");
ret = bch2_fs_quota_read(c);
if (ret)
goto err;
bch_verbose(c, "quotas done");
}
} else {
struct bch_inode_unpacked inode;
struct bkey_inode_buf packed_inode;
bch_notice(c, "initializing new filesystem");
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
ret = bch2_initial_gc(c, &journal);
if (ret)
goto err;
err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i)
if (bch2_dev_journal_alloc(ca)) {
percpu_ref_put(&ca->io_ref);
goto err;
}
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
/*
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
bch2_fs_journal_start(&c->journal);
bch2_journal_set_replay_done(&c->journal);
err = "error starting allocator";
if (bch2_fs_allocator_start(c))
goto err;
bch2_inode_init(c, &inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
inode.bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(&packed_inode, &inode);
err = "error creating root directory";
if (bch2_btree_insert(c, BTREE_ID_INODES,
&packed_inode.inode.k_i,
NULL, NULL, NULL, 0))
goto err;
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
if (ret)
goto err;
}
err = "error writing first journal entry";
if (bch2_journal_meta(&c->journal))
goto err;
}
recovery_done:
err = "dynamic fault";
if (bch2_fs_init_fault("fs_start"))
goto err;
@ -869,28 +718,13 @@ recovery_done:
goto err;
}
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
now = ktime_get_seconds();
for_each_member_device(ca, c, i)
mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
set_bit(BCH_FS_STARTED, &c->flags);
err = NULL;
out:
mutex_unlock(&c->state_lock);
bch2_journal_entries_free(&journal);
return err;
err:
fsck_err:
switch (ret) {
case BCH_FSCK_ERRORS_NOT_FIXED:
bch_err(c, "filesystem contains errors: please report this to the developers");
@ -1091,6 +925,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
if (opt_defined(c->opts, discard))
ca->mi.discard = opt_get(c->opts, discard);
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
@ -1454,7 +1291,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
* must flush all existing journal entries, they might have
* (overwritten) keys that point to the device we're removing:
*/
ret = bch2_journal_flush_all_pins(&c->journal);
bch2_journal_flush_all_pins(&c->journal);
ret = bch2_journal_error(&c->journal);
if (ret) {
bch_err(ca, "Remove failed, journal error");
goto err;
@ -1615,6 +1453,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
{
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb = { NULL };
struct bch_sb_field_members *mi;
struct bch_dev *ca;
unsigned dev_idx;
const char *err;
@ -1646,6 +1485,15 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
goto err;
}
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
mi->members[ca->dev_idx].last_mount =
cpu_to_le64(ktime_get_seconds());
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
mutex_unlock(&c->state_lock);
return 0;
err:

View File

@ -27,6 +27,7 @@
#include "rebalance.h"
#include "replicas.h"
#include "super-io.h"
#include "tests.h"
#include <linux/blkdev.h>
#include <linux/sort.h>
@ -192,6 +193,10 @@ rw_attribute(pd_controllers_update_seconds);
read_attribute(meta_replicas_have);
read_attribute(data_replicas_have);
#ifdef CONFIG_BCACHEFS_TESTS
write_attribute(perf_test);
#endif /* CONFIG_BCACHEFS_TESTS */
#define BCH_DEBUG_PARAM(name, description) \
rw_attribute(name);
@ -446,7 +451,25 @@ STORE(__bch2_fs)
sc.nr_to_scan = strtoul_or_return(buf);
c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
}
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
char *test = strsep(&p, " \t\n");
char *nr_str = strsep(&p, " \t\n");
char *threads_str = strsep(&p, " \t\n");
unsigned threads;
u64 nr;
int ret = -EINVAL;
if (threads_str &&
!(ret = kstrtouint(threads_str, 10, &threads)) &&
!(ret = bch2_strtoull_h(nr_str, &nr)))
bch2_btree_perf_test(c, test, nr, threads);
else
size = ret;
kfree(tmp);
}
#endif
return size;
}
@ -477,6 +500,10 @@ struct attribute *bch2_fs_files[] = {
&sysfs_promote_whole_extents,
&sysfs_compression_stats,
#ifdef CONFIG_BCACHEFS_TESTS
&sysfs_perf_test,
#endif
NULL
};

289
libbcachefs/tests.c Normal file
View File

@ -0,0 +1,289 @@
#ifdef CONFIG_BCACHEFS_TESTS
#include "bcachefs.h"
#include "btree_update.h"
#include "tests.h"
#include "linux/kthread.h"
#include "linux/random.h"
static void test_delete(struct bch_fs *c, u64 nr)
{
struct btree_iter iter;
struct bkey_i_cookie k;
int ret;
bkey_cookie_init(&k.k_i);
bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p,
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(&iter);
BUG_ON(ret);
ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
BTREE_INSERT_ENTRY(&iter, &k.k_i));
BUG_ON(ret);
pr_info("deleting once");
ret = bch2_btree_delete_at(&iter, 0);
BUG_ON(ret);
pr_info("deleting twice");
ret = bch2_btree_delete_at(&iter, 0);
BUG_ON(ret);
bch2_btree_iter_unlock(&iter);
}
static u64 test_rand(void)
{
u64 v;
#if 0
v = prandom_u32();
#else
prandom_bytes(&v, sizeof(v));
#endif
return v;
}
static void rand_insert(struct bch_fs *c, u64 nr)
{
struct bkey_i_cookie k;
int ret;
u64 i;
for (i = 0; i < nr; i++) {
bkey_cookie_init(&k.k_i);
k.k.p.offset = test_rand();
ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
NULL, NULL, NULL, 0);
BUG_ON(ret);
}
}
static void rand_lookup(struct bch_fs *c, u64 nr)
{
u64 i;
for (i = 0; i < nr; i++) {
struct btree_iter iter;
struct bkey_s_c k;
bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
POS(0, test_rand()), 0);
k = bch2_btree_iter_peek(&iter);
bch2_btree_iter_unlock(&iter);
}
}
static void rand_mixed(struct bch_fs *c, u64 nr)
{
int ret;
u64 i;
for (i = 0; i < nr; i++) {
struct btree_iter iter;
struct bkey_s_c k;
bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
POS(0, test_rand()), 0);
k = bch2_btree_iter_peek(&iter);
if (!(i & 3) && k.k) {
struct bkey_i_cookie k;
bkey_cookie_init(&k.k_i);
k.k.p = iter.pos;
ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
BTREE_INSERT_ENTRY(&iter, &k.k_i));
BUG_ON(ret);
}
bch2_btree_iter_unlock(&iter);
}
}
static void rand_delete(struct bch_fs *c, u64 nr)
{
struct bkey_i k;
int ret;
u64 i;
for (i = 0; i < nr; i++) {
bkey_init(&k.k);
k.k.p.offset = test_rand();
ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k,
NULL, NULL, NULL, 0);
BUG_ON(ret);
}
}
static void seq_insert(struct bch_fs *c, u64 nr)
{
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_cookie insert;
int ret;
u64 i = 0;
bkey_cookie_init(&insert.k_i);
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
insert.k.p = iter.pos;
ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
BTREE_INSERT_ENTRY(&iter, &insert.k_i));
BUG_ON(ret);
if (++i == nr)
break;
}
bch2_btree_iter_unlock(&iter);
}
static void seq_lookup(struct bch_fs *c, u64 nr)
{
struct btree_iter iter;
struct bkey_s_c k;
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k)
;
bch2_btree_iter_unlock(&iter);
}
static void seq_overwrite(struct bch_fs *c, u64 nr)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
BTREE_ITER_INTENT, k) {
struct bkey_i_cookie u;
bkey_reassemble(&u.k_i, k);
ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
BTREE_INSERT_ENTRY(&iter, &u.k_i));
BUG_ON(ret);
}
bch2_btree_iter_unlock(&iter);
}
static void seq_delete(struct bch_fs *c, u64 nr)
{
int ret;
ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
POS_MIN, POS_MAX,
ZERO_VERSION, NULL, NULL, NULL);
BUG_ON(ret);
}
typedef void (*perf_test_fn)(struct bch_fs *, u64);
struct test_job {
struct bch_fs *c;
u64 nr;
unsigned nr_threads;
perf_test_fn fn;
atomic_t ready;
wait_queue_head_t ready_wait;
atomic_t done;
struct completion done_completion;
u64 start;
u64 finish;
};
static int btree_perf_test_thread(void *data)
{
struct test_job *j = data;
if (atomic_dec_and_test(&j->ready)) {
wake_up(&j->ready_wait);
j->start = sched_clock();
} else {
wait_event(j->ready_wait, !atomic_read(&j->ready));
}
j->fn(j->c, j->nr / j->nr_threads);
if (atomic_dec_and_test(&j->done)) {
j->finish = sched_clock();
complete(&j->done_completion);
}
return 0;
}
void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
u64 nr, unsigned nr_threads)
{
struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
char name_buf[20], nr_buf[20], per_sec_buf[20];
unsigned i;
u64 time;
atomic_set(&j.ready, nr_threads);
init_waitqueue_head(&j.ready_wait);
atomic_set(&j.done, nr_threads);
init_completion(&j.done_completion);
#define perf_test(_test) \
if (!strcmp(testname, #_test)) j.fn = _test
perf_test(rand_insert);
perf_test(rand_lookup);
perf_test(rand_mixed);
perf_test(rand_delete);
perf_test(seq_insert);
perf_test(seq_lookup);
perf_test(seq_overwrite);
perf_test(seq_delete);
/* a unit test, not a perf test: */
perf_test(test_delete);
if (!j.fn) {
pr_err("unknown test %s", testname);
return;
}
//pr_info("running test %s:", testname);
if (nr_threads == 1)
btree_perf_test_thread(&j);
else
for (i = 0; i < nr_threads; i++)
kthread_run(btree_perf_test_thread, &j,
"bcachefs perf test[%u]", i);
while (wait_for_completion_interruptible(&j.done_completion))
;
time = j.finish - j.start;
scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
bch2_hprint(nr_buf, nr);
bch2_hprint(per_sec_buf, nr * NSEC_PER_SEC / time);
printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
name_buf, nr_buf, nr_threads,
time / NSEC_PER_SEC,
time * nr_threads / nr,
per_sec_buf);
}
#endif /* CONFIG_BCACHEFS_TESTS */

14
libbcachefs/tests.h Normal file
View File

@ -0,0 +1,14 @@
#ifndef _BCACHEFS_TEST_H
#define _BCACHEFS_TEST_H
struct bch_fs;
#ifdef CONFIG_BCACHEFS_TESTS
void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
#else
#endif /* CONFIG_BCACHEFS_TESTS */
#endif /* _BCACHEFS_TEST_H */

View File

@ -27,55 +27,73 @@
#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
static const char si_units[] = "?kMGTPEZY";
static int __bch2_strtoh(const char *cp, u64 *res,
u64 t_max, bool t_signed)
{
bool positive = *cp != '-';
unsigned u;
u64 v = 0;
if (*cp == '+' || *cp == '-')
cp++;
if (!isdigit(*cp))
return -EINVAL;
do {
if (v > U64_MAX / 10)
return -ERANGE;
v *= 10;
if (v > U64_MAX - (*cp - '0'))
return -ERANGE;
v += *cp - '0';
cp++;
} while (isdigit(*cp));
for (u = 1; u < ARRAY_SIZE(si_units); u++)
if (*cp == si_units[u]) {
cp++;
goto got_unit;
}
u = 0;
got_unit:
if (*cp == '\n')
cp++;
if (*cp)
return -EINVAL;
if (fls64(v) + u * 10 > 64)
return -ERANGE;
v <<= u * 10;
if (positive) {
if (v > t_max)
return -ERANGE;
} else {
if (v && !t_signed)
return -ERANGE;
if (v > t_max + 1)
return -ERANGE;
v = -v;
}
*res = v;
return 0;
}
#define STRTO_H(name, type) \
int bch2_ ## name ## _h(const char *cp, type *res) \
{ \
int u = 0; \
char *e; \
type i = simple_ ## name(cp, &e, 10); \
\
switch (tolower(*e)) { \
default: \
return -EINVAL; \
case 'y': \
case 'z': \
u++; \
case 'e': \
u++; \
case 'p': \
u++; \
case 't': \
u++; \
case 'g': \
u++; \
case 'm': \
u++; \
case 'k': \
u++; \
if (e++ == cp) \
return -EINVAL; \
case '\n': \
case '\0': \
if (*e == '\n') \
e++; \
} \
\
if (*e) \
return -EINVAL; \
\
while (u--) { \
if ((type) ~0 > 0 && \
(type) ~0 / 1024 <= i) \
return -EINVAL; \
if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \
(i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \
return -EINVAL; \
i *= 1024; \
} \
\
*res = i; \
return 0; \
} \
u64 v; \
int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \
ANYSINT_MAX(type) != ((type) ~0ULL)); \
*res = v; \
return ret; \
}
STRTO_H(strtoint, int)
STRTO_H(strtouint, unsigned int)
@ -84,7 +102,6 @@ STRTO_H(strtoull, unsigned long long)
ssize_t bch2_hprint(char *buf, s64 v)
{
static const char units[] = "?kMGTPEZY";
char dec[4] = "";
int u, t = 0;
@ -103,7 +120,7 @@ ssize_t bch2_hprint(char *buf, s64 v)
if (v < 100 && v > -100)
scnprintf(dec, sizeof(dec), ".%i", t / 103);
return sprintf(buf, "%lli%s%c", v, dec, units[u]);
return sprintf(buf, "%lli%s%c", v, dec, si_units[u]);
}
ssize_t bch2_scnprint_string_list(char *buf, size_t size,

View File

@ -15,7 +15,7 @@
static unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
{
return DIV_ROUND_UP(sizeof(struct bch_xattr) +
return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
name_len + val_len, sizeof(u64));
}