Update bcachefs sources to 22fa8fc32e6a bcachefs: rcu_pending now works in userspace

This commit is contained in:
Kent Overstreet 2024-08-23 15:38:12 -04:00
parent 6f938e0399
commit b422b19f63
50 changed files with 1320 additions and 638 deletions

View File

@ -1 +1 @@
62439c6f1a6dba3fca1e57f352745d6e36dd1e31
22fa8fc32e6aafb8bd76c6b746868dbdbc6a934d

2
Cargo.lock generated
View File

@ -73,7 +73,7 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "bcachefs-tools"
version = "1.11.1"
version = "1.12.0"
dependencies = [
"anyhow",
"bch_bindgen",

View File

@ -1,6 +1,6 @@
[package]
name = "bcachefs-tools"
version = "1.11.1"
version = "1.12.0"
authors = ["Yuxuan Shui <yshuiv7@gmail.com>", "Kayla Firestack <dev@kaylafire.me>", "Kent Overstreet <kent.overstreet@linux.dev>" ]
edition = "2021"
rust-version = "1.70"

View File

@ -1,4 +1,4 @@
VERSION=1.11.1
VERSION=1.12.0
PREFIX?=/usr/local
LIBEXECDIR?=$(PREFIX)/libexec

View File

@ -41,6 +41,7 @@
#include <linux/limits.h>
#include <linux/log2.h>
#include <linux/math.h>
#include <linux/slab.h>
#include <linux/types.h>
struct genradix_root;
@ -48,10 +49,63 @@ struct genradix_root;
#define GENRADIX_NODE_SHIFT 9
#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT)
#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
#define GENRADIX_MAX_DEPTH \
DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
#define GENRADIX_DEPTH_MASK \
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
static inline int genradix_depth_shift(unsigned depth)
{
return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
}
/*
* Returns size (of data, in bytes) that a tree of a given depth holds:
*/
static inline size_t genradix_depth_size(unsigned depth)
{
return 1UL << genradix_depth_shift(depth);
}
static inline unsigned genradix_root_to_depth(struct genradix_root *r)
{
return (unsigned long) r & GENRADIX_DEPTH_MASK;
}
static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
{
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
}
struct __genradix {
struct genradix_root *root;
};
struct genradix_node {
union {
/* Interior node: */
struct genradix_node *children[GENRADIX_ARY];
/* Leaf: */
u8 data[GENRADIX_NODE_SIZE];
};
};
static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
{
return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
}
static inline void genradix_free_node(struct genradix_node *node)
{
kfree(node);
}
/*
* NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
*/
@ -128,6 +182,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
#define __genradix_idx_to_offset(_radix, _idx) \
__idx_to_offset(_idx, __genradix_obj_size(_radix))
static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset)
{
struct genradix_root *r = READ_ONCE(radix->root);
struct genradix_node *n = genradix_root_to_node(r);
unsigned level = genradix_root_to_depth(r);
unsigned shift = genradix_depth_shift(level);
if (unlikely(ilog2(offset) >= genradix_depth_shift(level)))
return NULL;
while (n && shift > GENRADIX_NODE_SHIFT) {
shift -= GENRADIX_ARY_SHIFT;
n = n->children[offset >> shift];
offset &= (1UL << shift) - 1;
}
return n ? &n->data[offset] : NULL;
}
#define genradix_ptr_inlined(_radix, _idx) \
(__genradix_cast(_radix) \
__genradix_ptr_inlined(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx)))
void *__genradix_ptr(struct __genradix *, size_t);
/**
@ -142,7 +220,24 @@ void *__genradix_ptr(struct __genradix *, size_t);
__genradix_ptr(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx)))
void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
void *__genradix_ptr_alloc(struct __genradix *, size_t,
struct genradix_node **, gfp_t);
#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \
(__genradix_cast(_radix) \
(__genradix_ptr_inlined(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx)) ?: \
__genradix_ptr_alloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx), \
NULL, _gfp)))
#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\
(__genradix_cast(_radix) \
(__genradix_ptr_inlined(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx)) ?: \
__genradix_ptr_alloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx), \
_new_node, _gfp)))
/**
* genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
@ -157,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
(__genradix_cast(_radix) \
__genradix_ptr_alloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx), \
_gfp))
NULL, _gfp))
#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\
(__genradix_cast(_radix) \
__genradix_ptr_alloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx), \
_new_node, _gfp))
struct genradix_iter {
size_t offset;

View File

@ -3,6 +3,7 @@
#define _TOOLS_LINUX_MM_H
#include <sys/syscall.h>
#include <unistd.h>
#include <linux/types.h>
struct sysinfo {

View File

@ -18,6 +18,7 @@
#define alloc_hooks(_do, ...) _do
#define ARCH_KMALLOC_MINALIGN 16
#define ARCH_SLAB_MINALIGN 16
#define KMALLOC_MAX_SIZE SIZE_MAX
#define MAX_PAGE_ORDER 10
@ -102,6 +103,8 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
((size) != 0 && (n) > SIZE_MAX / (size) \
? NULL : kmalloc((n) * (size), flags))
#define kvmalloc_array_noprof(...) kvmalloc_array(__VA_ARGS__)
#define kcalloc(n, size, flags) kmalloc_array(n, size, flags|__GFP_ZERO)
#define kfree(p) free((void *) p)

View File

@ -1,6 +1,12 @@
#ifndef __TOOLS_LINUX_SRCU_H
#define __TOOLS_LINUX_SRCU_H
#include <linux/rcupdate.h>
#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2
typedef void (*rcu_callback_t)(struct rcu_head *head);
struct srcu_struct {
};
@ -26,10 +32,35 @@ static inline unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
return 0;
}
#undef poll_state_synchronize_rcu
static inline bool poll_state_synchronize_rcu(unsigned long cookie)
{
return false;
}
#undef start_poll_synchronize_rcu
static inline unsigned long start_poll_synchronize_rcu()
{
return 0;
}
static inline unsigned long get_state_synchronize_rcu()
{
return 0;
}
static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) {}
static inline void srcu_barrier(struct srcu_struct *ssp) {}
static inline void cleanup_srcu_struct(struct srcu_struct *ssp) {}
static inline void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func)
{
func(rhp);
}
static inline int init_srcu_struct(struct srcu_struct *ssp)
{
return 0;

View File

@ -30,6 +30,7 @@
#include <linux/rcupdate.h>
#include <linux/sched/task.h>
#include <linux/sort.h>
#include <linux/jiffies.h>
static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
@ -240,71 +241,73 @@ fsck_err:
int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
enum bch_validate_flags flags)
{
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
struct bch_alloc_v4 a;
int ret = 0;
bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k),
bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
c, alloc_v4_val_size_bad,
"bad val size (%u > %zu)",
alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
BCH_ALLOC_V4_NR_BACKPOINTERS(a.v),
bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
c, alloc_v4_backpointers_start_bad,
"invalid backpointers_start");
bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type,
bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
c, alloc_key_data_type_bad,
"invalid data type (got %u should be %u)",
a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
a.data_type, alloc_data_type(a, a.data_type));
for (unsigned i = 0; i < 2; i++)
bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
c, alloc_key_io_time_bad,
"invalid io_time[%s]: %llu, max %llu",
i == READ ? "read" : "write",
a.v->io_time[i], LRU_TIME_MAX);
a.io_time[i], LRU_TIME_MAX);
unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) >
unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
offsetof(struct bch_alloc_v4, stripe_sectors)
? a.v->stripe_sectors
? a.stripe_sectors
: 0;
switch (a.v->data_type) {
switch (a.data_type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
bkey_fsck_err_on(stripe_sectors ||
a.v->dirty_sectors ||
a.v->cached_sectors ||
a.v->stripe,
a.dirty_sectors ||
a.cached_sectors ||
a.stripe,
c, alloc_key_empty_but_have_data,
"empty data type free but have data %u.%u.%u %u",
stripe_sectors,
a.v->dirty_sectors,
a.v->cached_sectors,
a.v->stripe);
a.dirty_sectors,
a.cached_sectors,
a.stripe);
break;
case BCH_DATA_sb:
case BCH_DATA_journal:
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
bkey_fsck_err_on(!a.v->dirty_sectors &&
bkey_fsck_err_on(!a.dirty_sectors &&
!stripe_sectors,
c, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
bch2_data_type_str(a.v->data_type));
bch2_data_type_str(a.data_type));
break;
case BCH_DATA_cached:
bkey_fsck_err_on(!a.v->cached_sectors ||
a.v->dirty_sectors ||
bkey_fsck_err_on(!a.cached_sectors ||
a.dirty_sectors ||
stripe_sectors ||
a.v->stripe,
a.stripe,
c, alloc_key_cached_inconsistency,
"data type inconsistency");
bkey_fsck_err_on(!a.v->io_time[READ] &&
bkey_fsck_err_on(!a.io_time[READ] &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
c, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0");
@ -1872,26 +1875,26 @@ static void bch2_do_discards_work(struct work_struct *work)
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
percpu_ref_put(&ca->io_ref);
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_dev_do_discards(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
return;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
goto put_ioref;
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
goto put_write_ref;
if (queue_work(c->write_ref_wq, &ca->discard_work))
return;
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
put_ioref:
percpu_ref_put(&ca->io_ref);
put_write_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_do_discards(struct bch_fs *c)
@ -2181,7 +2184,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
* freespace/need_discard/need_gc_gens btrees as needed:
*/
while (1) {
if (last_updated + HZ * 10 < jiffies) {
if (time_after(jiffies, last_updated + HZ * 10)) {
bch_info(ca, "%s: currently at %llu/%llu",
__func__, iter.pos.offset, ca->mi.nbuckets);
last_updated = jiffies;

View File

@ -69,6 +69,7 @@ struct bch_alloc_v4 {
__u64 io_time[2];
__u32 stripe;
__u32 nr_external_backpointers;
/* end of fields in original version of alloc_v4 */
__u64 fragmentation_lru;
__u32 stripe_sectors;
__u32 pad;

View File

@ -677,7 +677,8 @@ struct bch_sb_field_ext {
x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
x(disk_accounting_v2, BCH_VERSION(1, 9)) \
x(disk_accounting_v3, BCH_VERSION(1, 10)) \
x(disk_accounting_inum, BCH_VERSION(1, 11))
x(disk_accounting_inum, BCH_VERSION(1, 11)) \
x(rebalance_work_acct_fix, BCH_VERSION(1, 12))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,

View File

@ -885,66 +885,18 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
/* Insert */
static void bch2_bset_fix_lookup_table(struct btree *b,
static void rw_aux_tree_insert_entry(struct btree *b,
struct bset_tree *t,
struct bkey_packed *_where,
unsigned clobber_u64s,
unsigned new_u64s)
unsigned idx)
{
int shift = new_u64s - clobber_u64s;
unsigned l, j, where = __btree_node_key_to_offset(b, _where);
EBUG_ON(bset_has_ro_aux_tree(t));
if (!bset_has_rw_aux_tree(t))
return;
/* returns first entry >= where */
l = rw_aux_tree_bsearch(b, t, where);
if (!l) /* never delete first entry */
l++;
else if (l < t->size &&
where < t->end_offset &&
rw_aux_tree(b, t)[l].offset == where)
rw_aux_tree_set(b, t, l++, _where);
/* l now > where */
for (j = l;
j < t->size &&
rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
j++)
;
if (j < t->size &&
rw_aux_tree(b, t)[j].offset + shift ==
rw_aux_tree(b, t)[l - 1].offset)
j++;
memmove(&rw_aux_tree(b, t)[l],
&rw_aux_tree(b, t)[j],
(void *) &rw_aux_tree(b, t)[t->size] -
(void *) &rw_aux_tree(b, t)[j]);
t->size -= j - l;
for (j = l; j < t->size; j++)
rw_aux_tree(b, t)[j].offset += shift;
EBUG_ON(l < t->size &&
rw_aux_tree(b, t)[l].offset ==
rw_aux_tree(b, t)[l - 1].offset);
EBUG_ON(!idx || idx > t->size);
struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
struct bkey_packed *end = idx < t->size
? rw_aux_to_bkey(b, t, idx)
: btree_bkey_last(b, t);
if (t->size < bset_rw_tree_capacity(b, t) &&
(l < t->size
? rw_aux_tree(b, t)[l].offset
: t->end_offset) -
rw_aux_tree(b, t)[l - 1].offset >
L1_CACHE_BYTES / sizeof(u64)) {
struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
struct bkey_packed *end = l < t->size
? rw_aux_to_bkey(b, t, l)
: btree_bkey_last(b, t);
(void *) end - (void *) start > L1_CACHE_BYTES) {
struct bkey_packed *k = start;
while (1) {
@ -953,17 +905,73 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
break;
if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
memmove(&rw_aux_tree(b, t)[l + 1],
&rw_aux_tree(b, t)[l],
memmove(&rw_aux_tree(b, t)[idx + 1],
&rw_aux_tree(b, t)[idx],
(void *) &rw_aux_tree(b, t)[t->size] -
(void *) &rw_aux_tree(b, t)[l]);
(void *) &rw_aux_tree(b, t)[idx]);
t->size++;
rw_aux_tree_set(b, t, l, k);
rw_aux_tree_set(b, t, idx, k);
break;
}
}
}
}
static void bch2_bset_fix_lookup_table(struct btree *b,
struct bset_tree *t,
struct bkey_packed *_where,
unsigned clobber_u64s,
unsigned new_u64s)
{
int shift = new_u64s - clobber_u64s;
unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
EBUG_ON(bset_has_ro_aux_tree(t));
if (!bset_has_rw_aux_tree(t))
return;
if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
rw_aux_tree_insert_entry(b, t, t->size);
goto verify;
}
/* returns first entry >= where */
idx = rw_aux_tree_bsearch(b, t, where);
if (rw_aux_tree(b, t)[idx].offset == where) {
if (!idx) { /* never delete first entry */
idx++;
} else if (where < t->end_offset) {
rw_aux_tree_set(b, t, idx++, _where);
} else {
EBUG_ON(where != t->end_offset);
rw_aux_tree_insert_entry(b, t, --t->size);
goto verify;
}
}
EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
if (idx < t->size &&
rw_aux_tree(b, t)[idx].offset + shift ==
rw_aux_tree(b, t)[idx - 1].offset) {
memmove(&rw_aux_tree(b, t)[idx],
&rw_aux_tree(b, t)[idx + 1],
(void *) &rw_aux_tree(b, t)[t->size] -
(void *) &rw_aux_tree(b, t)[idx + 1]);
t->size -= 1;
}
for (j = idx; j < t->size; j++)
rw_aux_tree(b, t)[j].offset += shift;
EBUG_ON(idx < t->size &&
rw_aux_tree(b, t)[idx].offset ==
rw_aux_tree(b, t)[idx - 1].offset);
rw_aux_tree_insert_entry(b, t, idx);
verify:
bch2_bset_verify_rw_aux_tree(b, t);
bset_aux_tree_verify(b);
}

View File

@ -159,6 +159,16 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
return b;
}
void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
{
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
@ -661,9 +671,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
: &bc->freed_nonpcpu;
struct btree *b, *b2;
u64 start_time = local_clock();
unsigned flags;
flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
/*
@ -735,7 +743,12 @@ out:
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
memalloc_nofs_restore(flags);
int ret = bch2_trans_relock(trans);
if (unlikely(ret)) {
bch2_btree_node_to_freelist(c, b);
return ERR_PTR(ret);
}
return b;
err:
mutex_lock(&bc->lock);
@ -764,7 +777,6 @@ err:
}
mutex_unlock(&bc->lock);
memalloc_nofs_restore(flags);
return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
}
@ -856,6 +868,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
bch2_btree_node_read(trans, b, sync);
int ret = bch2_trans_relock(trans);
if (ret)
return ERR_PTR(ret);
if (!sync)
return NULL;

View File

@ -12,6 +12,8 @@ struct btree_iter;
void bch2_recalc_btree_reserve(struct bch_fs *);
void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,

View File

@ -587,6 +587,15 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \
_btree_id, _pos, _flags, KEY_TYPE_##_type))
#define bkey_val_copy(_dst_v, _src_k) \
do { \
unsigned b = min_t(unsigned, sizeof(*_dst_v), \
bkey_val_bytes(_src_k.k)); \
memcpy(_dst_v, _src_k.v, b); \
if (b < sizeof(*_dst_v)) \
memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \
} while (0)
static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
unsigned btree_id, struct bpos pos,
unsigned flags, unsigned type,

View File

@ -79,130 +79,41 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
return true;
}
static void bkey_cached_evict(struct btree_key_cache *c,
static bool bkey_cached_evict(struct btree_key_cache *c,
struct bkey_cached *ck)
{
BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
bch2_btree_key_cache_params));
bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
bch2_btree_key_cache_params);
if (ret) {
memset(&ck->key, ~0, sizeof(ck->key));
atomic_long_dec(&c->nr_keys);
}
return ret;
}
static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
{
struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
this_cpu_dec(*c->btree_key_cache.nr_pending);
kmem_cache_free(bch2_key_cache, ck);
}
static void bkey_cached_free(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
if (ck->c.lock.readers) {
list_move_tail(&ck->list, &bc->freed_pcpu);
bc->nr_freed_pcpu++;
} else {
list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
}
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
#ifdef __KERNEL__
static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
struct bkey_cached *pos;
bc->nr_freed_nonpcpu++;
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
pos->btree_trans_barrier_seq)) {
list_move(&ck->list, &pos->list);
return;
}
}
list_move(&ck->list, &bc->freed_nonpcpu);
}
#endif
static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
if (!ck->c.lock.readers) {
#ifdef __KERNEL__
struct btree_key_cache_freelist *f;
bool freed = false;
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
if (f->nr < ARRAY_SIZE(f->objs)) {
f->objs[f->nr++] = ck;
freed = true;
}
preempt_enable();
if (!freed) {
mutex_lock(&bc->lock);
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
while (f->nr > ARRAY_SIZE(f->objs) / 2) {
struct bkey_cached *ck2 = f->objs[--f->nr];
__bkey_cached_move_to_freelist_ordered(bc, ck2);
}
preempt_enable();
__bkey_cached_move_to_freelist_ordered(bc, ck);
mutex_unlock(&bc->lock);
}
#else
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
mutex_unlock(&bc->lock);
#endif
} else {
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_pcpu);
bc->nr_freed_pcpu++;
mutex_unlock(&bc->lock);
}
}
static void bkey_cached_free_fast(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
list_del_init(&ck->list);
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
bkey_cached_move_to_freelist(bc, ck);
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
bool pcpu_readers = ck->c.lock.readers != NULL;
rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
this_cpu_inc(*bc->nr_pending);
}
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
@ -224,74 +135,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck = NULL;
bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
int ret;
if (!pcpu_readers) {
#ifdef __KERNEL__
struct btree_key_cache_freelist *f;
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
if (f->nr)
ck = f->objs[--f->nr];
preempt_enable();
if (!ck) {
mutex_lock(&bc->lock);
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
while (!list_empty(&bc->freed_nonpcpu) &&
f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
f->objs[f->nr++] = ck;
}
ck = f->nr ? f->objs[--f->nr] : NULL;
preempt_enable();
mutex_unlock(&bc->lock);
}
#else
mutex_lock(&bc->lock);
if (!list_empty(&bc->freed_nonpcpu)) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
}
mutex_unlock(&bc->lock);
#endif
} else {
mutex_lock(&bc->lock);
if (!list_empty(&bc->freed_pcpu)) {
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_pcpu--;
}
mutex_unlock(&bc->lock);
}
if (ck) {
ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
if (unlikely(ret)) {
bkey_cached_move_to_freelist(bc, ck);
return ERR_PTR(ret);
}
btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
ret = bch2_btree_node_lock_write(trans, path, &ck->c);
if (unlikely(ret)) {
btree_node_unlock(trans, path, 0);
bkey_cached_move_to_freelist(bc, ck);
return ERR_PTR(ret);
}
return ck;
}
struct bkey_cached *ck = container_of_or_null(
rcu_pending_dequeue(&bc->pending[pcpu_readers]),
struct bkey_cached, rcu);
if (ck)
goto lock;
ck = allocate_dropping_locks(trans, ret,
__bkey_cached_alloc(key_u64s, _gfp));
@ -302,15 +153,19 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
return ERR_PTR(ret);
}
if (!ck)
return NULL;
INIT_LIST_HEAD(&ck->list);
if (ck) {
bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
ck->c.cached = true;
BUG_ON(!six_trylock_intent(&ck->c.lock));
BUG_ON(!six_trylock_write(&ck->c.lock));
goto lock;
}
ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
struct bkey_cached, rcu);
if (ck)
goto lock;
lock:
six_lock_intent(&ck->c.lock, NULL, NULL);
six_lock_write(&ck->c.lock, NULL, NULL);
return ck;
}
@ -322,21 +177,21 @@ bkey_cached_reuse(struct btree_key_cache *c)
struct bkey_cached *ck;
unsigned i;
mutex_lock(&c->lock);
rcu_read_lock();
tbl = rht_dereference_rcu(c->table.tbl, &c->table);
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(c, ck);
if (bkey_cached_evict(c, ck))
goto out;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
}
ck = NULL;
out:
rcu_read_unlock();
mutex_unlock(&c->lock);
return ck;
}
@ -415,7 +270,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
path->uptodate = BTREE_ITER_UPTODATE;
return 0;
err:
bkey_cached_free_fast(bc, ck);
bkey_cached_free(bc, ck);
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
return ret;
@ -611,8 +466,12 @@ evict:
}
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
bkey_cached_evict(&c->btree_key_cache, ck);
bkey_cached_free_fast(&c->btree_key_cache, ck);
if (bkey_cached_evict(&c->btree_key_cache, ck)) {
bkey_cached_free(&c->btree_key_cache, ck);
} else {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
}
out:
bch2_trans_iter_exit(trans, &b_iter);
@ -722,10 +581,11 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
}
bkey_cached_evict(bc, ck);
bkey_cached_free_fast(bc, ck);
bkey_cached_free(bc, ck);
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
path->should_be_locked = false;
}
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
@ -734,60 +594,41 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bucket_table *tbl;
struct bkey_cached *ck, *t;
struct bkey_cached *ck;
size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
unsigned start, flags;
unsigned iter, start;
int srcu_idx;
mutex_lock(&bc->lock);
bc->requested_to_free += sc->nr_to_scan;
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
flags = memalloc_nofs_save();
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
/*
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
* Scanning is expensive while a rehash is in progress - most elements
* will be on the new hashtable, if it's in progress
*
* A rehash could still start while we're scanning - that's ok, we'll
* still see most elements.
*/
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
break;
list_del(&ck->list);
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
bc->nr_freed_nonpcpu--;
bc->freed++;
if (unlikely(tbl->nest)) {
rcu_read_unlock();
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
return SHRINK_STOP;
}
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
break;
list_del(&ck->list);
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
bc->nr_freed_pcpu--;
bc->freed++;
}
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
if (bc->shrink_iter >= tbl->size)
bc->shrink_iter = 0;
start = bc->shrink_iter;
iter = bc->shrink_iter;
if (iter >= tbl->size)
iter = 0;
start = iter;
do {
struct rhash_head *pos, *next;
pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
pos = rht_ptr_rcu(&tbl->buckets[iter]);
while (!rht_is_a_nulls(pos)) {
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
ck = container_of(pos, struct bkey_cached, hash);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@ -797,29 +638,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->skipped_accessed++;
} else if (!bkey_cached_lock_for_evict(ck)) {
bc->skipped_lock_fail++;
} else {
bkey_cached_evict(bc, ck);
} else if (bkey_cached_evict(bc, ck)) {
bkey_cached_free(bc, ck);
bc->moved_to_freelist++;
bc->freed++;
freed++;
} else {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
scanned++;
if (scanned >= nr)
break;
goto out;
pos = next;
}
bc->shrink_iter++;
if (bc->shrink_iter >= tbl->size)
bc->shrink_iter = 0;
} while (scanned < nr && bc->shrink_iter != start);
iter++;
if (iter >= tbl->size)
iter = 0;
} while (scanned < nr && iter != start);
out:
bc->shrink_iter = iter;
rcu_read_unlock();
memalloc_nofs_restore(flags);
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
mutex_unlock(&bc->lock);
return freed;
}
@ -847,64 +690,39 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct bucket_table *tbl;
struct bkey_cached *ck, *n;
struct bkey_cached *ck;
struct rhash_head *pos;
LIST_HEAD(items);
unsigned i;
#ifdef __KERNEL__
int cpu;
#endif
shrinker_free(bc->shrink);
mutex_lock(&bc->lock);
/*
* The loop is needed to guard against racing with rehash:
*/
while (atomic_long_read(&bc->nr_keys)) {
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
if (tbl)
if (tbl) {
if (tbl->nest) {
/* wait for in progress rehash */
rcu_read_unlock();
mutex_lock(&bc->table.mutex);
mutex_unlock(&bc->table.mutex);
rcu_read_lock();
continue;
}
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
bkey_cached_evict(bc, ck);
list_add(&ck->list, &items);
while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
ck = container_of(pos, struct bkey_cached, hash);
BUG_ON(!bkey_cached_evict(bc, ck));
kfree(ck->k);
kmem_cache_free(bch2_key_cache, ck);
}
}
rcu_read_unlock();
}
#ifdef __KERNEL__
if (bc->pcpu_freed) {
for_each_possible_cpu(cpu) {
struct btree_key_cache_freelist *f =
per_cpu_ptr(bc->pcpu_freed, cpu);
for (i = 0; i < f->nr; i++) {
ck = f->objs[i];
list_add(&ck->list, &items);
}
}
}
#endif
BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
list_splice(&bc->freed_pcpu, &items);
list_splice(&bc->freed_nonpcpu, &items);
mutex_unlock(&bc->lock);
list_for_each_entry_safe(ck, n, &items, list) {
cond_resched();
list_del(&ck->list);
kfree(ck->k);
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
}
if (atomic_long_read(&bc->nr_dirty) &&
!bch2_journal_error(&c->journal) &&
test_bit(BCH_FS_was_rw, &c->flags))
@ -918,14 +736,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
free_percpu(bc->pcpu_freed);
rcu_pending_exit(&bc->pending[0]);
rcu_pending_exit(&bc->pending[1]);
free_percpu(bc->nr_pending);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->freed_pcpu);
INIT_LIST_HEAD(&c->freed_nonpcpu);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
@ -933,11 +751,13 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct shrinker *shrink;
#ifdef __KERNEL__
bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
if (!bc->pcpu_freed)
bc->nr_pending = alloc_percpu(size_t);
if (!bc->nr_pending)
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
#endif
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
@ -959,45 +779,21 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
printbuf_tabstop_push(out, 24);
printbuf_tabstop_push(out, 12);
unsigned flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed));
prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu);
prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu);
prt_printf(out, "\nshrinker:\n");
prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size);
prt_newline(out);
prt_printf(out, "shrinker:\n");
prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
prt_printf(out, "freed:\t%lu\r\n", bc->freed);
prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist);
prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier));
struct bkey_cached *ck;
unsigned iter = 0;
list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
if (++iter > 10)
break;
}
iter = 0;
list_for_each_entry(ck, &bc->freed_pcpu, list) {
prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
if (++iter > 10)
break;
}
mutex_unlock(&bc->lock);
memalloc_flags_restore(flags);
prt_newline(out);
prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending));
}
void bch2_btree_key_cache_exit(void)

View File

@ -2,33 +2,25 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
struct btree_key_cache_freelist {
struct bkey_cached *objs[16];
unsigned nr;
};
#include "rcu_pending.h"
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
struct list_head freed_pcpu;
size_t nr_freed_pcpu;
struct list_head freed_nonpcpu;
size_t nr_freed_nonpcpu;
struct shrinker *shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
atomic_long_t nr_freed;
/* 0: non pcpu reader locks, 1: pcpu reader locks */
struct rcu_pending pending[2];
size_t __percpu *nr_pending;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
/* shrinker stats */
unsigned long requested_to_free;
unsigned long freed;
unsigned long moved_to_freelist;
unsigned long skipped_dirty;
unsigned long skipped_accessed;
unsigned long skipped_lock_fail;

View File

@ -218,13 +218,11 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
bool lock_may_not_fail,
unsigned long ip)
{
int ret;
trans->lock_may_not_fail = lock_may_not_fail;
trans->lock_must_abort = false;
trans->locking = b;
ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
bch2_six_check_for_deadlock, trans, ip);
WRITE_ONCE(trans->locking, NULL);
WRITE_ONCE(trans->locking_wait.start_time, 0);
@ -284,6 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
bch2_trans_verify_not_unlocked(trans);
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||

View File

@ -386,17 +386,16 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
unsigned long btree_trans_barrier_seq;
u16 u64s;
struct bkey_cached_key key;
struct rhash_head hash;
struct list_head list;
struct journal_entry_pin journal;
u64 seq;
struct bkey_i *k;
struct rcu_head rcu;
};
static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)

View File

@ -317,6 +317,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
: 0;
int ret;
b = bch2_btree_node_mem_alloc(trans, interior_node);
if (IS_ERR(b))
return b;
BUG_ON(b->ob.nr);
mutex_lock(&c->btree_reserve_cache_lock);
if (c->btree_reserve_cache_nr > nr_reserve) {
struct btree_alloc *a =
@ -325,10 +331,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
obs = a->ob;
bkey_copy(&tmp.k, &a->k);
mutex_unlock(&c->btree_reserve_cache_lock);
goto mem_alloc;
goto out;
}
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
ret = bch2_alloc_sectors_start_trans(trans,
c->opts.metadata_target ?:
@ -341,7 +346,7 @@ retry:
c->opts.metadata_replicas_required),
watermark, 0, cl, &wp);
if (unlikely(ret))
return ERR_PTR(ret);
goto err;
if (wp->sectors_free < btree_sectors(c)) {
struct open_bucket *ob;
@ -360,19 +365,16 @@ retry:
bch2_open_bucket_get(c, wp, &obs);
bch2_alloc_sectors_done(c, wp);
mem_alloc:
b = bch2_btree_node_mem_alloc(trans, interior_node);
out:
bkey_copy(&b->key, &tmp.k);
b->ob = obs;
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
/* we hold cannibalize_lock: */
BUG_ON(IS_ERR(b));
BUG_ON(b->ob.nr);
bkey_copy(&b->key, &tmp.k);
b->ob = obs;
return b;
err:
bch2_btree_node_to_freelist(c, b);
return ERR_PTR(ret);
}
static struct btree *bch2_btree_node_alloc(struct btree_update *as,
@ -729,6 +731,18 @@ static void btree_update_nodes_written(struct btree_update *as)
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
"%s", bch2_err_str(ret));
err:
/*
* Ensure transaction is unlocked before using btree_node_lock_nopath()
* (the use of which is always suspect, we need to work on removing this
* in the future)
*
* It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
* calls bch2_path_upgrade(), before we call path_make_mut(), so we may
* rarely end up with a locked path besides the one we have here:
*/
bch2_trans_unlock(trans);
bch2_trans_begin(trans);
/*
* We have to be careful because another thread might be getting ready
* to free as->b and calling btree_update_reparent() on us - we'll
@ -748,18 +762,6 @@ err:
* we're in journal error state:
*/
/*
* Ensure transaction is unlocked before using
* btree_node_lock_nopath() (the use of which is always suspect,
* we need to work on removing this in the future)
*
* It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
* calls bch2_path_upgrade(), before we call path_make_mut(), so
* we may rarely end up with a locked path besides the one we
* have here:
*/
bch2_trans_unlock(trans);
bch2_trans_begin(trans);
btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
as->btree_id, b->c.level, b->key.k.p);
struct btree_path *path = trans->paths + path_idx;
@ -2439,6 +2441,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
}
new_hash = bch2_btree_node_mem_alloc(trans, false);
ret = PTR_ERR_OR_ZERO(new_hash);
if (ret)
goto err;
}
path->intent_ref++;
@ -2446,14 +2451,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
commit_flags, skip_triggers);
--path->intent_ref;
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
list_move(&new_hash->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
six_unlock_write(&new_hash->c.lock);
six_unlock_intent(&new_hash->c.lock);
}
if (new_hash)
bch2_btree_node_to_freelist(c, new_hash);
err:
closure_sync(&cl);
bch2_btree_cache_cannibalize_unlock(trans);
return ret;
@ -2522,6 +2522,10 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
b = bch2_btree_node_mem_alloc(trans, false);
bch2_btree_cache_cannibalize_unlock(trans);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
return ret;
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
b->c.level = level;
@ -2553,7 +2557,7 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
{
bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level));
bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
}
static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)

View File

@ -159,6 +159,8 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
unsigned level,
unsigned flags)
{
bch2_trans_verify_not_unlocked(trans);
return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
btree_prev_sib) ?:
bch2_foreground_maybe_merge_sibling(trans, path, level, flags,

View File

@ -699,7 +699,8 @@ err:
static int __trigger_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k,
enum btree_iter_update_trigger_flags flags)
enum btree_iter_update_trigger_flags flags,
s64 *replicas_sectors)
{
bool gc = flags & BTREE_TRIGGER_gc;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@ -708,7 +709,6 @@ static int __trigger_extent(struct btree_trans *trans,
enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
? BCH_DATA_btree
: BCH_DATA_user;
s64 replicas_sectors = 0;
int ret = 0;
struct disk_accounting_pos acc_replicas_key = {
@ -739,7 +739,7 @@ static int __trigger_extent(struct btree_trans *trans,
if (ret)
return ret;
} else if (!p.has_ec) {
replicas_sectors += disk_sectors;
*replicas_sectors += disk_sectors;
acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
@ -777,7 +777,7 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (acc_replicas_key.replicas.nr_devs) {
ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
if (ret)
return ret;
}
@ -787,7 +787,7 @@ static int __trigger_extent(struct btree_trans *trans,
.type = BCH_DISK_ACCOUNTING_snapshot,
.snapshot.id = k.k->p.snapshot,
};
ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc);
ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
if (ret)
return ret;
}
@ -807,7 +807,7 @@ static int __trigger_extent(struct btree_trans *trans,
.type = BCH_DISK_ACCOUNTING_btree,
.btree.id = btree_id,
};
ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc);
ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
if (ret)
return ret;
} else {
@ -819,22 +819,13 @@ static int __trigger_extent(struct btree_trans *trans,
s64 v[3] = {
insert ? 1 : -1,
insert ? k.k->size : -((s64) k.k->size),
replicas_sectors,
*replicas_sectors,
};
ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
if (ret)
return ret;
}
if (bch2_bkey_rebalance_opts(k)) {
struct disk_accounting_pos acc = {
.type = BCH_DISK_ACCOUNTING_rebalance_work,
};
ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc);
if (ret)
return ret;
}
return 0;
}
@ -843,6 +834,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s new,
enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
@ -858,22 +850,54 @@ int bch2_trigger_extent(struct btree_trans *trans,
new_ptrs_bytes))
return 0;
if (flags & BTREE_TRIGGER_transactional) {
struct bch_fs *c = trans->c;
int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
(int) bch2_bkey_needs_rebalance(c, old);
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
if (mod) {
if (old.k->type) {
int ret = __trigger_extent(trans, btree, level, old,
flags & ~BTREE_TRIGGER_insert,
&old_replicas_sectors);
if (ret)
return ret;
}
if (new.k->type) {
int ret = __trigger_extent(trans, btree, level, new.s_c,
flags & ~BTREE_TRIGGER_overwrite,
&new_replicas_sectors);
if (ret)
return ret;
}
int need_rebalance_delta = 0;
s64 need_rebalance_sectors_delta = 0;
s64 s = bch2_bkey_sectors_need_rebalance(c, old);
need_rebalance_delta -= s != 0;
need_rebalance_sectors_delta -= s;
s = bch2_bkey_sectors_need_rebalance(c, old);
need_rebalance_delta += s != 0;
need_rebalance_sectors_delta += s;
if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
new.k->p, mod > 0);
new.k->p, need_rebalance_delta > 0);
if (ret)
return ret;
}
if (need_rebalance_sectors_delta) {
struct disk_accounting_pos acc = {
.type = BCH_DISK_ACCOUNTING_rebalance_work,
};
int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
flags & BTREE_TRIGGER_gc);
if (ret)
return ret;
}
}
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc))
return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags);
return 0;
}

View File

@ -24,7 +24,7 @@ struct bucket_array {
u16 first_bucket;
size_t nbuckets;
size_t nbuckets_minus_first;
struct bucket b[];
struct bucket b[] __counted_by(nbuckets);
};
struct bucket_gens {

View File

@ -4,12 +4,12 @@
#include <linux/slab.h>
#include "darray.h"
int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
void *data = kvmalloc_array(new_size, element_size, gfp);
void *data = kvmalloc_array_noprof(new_size, element_size, gfp);
if (!data)
return -ENOMEM;

View File

@ -22,29 +22,23 @@ struct { \
typedef DARRAY(char) darray_char;
typedef DARRAY(char *) darray_str;
int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
static inline int __darray_resize(darray_char *d, size_t element_size,
size_t new_size, gfp_t gfp)
{
return unlikely(new_size > d->size)
? __bch2_darray_resize(d, element_size, new_size, gfp)
: 0;
}
#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
#define __darray_resize(_d, _element_size, _new_size, _gfp) \
(unlikely((_new_size) > (_d)->size) \
? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
: 0)
#define darray_resize_gfp(_d, _new_size, _gfp) \
unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
#define darray_resize(_d, _new_size) \
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
{
return __darray_resize(d, t_size, d->nr + more, gfp);
}
#define darray_make_room_gfp(_d, _more, _gfp) \
__darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
#define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL)

View File

@ -79,6 +79,8 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
bkey_for_each_ptr(ptrs, ptr2) {
if (ptr2 == ptr)
break;
bucket = PTR_BUCKET_POS(ca, ptr2);
bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
}
return false;

View File

@ -145,7 +145,6 @@
x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
x(BCH_ERR_transaction_restart, transaction_restart_freeing_inode) \
x(BCH_ERR_transaction_restart, transaction_restart_nested) \
x(0, no_btree_node) \
x(BCH_ERR_no_btree_node, no_btree_node_relock) \

View File

@ -1379,6 +1379,45 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
return r != NULL;
}
static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
unsigned target, unsigned compression)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 sectors = 0;
if (compression) {
unsigned compression_type = bch2_compression_opt_to_type(compression);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
p.ptr.unwritten) {
sectors = 0;
goto incompressible;
}
if (!p.ptr.cached && p.crc.compression_type != compression_type)
sectors += p.crc.compressed_size;
}
}
incompressible:
if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target))
sectors += p.crc.compressed_size;
}
return sectors;
}
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0;
}
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
struct bch_io_opts *opts)
{

View File

@ -692,6 +692,7 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
unsigned, unsigned);
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
struct bch_io_opts *);

View File

@ -99,9 +99,7 @@ static inline void bch2_folio_release(struct folio *folio)
static inline struct bch_folio *__bch2_folio(struct folio *folio)
{
return folio_has_private(folio)
? (struct bch_folio *) folio_get_private(folio)
: NULL;
return folio_get_private(folio);
}
static inline struct bch_folio *bch2_folio(struct folio *folio)

View File

@ -328,9 +328,8 @@ static int bch2_ioc_setlabel(struct bch_fs *c,
mutex_lock(&c->sb_lock);
strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
mutex_unlock(&c->sb_lock);
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
mnt_drop_write_file(file);
return ret;

View File

@ -193,13 +193,19 @@ repeat:
inode = rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
if (inode) {
spin_lock(&inode->v.i_lock);
if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
spin_unlock(&inode->v.i_lock);
return NULL;
}
if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
if (trans)
if (!trans) {
__wait_on_freeing_inode(&inode->v);
} else {
bch2_trans_unlock(trans);
__wait_on_freeing_inode(&inode->v);
if (trans) {
trace_and_count(c, trans_restart_freeing_inode, trans, _THIS_IP_);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_freeing_inode));
int ret = bch2_trans_relock(trans);
if (ret)
return ERR_PTR(ret);
}
goto repeat;
}
@ -212,11 +218,14 @@ repeat:
static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
{
if (test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
spin_lock(&inode->v.i_lock);
bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
spin_unlock(&inode->v.i_lock);
if (remove) {
int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
&inode->hash, bch2_vfs_inodes_params);
BUG_ON(ret);
clear_bit(EI_INODE_HASHED, &inode->ei_flags);
inode->v.i_hash.pprev = NULL;
}
}
@ -226,6 +235,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
struct bch_inode_info *inode)
{
struct bch_inode_info *old = inode;
set_bit(EI_INODE_HASHED, &inode->ei_flags);
retry:
if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
&inode->hash,
@ -233,8 +244,8 @@ retry:
old = bch2_inode_hash_find(c, trans, inode->ei_inum);
if (!old)
goto retry;
if (IS_ERR(old))
return old;
clear_bit(EI_INODE_HASHED, &inode->ei_flags);
/*
* bcachefs doesn't use I_NEW; we have no use for it since we
@ -249,9 +260,8 @@ retry:
*/
set_nlink(&inode->v, 1);
discard_new_inode(&inode->v);
inode = old;
return old;
} else {
set_bit(EI_INODE_HASHED, &inode->ei_flags);
inode_fake_hash(&inode->v);
inode_sb_list_add(&inode->v);
@ -259,9 +269,8 @@ retry:
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
}
return inode;
}
}
#define memalloc_flags_do(_flags, _do) \
@ -333,14 +342,7 @@ static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *tr
bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
struct bch_inode_info *ret = bch2_inode_hash_insert(trans->c, trans, inode);
if (IS_ERR(ret)) {
inode->v.i_state |= I_NEW;
set_nlink(&inode->v, 1);
discard_new_inode(&inode->v);
}
return ret;
return bch2_inode_hash_insert(trans->c, trans, inode);
}
@ -1656,6 +1658,10 @@ static void bch2_evict_inode(struct inode *vinode)
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(vinode);
/*
* evict() has waited for outstanding writeback, we'll do no more IO
* through this inode: it's safe to remove from VFS inode hashtable here
*/
bch2_inode_hash_remove(c, inode);
truncate_inode_pages_final(&inode->v.i_data);

View File

@ -2006,7 +2006,6 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
if (ret) {
bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
ret = -BCH_ERR_fsck_repair_unimplemented;
ret = 0;
goto err;
}
@ -2216,6 +2215,8 @@ int bch2_check_xattrs(struct bch_fs *c)
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
check_xattr(trans, &iter, k, &hash_info, &inode)));
inode_walker_exit(&inode);
bch_err_fn(c, ret);
return ret;
}
@ -2469,8 +2470,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
: bch2_inode_unpack(inode_k, &inode);
if (ret) {
/* Should have been caught in dirents pass */
if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
bch_err(c, "error looking up parent directory: %i", ret);
bch_err_msg(c, ret, "error looking up parent directory");
break;
}

View File

@ -1260,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
}
if (!had_entries)
j->last_empty_seq = cur_seq;
j->last_empty_seq = cur_seq - 1; /* to match j->seq */
spin_lock(&j->lock);

View File

@ -1950,7 +1950,8 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
if (error ||
w->noflush ||
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
time_before(jiffies, j->last_flush_write +
msecs_to_jiffies(c->opts.journal_flush_delay)) &&
test_bit(JOURNAL_may_skip_flush, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);

View File

@ -104,6 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
int ret = -BCH_ERR_invalid_sb_journal;
u64 sum = 0;
unsigned nr;
unsigned i;
struct u64_range *b;
@ -119,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
for (i = 0; i < nr; i++) {
b[i].start = le64_to_cpu(journal->d[i].start);
b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
if (b[i].end <= b[i].start) {
prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
le64_to_cpu(journal->d[i].start),
le64_to_cpu(journal->d[i].nr));
goto err;
}
sum += le64_to_cpu(journal->d[i].nr);
}
sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
@ -148,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
}
}
if (sum > UINT_MAX) {
prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
goto err;
}
ret = 0;
err:
kfree(b);

View File

@ -383,7 +383,7 @@ static int bch2_copygc_thread(void *arg)
if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048;
bch2_trans_unlock_long(ctxt.trans);
move_buckets_wait(&ctxt, buckets, true);
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
MAX_SCHEDULE_TIMEOUT);
}

650
libbcachefs/rcu_pending.c Normal file
View File

@ -0,0 +1,650 @@
// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) "%s() " fmt "\n", __func__
#include <linux/generic-radix-tree.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/srcu.h>
#include <linux/vmalloc.h>
#include "rcu_pending.h"
#include "darray.h"
#include "util.h"
#define static_array_for_each(_a, _i) \
for (typeof(&(_a)[0]) _i = _a; \
_i < (_a) + ARRAY_SIZE(_a); \
_i++)
enum rcu_pending_special {
RCU_PENDING_KVFREE = 1,
RCU_PENDING_CALL_RCU = 2,
};
#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
{
return ssp
? get_state_synchronize_srcu(ssp)
: get_state_synchronize_rcu();
}
static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
{
return ssp
? start_poll_synchronize_srcu(ssp)
: start_poll_synchronize_rcu();
}
static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
{
return ssp
? poll_state_synchronize_srcu(ssp, cookie)
: poll_state_synchronize_rcu(cookie);
}
static inline void __rcu_barrier(struct srcu_struct *ssp)
{
return ssp
? srcu_barrier(ssp)
: rcu_barrier();
}
static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func)
{
if (ssp)
call_srcu(ssp, rhp, func);
else
call_rcu(rhp, func);
}
struct rcu_pending_seq {
/*
* We're using a radix tree like a vector - we're just pushing elements
* onto the end; we're using a radix tree instead of an actual vector to
* avoid reallocation overhead
*/
GENRADIX(struct rcu_head *) objs;
size_t nr;
struct rcu_head **cursor;
unsigned long seq;
};
struct rcu_pending_list {
struct rcu_head *head;
struct rcu_head *tail;
unsigned long seq;
};
struct rcu_pending_pcpu {
struct rcu_pending *parent;
spinlock_t lock;
int cpu;
/*
* We can't bound the number of unprocessed gp sequence numbers, and we
* can't efficiently merge radix trees for expired grace periods, so we
* need darray/vector:
*/
DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
/* Third entry is for expired objects: */
struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
struct rcu_head cb;
bool cb_armed;
struct work_struct work;
};
static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
{
if (p->objs.nr)
return true;
static_array_for_each(p->lists, i)
if (i->head)
return true;
return false;
}
static void rcu_pending_list_merge(struct rcu_pending_list *l1,
struct rcu_pending_list *l2)
{
#ifdef __KERNEL__
if (!l1->head)
l1->head = l2->head;
else
l1->tail->next = l2->head;
#else
if (!l1->head)
l1->head = l2->head;
else
l1->tail->next.next = (void *) l2->head;
#endif
l1->tail = l2->tail;
l2->head = l2->tail = NULL;
}
static void rcu_pending_list_add(struct rcu_pending_list *l,
struct rcu_head *n)
{
#ifdef __KERNEL__
if (!l->head)
l->head = n;
else
l->tail->next = n;
l->tail = n;
n->next = NULL;
#else
if (!l->head)
l->head = n;
else
l->tail->next.next = (void *) n;
l->tail = n;
n->next.next = NULL;
#endif
}
static void merge_expired_lists(struct rcu_pending_pcpu *p)
{
struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
for (struct rcu_pending_list *i = p->lists; i < expired; i++)
if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
rcu_pending_list_merge(expired, i);
}
#ifndef __KERNEL__
static inline void kfree_bulk(size_t nr, void ** p)
{
while (nr--)
kfree(*p);
}
#define local_irq_save(flags) \
do { \
flags = 0; \
} while (0)
#endif
static noinline void __process_finished_items(struct rcu_pending *pending,
struct rcu_pending_pcpu *p,
unsigned long flags)
{
struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
struct rcu_pending_seq objs = {};
struct rcu_head *list = NULL;
if (p->objs.nr &&
__poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
objs = p->objs.data[0];
darray_remove_item(&p->objs, p->objs.data);
}
merge_expired_lists(p);
list = expired->head;
expired->head = expired->tail = NULL;
spin_unlock_irqrestore(&p->lock, flags);
switch ((ulong) pending->process) {
case RCU_PENDING_KVFREE:
for (size_t i = 0; i < objs.nr; ) {
size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
i += nr_this_node;
}
genradix_free(&objs.objs);
while (list) {
struct rcu_head *obj = list;
#ifdef __KERNEL__
list = obj->next;
#else
list = (void *) obj->next.next;
#endif
/*
* low bit of pointer indicates whether rcu_head needs
* to be freed - kvfree_rcu_mightsleep()
*/
BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
kvfree(ptr);
bool free_head = ((unsigned long) obj->func) & 1UL;
if (free_head)
kfree(obj);
}
break;
case RCU_PENDING_CALL_RCU:
for (size_t i = 0; i < objs.nr; i++) {
struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
obj->func(obj);
}
genradix_free(&objs.objs);
while (list) {
struct rcu_head *obj = list;
#ifdef __KERNEL__
list = obj->next;
#else
list = (void *) obj->next.next;
#endif
obj->func(obj);
}
break;
default:
for (size_t i = 0; i < objs.nr; i++)
pending->process(pending, *genradix_ptr(&objs.objs, i));
genradix_free(&objs.objs);
while (list) {
struct rcu_head *obj = list;
#ifdef __KERNEL__
list = obj->next;
#else
list = (void *) obj->next.next;
#endif
pending->process(pending, obj);
}
break;
}
}
static bool process_finished_items(struct rcu_pending *pending,
struct rcu_pending_pcpu *p,
unsigned long flags)
{
/*
* XXX: we should grab the gp seq once and avoid multiple function
* calls, this is called from __rcu_pending_enqueue() fastpath in
* may_sleep==true mode
*/
if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
(p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
(p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
p->lists[2].head) {
__process_finished_items(pending, p, flags);
return true;
}
return false;
}
static void rcu_pending_work(struct work_struct *work)
{
struct rcu_pending_pcpu *p =
container_of(work, struct rcu_pending_pcpu, work);
struct rcu_pending *pending = p->parent;
unsigned long flags;
do {
spin_lock_irqsave(&p->lock, flags);
} while (process_finished_items(pending, p, flags));
spin_unlock_irqrestore(&p->lock, flags);
}
static void rcu_pending_rcu_cb(struct rcu_head *rcu)
{
struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
schedule_work_on(p->cpu, &p->work);
unsigned long flags;
spin_lock_irqsave(&p->lock, flags);
if (__rcu_pending_has_pending(p)) {
spin_unlock_irqrestore(&p->lock, flags);
__call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
} else {
p->cb_armed = false;
spin_unlock_irqrestore(&p->lock, flags);
}
}
static __always_inline struct rcu_pending_seq *
get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
{
darray_for_each_reverse(p->objs, objs)
if (objs->seq == seq)
return objs;
if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
return NULL;
return &darray_last(p->objs);
}
static noinline bool
rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
struct rcu_head *head, void *ptr,
unsigned long *flags)
{
if (ptr) {
if (!head) {
/*
* kvfree_rcu_mightsleep(): we weren't passed an
* rcu_head, but we need one: use the low bit of the
* ponter to free to flag that the head needs to be
* freed as well:
*/
ptr = (void *)(((unsigned long) ptr)|1UL);
head = kmalloc(sizeof(*head), __GFP_NOWARN);
if (!head) {
spin_unlock_irqrestore(&p->lock, *flags);
head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
/*
* dropped lock, did GFP_KERNEL allocation,
* check for gp expiration
*/
if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
kvfree(--ptr);
kfree(head);
spin_lock_irqsave(&p->lock, *flags);
return false;
}
}
}
head->func = ptr;
}
again:
for (struct rcu_pending_list *i = p->lists;
i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
if (i->seq == seq) {
rcu_pending_list_add(i, head);
return false;
}
}
for (struct rcu_pending_list *i = p->lists;
i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
if (!i->head) {
i->seq = seq;
rcu_pending_list_add(i, head);
return true;
}
}
merge_expired_lists(p);
goto again;
}
/*
* __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
* pending->pracess) once grace period elapses.
*
* Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
* back to a linked list.
*
* - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
* process callback
*
* - If @ptr and @head are both not NULL, we're kvfree_rcu()
*
* - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
*
* - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
* expired items.
*/
static __always_inline void
__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
void *ptr, bool may_sleep)
{
struct rcu_pending_pcpu *p;
struct rcu_pending_seq *objs;
struct genradix_node *new_node = NULL;
unsigned long seq, flags;
bool start_gp = false;
BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
local_irq_save(flags);
p = this_cpu_ptr(pending->p);
spin_lock(&p->lock);
seq = __get_state_synchronize_rcu(pending->srcu);
restart:
if (may_sleep &&
unlikely(process_finished_items(pending, p, flags)))
goto check_expired;
/*
* In kvfree_rcu() mode, the radix tree is only for slab pointers so
* that we can do kfree_bulk() - vmalloc pointers always use the linked
* list:
*/
if (ptr && unlikely(is_vmalloc_addr(ptr)))
goto list_add;
objs = get_object_radix(p, seq);
if (unlikely(!objs))
goto list_add;
if (unlikely(!objs->cursor)) {
/*
* New radix tree nodes must be added under @p->lock because the
* tree root is in a darray that can be resized (typically,
* genradix supports concurrent unlocked allocation of new
* nodes) - hence preallocation and the retry loop:
*/
objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
if (unlikely(!objs->cursor)) {
if (may_sleep) {
spin_unlock_irqrestore(&p->lock, flags);
gfp_t gfp = GFP_KERNEL;
if (!head)
gfp |= __GFP_NOFAIL;
new_node = genradix_alloc_node(gfp);
if (!new_node)
may_sleep = false;
goto check_expired;
}
list_add:
start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
goto start_gp;
}
}
*objs->cursor++ = ptr ?: head;
/* zero cursor if we hit the end of a radix tree node: */
if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
objs->cursor = NULL;
start_gp = !objs->nr;
objs->nr++;
start_gp:
if (unlikely(start_gp)) {
/*
* We only have one callback (ideally, we would have one for
* every outstanding graceperiod) - so if our callback is
* already in flight, we may still have to start a grace period
* (since we used get_state() above, not start_poll())
*/
if (!p->cb_armed) {
p->cb_armed = true;
__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
} else {
__start_poll_synchronize_rcu(pending->srcu);
}
}
spin_unlock_irqrestore(&p->lock, flags);
free_node:
if (new_node)
genradix_free_node(new_node);
return;
check_expired:
if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
switch ((ulong) pending->process) {
case RCU_PENDING_KVFREE:
kvfree(ptr);
break;
case RCU_PENDING_CALL_RCU:
head->func(head);
break;
default:
pending->process(pending, head);
break;
}
goto free_node;
}
local_irq_save(flags);
p = this_cpu_ptr(pending->p);
spin_lock(&p->lock);
goto restart;
}
void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
{
__rcu_pending_enqueue(pending, obj, NULL, true);
}
static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
{
struct rcu_head *ret = NULL;
spin_lock_irq(&p->lock);
darray_for_each(p->objs, objs)
if (objs->nr) {
ret = *genradix_ptr(&objs->objs, --objs->nr);
objs->cursor = NULL;
if (!objs->nr)
genradix_free(&objs->objs);
goto out;
}
static_array_for_each(p->lists, i)
if (i->head) {
ret = i->head;
#ifdef __KERNEL__
i->head = ret->next;
#else
i->head = (void *) ret->next.next;
#endif
if (!i->head)
i->tail = NULL;
goto out;
}
out:
spin_unlock_irq(&p->lock);
return ret;
}
struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
{
return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
}
struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
{
struct rcu_head *ret = rcu_pending_dequeue(pending);
if (ret)
return ret;
int cpu;
for_each_possible_cpu(cpu) {
ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
if (ret)
break;
}
return ret;
}
static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
{
int cpu;
for_each_possible_cpu(cpu) {
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
spin_lock_irq(&p->lock);
if (__rcu_pending_has_pending(p) || p->cb_armed) {
spin_unlock_irq(&p->lock);
return true;
}
spin_unlock_irq(&p->lock);
}
return false;
}
void rcu_pending_exit(struct rcu_pending *pending)
{
int cpu;
if (!pending->p)
return;
while (rcu_pending_has_pending_or_armed(pending)) {
__rcu_barrier(pending->srcu);
for_each_possible_cpu(cpu) {
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
flush_work(&p->work);
}
}
for_each_possible_cpu(cpu) {
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
flush_work(&p->work);
}
for_each_possible_cpu(cpu) {
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
static_array_for_each(p->lists, i)
WARN_ON(i->head);
WARN_ON(p->objs.nr);
darray_exit(&p->objs);
}
free_percpu(pending->p);
}
/**
* rcu_pending_init: - initialize a rcu_pending
*
* @pending: Object to init
* @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
* RCU flavor
* @process: Callback function invoked on objects once their RCU barriers
* have completed; if NULL, kvfree() is used.
*/
int rcu_pending_init(struct rcu_pending *pending,
struct srcu_struct *srcu,
rcu_pending_process_fn process)
{
pending->p = alloc_percpu(struct rcu_pending_pcpu);
if (!pending->p)
return -ENOMEM;
int cpu;
for_each_possible_cpu(cpu) {
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
p->parent = pending;
p->cpu = cpu;
spin_lock_init(&p->lock);
darray_init(&p->objs);
INIT_WORK(&p->work, rcu_pending_work);
}
pending->srcu = srcu;
pending->process = process;
return 0;
}

27
libbcachefs/rcu_pending.h Normal file
View File

@ -0,0 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCU_PENDING_H
#define _LINUX_RCU_PENDING_H
#include <linux/rcupdate.h>
struct rcu_pending;
typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
struct rcu_pending_pcpu;
struct rcu_pending {
struct rcu_pending_pcpu __percpu *p;
struct srcu_struct *srcu;
rcu_pending_process_fn process;
};
void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
void rcu_pending_exit(struct rcu_pending *pending);
int rcu_pending_init(struct rcu_pending *pending,
struct srcu_struct *srcu,
rcu_pending_process_fn process);
#endif /* _LINUX_RCU_PENDING_H */

View File

@ -241,7 +241,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
const struct journal_key *l = *((const struct journal_key **)_l);
const struct journal_key *r = *((const struct journal_key **)_r);
return cmp_int(l->journal_seq, r->journal_seq);
/*
* Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
*
* journal_seq == 0 means that the key comes from early repair, and
* should be inserted last so as to avoid overflowing the journal
*/
return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
}
int bch2_journal_replay(struct bch_fs *c)
@ -322,6 +328,7 @@ int bch2_journal_replay(struct bch_fs *c)
}
}
bch2_trans_unlock_long(trans);
/*
* Now, replay any remaining keys in the order in which they appear in
* the journal, unpinning those journal entries as we go:

View File

@ -81,8 +81,7 @@
x(trans_restart_write_buffer_flush, 75) \
x(trans_restart_split_race, 76) \
x(write_buffer_flush_slowpath, 77) \
x(write_buffer_flush_sync, 78) \
x(trans_restart_freeing_inode, 79)
x(write_buffer_flush_sync, 78)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,

View File

@ -74,6 +74,9 @@
BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \
BCH_FSCK_ERR_accounting_key_junk_at_end) \
x(disk_accounting_inum, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch) \
x(rebalance_work_acct_fix, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch)
@ -108,7 +111,10 @@
BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
BCH_FSCK_ERR_fs_usage_replicas_wrong, \
BCH_FSCK_ERR_accounting_replicas_not_marked, \
BCH_FSCK_ERR_bkey_version_in_future)
BCH_FSCK_ERR_bkey_version_in_future) \
x(rebalance_work_acct_fix, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch)
struct upgrade_downgrade_entry {
u64 recovery_passes;

View File

@ -23,7 +23,7 @@ enum bch_fsck_flags {
x(jset_past_bucket_end, 9, 0) \
x(jset_seq_blacklisted, 10, 0) \
x(journal_entries_missing, 11, 0) \
x(journal_entry_replicas_not_marked, 12, 0) \
x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \
x(journal_entry_past_jset_end, 13, 0) \
x(journal_entry_replicas_data_mismatch, 14, 0) \
x(journal_entry_bkey_u64s_0, 15, 0) \

View File

@ -233,7 +233,7 @@ write_attribute(perf_test);
#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
{ .name = #_name, .mode = 0444 };
{ .name = #_name, .mode = 0644 };
BCH_TIME_STATS()
#undef x
@ -722,6 +722,13 @@ SHOW(bch2_fs_time_stats)
STORE(bch2_fs_time_stats)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
#define x(name) \
if (attr == &sysfs_time_stat_##name) \
bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
BCH_TIME_STATS()
#undef x
return size;
}
SYSFS_OPS(bch2_fs_time_stats);

View File

@ -387,7 +387,7 @@ again:
seen = buf->buf.nr;
char *n = memchr(buf->buf.data, '\n', seen);
if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) {
if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
spin_unlock(&buf->lock);
return -ETIME;
}

View File

@ -151,6 +151,20 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
}
}
void bch2_time_stats_reset(struct bch2_time_stats *stats)
{
spin_lock_irq(&stats->lock);
unsigned offset = offsetof(struct bch2_time_stats, min_duration);
memset((void *) stats + offset, 0, sizeof(*stats) - offset);
if (stats->buffer) {
int cpu;
for_each_possible_cpu(cpu)
per_cpu_ptr(stats->buffer, cpu)->nr = 0;
}
spin_unlock_irq(&stats->lock);
}
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
free_percpu(stats->buffer);

View File

@ -70,6 +70,7 @@ struct time_stat_buffer {
struct bch2_time_stats {
spinlock_t lock;
bool have_quantiles;
struct time_stat_buffer __percpu *buffer;
/* all fields are in nanoseconds */
u64 min_duration;
u64 max_duration;
@ -87,7 +88,6 @@ struct bch2_time_stats {
struct mean_and_variance_weighted duration_stats_weighted;
struct mean_and_variance_weighted freq_stats_weighted;
struct time_stat_buffer __percpu *buffer;
};
struct bch2_time_stats_quantiles {
@ -142,6 +142,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
return false;
}
void bch2_time_stats_reset(struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);

View File

@ -1316,12 +1316,6 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
__entry->new_u64s)
);
DEFINE_EVENT(transaction_event, trans_restart_freeing_inode,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip)
);
TRACE_EVENT(path_downgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,

View File

@ -416,7 +416,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
prt_printf(out, "\tsince mount\r\trecent\r\n");
prt_printf(out, "recent");
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, out->indent + 20);

View File

@ -5,99 +5,31 @@
#include <linux/gfp.h>
#include <linux/kmemleak.h>
#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
struct genradix_node {
union {
/* Interior node: */
struct genradix_node *children[GENRADIX_ARY];
/* Leaf: */
u8 data[GENRADIX_NODE_SIZE];
};
};
static inline int genradix_depth_shift(unsigned depth)
{
return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
}
/*
* Returns size (of data, in bytes) that a tree of a given depth holds:
*/
static inline size_t genradix_depth_size(unsigned depth)
{
return 1UL << genradix_depth_shift(depth);
}
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
#define GENRADIX_MAX_DEPTH \
DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
#define GENRADIX_DEPTH_MASK \
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
static inline unsigned genradix_root_to_depth(struct genradix_root *r)
{
return (unsigned long) r & GENRADIX_DEPTH_MASK;
}
static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
{
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
}
/*
* Returns pointer to the specified byte @offset within @radix, or NULL if not
* allocated
*/
void *__genradix_ptr(struct __genradix *radix, size_t offset)
{
struct genradix_root *r = READ_ONCE(radix->root);
struct genradix_node *n = genradix_root_to_node(r);
unsigned level = genradix_root_to_depth(r);
if (ilog2(offset) >= genradix_depth_shift(level))
return NULL;
while (1) {
if (!n)
return NULL;
if (!level)
break;
level--;
n = n->children[offset >> genradix_depth_shift(level)];
offset &= genradix_depth_size(level) - 1;
}
return &n->data[offset];
return __genradix_ptr_inlined(radix, offset);
}
EXPORT_SYMBOL(__genradix_ptr);
static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
{
return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
}
static inline void genradix_free_node(struct genradix_node *node)
{
kfree(node);
}
/*
* Returns pointer to the specified byte @offset within @radix, allocating it if
* necessary - newly allocated slots are always zeroed out:
*/
void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
struct genradix_node **preallocated,
gfp_t gfp_mask)
{
struct genradix_root *v = READ_ONCE(radix->root);
struct genradix_node *n, *new_node = NULL;
unsigned level;
if (preallocated)
swap(new_node, *preallocated);
/* Increase tree depth if necessary: */
while (1) {
struct genradix_root *r = v, *new_root;
@ -281,7 +213,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size,
size_t offset;
for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask))
return -ENOMEM;
return 0;