mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-02-22 00:00:03 +03:00
Update bcachefs sources to 22fa8fc32e6a bcachefs: rcu_pending now works in userspace
This commit is contained in:
parent
6f938e0399
commit
b422b19f63
@ -1 +1 @@
|
||||
62439c6f1a6dba3fca1e57f352745d6e36dd1e31
|
||||
22fa8fc32e6aafb8bd76c6b746868dbdbc6a934d
|
||||
|
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -73,7 +73,7 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
||||
|
||||
[[package]]
|
||||
name = "bcachefs-tools"
|
||||
version = "1.11.1"
|
||||
version = "1.12.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bch_bindgen",
|
||||
|
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "bcachefs-tools"
|
||||
version = "1.11.1"
|
||||
version = "1.12.0"
|
||||
authors = ["Yuxuan Shui <yshuiv7@gmail.com>", "Kayla Firestack <dev@kaylafire.me>", "Kent Overstreet <kent.overstreet@linux.dev>" ]
|
||||
edition = "2021"
|
||||
rust-version = "1.70"
|
||||
|
2
Makefile
2
Makefile
@ -1,4 +1,4 @@
|
||||
VERSION=1.11.1
|
||||
VERSION=1.12.0
|
||||
|
||||
PREFIX?=/usr/local
|
||||
LIBEXECDIR?=$(PREFIX)/libexec
|
||||
|
@ -41,6 +41,7 @@
|
||||
#include <linux/limits.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/math.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
struct genradix_root;
|
||||
@ -48,10 +49,63 @@ struct genradix_root;
|
||||
#define GENRADIX_NODE_SHIFT 9
|
||||
#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT)
|
||||
|
||||
#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
|
||||
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
|
||||
|
||||
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
|
||||
#define GENRADIX_MAX_DEPTH \
|
||||
DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
|
||||
|
||||
#define GENRADIX_DEPTH_MASK \
|
||||
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
|
||||
|
||||
static inline int genradix_depth_shift(unsigned depth)
|
||||
{
|
||||
return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns size (of data, in bytes) that a tree of a given depth holds:
|
||||
*/
|
||||
static inline size_t genradix_depth_size(unsigned depth)
|
||||
{
|
||||
return 1UL << genradix_depth_shift(depth);
|
||||
}
|
||||
|
||||
static inline unsigned genradix_root_to_depth(struct genradix_root *r)
|
||||
{
|
||||
return (unsigned long) r & GENRADIX_DEPTH_MASK;
|
||||
}
|
||||
|
||||
static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
|
||||
{
|
||||
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
|
||||
}
|
||||
|
||||
struct __genradix {
|
||||
struct genradix_root *root;
|
||||
};
|
||||
|
||||
struct genradix_node {
|
||||
union {
|
||||
/* Interior node: */
|
||||
struct genradix_node *children[GENRADIX_ARY];
|
||||
|
||||
/* Leaf: */
|
||||
u8 data[GENRADIX_NODE_SIZE];
|
||||
};
|
||||
};
|
||||
|
||||
static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
|
||||
{
|
||||
return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
|
||||
}
|
||||
|
||||
static inline void genradix_free_node(struct genradix_node *node)
|
||||
{
|
||||
kfree(node);
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
|
||||
*/
|
||||
@ -128,6 +182,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
|
||||
#define __genradix_idx_to_offset(_radix, _idx) \
|
||||
__idx_to_offset(_idx, __genradix_obj_size(_radix))
|
||||
|
||||
static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset)
|
||||
{
|
||||
struct genradix_root *r = READ_ONCE(radix->root);
|
||||
struct genradix_node *n = genradix_root_to_node(r);
|
||||
unsigned level = genradix_root_to_depth(r);
|
||||
unsigned shift = genradix_depth_shift(level);
|
||||
|
||||
if (unlikely(ilog2(offset) >= genradix_depth_shift(level)))
|
||||
return NULL;
|
||||
|
||||
while (n && shift > GENRADIX_NODE_SHIFT) {
|
||||
shift -= GENRADIX_ARY_SHIFT;
|
||||
n = n->children[offset >> shift];
|
||||
offset &= (1UL << shift) - 1;
|
||||
}
|
||||
|
||||
return n ? &n->data[offset] : NULL;
|
||||
}
|
||||
|
||||
#define genradix_ptr_inlined(_radix, _idx) \
|
||||
(__genradix_cast(_radix) \
|
||||
__genradix_ptr_inlined(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx)))
|
||||
|
||||
void *__genradix_ptr(struct __genradix *, size_t);
|
||||
|
||||
/**
|
||||
@ -142,7 +220,24 @@ void *__genradix_ptr(struct __genradix *, size_t);
|
||||
__genradix_ptr(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx)))
|
||||
|
||||
void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
|
||||
void *__genradix_ptr_alloc(struct __genradix *, size_t,
|
||||
struct genradix_node **, gfp_t);
|
||||
|
||||
#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \
|
||||
(__genradix_cast(_radix) \
|
||||
(__genradix_ptr_inlined(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx)) ?: \
|
||||
__genradix_ptr_alloc(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx), \
|
||||
NULL, _gfp)))
|
||||
|
||||
#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\
|
||||
(__genradix_cast(_radix) \
|
||||
(__genradix_ptr_inlined(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx)) ?: \
|
||||
__genradix_ptr_alloc(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx), \
|
||||
_new_node, _gfp)))
|
||||
|
||||
/**
|
||||
* genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
|
||||
@ -157,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
|
||||
(__genradix_cast(_radix) \
|
||||
__genradix_ptr_alloc(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx), \
|
||||
_gfp))
|
||||
NULL, _gfp))
|
||||
|
||||
#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\
|
||||
(__genradix_cast(_radix) \
|
||||
__genradix_ptr_alloc(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx), \
|
||||
_new_node, _gfp))
|
||||
|
||||
struct genradix_iter {
|
||||
size_t offset;
|
||||
|
@ -3,6 +3,7 @@
|
||||
#define _TOOLS_LINUX_MM_H
|
||||
|
||||
#include <sys/syscall.h>
|
||||
#include <unistd.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
struct sysinfo {
|
||||
|
@ -18,6 +18,7 @@
|
||||
#define alloc_hooks(_do, ...) _do
|
||||
|
||||
#define ARCH_KMALLOC_MINALIGN 16
|
||||
#define ARCH_SLAB_MINALIGN 16
|
||||
#define KMALLOC_MAX_SIZE SIZE_MAX
|
||||
|
||||
#define MAX_PAGE_ORDER 10
|
||||
@ -102,6 +103,8 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
|
||||
((size) != 0 && (n) > SIZE_MAX / (size) \
|
||||
? NULL : kmalloc((n) * (size), flags))
|
||||
|
||||
#define kvmalloc_array_noprof(...) kvmalloc_array(__VA_ARGS__)
|
||||
|
||||
#define kcalloc(n, size, flags) kmalloc_array(n, size, flags|__GFP_ZERO)
|
||||
|
||||
#define kfree(p) free((void *) p)
|
||||
|
@ -1,6 +1,12 @@
|
||||
#ifndef __TOOLS_LINUX_SRCU_H
|
||||
#define __TOOLS_LINUX_SRCU_H
|
||||
|
||||
#include <linux/rcupdate.h>
|
||||
|
||||
#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2
|
||||
|
||||
typedef void (*rcu_callback_t)(struct rcu_head *head);
|
||||
|
||||
struct srcu_struct {
|
||||
};
|
||||
|
||||
@ -26,10 +32,35 @@ static inline unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#undef poll_state_synchronize_rcu
|
||||
static inline bool poll_state_synchronize_rcu(unsigned long cookie)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#undef start_poll_synchronize_rcu
|
||||
static inline unsigned long start_poll_synchronize_rcu()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned long get_state_synchronize_rcu()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) {}
|
||||
|
||||
static inline void srcu_barrier(struct srcu_struct *ssp) {}
|
||||
|
||||
static inline void cleanup_srcu_struct(struct srcu_struct *ssp) {}
|
||||
|
||||
static inline void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
|
||||
rcu_callback_t func)
|
||||
{
|
||||
func(rhp);
|
||||
}
|
||||
|
||||
static inline int init_srcu_struct(struct srcu_struct *ssp)
|
||||
{
|
||||
return 0;
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/jiffies.h>
|
||||
|
||||
static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
|
||||
|
||||
@ -240,71 +241,73 @@ fsck_err:
|
||||
int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
{
|
||||
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
|
||||
struct bch_alloc_v4 a;
|
||||
int ret = 0;
|
||||
|
||||
bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k),
|
||||
bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
|
||||
|
||||
bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
|
||||
c, alloc_v4_val_size_bad,
|
||||
"bad val size (%u > %zu)",
|
||||
alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
|
||||
alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
|
||||
|
||||
bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
|
||||
BCH_ALLOC_V4_NR_BACKPOINTERS(a.v),
|
||||
bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
|
||||
BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
|
||||
c, alloc_v4_backpointers_start_bad,
|
||||
"invalid backpointers_start");
|
||||
|
||||
bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type,
|
||||
bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
|
||||
c, alloc_key_data_type_bad,
|
||||
"invalid data type (got %u should be %u)",
|
||||
a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
|
||||
a.data_type, alloc_data_type(a, a.data_type));
|
||||
|
||||
for (unsigned i = 0; i < 2; i++)
|
||||
bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
|
||||
bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
|
||||
c, alloc_key_io_time_bad,
|
||||
"invalid io_time[%s]: %llu, max %llu",
|
||||
i == READ ? "read" : "write",
|
||||
a.v->io_time[i], LRU_TIME_MAX);
|
||||
a.io_time[i], LRU_TIME_MAX);
|
||||
|
||||
unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) >
|
||||
unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
|
||||
offsetof(struct bch_alloc_v4, stripe_sectors)
|
||||
? a.v->stripe_sectors
|
||||
? a.stripe_sectors
|
||||
: 0;
|
||||
|
||||
switch (a.v->data_type) {
|
||||
switch (a.data_type) {
|
||||
case BCH_DATA_free:
|
||||
case BCH_DATA_need_gc_gens:
|
||||
case BCH_DATA_need_discard:
|
||||
bkey_fsck_err_on(stripe_sectors ||
|
||||
a.v->dirty_sectors ||
|
||||
a.v->cached_sectors ||
|
||||
a.v->stripe,
|
||||
a.dirty_sectors ||
|
||||
a.cached_sectors ||
|
||||
a.stripe,
|
||||
c, alloc_key_empty_but_have_data,
|
||||
"empty data type free but have data %u.%u.%u %u",
|
||||
stripe_sectors,
|
||||
a.v->dirty_sectors,
|
||||
a.v->cached_sectors,
|
||||
a.v->stripe);
|
||||
a.dirty_sectors,
|
||||
a.cached_sectors,
|
||||
a.stripe);
|
||||
break;
|
||||
case BCH_DATA_sb:
|
||||
case BCH_DATA_journal:
|
||||
case BCH_DATA_btree:
|
||||
case BCH_DATA_user:
|
||||
case BCH_DATA_parity:
|
||||
bkey_fsck_err_on(!a.v->dirty_sectors &&
|
||||
bkey_fsck_err_on(!a.dirty_sectors &&
|
||||
!stripe_sectors,
|
||||
c, alloc_key_dirty_sectors_0,
|
||||
"data_type %s but dirty_sectors==0",
|
||||
bch2_data_type_str(a.v->data_type));
|
||||
bch2_data_type_str(a.data_type));
|
||||
break;
|
||||
case BCH_DATA_cached:
|
||||
bkey_fsck_err_on(!a.v->cached_sectors ||
|
||||
a.v->dirty_sectors ||
|
||||
bkey_fsck_err_on(!a.cached_sectors ||
|
||||
a.dirty_sectors ||
|
||||
stripe_sectors ||
|
||||
a.v->stripe,
|
||||
a.stripe,
|
||||
c, alloc_key_cached_inconsistency,
|
||||
"data type inconsistency");
|
||||
|
||||
bkey_fsck_err_on(!a.v->io_time[READ] &&
|
||||
bkey_fsck_err_on(!a.io_time[READ] &&
|
||||
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
|
||||
c, alloc_key_cached_but_read_time_zero,
|
||||
"cached bucket with read_time == 0");
|
||||
@ -1872,26 +1875,26 @@ static void bch2_do_discards_work(struct work_struct *work)
|
||||
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
|
||||
bch2_err_str(ret));
|
||||
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
|
||||
}
|
||||
|
||||
void bch2_dev_do_discards(struct bch_dev *ca)
|
||||
{
|
||||
struct bch_fs *c = ca->fs;
|
||||
|
||||
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
|
||||
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
|
||||
return;
|
||||
|
||||
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
|
||||
goto put_ioref;
|
||||
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
|
||||
goto put_write_ref;
|
||||
|
||||
if (queue_work(c->write_ref_wq, &ca->discard_work))
|
||||
return;
|
||||
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
|
||||
put_ioref:
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
put_write_ref:
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
|
||||
}
|
||||
|
||||
void bch2_do_discards(struct bch_fs *c)
|
||||
@ -2181,7 +2184,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
|
||||
* freespace/need_discard/need_gc_gens btrees as needed:
|
||||
*/
|
||||
while (1) {
|
||||
if (last_updated + HZ * 10 < jiffies) {
|
||||
if (time_after(jiffies, last_updated + HZ * 10)) {
|
||||
bch_info(ca, "%s: currently at %llu/%llu",
|
||||
__func__, iter.pos.offset, ca->mi.nbuckets);
|
||||
last_updated = jiffies;
|
||||
|
@ -69,6 +69,7 @@ struct bch_alloc_v4 {
|
||||
__u64 io_time[2];
|
||||
__u32 stripe;
|
||||
__u32 nr_external_backpointers;
|
||||
/* end of fields in original version of alloc_v4 */
|
||||
__u64 fragmentation_lru;
|
||||
__u32 stripe_sectors;
|
||||
__u32 pad;
|
||||
|
@ -677,7 +677,8 @@ struct bch_sb_field_ext {
|
||||
x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
|
||||
x(disk_accounting_v2, BCH_VERSION(1, 9)) \
|
||||
x(disk_accounting_v3, BCH_VERSION(1, 10)) \
|
||||
x(disk_accounting_inum, BCH_VERSION(1, 11))
|
||||
x(disk_accounting_inum, BCH_VERSION(1, 11)) \
|
||||
x(rebalance_work_acct_fix, BCH_VERSION(1, 12))
|
||||
|
||||
enum bcachefs_metadata_version {
|
||||
bcachefs_metadata_version_min = 9,
|
||||
|
@ -885,66 +885,18 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
|
||||
|
||||
/* Insert */
|
||||
|
||||
static void bch2_bset_fix_lookup_table(struct btree *b,
|
||||
static void rw_aux_tree_insert_entry(struct btree *b,
|
||||
struct bset_tree *t,
|
||||
struct bkey_packed *_where,
|
||||
unsigned clobber_u64s,
|
||||
unsigned new_u64s)
|
||||
unsigned idx)
|
||||
{
|
||||
int shift = new_u64s - clobber_u64s;
|
||||
unsigned l, j, where = __btree_node_key_to_offset(b, _where);
|
||||
|
||||
EBUG_ON(bset_has_ro_aux_tree(t));
|
||||
|
||||
if (!bset_has_rw_aux_tree(t))
|
||||
return;
|
||||
|
||||
/* returns first entry >= where */
|
||||
l = rw_aux_tree_bsearch(b, t, where);
|
||||
|
||||
if (!l) /* never delete first entry */
|
||||
l++;
|
||||
else if (l < t->size &&
|
||||
where < t->end_offset &&
|
||||
rw_aux_tree(b, t)[l].offset == where)
|
||||
rw_aux_tree_set(b, t, l++, _where);
|
||||
|
||||
/* l now > where */
|
||||
|
||||
for (j = l;
|
||||
j < t->size &&
|
||||
rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
|
||||
j++)
|
||||
;
|
||||
|
||||
if (j < t->size &&
|
||||
rw_aux_tree(b, t)[j].offset + shift ==
|
||||
rw_aux_tree(b, t)[l - 1].offset)
|
||||
j++;
|
||||
|
||||
memmove(&rw_aux_tree(b, t)[l],
|
||||
&rw_aux_tree(b, t)[j],
|
||||
(void *) &rw_aux_tree(b, t)[t->size] -
|
||||
(void *) &rw_aux_tree(b, t)[j]);
|
||||
t->size -= j - l;
|
||||
|
||||
for (j = l; j < t->size; j++)
|
||||
rw_aux_tree(b, t)[j].offset += shift;
|
||||
|
||||
EBUG_ON(l < t->size &&
|
||||
rw_aux_tree(b, t)[l].offset ==
|
||||
rw_aux_tree(b, t)[l - 1].offset);
|
||||
EBUG_ON(!idx || idx > t->size);
|
||||
struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
|
||||
struct bkey_packed *end = idx < t->size
|
||||
? rw_aux_to_bkey(b, t, idx)
|
||||
: btree_bkey_last(b, t);
|
||||
|
||||
if (t->size < bset_rw_tree_capacity(b, t) &&
|
||||
(l < t->size
|
||||
? rw_aux_tree(b, t)[l].offset
|
||||
: t->end_offset) -
|
||||
rw_aux_tree(b, t)[l - 1].offset >
|
||||
L1_CACHE_BYTES / sizeof(u64)) {
|
||||
struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
|
||||
struct bkey_packed *end = l < t->size
|
||||
? rw_aux_to_bkey(b, t, l)
|
||||
: btree_bkey_last(b, t);
|
||||
(void *) end - (void *) start > L1_CACHE_BYTES) {
|
||||
struct bkey_packed *k = start;
|
||||
|
||||
while (1) {
|
||||
@ -953,17 +905,73 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
|
||||
break;
|
||||
|
||||
if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
|
||||
memmove(&rw_aux_tree(b, t)[l + 1],
|
||||
&rw_aux_tree(b, t)[l],
|
||||
memmove(&rw_aux_tree(b, t)[idx + 1],
|
||||
&rw_aux_tree(b, t)[idx],
|
||||
(void *) &rw_aux_tree(b, t)[t->size] -
|
||||
(void *) &rw_aux_tree(b, t)[l]);
|
||||
(void *) &rw_aux_tree(b, t)[idx]);
|
||||
t->size++;
|
||||
rw_aux_tree_set(b, t, l, k);
|
||||
rw_aux_tree_set(b, t, idx, k);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void bch2_bset_fix_lookup_table(struct btree *b,
|
||||
struct bset_tree *t,
|
||||
struct bkey_packed *_where,
|
||||
unsigned clobber_u64s,
|
||||
unsigned new_u64s)
|
||||
{
|
||||
int shift = new_u64s - clobber_u64s;
|
||||
unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
|
||||
|
||||
EBUG_ON(bset_has_ro_aux_tree(t));
|
||||
|
||||
if (!bset_has_rw_aux_tree(t))
|
||||
return;
|
||||
|
||||
if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
|
||||
rw_aux_tree_insert_entry(b, t, t->size);
|
||||
goto verify;
|
||||
}
|
||||
|
||||
/* returns first entry >= where */
|
||||
idx = rw_aux_tree_bsearch(b, t, where);
|
||||
|
||||
if (rw_aux_tree(b, t)[idx].offset == where) {
|
||||
if (!idx) { /* never delete first entry */
|
||||
idx++;
|
||||
} else if (where < t->end_offset) {
|
||||
rw_aux_tree_set(b, t, idx++, _where);
|
||||
} else {
|
||||
EBUG_ON(where != t->end_offset);
|
||||
rw_aux_tree_insert_entry(b, t, --t->size);
|
||||
goto verify;
|
||||
}
|
||||
}
|
||||
|
||||
EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
|
||||
if (idx < t->size &&
|
||||
rw_aux_tree(b, t)[idx].offset + shift ==
|
||||
rw_aux_tree(b, t)[idx - 1].offset) {
|
||||
memmove(&rw_aux_tree(b, t)[idx],
|
||||
&rw_aux_tree(b, t)[idx + 1],
|
||||
(void *) &rw_aux_tree(b, t)[t->size] -
|
||||
(void *) &rw_aux_tree(b, t)[idx + 1]);
|
||||
t->size -= 1;
|
||||
}
|
||||
|
||||
for (j = idx; j < t->size; j++)
|
||||
rw_aux_tree(b, t)[j].offset += shift;
|
||||
|
||||
EBUG_ON(idx < t->size &&
|
||||
rw_aux_tree(b, t)[idx].offset ==
|
||||
rw_aux_tree(b, t)[idx - 1].offset);
|
||||
|
||||
rw_aux_tree_insert_entry(b, t, idx);
|
||||
|
||||
verify:
|
||||
bch2_bset_verify_rw_aux_tree(b, t);
|
||||
bset_aux_tree_verify(b);
|
||||
}
|
||||
|
@ -159,6 +159,16 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
|
||||
return b;
|
||||
}
|
||||
|
||||
void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
mutex_lock(&c->btree_cache.lock);
|
||||
list_move(&b->list, &c->btree_cache.freeable);
|
||||
mutex_unlock(&c->btree_cache.lock);
|
||||
|
||||
six_unlock_write(&b->c.lock);
|
||||
six_unlock_intent(&b->c.lock);
|
||||
}
|
||||
|
||||
/* Btree in memory cache - hash table */
|
||||
|
||||
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
|
||||
@ -661,9 +671,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
|
||||
: &bc->freed_nonpcpu;
|
||||
struct btree *b, *b2;
|
||||
u64 start_time = local_clock();
|
||||
unsigned flags;
|
||||
|
||||
flags = memalloc_nofs_save();
|
||||
mutex_lock(&bc->lock);
|
||||
|
||||
/*
|
||||
@ -735,7 +743,12 @@ out:
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
|
||||
start_time);
|
||||
|
||||
memalloc_nofs_restore(flags);
|
||||
int ret = bch2_trans_relock(trans);
|
||||
if (unlikely(ret)) {
|
||||
bch2_btree_node_to_freelist(c, b);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
return b;
|
||||
err:
|
||||
mutex_lock(&bc->lock);
|
||||
@ -764,7 +777,6 @@ err:
|
||||
}
|
||||
|
||||
mutex_unlock(&bc->lock);
|
||||
memalloc_nofs_restore(flags);
|
||||
return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
|
||||
}
|
||||
|
||||
@ -856,6 +868,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
|
||||
|
||||
bch2_btree_node_read(trans, b, sync);
|
||||
|
||||
int ret = bch2_trans_relock(trans);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
if (!sync)
|
||||
return NULL;
|
||||
|
||||
|
@ -12,6 +12,8 @@ struct btree_iter;
|
||||
|
||||
void bch2_recalc_btree_reserve(struct bch_fs *);
|
||||
|
||||
void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
|
||||
|
||||
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
|
||||
int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
|
||||
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
|
||||
|
@ -587,6 +587,15 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
|
||||
bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \
|
||||
_btree_id, _pos, _flags, KEY_TYPE_##_type))
|
||||
|
||||
#define bkey_val_copy(_dst_v, _src_k) \
|
||||
do { \
|
||||
unsigned b = min_t(unsigned, sizeof(*_dst_v), \
|
||||
bkey_val_bytes(_src_k.k)); \
|
||||
memcpy(_dst_v, _src_k.v, b); \
|
||||
if (b < sizeof(*_dst_v)) \
|
||||
memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \
|
||||
} while (0)
|
||||
|
||||
static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
|
||||
unsigned btree_id, struct bpos pos,
|
||||
unsigned flags, unsigned type,
|
||||
|
@ -79,130 +79,41 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
|
||||
return true;
|
||||
}
|
||||
|
||||
static void bkey_cached_evict(struct btree_key_cache *c,
|
||||
static bool bkey_cached_evict(struct btree_key_cache *c,
|
||||
struct bkey_cached *ck)
|
||||
{
|
||||
BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
|
||||
bch2_btree_key_cache_params));
|
||||
bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
|
||||
bch2_btree_key_cache_params);
|
||||
if (ret) {
|
||||
memset(&ck->key, ~0, sizeof(ck->key));
|
||||
|
||||
atomic_long_dec(&c->nr_keys);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
|
||||
{
|
||||
struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
|
||||
struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
|
||||
|
||||
this_cpu_dec(*c->btree_key_cache.nr_pending);
|
||||
kmem_cache_free(bch2_key_cache, ck);
|
||||
}
|
||||
|
||||
static void bkey_cached_free(struct btree_key_cache *bc,
|
||||
struct bkey_cached *ck)
|
||||
{
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
||||
|
||||
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
|
||||
|
||||
ck->btree_trans_barrier_seq =
|
||||
start_poll_synchronize_srcu(&c->btree_trans_barrier);
|
||||
|
||||
if (ck->c.lock.readers) {
|
||||
list_move_tail(&ck->list, &bc->freed_pcpu);
|
||||
bc->nr_freed_pcpu++;
|
||||
} else {
|
||||
list_move_tail(&ck->list, &bc->freed_nonpcpu);
|
||||
bc->nr_freed_nonpcpu++;
|
||||
}
|
||||
atomic_long_inc(&bc->nr_freed);
|
||||
|
||||
kfree(ck->k);
|
||||
ck->k = NULL;
|
||||
ck->u64s = 0;
|
||||
|
||||
six_unlock_write(&ck->c.lock);
|
||||
six_unlock_intent(&ck->c.lock);
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
|
||||
struct bkey_cached *ck)
|
||||
{
|
||||
struct bkey_cached *pos;
|
||||
|
||||
bc->nr_freed_nonpcpu++;
|
||||
|
||||
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
|
||||
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
|
||||
pos->btree_trans_barrier_seq)) {
|
||||
list_move(&ck->list, &pos->list);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
list_move(&ck->list, &bc->freed_nonpcpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
|
||||
struct bkey_cached *ck)
|
||||
{
|
||||
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
|
||||
|
||||
if (!ck->c.lock.readers) {
|
||||
#ifdef __KERNEL__
|
||||
struct btree_key_cache_freelist *f;
|
||||
bool freed = false;
|
||||
|
||||
preempt_disable();
|
||||
f = this_cpu_ptr(bc->pcpu_freed);
|
||||
|
||||
if (f->nr < ARRAY_SIZE(f->objs)) {
|
||||
f->objs[f->nr++] = ck;
|
||||
freed = true;
|
||||
}
|
||||
preempt_enable();
|
||||
|
||||
if (!freed) {
|
||||
mutex_lock(&bc->lock);
|
||||
preempt_disable();
|
||||
f = this_cpu_ptr(bc->pcpu_freed);
|
||||
|
||||
while (f->nr > ARRAY_SIZE(f->objs) / 2) {
|
||||
struct bkey_cached *ck2 = f->objs[--f->nr];
|
||||
|
||||
__bkey_cached_move_to_freelist_ordered(bc, ck2);
|
||||
}
|
||||
preempt_enable();
|
||||
|
||||
__bkey_cached_move_to_freelist_ordered(bc, ck);
|
||||
mutex_unlock(&bc->lock);
|
||||
}
|
||||
#else
|
||||
mutex_lock(&bc->lock);
|
||||
list_move_tail(&ck->list, &bc->freed_nonpcpu);
|
||||
bc->nr_freed_nonpcpu++;
|
||||
mutex_unlock(&bc->lock);
|
||||
#endif
|
||||
} else {
|
||||
mutex_lock(&bc->lock);
|
||||
list_move_tail(&ck->list, &bc->freed_pcpu);
|
||||
bc->nr_freed_pcpu++;
|
||||
mutex_unlock(&bc->lock);
|
||||
}
|
||||
}
|
||||
|
||||
static void bkey_cached_free_fast(struct btree_key_cache *bc,
|
||||
struct bkey_cached *ck)
|
||||
{
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
||||
|
||||
ck->btree_trans_barrier_seq =
|
||||
start_poll_synchronize_srcu(&c->btree_trans_barrier);
|
||||
|
||||
list_del_init(&ck->list);
|
||||
atomic_long_inc(&bc->nr_freed);
|
||||
|
||||
kfree(ck->k);
|
||||
ck->k = NULL;
|
||||
ck->u64s = 0;
|
||||
|
||||
bkey_cached_move_to_freelist(bc, ck);
|
||||
|
||||
six_unlock_write(&ck->c.lock);
|
||||
six_unlock_intent(&ck->c.lock);
|
||||
bool pcpu_readers = ck->c.lock.readers != NULL;
|
||||
rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
|
||||
this_cpu_inc(*bc->nr_pending);
|
||||
}
|
||||
|
||||
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
|
||||
@ -224,74 +135,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_key_cache *bc = &c->btree_key_cache;
|
||||
struct bkey_cached *ck = NULL;
|
||||
bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
|
||||
int ret;
|
||||
|
||||
if (!pcpu_readers) {
|
||||
#ifdef __KERNEL__
|
||||
struct btree_key_cache_freelist *f;
|
||||
|
||||
preempt_disable();
|
||||
f = this_cpu_ptr(bc->pcpu_freed);
|
||||
if (f->nr)
|
||||
ck = f->objs[--f->nr];
|
||||
preempt_enable();
|
||||
|
||||
if (!ck) {
|
||||
mutex_lock(&bc->lock);
|
||||
preempt_disable();
|
||||
f = this_cpu_ptr(bc->pcpu_freed);
|
||||
|
||||
while (!list_empty(&bc->freed_nonpcpu) &&
|
||||
f->nr < ARRAY_SIZE(f->objs) / 2) {
|
||||
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
|
||||
list_del_init(&ck->list);
|
||||
bc->nr_freed_nonpcpu--;
|
||||
f->objs[f->nr++] = ck;
|
||||
}
|
||||
|
||||
ck = f->nr ? f->objs[--f->nr] : NULL;
|
||||
preempt_enable();
|
||||
mutex_unlock(&bc->lock);
|
||||
}
|
||||
#else
|
||||
mutex_lock(&bc->lock);
|
||||
if (!list_empty(&bc->freed_nonpcpu)) {
|
||||
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
|
||||
list_del_init(&ck->list);
|
||||
bc->nr_freed_nonpcpu--;
|
||||
}
|
||||
mutex_unlock(&bc->lock);
|
||||
#endif
|
||||
} else {
|
||||
mutex_lock(&bc->lock);
|
||||
if (!list_empty(&bc->freed_pcpu)) {
|
||||
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
|
||||
list_del_init(&ck->list);
|
||||
bc->nr_freed_pcpu--;
|
||||
}
|
||||
mutex_unlock(&bc->lock);
|
||||
}
|
||||
|
||||
if (ck) {
|
||||
ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
|
||||
if (unlikely(ret)) {
|
||||
bkey_cached_move_to_freelist(bc, ck);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
|
||||
|
||||
ret = bch2_btree_node_lock_write(trans, path, &ck->c);
|
||||
if (unlikely(ret)) {
|
||||
btree_node_unlock(trans, path, 0);
|
||||
bkey_cached_move_to_freelist(bc, ck);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
return ck;
|
||||
}
|
||||
struct bkey_cached *ck = container_of_or_null(
|
||||
rcu_pending_dequeue(&bc->pending[pcpu_readers]),
|
||||
struct bkey_cached, rcu);
|
||||
if (ck)
|
||||
goto lock;
|
||||
|
||||
ck = allocate_dropping_locks(trans, ret,
|
||||
__bkey_cached_alloc(key_u64s, _gfp));
|
||||
@ -302,15 +153,19 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
if (!ck)
|
||||
return NULL;
|
||||
|
||||
INIT_LIST_HEAD(&ck->list);
|
||||
if (ck) {
|
||||
bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
|
||||
|
||||
ck->c.cached = true;
|
||||
BUG_ON(!six_trylock_intent(&ck->c.lock));
|
||||
BUG_ON(!six_trylock_write(&ck->c.lock));
|
||||
goto lock;
|
||||
}
|
||||
|
||||
ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
|
||||
struct bkey_cached, rcu);
|
||||
if (ck)
|
||||
goto lock;
|
||||
lock:
|
||||
six_lock_intent(&ck->c.lock, NULL, NULL);
|
||||
six_lock_write(&ck->c.lock, NULL, NULL);
|
||||
return ck;
|
||||
}
|
||||
|
||||
@ -322,21 +177,21 @@ bkey_cached_reuse(struct btree_key_cache *c)
|
||||
struct bkey_cached *ck;
|
||||
unsigned i;
|
||||
|
||||
mutex_lock(&c->lock);
|
||||
rcu_read_lock();
|
||||
tbl = rht_dereference_rcu(c->table.tbl, &c->table);
|
||||
for (i = 0; i < tbl->size; i++)
|
||||
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
|
||||
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
|
||||
bkey_cached_lock_for_evict(ck)) {
|
||||
bkey_cached_evict(c, ck);
|
||||
if (bkey_cached_evict(c, ck))
|
||||
goto out;
|
||||
six_unlock_write(&ck->c.lock);
|
||||
six_unlock_intent(&ck->c.lock);
|
||||
}
|
||||
}
|
||||
ck = NULL;
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
mutex_unlock(&c->lock);
|
||||
return ck;
|
||||
}
|
||||
|
||||
@ -415,7 +270,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
|
||||
path->uptodate = BTREE_ITER_UPTODATE;
|
||||
return 0;
|
||||
err:
|
||||
bkey_cached_free_fast(bc, ck);
|
||||
bkey_cached_free(bc, ck);
|
||||
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
|
||||
|
||||
return ret;
|
||||
@ -611,8 +466,12 @@ evict:
|
||||
}
|
||||
|
||||
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
|
||||
bkey_cached_evict(&c->btree_key_cache, ck);
|
||||
bkey_cached_free_fast(&c->btree_key_cache, ck);
|
||||
if (bkey_cached_evict(&c->btree_key_cache, ck)) {
|
||||
bkey_cached_free(&c->btree_key_cache, ck);
|
||||
} else {
|
||||
six_unlock_write(&ck->c.lock);
|
||||
six_unlock_intent(&ck->c.lock);
|
||||
}
|
||||
}
|
||||
out:
|
||||
bch2_trans_iter_exit(trans, &b_iter);
|
||||
@ -722,10 +581,11 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
bkey_cached_evict(bc, ck);
|
||||
bkey_cached_free_fast(bc, ck);
|
||||
bkey_cached_free(bc, ck);
|
||||
|
||||
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
|
||||
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
|
||||
path->should_be_locked = false;
|
||||
}
|
||||
|
||||
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
|
||||
@ -734,60 +594,41 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
|
||||
struct bch_fs *c = shrink->private_data;
|
||||
struct btree_key_cache *bc = &c->btree_key_cache;
|
||||
struct bucket_table *tbl;
|
||||
struct bkey_cached *ck, *t;
|
||||
struct bkey_cached *ck;
|
||||
size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
|
||||
unsigned start, flags;
|
||||
unsigned iter, start;
|
||||
int srcu_idx;
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
bc->requested_to_free += sc->nr_to_scan;
|
||||
|
||||
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
|
||||
flags = memalloc_nofs_save();
|
||||
rcu_read_lock();
|
||||
|
||||
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
|
||||
|
||||
/*
|
||||
* Newest freed entries are at the end of the list - once we hit one
|
||||
* that's too new to be freed, we can bail out:
|
||||
* Scanning is expensive while a rehash is in progress - most elements
|
||||
* will be on the new hashtable, if it's in progress
|
||||
*
|
||||
* A rehash could still start while we're scanning - that's ok, we'll
|
||||
* still see most elements.
|
||||
*/
|
||||
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
|
||||
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
|
||||
ck->btree_trans_barrier_seq))
|
||||
break;
|
||||
|
||||
list_del(&ck->list);
|
||||
six_lock_exit(&ck->c.lock);
|
||||
kmem_cache_free(bch2_key_cache, ck);
|
||||
atomic_long_dec(&bc->nr_freed);
|
||||
bc->nr_freed_nonpcpu--;
|
||||
bc->freed++;
|
||||
if (unlikely(tbl->nest)) {
|
||||
rcu_read_unlock();
|
||||
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
|
||||
return SHRINK_STOP;
|
||||
}
|
||||
|
||||
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
|
||||
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
|
||||
ck->btree_trans_barrier_seq))
|
||||
break;
|
||||
|
||||
list_del(&ck->list);
|
||||
six_lock_exit(&ck->c.lock);
|
||||
kmem_cache_free(bch2_key_cache, ck);
|
||||
atomic_long_dec(&bc->nr_freed);
|
||||
bc->nr_freed_pcpu--;
|
||||
bc->freed++;
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
|
||||
if (bc->shrink_iter >= tbl->size)
|
||||
bc->shrink_iter = 0;
|
||||
start = bc->shrink_iter;
|
||||
iter = bc->shrink_iter;
|
||||
if (iter >= tbl->size)
|
||||
iter = 0;
|
||||
start = iter;
|
||||
|
||||
do {
|
||||
struct rhash_head *pos, *next;
|
||||
|
||||
pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
|
||||
pos = rht_ptr_rcu(&tbl->buckets[iter]);
|
||||
|
||||
while (!rht_is_a_nulls(pos)) {
|
||||
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
|
||||
next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
|
||||
ck = container_of(pos, struct bkey_cached, hash);
|
||||
|
||||
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
|
||||
@ -797,29 +638,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
|
||||
bc->skipped_accessed++;
|
||||
} else if (!bkey_cached_lock_for_evict(ck)) {
|
||||
bc->skipped_lock_fail++;
|
||||
} else {
|
||||
bkey_cached_evict(bc, ck);
|
||||
} else if (bkey_cached_evict(bc, ck)) {
|
||||
bkey_cached_free(bc, ck);
|
||||
bc->moved_to_freelist++;
|
||||
bc->freed++;
|
||||
freed++;
|
||||
} else {
|
||||
six_unlock_write(&ck->c.lock);
|
||||
six_unlock_intent(&ck->c.lock);
|
||||
}
|
||||
|
||||
scanned++;
|
||||
if (scanned >= nr)
|
||||
break;
|
||||
goto out;
|
||||
|
||||
pos = next;
|
||||
}
|
||||
|
||||
bc->shrink_iter++;
|
||||
if (bc->shrink_iter >= tbl->size)
|
||||
bc->shrink_iter = 0;
|
||||
} while (scanned < nr && bc->shrink_iter != start);
|
||||
iter++;
|
||||
if (iter >= tbl->size)
|
||||
iter = 0;
|
||||
} while (scanned < nr && iter != start);
|
||||
out:
|
||||
bc->shrink_iter = iter;
|
||||
|
||||
rcu_read_unlock();
|
||||
memalloc_nofs_restore(flags);
|
||||
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
return freed;
|
||||
}
|
||||
@ -847,64 +690,39 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
|
||||
{
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
||||
struct bucket_table *tbl;
|
||||
struct bkey_cached *ck, *n;
|
||||
struct bkey_cached *ck;
|
||||
struct rhash_head *pos;
|
||||
LIST_HEAD(items);
|
||||
unsigned i;
|
||||
#ifdef __KERNEL__
|
||||
int cpu;
|
||||
#endif
|
||||
|
||||
shrinker_free(bc->shrink);
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
|
||||
/*
|
||||
* The loop is needed to guard against racing with rehash:
|
||||
*/
|
||||
while (atomic_long_read(&bc->nr_keys)) {
|
||||
rcu_read_lock();
|
||||
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
|
||||
if (tbl)
|
||||
if (tbl) {
|
||||
if (tbl->nest) {
|
||||
/* wait for in progress rehash */
|
||||
rcu_read_unlock();
|
||||
mutex_lock(&bc->table.mutex);
|
||||
mutex_unlock(&bc->table.mutex);
|
||||
rcu_read_lock();
|
||||
continue;
|
||||
}
|
||||
for (i = 0; i < tbl->size; i++)
|
||||
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
|
||||
bkey_cached_evict(bc, ck);
|
||||
list_add(&ck->list, &items);
|
||||
while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
|
||||
ck = container_of(pos, struct bkey_cached, hash);
|
||||
BUG_ON(!bkey_cached_evict(bc, ck));
|
||||
kfree(ck->k);
|
||||
kmem_cache_free(bch2_key_cache, ck);
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
if (bc->pcpu_freed) {
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct btree_key_cache_freelist *f =
|
||||
per_cpu_ptr(bc->pcpu_freed, cpu);
|
||||
|
||||
for (i = 0; i < f->nr; i++) {
|
||||
ck = f->objs[i];
|
||||
list_add(&ck->list, &items);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
|
||||
BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
|
||||
|
||||
list_splice(&bc->freed_pcpu, &items);
|
||||
list_splice(&bc->freed_nonpcpu, &items);
|
||||
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
list_for_each_entry_safe(ck, n, &items, list) {
|
||||
cond_resched();
|
||||
|
||||
list_del(&ck->list);
|
||||
kfree(ck->k);
|
||||
six_lock_exit(&ck->c.lock);
|
||||
kmem_cache_free(bch2_key_cache, ck);
|
||||
}
|
||||
|
||||
if (atomic_long_read(&bc->nr_dirty) &&
|
||||
!bch2_journal_error(&c->journal) &&
|
||||
test_bit(BCH_FS_was_rw, &c->flags))
|
||||
@ -918,14 +736,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
|
||||
if (bc->table_init_done)
|
||||
rhashtable_destroy(&bc->table);
|
||||
|
||||
free_percpu(bc->pcpu_freed);
|
||||
rcu_pending_exit(&bc->pending[0]);
|
||||
rcu_pending_exit(&bc->pending[1]);
|
||||
|
||||
free_percpu(bc->nr_pending);
|
||||
}
|
||||
|
||||
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
|
||||
{
|
||||
mutex_init(&c->lock);
|
||||
INIT_LIST_HEAD(&c->freed_pcpu);
|
||||
INIT_LIST_HEAD(&c->freed_nonpcpu);
|
||||
}
|
||||
|
||||
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
|
||||
@ -933,11 +751,13 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
||||
struct shrinker *shrink;
|
||||
|
||||
#ifdef __KERNEL__
|
||||
bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
|
||||
if (!bc->pcpu_freed)
|
||||
bc->nr_pending = alloc_percpu(size_t);
|
||||
if (!bc->nr_pending)
|
||||
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
|
||||
|
||||
if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
|
||||
rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
|
||||
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
|
||||
#endif
|
||||
|
||||
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
|
||||
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
|
||||
@ -959,45 +779,21 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
|
||||
|
||||
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
|
||||
{
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
||||
|
||||
printbuf_tabstop_push(out, 24);
|
||||
printbuf_tabstop_push(out, 12);
|
||||
|
||||
unsigned flags = memalloc_nofs_save();
|
||||
mutex_lock(&bc->lock);
|
||||
prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
|
||||
prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
|
||||
prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed));
|
||||
prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu);
|
||||
prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu);
|
||||
|
||||
prt_printf(out, "\nshrinker:\n");
|
||||
prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size);
|
||||
prt_newline(out);
|
||||
prt_printf(out, "shrinker:\n");
|
||||
prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
|
||||
prt_printf(out, "freed:\t%lu\r\n", bc->freed);
|
||||
prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist);
|
||||
prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
|
||||
prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
|
||||
prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
|
||||
|
||||
prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier));
|
||||
|
||||
struct bkey_cached *ck;
|
||||
unsigned iter = 0;
|
||||
list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
|
||||
prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
|
||||
if (++iter > 10)
|
||||
break;
|
||||
}
|
||||
|
||||
iter = 0;
|
||||
list_for_each_entry(ck, &bc->freed_pcpu, list) {
|
||||
prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
|
||||
if (++iter > 10)
|
||||
break;
|
||||
}
|
||||
mutex_unlock(&bc->lock);
|
||||
memalloc_flags_restore(flags);
|
||||
prt_newline(out);
|
||||
prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending));
|
||||
}
|
||||
|
||||
void bch2_btree_key_cache_exit(void)
|
||||
|
@ -2,33 +2,25 @@
|
||||
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
|
||||
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
|
||||
|
||||
struct btree_key_cache_freelist {
|
||||
struct bkey_cached *objs[16];
|
||||
unsigned nr;
|
||||
};
|
||||
#include "rcu_pending.h"
|
||||
|
||||
struct btree_key_cache {
|
||||
struct mutex lock;
|
||||
struct rhashtable table;
|
||||
bool table_init_done;
|
||||
|
||||
struct list_head freed_pcpu;
|
||||
size_t nr_freed_pcpu;
|
||||
struct list_head freed_nonpcpu;
|
||||
size_t nr_freed_nonpcpu;
|
||||
|
||||
struct shrinker *shrink;
|
||||
unsigned shrink_iter;
|
||||
struct btree_key_cache_freelist __percpu *pcpu_freed;
|
||||
|
||||
atomic_long_t nr_freed;
|
||||
/* 0: non pcpu reader locks, 1: pcpu reader locks */
|
||||
struct rcu_pending pending[2];
|
||||
size_t __percpu *nr_pending;
|
||||
|
||||
atomic_long_t nr_keys;
|
||||
atomic_long_t nr_dirty;
|
||||
|
||||
/* shrinker stats */
|
||||
unsigned long requested_to_free;
|
||||
unsigned long freed;
|
||||
unsigned long moved_to_freelist;
|
||||
unsigned long skipped_dirty;
|
||||
unsigned long skipped_accessed;
|
||||
unsigned long skipped_lock_fail;
|
||||
|
@ -218,13 +218,11 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
|
||||
bool lock_may_not_fail,
|
||||
unsigned long ip)
|
||||
{
|
||||
int ret;
|
||||
|
||||
trans->lock_may_not_fail = lock_may_not_fail;
|
||||
trans->lock_must_abort = false;
|
||||
trans->locking = b;
|
||||
|
||||
ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
|
||||
int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
|
||||
bch2_six_check_for_deadlock, trans, ip);
|
||||
WRITE_ONCE(trans->locking, NULL);
|
||||
WRITE_ONCE(trans->locking_wait.start_time, 0);
|
||||
@ -284,6 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
|
||||
int ret = 0;
|
||||
|
||||
EBUG_ON(level >= BTREE_MAX_DEPTH);
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
|
||||
if (likely(six_trylock_type(&b->lock, type)) ||
|
||||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
|
||||
|
@ -386,17 +386,16 @@ struct bkey_cached {
|
||||
struct btree_bkey_cached_common c;
|
||||
|
||||
unsigned long flags;
|
||||
unsigned long btree_trans_barrier_seq;
|
||||
u16 u64s;
|
||||
struct bkey_cached_key key;
|
||||
|
||||
struct rhash_head hash;
|
||||
struct list_head list;
|
||||
|
||||
struct journal_entry_pin journal;
|
||||
u64 seq;
|
||||
|
||||
struct bkey_i *k;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
|
||||
|
@ -317,6 +317,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
|
||||
: 0;
|
||||
int ret;
|
||||
|
||||
b = bch2_btree_node_mem_alloc(trans, interior_node);
|
||||
if (IS_ERR(b))
|
||||
return b;
|
||||
|
||||
BUG_ON(b->ob.nr);
|
||||
|
||||
mutex_lock(&c->btree_reserve_cache_lock);
|
||||
if (c->btree_reserve_cache_nr > nr_reserve) {
|
||||
struct btree_alloc *a =
|
||||
@ -325,10 +331,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
|
||||
obs = a->ob;
|
||||
bkey_copy(&tmp.k, &a->k);
|
||||
mutex_unlock(&c->btree_reserve_cache_lock);
|
||||
goto mem_alloc;
|
||||
goto out;
|
||||
}
|
||||
mutex_unlock(&c->btree_reserve_cache_lock);
|
||||
|
||||
retry:
|
||||
ret = bch2_alloc_sectors_start_trans(trans,
|
||||
c->opts.metadata_target ?:
|
||||
@ -341,7 +346,7 @@ retry:
|
||||
c->opts.metadata_replicas_required),
|
||||
watermark, 0, cl, &wp);
|
||||
if (unlikely(ret))
|
||||
return ERR_PTR(ret);
|
||||
goto err;
|
||||
|
||||
if (wp->sectors_free < btree_sectors(c)) {
|
||||
struct open_bucket *ob;
|
||||
@ -360,19 +365,16 @@ retry:
|
||||
|
||||
bch2_open_bucket_get(c, wp, &obs);
|
||||
bch2_alloc_sectors_done(c, wp);
|
||||
mem_alloc:
|
||||
b = bch2_btree_node_mem_alloc(trans, interior_node);
|
||||
out:
|
||||
bkey_copy(&b->key, &tmp.k);
|
||||
b->ob = obs;
|
||||
six_unlock_write(&b->c.lock);
|
||||
six_unlock_intent(&b->c.lock);
|
||||
|
||||
/* we hold cannibalize_lock: */
|
||||
BUG_ON(IS_ERR(b));
|
||||
BUG_ON(b->ob.nr);
|
||||
|
||||
bkey_copy(&b->key, &tmp.k);
|
||||
b->ob = obs;
|
||||
|
||||
return b;
|
||||
err:
|
||||
bch2_btree_node_to_freelist(c, b);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static struct btree *bch2_btree_node_alloc(struct btree_update *as,
|
||||
@ -729,6 +731,18 @@ static void btree_update_nodes_written(struct btree_update *as)
|
||||
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
|
||||
"%s", bch2_err_str(ret));
|
||||
err:
|
||||
/*
|
||||
* Ensure transaction is unlocked before using btree_node_lock_nopath()
|
||||
* (the use of which is always suspect, we need to work on removing this
|
||||
* in the future)
|
||||
*
|
||||
* It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
|
||||
* calls bch2_path_upgrade(), before we call path_make_mut(), so we may
|
||||
* rarely end up with a locked path besides the one we have here:
|
||||
*/
|
||||
bch2_trans_unlock(trans);
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
/*
|
||||
* We have to be careful because another thread might be getting ready
|
||||
* to free as->b and calling btree_update_reparent() on us - we'll
|
||||
@ -748,18 +762,6 @@ err:
|
||||
* we're in journal error state:
|
||||
*/
|
||||
|
||||
/*
|
||||
* Ensure transaction is unlocked before using
|
||||
* btree_node_lock_nopath() (the use of which is always suspect,
|
||||
* we need to work on removing this in the future)
|
||||
*
|
||||
* It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
|
||||
* calls bch2_path_upgrade(), before we call path_make_mut(), so
|
||||
* we may rarely end up with a locked path besides the one we
|
||||
* have here:
|
||||
*/
|
||||
bch2_trans_unlock(trans);
|
||||
bch2_trans_begin(trans);
|
||||
btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
|
||||
as->btree_id, b->c.level, b->key.k.p);
|
||||
struct btree_path *path = trans->paths + path_idx;
|
||||
@ -2439,6 +2441,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
|
||||
}
|
||||
|
||||
new_hash = bch2_btree_node_mem_alloc(trans, false);
|
||||
ret = PTR_ERR_OR_ZERO(new_hash);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
path->intent_ref++;
|
||||
@ -2446,14 +2451,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
|
||||
commit_flags, skip_triggers);
|
||||
--path->intent_ref;
|
||||
|
||||
if (new_hash) {
|
||||
mutex_lock(&c->btree_cache.lock);
|
||||
list_move(&new_hash->list, &c->btree_cache.freeable);
|
||||
mutex_unlock(&c->btree_cache.lock);
|
||||
|
||||
six_unlock_write(&new_hash->c.lock);
|
||||
six_unlock_intent(&new_hash->c.lock);
|
||||
}
|
||||
if (new_hash)
|
||||
bch2_btree_node_to_freelist(c, new_hash);
|
||||
err:
|
||||
closure_sync(&cl);
|
||||
bch2_btree_cache_cannibalize_unlock(trans);
|
||||
return ret;
|
||||
@ -2522,6 +2522,10 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
|
||||
b = bch2_btree_node_mem_alloc(trans, false);
|
||||
bch2_btree_cache_cannibalize_unlock(trans);
|
||||
|
||||
ret = PTR_ERR_OR_ZERO(b);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
set_btree_node_fake(b);
|
||||
set_btree_node_need_rewrite(b);
|
||||
b->c.level = level;
|
||||
@ -2553,7 +2557,7 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
|
||||
|
||||
void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
|
||||
{
|
||||
bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level));
|
||||
bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
|
||||
}
|
||||
|
||||
static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
|
||||
|
@ -159,6 +159,8 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
|
||||
unsigned level,
|
||||
unsigned flags)
|
||||
{
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
|
||||
return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
|
||||
btree_prev_sib) ?:
|
||||
bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
|
||||
|
@ -699,7 +699,8 @@ err:
|
||||
static int __trigger_extent(struct btree_trans *trans,
|
||||
enum btree_id btree_id, unsigned level,
|
||||
struct bkey_s_c k,
|
||||
enum btree_iter_update_trigger_flags flags)
|
||||
enum btree_iter_update_trigger_flags flags,
|
||||
s64 *replicas_sectors)
|
||||
{
|
||||
bool gc = flags & BTREE_TRIGGER_gc;
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
@ -708,7 +709,6 @@ static int __trigger_extent(struct btree_trans *trans,
|
||||
enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
|
||||
? BCH_DATA_btree
|
||||
: BCH_DATA_user;
|
||||
s64 replicas_sectors = 0;
|
||||
int ret = 0;
|
||||
|
||||
struct disk_accounting_pos acc_replicas_key = {
|
||||
@ -739,7 +739,7 @@ static int __trigger_extent(struct btree_trans *trans,
|
||||
if (ret)
|
||||
return ret;
|
||||
} else if (!p.has_ec) {
|
||||
replicas_sectors += disk_sectors;
|
||||
*replicas_sectors += disk_sectors;
|
||||
acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
|
||||
} else {
|
||||
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
|
||||
@ -777,7 +777,7 @@ static int __trigger_extent(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
if (acc_replicas_key.replicas.nr_devs) {
|
||||
ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
|
||||
ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@ -787,7 +787,7 @@ static int __trigger_extent(struct btree_trans *trans,
|
||||
.type = BCH_DISK_ACCOUNTING_snapshot,
|
||||
.snapshot.id = k.k->p.snapshot,
|
||||
};
|
||||
ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc);
|
||||
ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@ -807,7 +807,7 @@ static int __trigger_extent(struct btree_trans *trans,
|
||||
.type = BCH_DISK_ACCOUNTING_btree,
|
||||
.btree.id = btree_id,
|
||||
};
|
||||
ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc);
|
||||
ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
|
||||
if (ret)
|
||||
return ret;
|
||||
} else {
|
||||
@ -819,22 +819,13 @@ static int __trigger_extent(struct btree_trans *trans,
|
||||
s64 v[3] = {
|
||||
insert ? 1 : -1,
|
||||
insert ? k.k->size : -((s64) k.k->size),
|
||||
replicas_sectors,
|
||||
*replicas_sectors,
|
||||
};
|
||||
ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (bch2_bkey_rebalance_opts(k)) {
|
||||
struct disk_accounting_pos acc = {
|
||||
.type = BCH_DISK_ACCOUNTING_rebalance_work,
|
||||
};
|
||||
ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -843,6 +834,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
|
||||
struct bkey_s_c old, struct bkey_s new,
|
||||
enum btree_iter_update_trigger_flags flags)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
|
||||
struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
|
||||
unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
|
||||
@ -858,22 +850,54 @@ int bch2_trigger_extent(struct btree_trans *trans,
|
||||
new_ptrs_bytes))
|
||||
return 0;
|
||||
|
||||
if (flags & BTREE_TRIGGER_transactional) {
|
||||
struct bch_fs *c = trans->c;
|
||||
int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
|
||||
(int) bch2_bkey_needs_rebalance(c, old);
|
||||
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
|
||||
s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
|
||||
|
||||
if (mod) {
|
||||
if (old.k->type) {
|
||||
int ret = __trigger_extent(trans, btree, level, old,
|
||||
flags & ~BTREE_TRIGGER_insert,
|
||||
&old_replicas_sectors);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (new.k->type) {
|
||||
int ret = __trigger_extent(trans, btree, level, new.s_c,
|
||||
flags & ~BTREE_TRIGGER_overwrite,
|
||||
&new_replicas_sectors);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
int need_rebalance_delta = 0;
|
||||
s64 need_rebalance_sectors_delta = 0;
|
||||
|
||||
s64 s = bch2_bkey_sectors_need_rebalance(c, old);
|
||||
need_rebalance_delta -= s != 0;
|
||||
need_rebalance_sectors_delta -= s;
|
||||
|
||||
s = bch2_bkey_sectors_need_rebalance(c, old);
|
||||
need_rebalance_delta += s != 0;
|
||||
need_rebalance_sectors_delta += s;
|
||||
|
||||
if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
|
||||
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
|
||||
new.k->p, mod > 0);
|
||||
new.k->p, need_rebalance_delta > 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (need_rebalance_sectors_delta) {
|
||||
struct disk_accounting_pos acc = {
|
||||
.type = BCH_DISK_ACCOUNTING_rebalance_work,
|
||||
};
|
||||
int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
|
||||
flags & BTREE_TRIGGER_gc);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc))
|
||||
return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -24,7 +24,7 @@ struct bucket_array {
|
||||
u16 first_bucket;
|
||||
size_t nbuckets;
|
||||
size_t nbuckets_minus_first;
|
||||
struct bucket b[];
|
||||
struct bucket b[] __counted_by(nbuckets);
|
||||
};
|
||||
|
||||
struct bucket_gens {
|
||||
|
@ -4,12 +4,12 @@
|
||||
#include <linux/slab.h>
|
||||
#include "darray.h"
|
||||
|
||||
int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
|
||||
int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
|
||||
{
|
||||
if (new_size > d->size) {
|
||||
new_size = roundup_pow_of_two(new_size);
|
||||
|
||||
void *data = kvmalloc_array(new_size, element_size, gfp);
|
||||
void *data = kvmalloc_array_noprof(new_size, element_size, gfp);
|
||||
if (!data)
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -22,29 +22,23 @@ struct { \
|
||||
typedef DARRAY(char) darray_char;
|
||||
typedef DARRAY(char *) darray_str;
|
||||
|
||||
int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
|
||||
int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
|
||||
|
||||
static inline int __darray_resize(darray_char *d, size_t element_size,
|
||||
size_t new_size, gfp_t gfp)
|
||||
{
|
||||
return unlikely(new_size > d->size)
|
||||
? __bch2_darray_resize(d, element_size, new_size, gfp)
|
||||
: 0;
|
||||
}
|
||||
#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
|
||||
|
||||
#define __darray_resize(_d, _element_size, _new_size, _gfp) \
|
||||
(unlikely((_new_size) > (_d)->size) \
|
||||
? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
|
||||
: 0)
|
||||
|
||||
#define darray_resize_gfp(_d, _new_size, _gfp) \
|
||||
unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
|
||||
__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
|
||||
|
||||
#define darray_resize(_d, _new_size) \
|
||||
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
|
||||
|
||||
static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
|
||||
{
|
||||
return __darray_resize(d, t_size, d->nr + more, gfp);
|
||||
}
|
||||
|
||||
#define darray_make_room_gfp(_d, _more, _gfp) \
|
||||
__darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
|
||||
darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
|
||||
|
||||
#define darray_make_room(_d, _more) \
|
||||
darray_make_room_gfp(_d, _more, GFP_KERNEL)
|
||||
|
@ -79,6 +79,8 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
|
||||
bkey_for_each_ptr(ptrs, ptr2) {
|
||||
if (ptr2 == ptr)
|
||||
break;
|
||||
|
||||
bucket = PTR_BUCKET_POS(ca, ptr2);
|
||||
bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
|
||||
}
|
||||
return false;
|
||||
|
@ -145,7 +145,6 @@
|
||||
x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
|
||||
x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
|
||||
x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
|
||||
x(BCH_ERR_transaction_restart, transaction_restart_freeing_inode) \
|
||||
x(BCH_ERR_transaction_restart, transaction_restart_nested) \
|
||||
x(0, no_btree_node) \
|
||||
x(BCH_ERR_no_btree_node, no_btree_node_relock) \
|
||||
|
@ -1379,6 +1379,45 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
|
||||
return r != NULL;
|
||||
}
|
||||
|
||||
static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
|
||||
unsigned target, unsigned compression)
|
||||
{
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
u64 sectors = 0;
|
||||
|
||||
if (compression) {
|
||||
unsigned compression_type = bch2_compression_opt_to_type(compression);
|
||||
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
||||
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
|
||||
p.ptr.unwritten) {
|
||||
sectors = 0;
|
||||
goto incompressible;
|
||||
}
|
||||
|
||||
if (!p.ptr.cached && p.crc.compression_type != compression_type)
|
||||
sectors += p.crc.compressed_size;
|
||||
}
|
||||
}
|
||||
incompressible:
|
||||
if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
||||
if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target))
|
||||
sectors += p.crc.compressed_size;
|
||||
}
|
||||
|
||||
return sectors;
|
||||
}
|
||||
|
||||
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
|
||||
{
|
||||
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
|
||||
|
||||
return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0;
|
||||
}
|
||||
|
||||
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
|
||||
struct bch_io_opts *opts)
|
||||
{
|
||||
|
@ -692,6 +692,7 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
|
||||
unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
|
||||
unsigned, unsigned);
|
||||
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
|
||||
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
|
||||
|
||||
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
|
||||
struct bch_io_opts *);
|
||||
|
@ -99,9 +99,7 @@ static inline void bch2_folio_release(struct folio *folio)
|
||||
|
||||
static inline struct bch_folio *__bch2_folio(struct folio *folio)
|
||||
{
|
||||
return folio_has_private(folio)
|
||||
? (struct bch_folio *) folio_get_private(folio)
|
||||
: NULL;
|
||||
return folio_get_private(folio);
|
||||
}
|
||||
|
||||
static inline struct bch_folio *bch2_folio(struct folio *folio)
|
||||
|
@ -328,9 +328,8 @@ static int bch2_ioc_setlabel(struct bch_fs *c,
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
ret = bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
mnt_drop_write_file(file);
|
||||
return ret;
|
||||
|
@ -193,13 +193,19 @@ repeat:
|
||||
inode = rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
|
||||
if (inode) {
|
||||
spin_lock(&inode->v.i_lock);
|
||||
if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
|
||||
spin_unlock(&inode->v.i_lock);
|
||||
return NULL;
|
||||
}
|
||||
if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
|
||||
if (trans)
|
||||
if (!trans) {
|
||||
__wait_on_freeing_inode(&inode->v);
|
||||
} else {
|
||||
bch2_trans_unlock(trans);
|
||||
__wait_on_freeing_inode(&inode->v);
|
||||
if (trans) {
|
||||
trace_and_count(c, trans_restart_freeing_inode, trans, _THIS_IP_);
|
||||
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_freeing_inode));
|
||||
int ret = bch2_trans_relock(trans);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
goto repeat;
|
||||
}
|
||||
@ -212,11 +218,14 @@ repeat:
|
||||
|
||||
static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
|
||||
{
|
||||
if (test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
|
||||
spin_lock(&inode->v.i_lock);
|
||||
bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
|
||||
spin_unlock(&inode->v.i_lock);
|
||||
|
||||
if (remove) {
|
||||
int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
|
||||
&inode->hash, bch2_vfs_inodes_params);
|
||||
BUG_ON(ret);
|
||||
clear_bit(EI_INODE_HASHED, &inode->ei_flags);
|
||||
inode->v.i_hash.pprev = NULL;
|
||||
}
|
||||
}
|
||||
@ -226,6 +235,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
|
||||
struct bch_inode_info *inode)
|
||||
{
|
||||
struct bch_inode_info *old = inode;
|
||||
|
||||
set_bit(EI_INODE_HASHED, &inode->ei_flags);
|
||||
retry:
|
||||
if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
|
||||
&inode->hash,
|
||||
@ -233,8 +244,8 @@ retry:
|
||||
old = bch2_inode_hash_find(c, trans, inode->ei_inum);
|
||||
if (!old)
|
||||
goto retry;
|
||||
if (IS_ERR(old))
|
||||
return old;
|
||||
|
||||
clear_bit(EI_INODE_HASHED, &inode->ei_flags);
|
||||
|
||||
/*
|
||||
* bcachefs doesn't use I_NEW; we have no use for it since we
|
||||
@ -249,9 +260,8 @@ retry:
|
||||
*/
|
||||
set_nlink(&inode->v, 1);
|
||||
discard_new_inode(&inode->v);
|
||||
inode = old;
|
||||
return old;
|
||||
} else {
|
||||
set_bit(EI_INODE_HASHED, &inode->ei_flags);
|
||||
inode_fake_hash(&inode->v);
|
||||
|
||||
inode_sb_list_add(&inode->v);
|
||||
@ -259,9 +269,8 @@ retry:
|
||||
mutex_lock(&c->vfs_inodes_lock);
|
||||
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
|
||||
mutex_unlock(&c->vfs_inodes_lock);
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
}
|
||||
|
||||
#define memalloc_flags_do(_flags, _do) \
|
||||
@ -333,14 +342,7 @@ static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *tr
|
||||
|
||||
bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
|
||||
|
||||
struct bch_inode_info *ret = bch2_inode_hash_insert(trans->c, trans, inode);
|
||||
if (IS_ERR(ret)) {
|
||||
inode->v.i_state |= I_NEW;
|
||||
set_nlink(&inode->v, 1);
|
||||
discard_new_inode(&inode->v);
|
||||
}
|
||||
|
||||
return ret;
|
||||
return bch2_inode_hash_insert(trans->c, trans, inode);
|
||||
|
||||
}
|
||||
|
||||
@ -1656,6 +1658,10 @@ static void bch2_evict_inode(struct inode *vinode)
|
||||
struct bch_fs *c = vinode->i_sb->s_fs_info;
|
||||
struct bch_inode_info *inode = to_bch_ei(vinode);
|
||||
|
||||
/*
|
||||
* evict() has waited for outstanding writeback, we'll do no more IO
|
||||
* through this inode: it's safe to remove from VFS inode hashtable here
|
||||
*/
|
||||
bch2_inode_hash_remove(c, inode);
|
||||
|
||||
truncate_inode_pages_final(&inode->v.i_data);
|
||||
|
@ -2006,7 +2006,6 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
|
||||
if (ret) {
|
||||
bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
|
||||
ret = -BCH_ERR_fsck_repair_unimplemented;
|
||||
ret = 0;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -2216,6 +2215,8 @@ int bch2_check_xattrs(struct bch_fs *c)
|
||||
NULL, NULL,
|
||||
BCH_TRANS_COMMIT_no_enospc,
|
||||
check_xattr(trans, &iter, k, &hash_info, &inode)));
|
||||
|
||||
inode_walker_exit(&inode);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
@ -2469,8 +2470,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
|
||||
: bch2_inode_unpack(inode_k, &inode);
|
||||
if (ret) {
|
||||
/* Should have been caught in dirents pass */
|
||||
if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
bch_err(c, "error looking up parent directory: %i", ret);
|
||||
bch_err_msg(c, ret, "error looking up parent directory");
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1260,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
|
||||
}
|
||||
|
||||
if (!had_entries)
|
||||
j->last_empty_seq = cur_seq;
|
||||
j->last_empty_seq = cur_seq - 1; /* to match j->seq */
|
||||
|
||||
spin_lock(&j->lock);
|
||||
|
||||
|
@ -1950,7 +1950,8 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
|
||||
if (error ||
|
||||
w->noflush ||
|
||||
(!w->must_flush &&
|
||||
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
|
||||
time_before(jiffies, j->last_flush_write +
|
||||
msecs_to_jiffies(c->opts.journal_flush_delay)) &&
|
||||
test_bit(JOURNAL_may_skip_flush, &j->flags))) {
|
||||
w->noflush = true;
|
||||
SET_JSET_NO_FLUSH(w->data, true);
|
||||
|
@ -104,6 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
|
||||
struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
|
||||
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
|
||||
int ret = -BCH_ERR_invalid_sb_journal;
|
||||
u64 sum = 0;
|
||||
unsigned nr;
|
||||
unsigned i;
|
||||
struct u64_range *b;
|
||||
@ -119,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
|
||||
for (i = 0; i < nr; i++) {
|
||||
b[i].start = le64_to_cpu(journal->d[i].start);
|
||||
b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
|
||||
|
||||
if (b[i].end <= b[i].start) {
|
||||
prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
|
||||
le64_to_cpu(journal->d[i].start),
|
||||
le64_to_cpu(journal->d[i].nr));
|
||||
goto err;
|
||||
}
|
||||
|
||||
sum += le64_to_cpu(journal->d[i].nr);
|
||||
}
|
||||
|
||||
sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
|
||||
@ -148,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
|
||||
}
|
||||
}
|
||||
|
||||
if (sum > UINT_MAX) {
|
||||
prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
err:
|
||||
kfree(b);
|
||||
|
@ -383,7 +383,7 @@ static int bch2_copygc_thread(void *arg)
|
||||
if (min_member_capacity == U64_MAX)
|
||||
min_member_capacity = 128 * 2048;
|
||||
|
||||
bch2_trans_unlock_long(ctxt.trans);
|
||||
move_buckets_wait(&ctxt, buckets, true);
|
||||
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
|
||||
MAX_SCHEDULE_TIMEOUT);
|
||||
}
|
||||
|
650
libbcachefs/rcu_pending.c
Normal file
650
libbcachefs/rcu_pending.c
Normal file
@ -0,0 +1,650 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#define pr_fmt(fmt) "%s() " fmt "\n", __func__
|
||||
|
||||
#include <linux/generic-radix-tree.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/srcu.h>
|
||||
#include <linux/vmalloc.h>
|
||||
|
||||
#include "rcu_pending.h"
|
||||
#include "darray.h"
|
||||
#include "util.h"
|
||||
|
||||
#define static_array_for_each(_a, _i) \
|
||||
for (typeof(&(_a)[0]) _i = _a; \
|
||||
_i < (_a) + ARRAY_SIZE(_a); \
|
||||
_i++)
|
||||
|
||||
enum rcu_pending_special {
|
||||
RCU_PENDING_KVFREE = 1,
|
||||
RCU_PENDING_CALL_RCU = 2,
|
||||
};
|
||||
|
||||
#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
|
||||
#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
|
||||
|
||||
static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
|
||||
{
|
||||
return ssp
|
||||
? get_state_synchronize_srcu(ssp)
|
||||
: get_state_synchronize_rcu();
|
||||
}
|
||||
|
||||
static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
|
||||
{
|
||||
return ssp
|
||||
? start_poll_synchronize_srcu(ssp)
|
||||
: start_poll_synchronize_rcu();
|
||||
}
|
||||
|
||||
static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
|
||||
{
|
||||
return ssp
|
||||
? poll_state_synchronize_srcu(ssp, cookie)
|
||||
: poll_state_synchronize_rcu(cookie);
|
||||
}
|
||||
|
||||
static inline void __rcu_barrier(struct srcu_struct *ssp)
|
||||
{
|
||||
return ssp
|
||||
? srcu_barrier(ssp)
|
||||
: rcu_barrier();
|
||||
}
|
||||
|
||||
static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
|
||||
rcu_callback_t func)
|
||||
{
|
||||
if (ssp)
|
||||
call_srcu(ssp, rhp, func);
|
||||
else
|
||||
call_rcu(rhp, func);
|
||||
}
|
||||
|
||||
struct rcu_pending_seq {
|
||||
/*
|
||||
* We're using a radix tree like a vector - we're just pushing elements
|
||||
* onto the end; we're using a radix tree instead of an actual vector to
|
||||
* avoid reallocation overhead
|
||||
*/
|
||||
GENRADIX(struct rcu_head *) objs;
|
||||
size_t nr;
|
||||
struct rcu_head **cursor;
|
||||
unsigned long seq;
|
||||
};
|
||||
|
||||
struct rcu_pending_list {
|
||||
struct rcu_head *head;
|
||||
struct rcu_head *tail;
|
||||
unsigned long seq;
|
||||
};
|
||||
|
||||
struct rcu_pending_pcpu {
|
||||
struct rcu_pending *parent;
|
||||
spinlock_t lock;
|
||||
int cpu;
|
||||
|
||||
/*
|
||||
* We can't bound the number of unprocessed gp sequence numbers, and we
|
||||
* can't efficiently merge radix trees for expired grace periods, so we
|
||||
* need darray/vector:
|
||||
*/
|
||||
DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
|
||||
|
||||
/* Third entry is for expired objects: */
|
||||
struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
|
||||
|
||||
struct rcu_head cb;
|
||||
bool cb_armed;
|
||||
struct work_struct work;
|
||||
};
|
||||
|
||||
static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
|
||||
{
|
||||
if (p->objs.nr)
|
||||
return true;
|
||||
|
||||
static_array_for_each(p->lists, i)
|
||||
if (i->head)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void rcu_pending_list_merge(struct rcu_pending_list *l1,
|
||||
struct rcu_pending_list *l2)
|
||||
{
|
||||
#ifdef __KERNEL__
|
||||
if (!l1->head)
|
||||
l1->head = l2->head;
|
||||
else
|
||||
l1->tail->next = l2->head;
|
||||
#else
|
||||
if (!l1->head)
|
||||
l1->head = l2->head;
|
||||
else
|
||||
l1->tail->next.next = (void *) l2->head;
|
||||
#endif
|
||||
|
||||
l1->tail = l2->tail;
|
||||
l2->head = l2->tail = NULL;
|
||||
}
|
||||
|
||||
static void rcu_pending_list_add(struct rcu_pending_list *l,
|
||||
struct rcu_head *n)
|
||||
{
|
||||
#ifdef __KERNEL__
|
||||
if (!l->head)
|
||||
l->head = n;
|
||||
else
|
||||
l->tail->next = n;
|
||||
l->tail = n;
|
||||
n->next = NULL;
|
||||
#else
|
||||
if (!l->head)
|
||||
l->head = n;
|
||||
else
|
||||
l->tail->next.next = (void *) n;
|
||||
l->tail = n;
|
||||
n->next.next = NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void merge_expired_lists(struct rcu_pending_pcpu *p)
|
||||
{
|
||||
struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
|
||||
|
||||
for (struct rcu_pending_list *i = p->lists; i < expired; i++)
|
||||
if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
|
||||
rcu_pending_list_merge(expired, i);
|
||||
}
|
||||
|
||||
#ifndef __KERNEL__
|
||||
static inline void kfree_bulk(size_t nr, void ** p)
|
||||
{
|
||||
while (nr--)
|
||||
kfree(*p);
|
||||
}
|
||||
|
||||
#define local_irq_save(flags) \
|
||||
do { \
|
||||
flags = 0; \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
static noinline void __process_finished_items(struct rcu_pending *pending,
|
||||
struct rcu_pending_pcpu *p,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
|
||||
struct rcu_pending_seq objs = {};
|
||||
struct rcu_head *list = NULL;
|
||||
|
||||
if (p->objs.nr &&
|
||||
__poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
|
||||
objs = p->objs.data[0];
|
||||
darray_remove_item(&p->objs, p->objs.data);
|
||||
}
|
||||
|
||||
merge_expired_lists(p);
|
||||
|
||||
list = expired->head;
|
||||
expired->head = expired->tail = NULL;
|
||||
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
|
||||
switch ((ulong) pending->process) {
|
||||
case RCU_PENDING_KVFREE:
|
||||
for (size_t i = 0; i < objs.nr; ) {
|
||||
size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
|
||||
|
||||
kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
|
||||
i += nr_this_node;
|
||||
}
|
||||
genradix_free(&objs.objs);
|
||||
|
||||
while (list) {
|
||||
struct rcu_head *obj = list;
|
||||
#ifdef __KERNEL__
|
||||
list = obj->next;
|
||||
#else
|
||||
list = (void *) obj->next.next;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* low bit of pointer indicates whether rcu_head needs
|
||||
* to be freed - kvfree_rcu_mightsleep()
|
||||
*/
|
||||
BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
|
||||
|
||||
void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
|
||||
kvfree(ptr);
|
||||
|
||||
bool free_head = ((unsigned long) obj->func) & 1UL;
|
||||
if (free_head)
|
||||
kfree(obj);
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case RCU_PENDING_CALL_RCU:
|
||||
for (size_t i = 0; i < objs.nr; i++) {
|
||||
struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
|
||||
obj->func(obj);
|
||||
}
|
||||
genradix_free(&objs.objs);
|
||||
|
||||
while (list) {
|
||||
struct rcu_head *obj = list;
|
||||
#ifdef __KERNEL__
|
||||
list = obj->next;
|
||||
#else
|
||||
list = (void *) obj->next.next;
|
||||
#endif
|
||||
obj->func(obj);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
for (size_t i = 0; i < objs.nr; i++)
|
||||
pending->process(pending, *genradix_ptr(&objs.objs, i));
|
||||
genradix_free(&objs.objs);
|
||||
|
||||
while (list) {
|
||||
struct rcu_head *obj = list;
|
||||
#ifdef __KERNEL__
|
||||
list = obj->next;
|
||||
#else
|
||||
list = (void *) obj->next.next;
|
||||
#endif
|
||||
pending->process(pending, obj);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static bool process_finished_items(struct rcu_pending *pending,
|
||||
struct rcu_pending_pcpu *p,
|
||||
unsigned long flags)
|
||||
{
|
||||
/*
|
||||
* XXX: we should grab the gp seq once and avoid multiple function
|
||||
* calls, this is called from __rcu_pending_enqueue() fastpath in
|
||||
* may_sleep==true mode
|
||||
*/
|
||||
if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
|
||||
(p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
|
||||
(p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
|
||||
p->lists[2].head) {
|
||||
__process_finished_items(pending, p, flags);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void rcu_pending_work(struct work_struct *work)
|
||||
{
|
||||
struct rcu_pending_pcpu *p =
|
||||
container_of(work, struct rcu_pending_pcpu, work);
|
||||
struct rcu_pending *pending = p->parent;
|
||||
unsigned long flags;
|
||||
|
||||
do {
|
||||
spin_lock_irqsave(&p->lock, flags);
|
||||
} while (process_finished_items(pending, p, flags));
|
||||
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
}
|
||||
|
||||
static void rcu_pending_rcu_cb(struct rcu_head *rcu)
|
||||
{
|
||||
struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
|
||||
|
||||
schedule_work_on(p->cpu, &p->work);
|
||||
|
||||
unsigned long flags;
|
||||
spin_lock_irqsave(&p->lock, flags);
|
||||
if (__rcu_pending_has_pending(p)) {
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
__call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
|
||||
} else {
|
||||
p->cb_armed = false;
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
}
|
||||
}
|
||||
|
||||
static __always_inline struct rcu_pending_seq *
|
||||
get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
|
||||
{
|
||||
darray_for_each_reverse(p->objs, objs)
|
||||
if (objs->seq == seq)
|
||||
return objs;
|
||||
|
||||
if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
|
||||
return NULL;
|
||||
|
||||
return &darray_last(p->objs);
|
||||
}
|
||||
|
||||
static noinline bool
|
||||
rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
|
||||
struct rcu_head *head, void *ptr,
|
||||
unsigned long *flags)
|
||||
{
|
||||
if (ptr) {
|
||||
if (!head) {
|
||||
/*
|
||||
* kvfree_rcu_mightsleep(): we weren't passed an
|
||||
* rcu_head, but we need one: use the low bit of the
|
||||
* ponter to free to flag that the head needs to be
|
||||
* freed as well:
|
||||
*/
|
||||
ptr = (void *)(((unsigned long) ptr)|1UL);
|
||||
head = kmalloc(sizeof(*head), __GFP_NOWARN);
|
||||
if (!head) {
|
||||
spin_unlock_irqrestore(&p->lock, *flags);
|
||||
head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
|
||||
/*
|
||||
* dropped lock, did GFP_KERNEL allocation,
|
||||
* check for gp expiration
|
||||
*/
|
||||
if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
|
||||
kvfree(--ptr);
|
||||
kfree(head);
|
||||
spin_lock_irqsave(&p->lock, *flags);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
head->func = ptr;
|
||||
}
|
||||
again:
|
||||
for (struct rcu_pending_list *i = p->lists;
|
||||
i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
|
||||
if (i->seq == seq) {
|
||||
rcu_pending_list_add(i, head);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (struct rcu_pending_list *i = p->lists;
|
||||
i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
|
||||
if (!i->head) {
|
||||
i->seq = seq;
|
||||
rcu_pending_list_add(i, head);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
merge_expired_lists(p);
|
||||
goto again;
|
||||
}
|
||||
|
||||
/*
|
||||
* __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
|
||||
* pending->pracess) once grace period elapses.
|
||||
*
|
||||
* Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
|
||||
* back to a linked list.
|
||||
*
|
||||
* - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
|
||||
* process callback
|
||||
*
|
||||
* - If @ptr and @head are both not NULL, we're kvfree_rcu()
|
||||
*
|
||||
* - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
|
||||
*
|
||||
* - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
|
||||
* expired items.
|
||||
*/
|
||||
static __always_inline void
|
||||
__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
|
||||
void *ptr, bool may_sleep)
|
||||
{
|
||||
|
||||
struct rcu_pending_pcpu *p;
|
||||
struct rcu_pending_seq *objs;
|
||||
struct genradix_node *new_node = NULL;
|
||||
unsigned long seq, flags;
|
||||
bool start_gp = false;
|
||||
|
||||
BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
|
||||
|
||||
local_irq_save(flags);
|
||||
p = this_cpu_ptr(pending->p);
|
||||
spin_lock(&p->lock);
|
||||
seq = __get_state_synchronize_rcu(pending->srcu);
|
||||
restart:
|
||||
if (may_sleep &&
|
||||
unlikely(process_finished_items(pending, p, flags)))
|
||||
goto check_expired;
|
||||
|
||||
/*
|
||||
* In kvfree_rcu() mode, the radix tree is only for slab pointers so
|
||||
* that we can do kfree_bulk() - vmalloc pointers always use the linked
|
||||
* list:
|
||||
*/
|
||||
if (ptr && unlikely(is_vmalloc_addr(ptr)))
|
||||
goto list_add;
|
||||
|
||||
objs = get_object_radix(p, seq);
|
||||
if (unlikely(!objs))
|
||||
goto list_add;
|
||||
|
||||
if (unlikely(!objs->cursor)) {
|
||||
/*
|
||||
* New radix tree nodes must be added under @p->lock because the
|
||||
* tree root is in a darray that can be resized (typically,
|
||||
* genradix supports concurrent unlocked allocation of new
|
||||
* nodes) - hence preallocation and the retry loop:
|
||||
*/
|
||||
objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
|
||||
objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
|
||||
if (unlikely(!objs->cursor)) {
|
||||
if (may_sleep) {
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
|
||||
gfp_t gfp = GFP_KERNEL;
|
||||
if (!head)
|
||||
gfp |= __GFP_NOFAIL;
|
||||
|
||||
new_node = genradix_alloc_node(gfp);
|
||||
if (!new_node)
|
||||
may_sleep = false;
|
||||
goto check_expired;
|
||||
}
|
||||
list_add:
|
||||
start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
|
||||
goto start_gp;
|
||||
}
|
||||
}
|
||||
|
||||
*objs->cursor++ = ptr ?: head;
|
||||
/* zero cursor if we hit the end of a radix tree node: */
|
||||
if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
|
||||
objs->cursor = NULL;
|
||||
start_gp = !objs->nr;
|
||||
objs->nr++;
|
||||
start_gp:
|
||||
if (unlikely(start_gp)) {
|
||||
/*
|
||||
* We only have one callback (ideally, we would have one for
|
||||
* every outstanding graceperiod) - so if our callback is
|
||||
* already in flight, we may still have to start a grace period
|
||||
* (since we used get_state() above, not start_poll())
|
||||
*/
|
||||
if (!p->cb_armed) {
|
||||
p->cb_armed = true;
|
||||
__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
|
||||
} else {
|
||||
__start_poll_synchronize_rcu(pending->srcu);
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
free_node:
|
||||
if (new_node)
|
||||
genradix_free_node(new_node);
|
||||
return;
|
||||
check_expired:
|
||||
if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
|
||||
switch ((ulong) pending->process) {
|
||||
case RCU_PENDING_KVFREE:
|
||||
kvfree(ptr);
|
||||
break;
|
||||
case RCU_PENDING_CALL_RCU:
|
||||
head->func(head);
|
||||
break;
|
||||
default:
|
||||
pending->process(pending, head);
|
||||
break;
|
||||
}
|
||||
goto free_node;
|
||||
}
|
||||
|
||||
local_irq_save(flags);
|
||||
p = this_cpu_ptr(pending->p);
|
||||
spin_lock(&p->lock);
|
||||
goto restart;
|
||||
}
|
||||
|
||||
void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
|
||||
{
|
||||
__rcu_pending_enqueue(pending, obj, NULL, true);
|
||||
}
|
||||
|
||||
static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
|
||||
{
|
||||
struct rcu_head *ret = NULL;
|
||||
|
||||
spin_lock_irq(&p->lock);
|
||||
darray_for_each(p->objs, objs)
|
||||
if (objs->nr) {
|
||||
ret = *genradix_ptr(&objs->objs, --objs->nr);
|
||||
objs->cursor = NULL;
|
||||
if (!objs->nr)
|
||||
genradix_free(&objs->objs);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static_array_for_each(p->lists, i)
|
||||
if (i->head) {
|
||||
ret = i->head;
|
||||
#ifdef __KERNEL__
|
||||
i->head = ret->next;
|
||||
#else
|
||||
i->head = (void *) ret->next.next;
|
||||
#endif
|
||||
if (!i->head)
|
||||
i->tail = NULL;
|
||||
goto out;
|
||||
}
|
||||
out:
|
||||
spin_unlock_irq(&p->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
|
||||
{
|
||||
return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
|
||||
}
|
||||
|
||||
struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
|
||||
{
|
||||
struct rcu_head *ret = rcu_pending_dequeue(pending);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
int cpu;
|
||||
for_each_possible_cpu(cpu) {
|
||||
ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
|
||||
{
|
||||
int cpu;
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
||||
spin_lock_irq(&p->lock);
|
||||
if (__rcu_pending_has_pending(p) || p->cb_armed) {
|
||||
spin_unlock_irq(&p->lock);
|
||||
return true;
|
||||
}
|
||||
spin_unlock_irq(&p->lock);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void rcu_pending_exit(struct rcu_pending *pending)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
if (!pending->p)
|
||||
return;
|
||||
|
||||
while (rcu_pending_has_pending_or_armed(pending)) {
|
||||
__rcu_barrier(pending->srcu);
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
||||
flush_work(&p->work);
|
||||
}
|
||||
}
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
||||
flush_work(&p->work);
|
||||
}
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
||||
|
||||
static_array_for_each(p->lists, i)
|
||||
WARN_ON(i->head);
|
||||
WARN_ON(p->objs.nr);
|
||||
darray_exit(&p->objs);
|
||||
}
|
||||
free_percpu(pending->p);
|
||||
}
|
||||
|
||||
/**
|
||||
* rcu_pending_init: - initialize a rcu_pending
|
||||
*
|
||||
* @pending: Object to init
|
||||
* @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
|
||||
* RCU flavor
|
||||
* @process: Callback function invoked on objects once their RCU barriers
|
||||
* have completed; if NULL, kvfree() is used.
|
||||
*/
|
||||
int rcu_pending_init(struct rcu_pending *pending,
|
||||
struct srcu_struct *srcu,
|
||||
rcu_pending_process_fn process)
|
||||
{
|
||||
pending->p = alloc_percpu(struct rcu_pending_pcpu);
|
||||
if (!pending->p)
|
||||
return -ENOMEM;
|
||||
|
||||
int cpu;
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
||||
p->parent = pending;
|
||||
p->cpu = cpu;
|
||||
spin_lock_init(&p->lock);
|
||||
darray_init(&p->objs);
|
||||
INIT_WORK(&p->work, rcu_pending_work);
|
||||
}
|
||||
|
||||
pending->srcu = srcu;
|
||||
pending->process = process;
|
||||
|
||||
return 0;
|
||||
}
|
27
libbcachefs/rcu_pending.h
Normal file
27
libbcachefs/rcu_pending.h
Normal file
@ -0,0 +1,27 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_RCU_PENDING_H
|
||||
#define _LINUX_RCU_PENDING_H
|
||||
|
||||
#include <linux/rcupdate.h>
|
||||
|
||||
struct rcu_pending;
|
||||
typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
|
||||
|
||||
struct rcu_pending_pcpu;
|
||||
|
||||
struct rcu_pending {
|
||||
struct rcu_pending_pcpu __percpu *p;
|
||||
struct srcu_struct *srcu;
|
||||
rcu_pending_process_fn process;
|
||||
};
|
||||
|
||||
void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
|
||||
struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
|
||||
struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
|
||||
|
||||
void rcu_pending_exit(struct rcu_pending *pending);
|
||||
int rcu_pending_init(struct rcu_pending *pending,
|
||||
struct srcu_struct *srcu,
|
||||
rcu_pending_process_fn process);
|
||||
|
||||
#endif /* _LINUX_RCU_PENDING_H */
|
@ -241,7 +241,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
|
||||
const struct journal_key *l = *((const struct journal_key **)_l);
|
||||
const struct journal_key *r = *((const struct journal_key **)_r);
|
||||
|
||||
return cmp_int(l->journal_seq, r->journal_seq);
|
||||
/*
|
||||
* Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
|
||||
*
|
||||
* journal_seq == 0 means that the key comes from early repair, and
|
||||
* should be inserted last so as to avoid overflowing the journal
|
||||
*/
|
||||
return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
|
||||
}
|
||||
|
||||
int bch2_journal_replay(struct bch_fs *c)
|
||||
@ -322,6 +328,7 @@ int bch2_journal_replay(struct bch_fs *c)
|
||||
}
|
||||
}
|
||||
|
||||
bch2_trans_unlock_long(trans);
|
||||
/*
|
||||
* Now, replay any remaining keys in the order in which they appear in
|
||||
* the journal, unpinning those journal entries as we go:
|
||||
|
@ -81,8 +81,7 @@
|
||||
x(trans_restart_write_buffer_flush, 75) \
|
||||
x(trans_restart_split_race, 76) \
|
||||
x(write_buffer_flush_slowpath, 77) \
|
||||
x(write_buffer_flush_sync, 78) \
|
||||
x(trans_restart_freeing_inode, 79)
|
||||
x(write_buffer_flush_sync, 78)
|
||||
|
||||
enum bch_persistent_counters {
|
||||
#define x(t, n, ...) BCH_COUNTER_##t,
|
||||
|
@ -74,6 +74,9 @@
|
||||
BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \
|
||||
BCH_FSCK_ERR_accounting_key_junk_at_end) \
|
||||
x(disk_accounting_inum, \
|
||||
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
||||
BCH_FSCK_ERR_accounting_mismatch) \
|
||||
x(rebalance_work_acct_fix, \
|
||||
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
||||
BCH_FSCK_ERR_accounting_mismatch)
|
||||
|
||||
@ -108,7 +111,10 @@
|
||||
BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
|
||||
BCH_FSCK_ERR_fs_usage_replicas_wrong, \
|
||||
BCH_FSCK_ERR_accounting_replicas_not_marked, \
|
||||
BCH_FSCK_ERR_bkey_version_in_future)
|
||||
BCH_FSCK_ERR_bkey_version_in_future) \
|
||||
x(rebalance_work_acct_fix, \
|
||||
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
||||
BCH_FSCK_ERR_accounting_mismatch)
|
||||
|
||||
struct upgrade_downgrade_entry {
|
||||
u64 recovery_passes;
|
||||
|
@ -23,7 +23,7 @@ enum bch_fsck_flags {
|
||||
x(jset_past_bucket_end, 9, 0) \
|
||||
x(jset_seq_blacklisted, 10, 0) \
|
||||
x(journal_entries_missing, 11, 0) \
|
||||
x(journal_entry_replicas_not_marked, 12, 0) \
|
||||
x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \
|
||||
x(journal_entry_past_jset_end, 13, 0) \
|
||||
x(journal_entry_replicas_data_mismatch, 14, 0) \
|
||||
x(journal_entry_bkey_u64s_0, 15, 0) \
|
||||
|
@ -233,7 +233,7 @@ write_attribute(perf_test);
|
||||
|
||||
#define x(_name) \
|
||||
static struct attribute sysfs_time_stat_##_name = \
|
||||
{ .name = #_name, .mode = 0444 };
|
||||
{ .name = #_name, .mode = 0644 };
|
||||
BCH_TIME_STATS()
|
||||
#undef x
|
||||
|
||||
@ -722,6 +722,13 @@ SHOW(bch2_fs_time_stats)
|
||||
|
||||
STORE(bch2_fs_time_stats)
|
||||
{
|
||||
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
|
||||
|
||||
#define x(name) \
|
||||
if (attr == &sysfs_time_stat_##name) \
|
||||
bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
|
||||
BCH_TIME_STATS()
|
||||
#undef x
|
||||
return size;
|
||||
}
|
||||
SYSFS_OPS(bch2_fs_time_stats);
|
||||
|
@ -387,7 +387,7 @@ again:
|
||||
seen = buf->buf.nr;
|
||||
char *n = memchr(buf->buf.data, '\n', seen);
|
||||
|
||||
if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) {
|
||||
if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
|
||||
spin_unlock(&buf->lock);
|
||||
return -ETIME;
|
||||
}
|
||||
|
@ -151,6 +151,20 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_time_stats_reset(struct bch2_time_stats *stats)
|
||||
{
|
||||
spin_lock_irq(&stats->lock);
|
||||
unsigned offset = offsetof(struct bch2_time_stats, min_duration);
|
||||
memset((void *) stats + offset, 0, sizeof(*stats) - offset);
|
||||
|
||||
if (stats->buffer) {
|
||||
int cpu;
|
||||
for_each_possible_cpu(cpu)
|
||||
per_cpu_ptr(stats->buffer, cpu)->nr = 0;
|
||||
}
|
||||
spin_unlock_irq(&stats->lock);
|
||||
}
|
||||
|
||||
void bch2_time_stats_exit(struct bch2_time_stats *stats)
|
||||
{
|
||||
free_percpu(stats->buffer);
|
||||
|
@ -70,6 +70,7 @@ struct time_stat_buffer {
|
||||
struct bch2_time_stats {
|
||||
spinlock_t lock;
|
||||
bool have_quantiles;
|
||||
struct time_stat_buffer __percpu *buffer;
|
||||
/* all fields are in nanoseconds */
|
||||
u64 min_duration;
|
||||
u64 max_duration;
|
||||
@ -87,7 +88,6 @@ struct bch2_time_stats {
|
||||
|
||||
struct mean_and_variance_weighted duration_stats_weighted;
|
||||
struct mean_and_variance_weighted freq_stats_weighted;
|
||||
struct time_stat_buffer __percpu *buffer;
|
||||
};
|
||||
|
||||
struct bch2_time_stats_quantiles {
|
||||
@ -142,6 +142,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
|
||||
return false;
|
||||
}
|
||||
|
||||
void bch2_time_stats_reset(struct bch2_time_stats *);
|
||||
void bch2_time_stats_exit(struct bch2_time_stats *);
|
||||
void bch2_time_stats_init(struct bch2_time_stats *);
|
||||
|
||||
|
@ -1316,12 +1316,6 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
|
||||
__entry->new_u64s)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(transaction_event, trans_restart_freeing_inode,
|
||||
TP_PROTO(struct btree_trans *trans,
|
||||
unsigned long caller_ip),
|
||||
TP_ARGS(trans, caller_ip)
|
||||
);
|
||||
|
||||
TRACE_EVENT(path_downgrade,
|
||||
TP_PROTO(struct btree_trans *trans,
|
||||
unsigned long caller_ip,
|
||||
|
@ -416,7 +416,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
|
||||
printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
|
||||
|
||||
prt_printf(out, "\tsince mount\r\trecent\r\n");
|
||||
prt_printf(out, "recent");
|
||||
|
||||
printbuf_tabstops_reset(out);
|
||||
printbuf_tabstop_push(out, out->indent + 20);
|
||||
|
@ -5,99 +5,31 @@
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/kmemleak.h>
|
||||
|
||||
#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
|
||||
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
|
||||
|
||||
struct genradix_node {
|
||||
union {
|
||||
/* Interior node: */
|
||||
struct genradix_node *children[GENRADIX_ARY];
|
||||
|
||||
/* Leaf: */
|
||||
u8 data[GENRADIX_NODE_SIZE];
|
||||
};
|
||||
};
|
||||
|
||||
static inline int genradix_depth_shift(unsigned depth)
|
||||
{
|
||||
return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns size (of data, in bytes) that a tree of a given depth holds:
|
||||
*/
|
||||
static inline size_t genradix_depth_size(unsigned depth)
|
||||
{
|
||||
return 1UL << genradix_depth_shift(depth);
|
||||
}
|
||||
|
||||
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
|
||||
#define GENRADIX_MAX_DEPTH \
|
||||
DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
|
||||
|
||||
#define GENRADIX_DEPTH_MASK \
|
||||
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
|
||||
|
||||
static inline unsigned genradix_root_to_depth(struct genradix_root *r)
|
||||
{
|
||||
return (unsigned long) r & GENRADIX_DEPTH_MASK;
|
||||
}
|
||||
|
||||
static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
|
||||
{
|
||||
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns pointer to the specified byte @offset within @radix, or NULL if not
|
||||
* allocated
|
||||
*/
|
||||
void *__genradix_ptr(struct __genradix *radix, size_t offset)
|
||||
{
|
||||
struct genradix_root *r = READ_ONCE(radix->root);
|
||||
struct genradix_node *n = genradix_root_to_node(r);
|
||||
unsigned level = genradix_root_to_depth(r);
|
||||
|
||||
if (ilog2(offset) >= genradix_depth_shift(level))
|
||||
return NULL;
|
||||
|
||||
while (1) {
|
||||
if (!n)
|
||||
return NULL;
|
||||
if (!level)
|
||||
break;
|
||||
|
||||
level--;
|
||||
|
||||
n = n->children[offset >> genradix_depth_shift(level)];
|
||||
offset &= genradix_depth_size(level) - 1;
|
||||
}
|
||||
|
||||
return &n->data[offset];
|
||||
return __genradix_ptr_inlined(radix, offset);
|
||||
}
|
||||
EXPORT_SYMBOL(__genradix_ptr);
|
||||
|
||||
static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
|
||||
{
|
||||
return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
|
||||
}
|
||||
|
||||
static inline void genradix_free_node(struct genradix_node *node)
|
||||
{
|
||||
kfree(node);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns pointer to the specified byte @offset within @radix, allocating it if
|
||||
* necessary - newly allocated slots are always zeroed out:
|
||||
*/
|
||||
void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
|
||||
struct genradix_node **preallocated,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
struct genradix_root *v = READ_ONCE(radix->root);
|
||||
struct genradix_node *n, *new_node = NULL;
|
||||
unsigned level;
|
||||
|
||||
if (preallocated)
|
||||
swap(new_node, *preallocated);
|
||||
|
||||
/* Increase tree depth if necessary: */
|
||||
while (1) {
|
||||
struct genradix_root *r = v, *new_root;
|
||||
@ -281,7 +213,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size,
|
||||
size_t offset;
|
||||
|
||||
for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
|
||||
if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
|
||||
if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask))
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
|
Loading…
Reference in New Issue
Block a user