Update bcachefs sources to 176718966e bcachefs: Re-enable hash_redo_key()

This commit is contained in:
Kent Overstreet 2022-09-04 14:21:58 -04:00
parent d2c2c5954c
commit d01f633041
37 changed files with 1695 additions and 1360 deletions

View File

@ -1 +1 @@
a7694865a3008d6752370caee2ed3c64c1b0f973
176718966e14c5f832ead8cea2e0e45aba51f5ef

View File

@ -10,6 +10,7 @@
#define list_add(n, h) cds_list_add(n, h)
#define list_add_tail(n, h) cds_list_add_tail(n, h)
#define __list_del_entry(l) cds_list_del(l)
#define __list_del(p, n) __cds_list_del(p, n)
#define list_del(l) cds_list_del(l)
#define list_del_init(l) cds_list_del_init(l)
#define list_replace(o, n) cds_list_replace(o, n)

View File

@ -59,7 +59,6 @@
*/
#include <linux/lockdep.h>
#include <linux/osq_lock.h>
#include <linux/sched.h>
#include <linux/types.h>
@ -105,18 +104,23 @@ enum six_lock_type {
struct six_lock {
union six_lock_state state;
unsigned intent_lock_recurse;
struct task_struct *owner;
struct optimistic_spin_queue osq;
unsigned __percpu *readers;
unsigned intent_lock_recurse;
raw_spinlock_t wait_lock;
struct list_head wait_list[2];
struct list_head wait_list;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
};
struct six_lock_waiter {
struct list_head list;
struct task_struct *task;
enum six_lock_type lock_want;
bool lock_acquired;
};
typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
static __always_inline void __six_lock_init(struct six_lock *lock,
@ -125,8 +129,7 @@ static __always_inline void __six_lock_init(struct six_lock *lock,
{
atomic64_set(&lock->state.counter, 0);
raw_spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
INIT_LIST_HEAD(&lock->wait_list);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
debug_check_no_locks_freed((void *) lock, sizeof(*lock));
lockdep_init_map(&lock->dep_map, name, key, 0);
@ -146,6 +149,8 @@ do { \
bool six_trylock_##type(struct six_lock *); \
bool six_relock_##type(struct six_lock *, u32); \
int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
int six_lock_waiter_##type(struct six_lock *, struct six_lock_waiter *, \
six_lock_should_sleep_fn, void *); \
void six_unlock_##type(struct six_lock *);
__SIX_LOCK(read)
@ -182,6 +187,13 @@ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
}
static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
struct six_lock_waiter *wait,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p);
}
static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
{
SIX_LOCK_DISPATCH(type, six_unlock, lock);
@ -196,13 +208,11 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
void six_lock_wakeup_all(struct six_lock *);
void six_lock_pcpu_free_rcu(struct six_lock *);
void six_lock_pcpu_free(struct six_lock *);
void six_lock_pcpu_alloc(struct six_lock *);
struct six_lock_count {
unsigned read;
unsigned intent;
unsigned n[3];
};
struct six_lock_count six_lock_counts(struct six_lock *);

View File

@ -18,7 +18,7 @@
__entry->dst##_snapshot = (src).snapshot
DECLARE_EVENT_CLASS(bpos,
TP_PROTO(struct bpos *p),
TP_PROTO(const struct bpos *p),
TP_ARGS(p),
TP_STRUCT__entry(
@ -52,6 +52,31 @@ DECLARE_EVENT_CLASS(bkey,
__entry->offset, __entry->size)
);
DECLARE_EVENT_CLASS(btree_node,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b),
TP_STRUCT__entry(
__field(dev_t, dev )
__field(u8, level )
__field(u8, btree_id )
TRACE_BPOS_entries(pos)
),
TP_fast_assign(
__entry->dev = c->dev;
__entry->level = b->c.level;
__entry->btree_id = b->c.btree_id;
TRACE_BPOS_assign(pos, b->key.k.p);
),
TP_printk("%d,%d %u %s %llu:%llu:%u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->level,
bch2_btree_ids[__entry->btree_id],
__entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
);
DECLARE_EVENT_CLASS(bch_fs,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c),
@ -112,7 +137,7 @@ TRACE_EVENT(write_super,
/* io.c: */
DEFINE_EVENT(bio, read_split,
DEFINE_EVENT(bio, read_promote,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
@ -122,12 +147,17 @@ DEFINE_EVENT(bio, read_bounce,
TP_ARGS(bio)
);
DEFINE_EVENT(bio, read_split,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
DEFINE_EVENT(bio, read_retry,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
DEFINE_EVENT(bio, promote,
DEFINE_EVENT(bio, read_reuse_race,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
@ -220,48 +250,68 @@ TRACE_EVENT(journal_reclaim_finish,
__entry->nr_flushed)
);
/* allocator: */
/* bset.c: */
DEFINE_EVENT(bpos, bkey_pack_pos_fail,
TP_PROTO(struct bpos *p),
TP_PROTO(const struct bpos *p),
TP_ARGS(p)
);
/* Btree */
/* Btree cache: */
DECLARE_EVENT_CLASS(btree_node,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b),
TRACE_EVENT(btree_cache_scan,
TP_PROTO(long nr_to_scan, long can_free, long ret),
TP_ARGS(nr_to_scan, can_free, ret),
TP_STRUCT__entry(
__field(dev_t, dev )
__field(u8, level )
__field(u8, btree_id )
TRACE_BPOS_entries(pos)
__field(long, nr_to_scan )
__field(long, can_free )
__field(long, ret )
),
TP_fast_assign(
__entry->dev = c->dev;
__entry->level = b->c.level;
__entry->btree_id = b->c.btree_id;
TRACE_BPOS_assign(pos, b->key.k.p);
__entry->nr_to_scan = nr_to_scan;
__entry->can_free = can_free;
__entry->ret = ret;
),
TP_printk("%d,%d %u %s %llu:%llu:%u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->level,
bch2_btree_ids[__entry->btree_id],
__entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
TP_printk("scanned for %li nodes, can free %li, ret %li",
__entry->nr_to_scan, __entry->can_free, __entry->ret)
);
DEFINE_EVENT(btree_node, btree_read,
DEFINE_EVENT(btree_node, btree_cache_reap,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
TRACE_EVENT(btree_write,
DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
/* Btree */
DEFINE_EVENT(btree_node, btree_node_read,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
TRACE_EVENT(btree_node_write,
TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
TP_ARGS(b, bytes, sectors),
@ -291,31 +341,6 @@ DEFINE_EVENT(btree_node, btree_node_free,
TP_ARGS(c, b)
);
DEFINE_EVENT(btree_node, btree_node_reap,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
DEFINE_EVENT(bch_fs, btree_node_cannibalize,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
TRACE_EVENT(btree_reserve_get_fail,
TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
@ -323,7 +348,7 @@ TRACE_EVENT(btree_reserve_get_fail,
TP_ARGS(trans_fn, caller_ip, required),
TP_STRUCT__entry(
__array(char, trans_fn, 24 )
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(size_t, required )
),
@ -340,52 +365,32 @@ TRACE_EVENT(btree_reserve_get_fail,
__entry->required)
);
DEFINE_EVENT(btree_node, btree_split,
DEFINE_EVENT(btree_node, btree_node_compact,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
DEFINE_EVENT(btree_node, btree_compact,
DEFINE_EVENT(btree_node, btree_node_merge,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
DEFINE_EVENT(btree_node, btree_merge,
DEFINE_EVENT(btree_node, btree_node_split,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
DEFINE_EVENT(btree_node, btree_rewrite,
DEFINE_EVENT(btree_node, btree_node_rewrite,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
DEFINE_EVENT(btree_node, btree_set_root,
DEFINE_EVENT(btree_node, btree_node_set_root,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
TRACE_EVENT(btree_cache_scan,
TP_PROTO(long nr_to_scan, long can_free, long ret),
TP_ARGS(nr_to_scan, can_free, ret),
TP_STRUCT__entry(
__field(long, nr_to_scan )
__field(long, can_free )
__field(long, ret )
),
TP_fast_assign(
__entry->nr_to_scan = nr_to_scan;
__entry->can_free = can_free;
__entry->ret = ret;
),
TP_printk("scanned for %li nodes, can free %li, ret %li",
__entry->nr_to_scan, __entry->can_free, __entry->ret)
);
TRACE_EVENT(btree_node_relock_fail,
TRACE_EVENT(btree_path_relock_fail,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
struct btree_path *path,
@ -393,26 +398,31 @@ TRACE_EVENT(btree_node_relock_fail,
TP_ARGS(trans, caller_ip, path, level),
TP_STRUCT__entry(
__array(char, trans_fn, 24 )
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, btree_id )
TRACE_BPOS_entries(pos)
__field(unsigned long, node )
__array(char, node, 24 )
__field(u32, iter_lock_seq )
__field(u32, node_lock_seq )
),
TP_fast_assign(
struct btree *b = btree_path_node(path, level);
strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->btree_id = path->btree_id;
TRACE_BPOS_assign(pos, path->pos);
__entry->node = (unsigned long) btree_path_node(path, level);
if (IS_ERR(b))
strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
else
scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
__entry->iter_lock_seq = path->l[level].lock_seq;
__entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
),
TP_printk("%s %pS btree %s pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
TP_printk("%s %pS btree %s pos %llu:%llu:%u, node %s iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
bch2_btree_ids[__entry->btree_id],
@ -424,7 +434,7 @@ TRACE_EVENT(btree_node_relock_fail,
__entry->node_lock_seq)
);
TRACE_EVENT(btree_node_upgrade_fail,
TRACE_EVENT(btree_path_upgrade_fail,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
struct btree_path *path,
@ -432,7 +442,7 @@ TRACE_EVENT(btree_node_upgrade_fail,
TP_ARGS(trans, caller_ip, path, level),
TP_STRUCT__entry(
__array(char, trans_fn, 24 )
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, btree_id )
TRACE_BPOS_entries(pos)
@ -452,12 +462,12 @@ TRACE_EVENT(btree_node_upgrade_fail,
TRACE_BPOS_assign(pos, path->pos);
__entry->locked = btree_node_locked(path, level);
c = bch2_btree_node_lock_counts(trans, NULL, path->l[level].b, level),
__entry->self_read_count = c.read;
__entry->self_intent_count = c.intent;
c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
__entry->self_read_count = c.n[SIX_LOCK_read];
__entry->self_intent_count = c.n[SIX_LOCK_intent];
c = six_lock_counts(&path->l[level].b->c.lock);
__entry->read_count = c.read;
__entry->intent_count = c.intent;
__entry->read_count = c.n[SIX_LOCK_read];
__entry->intent_count = c.n[SIX_LOCK_read];
),
TP_printk("%s %pS btree %s pos %llu:%llu:%u, locked %u held %u:%u lock count %u:%u",
@ -599,7 +609,7 @@ TRACE_EVENT(discard_buckets,
__entry->err)
);
TRACE_EVENT(invalidate_bucket,
TRACE_EVENT(bucket_invalidate,
TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
TP_ARGS(c, dev, bucket, sectors),
@ -625,17 +635,27 @@ TRACE_EVENT(invalidate_bucket,
/* Moving IO */
DEFINE_EVENT(bkey, move_extent,
DEFINE_EVENT(bkey, move_extent_read,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
DEFINE_EVENT(bkey, move_alloc_mem_fail,
DEFINE_EVENT(bkey, move_extent_write,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
DEFINE_EVENT(bkey, move_race,
DEFINE_EVENT(bkey, move_extent_finish,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
DEFINE_EVENT(bkey, move_extent_race,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
@ -714,13 +734,15 @@ TRACE_EVENT(copygc_wait,
__entry->wait_amount, __entry->until)
);
/* btree transactions: */
DECLARE_EVENT_CLASS(transaction_event,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip),
TP_STRUCT__entry(
__array(char, trans_fn, 24 )
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
),
@ -738,7 +760,7 @@ DEFINE_EVENT(transaction_event, transaction_commit,
TP_ARGS(trans, caller_ip)
);
DEFINE_EVENT(transaction_event, transaction_restart_injected,
DEFINE_EVENT(transaction_event, trans_restart_injected,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip)
@ -756,10 +778,28 @@ DEFINE_EVENT(transaction_event, trans_restart_journal_res_get,
TP_ARGS(trans, caller_ip)
);
DEFINE_EVENT(transaction_event, trans_restart_journal_preres_get,
TRACE_EVENT(trans_restart_journal_preres_get,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip)
unsigned long caller_ip,
unsigned flags),
TP_ARGS(trans, caller_ip, flags),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(unsigned, flags )
),
TP_fast_assign(
strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->flags = flags;
),
TP_printk("%s %pS %x", __entry->trans_fn,
(void *) __entry->caller_ip,
__entry->flags)
);
DEFINE_EVENT(transaction_event, trans_restart_journal_reclaim,
@ -805,7 +845,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
TP_ARGS(trans, caller_ip, path),
TP_STRUCT__entry(
__array(char, trans_fn, 24 )
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, btree_id )
TRACE_BPOS_entries(pos)
@ -883,7 +923,7 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_event, transaction_restart_key_cache_upgrade,
DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip)
@ -935,7 +975,7 @@ TRACE_EVENT(trans_restart_would_deadlock,
have, want, want_pos),
TP_STRUCT__entry(
__array(char, trans_fn, 24 )
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, in_traverse_all )
__field(u8, reason )
@ -982,7 +1022,7 @@ TRACE_EVENT(trans_restart_would_deadlock_write,
TP_ARGS(trans),
TP_STRUCT__entry(
__array(char, trans_fn, 24 )
__array(char, trans_fn, 32 )
),
TP_fast_assign(
@ -999,7 +1039,7 @@ TRACE_EVENT(trans_restart_mem_realloced,
TP_ARGS(trans, caller_ip, bytes),
TP_STRUCT__entry(
__array(char, trans_fn, 24 )
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(unsigned long, bytes )
),
@ -1025,7 +1065,7 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
TP_STRUCT__entry(
__array(char, trans_fn, 24 )
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(enum btree_id, btree_id )
TRACE_BPOS_entries(pos)

View File

@ -1217,8 +1217,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
if (ret)
goto out;
trace_invalidate_bucket(c, bucket.inode, bucket.offset, cached_sectors);
this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]);
trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
--*nr_to_invalidate;
out:
bch2_trans_iter_exit(trans, &alloc_iter);

View File

@ -268,7 +268,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
spin_unlock(&c->freelist_lock);
trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]);
trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve]);
return ob;
}
@ -575,20 +575,19 @@ err:
if (!ob)
ob = ERR_PTR(-BCH_ERR_no_buckets_found);
if (IS_ERR(ob)) {
trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve],
usage.d[BCH_DATA_free].buckets,
avail,
bch2_copygc_wait_amount(c),
c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
buckets_seen,
skipped_open,
skipped_need_journal_commit,
skipped_nouse,
cl == NULL,
bch2_err_str(PTR_ERR(ob)));
atomic_long_inc(&c->bucket_alloc_fail);
}
if (IS_ERR(ob))
trace_and_count(c, bucket_alloc_fail,
ca, bch2_alloc_reserves[reserve],
usage.d[BCH_DATA_free].buckets,
avail,
bch2_copygc_wait_amount(c),
c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
buckets_seen,
skipped_open,
skipped_need_journal_commit,
skipped_nouse,
cl == NULL,
bch2_err_str(PTR_ERR(ob)));
return ob;
}

View File

@ -212,6 +212,12 @@
#define dynamic_fault(...) 0
#define race_fault(...) 0
#define trace_and_count(_c, _name, ...) \
do { \
this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \
trace_##_name(__VA_ARGS__); \
} while (0)
#define bch2_fs_init_fault(name) \
dynamic_fault("bcachefs:bch_fs_init:" name)
#define bch2_meta_read_fault(name) \
@ -329,9 +335,6 @@ BCH_DEBUG_PARAMS_DEBUG()
x(btree_interior_update_foreground) \
x(btree_interior_update_total) \
x(btree_gc) \
x(btree_lock_contended_read) \
x(btree_lock_contended_intent) \
x(btree_lock_contended_write) \
x(data_write) \
x(data_read) \
x(data_promote) \
@ -535,6 +538,7 @@ struct btree_transaction_stats {
struct mutex lock;
struct time_stats lock_hold_times;
unsigned nr_max_paths;
unsigned max_mem;
char *max_paths_text;
};
@ -917,12 +921,6 @@ struct bch_fs {
u64 last_bucket_seq_cleanup;
/* TODO rewrite as counters - The rest of this all shows up in sysfs */
atomic_long_t read_realloc_races;
atomic_long_t extent_migrate_done;
atomic_long_t extent_migrate_raced;
atomic_long_t bucket_alloc_fail;
u64 counters_on_mount[BCH_COUNTER_NR];
u64 __percpu *counters;

View File

@ -1337,12 +1337,81 @@ struct bch_sb_field_disk_groups {
/* BCH_SB_FIELD_counters */
#define BCH_PERSISTENT_COUNTERS() \
x(io_read, 0) \
x(io_write, 1) \
x(io_move, 2) \
x(bucket_invalidate, 3) \
x(bucket_discard, 4)
#define BCH_PERSISTENT_COUNTERS() \
x(io_read, 0) \
x(io_write, 1) \
x(io_move, 2) \
x(bucket_invalidate, 3) \
x(bucket_discard, 4) \
x(bucket_alloc, 5) \
x(bucket_alloc_fail, 6) \
x(btree_cache_scan, 7) \
x(btree_cache_reap, 8) \
x(btree_cache_cannibalize, 9) \
x(btree_cache_cannibalize_lock, 10) \
x(btree_cache_cannibalize_lock_fail, 11) \
x(btree_cache_cannibalize_unlock, 12) \
x(btree_node_write, 13) \
x(btree_node_read, 14) \
x(btree_node_compact, 15) \
x(btree_node_merge, 16) \
x(btree_node_split, 17) \
x(btree_node_rewrite, 18) \
x(btree_node_alloc, 19) \
x(btree_node_free, 20) \
x(btree_node_set_root, 21) \
x(btree_path_relock_fail, 22) \
x(btree_path_upgrade_fail, 23) \
x(btree_reserve_get_fail, 24) \
x(journal_entry_full, 25) \
x(journal_full, 26) \
x(journal_reclaim_finish, 27) \
x(journal_reclaim_start, 28) \
x(journal_write, 29) \
x(read_promote, 30) \
x(read_bounce, 31) \
x(read_split, 33) \
x(read_retry, 32) \
x(read_reuse_race, 34) \
x(move_extent_read, 35) \
x(move_extent_write, 36) \
x(move_extent_finish, 37) \
x(move_extent_race, 38) \
x(move_extent_alloc_mem_fail, 39) \
x(copygc, 40) \
x(copygc_wait, 41) \
x(gc_gens_end, 42) \
x(gc_gens_start, 43) \
x(trans_blocked_journal_reclaim, 44) \
x(trans_restart_btree_node_reused, 45) \
x(trans_restart_btree_node_split, 46) \
x(trans_restart_fault_inject, 47) \
x(trans_restart_iter_upgrade, 48) \
x(trans_restart_journal_preres_get, 49) \
x(trans_restart_journal_reclaim, 50) \
x(trans_restart_journal_res_get, 51) \
x(trans_restart_key_cache_key_realloced, 52) \
x(trans_restart_key_cache_raced, 53) \
x(trans_restart_mark_replicas, 54) \
x(trans_restart_mem_realloced, 55) \
x(trans_restart_memory_allocation_failure, 56) \
x(trans_restart_relock, 57) \
x(trans_restart_relock_after_fill, 58) \
x(trans_restart_relock_key_cache_fill, 59) \
x(trans_restart_relock_next_node, 60) \
x(trans_restart_relock_parent_for_fill, 61) \
x(trans_restart_relock_path, 62) \
x(trans_restart_relock_path_intent, 63) \
x(trans_restart_too_many_iters, 64) \
x(trans_restart_traverse, 65) \
x(trans_restart_upgrade, 66) \
x(trans_restart_would_deadlock, 67) \
x(trans_restart_would_deadlock_write, 68) \
x(trans_restart_injected, 69) \
x(trans_restart_key_cache_upgrade, 70) \
x(trans_traverse_all, 71) \
x(transaction_commit, 72) \
x(write_super, 73)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,

View File

@ -14,8 +14,6 @@
#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
struct lock_class_key bch2_btree_node_lock_key;
const char * const bch2_btree_node_flags[] = {
#define x(f) #f,
BTREE_FLAGS()
@ -254,7 +252,7 @@ wait_on_io:
}
out:
if (b->hash_val && !ret)
trace_btree_node_reap(c, b);
trace_and_count(c, btree_cache_reap, c, b);
return ret;
out_unlock:
six_unlock_write(&b->c.lock);
@ -378,7 +376,7 @@ out:
ret = freed;
memalloc_nofs_restore(flags);
out_norestore:
trace_btree_cache_scan(sc->nr_to_scan, can_free, ret);
trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
return ret;
}
@ -514,7 +512,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
struct btree_cache *bc = &c->btree_cache;
if (bc->alloc_lock == current) {
trace_btree_node_cannibalize_unlock(c);
trace_and_count(c, btree_cache_cannibalize_unlock, c);
bc->alloc_lock = NULL;
closure_wake_up(&bc->alloc_wait);
}
@ -530,7 +528,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
if (!cl) {
trace_btree_node_cannibalize_lock_fail(c);
trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
return -ENOMEM;
}
@ -544,11 +542,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
}
trace_btree_node_cannibalize_lock_fail(c);
trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
return -EAGAIN;
success:
trace_btree_node_cannibalize_lock(c);
trace_and_count(c, btree_cache_cannibalize_lock, c);
return 0;
}
@ -672,7 +670,7 @@ err_locked:
mutex_unlock(&bc->lock);
trace_btree_node_cannibalize(c);
trace_and_count(c, btree_cache_cannibalize, c);
goto out;
}
@ -701,7 +699,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
* been freed:
*/
if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
trace_trans_restart_relock_parent_for_fill(trans, _THIS_IP_, path);
trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
}
@ -709,7 +707,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
if (trans && b == ERR_PTR(-ENOMEM)) {
trans->memory_allocation_failure = true;
trace_trans_restart_memory_allocation_failure(trans, _THIS_IP_, path);
trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
}
@ -758,7 +756,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
if (!six_relock_type(&b->c.lock, lock_type, seq)) {
if (trans)
trace_trans_restart_relock_after_fill(trans, _THIS_IP_, path);
trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
}
@ -896,7 +894,7 @@ lock_node:
if (btree_node_read_locked(path, level + 1))
btree_node_unlock(trans, path, level + 1);
ret = btree_node_lock(trans, path, b, k->k.p, level, lock_type,
ret = btree_node_lock(trans, path, &b->c, k->k.p, level, lock_type,
lock_node_check_fn, (void *) k, trace_ip);
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
@ -913,7 +911,7 @@ lock_node:
if (bch2_btree_node_relock(trans, path, level + 1))
goto retry;
trace_trans_restart_btree_node_reused(trans, trace_ip, path);
trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
}
}
@ -969,12 +967,13 @@ lock_node:
return b;
}
struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
const struct bkey_i *k,
enum btree_id btree_id,
unsigned level,
bool nofill)
{
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
struct bset_tree *t;
@ -1008,9 +1007,14 @@ retry:
goto out;
} else {
lock_node:
ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
if (ret)
goto retry;
ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
goto retry;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ERR_PTR(ret);
BUG();
}
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
b->c.btree_id != btree_id ||
@ -1072,8 +1076,9 @@ int bch2_btree_node_prefetch(struct bch_fs *c,
return PTR_ERR_OR_ZERO(b);
}
void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
{
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
@ -1089,8 +1094,8 @@ wait_on_io:
__bch2_btree_node_wait_on_read(b);
__bch2_btree_node_wait_on_write(b);
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
if (btree_node_dirty(b)) {
__bch2_btree_node_write(c, b, 0);

View File

@ -5,8 +5,6 @@
#include "bcachefs.h"
#include "btree_types.h"
extern struct lock_class_key bch2_btree_node_lock_key;
extern const char * const bch2_btree_node_flags[];
struct btree_iter;
@ -28,13 +26,13 @@ struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
const struct bkey_i *, unsigned,
enum six_lock_type, unsigned long);
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
enum btree_id, unsigned, bool);
int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
const struct bkey_i *, enum btree_id, unsigned);
void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
void bch2_fs_btree_cache_exit(struct bch_fs *);
int bch2_fs_btree_cache_init(struct bch_fs *);

View File

@ -165,10 +165,11 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
}
}
static void bch2_btree_node_update_key_early(struct bch_fs *c,
static void bch2_btree_node_update_key_early(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_i *new)
{
struct bch_fs *c = trans->c;
struct btree *b;
struct bkey_buf tmp;
int ret;
@ -176,7 +177,7 @@ static void bch2_btree_node_update_key_early(struct bch_fs *c,
bch2_bkey_buf_init(&tmp);
bch2_bkey_buf_reassemble(&tmp, c, old);
b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
if (!IS_ERR_OR_NULL(b)) {
mutex_lock(&c->btree_cache.lock);
@ -352,8 +353,9 @@ fsck_err:
return ret;
}
static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
{
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf prev_k, cur_k;
@ -378,7 +380,7 @@ again:
bch2_btree_and_journal_iter_advance(&iter);
bch2_bkey_buf_reassemble(&cur_k, c, k);
cur = bch2_btree_node_get_noiter(c, cur_k.k,
cur = bch2_btree_node_get_noiter(trans, cur_k.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(cur);
@ -392,7 +394,7 @@ again:
bch2_btree_ids[b->c.btree_id],
b->c.level - 1,
buf.buf)) {
bch2_btree_node_evict(c, cur_k.k);
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
cur = NULL;
@ -411,7 +413,7 @@ again:
if (ret == DROP_THIS_NODE) {
six_unlock_read(&cur->c.lock);
bch2_btree_node_evict(c, cur_k.k);
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
cur = NULL;
@ -425,7 +427,7 @@ again:
prev = NULL;
if (ret == DROP_PREV_NODE) {
bch2_btree_node_evict(c, prev_k.k);
bch2_btree_node_evict(trans, prev_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, prev_k.k->k.p);
if (ret)
@ -465,7 +467,7 @@ again:
bch2_bkey_buf_reassemble(&cur_k, c, k);
bch2_btree_and_journal_iter_advance(&iter);
cur = bch2_btree_node_get_noiter(c, cur_k.k,
cur = bch2_btree_node_get_noiter(trans, cur_k.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(cur);
@ -476,12 +478,12 @@ again:
goto err;
}
ret = bch2_btree_repair_topology_recurse(c, cur);
ret = bch2_btree_repair_topology_recurse(trans, cur);
six_unlock_read(&cur->c.lock);
cur = NULL;
if (ret == DROP_THIS_NODE) {
bch2_btree_node_evict(c, cur_k.k);
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
dropped_children = true;
@ -522,18 +524,21 @@ fsck_err:
static int bch2_repair_topology(struct bch_fs *c)
{
struct btree_trans trans;
struct btree *b;
unsigned i;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < BTREE_ID_NR && !ret; i++) {
b = c->btree_roots[i].b;
if (btree_node_fake(b))
continue;
six_lock_read(&b->c.lock, NULL, NULL);
ret = bch2_btree_repair_topology_recurse(c, b);
six_unlock_read(&b->c.lock);
btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
ret = bch2_btree_repair_topology_recurse(&trans, b);
if (ret == DROP_THIS_NODE) {
bch_err(c, "empty btree root - repair unimplemented");
@ -541,13 +546,16 @@ static int bch2_repair_topology(struct bch_fs *c)
}
}
bch2_trans_exit(&trans);
return ret;
}
static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
unsigned level, bool is_root,
struct bkey_s_c *k)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p = { 0 };
@ -747,7 +755,7 @@ found:
}
if (level)
bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
if (c->opts.verbose) {
printbuf_reset(&buf);
@ -788,7 +796,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
BUG_ON(bch2_journal_seq_verify &&
k->k->version.lo > atomic64_read(&c->journal.seq));
ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
if (ret)
goto err;
@ -941,7 +949,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
bch2_bkey_buf_reassemble(&cur, c, k);
bch2_btree_and_journal_iter_advance(&iter);
child = bch2_btree_node_get_noiter(c, cur.k,
child = bch2_btree_node_get_noiter(trans, cur.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(child);
@ -1934,7 +1942,7 @@ int bch2_gc_gens(struct bch_fs *c)
if (!mutex_trylock(&c->gc_gens_lock))
return 0;
trace_gc_gens_start(c);
trace_and_count(c, gc_gens_start, c);
down_read(&c->gc_lock);
bch2_trans_init(&trans, c, 0, 0);
@ -1995,7 +2003,7 @@ int bch2_gc_gens(struct bch_fs *c)
c->gc_count++;
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
trace_gc_gens_end(c);
trace_and_count(c, gc_gens_end, c);
err:
for_each_member_device(ca, c, i) {
kvfree(ca->oldest_gen);

View File

@ -1490,7 +1490,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
struct bio *bio;
int ret;
trace_btree_read(c, b);
trace_and_count(c, btree_node_read, c, b);
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
@ -1657,9 +1657,15 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
static void btree_node_write_done(struct bch_fs *c, struct btree *b)
{
six_lock_read(&b->c.lock, NULL, NULL);
struct btree_trans trans;
bch2_trans_init(&trans, c, 0, 0);
btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
__btree_node_write_done(c, b);
six_unlock_read(&b->c.lock);
bch2_trans_exit(&trans);
}
static void btree_node_write_work(struct work_struct *work)
@ -1979,7 +1985,7 @@ do_write:
c->opts.nochanges)
goto err;
trace_btree_write(b, bytes_to_write, sectors_to_write);
trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
wbio = container_of(bio_alloc_bioset(NULL,
buf_pages(data, sectors_to_write << 9),

View File

@ -130,443 +130,6 @@ static inline bool btree_path_pos_in_node(struct btree_path *path,
!btree_path_pos_after_node(path, b);
}
/* Btree node locking: */
void bch2_btree_node_unlock_write(struct btree_trans *trans,
struct btree_path *path, struct btree *b)
{
bch2_btree_node_unlock_write_inlined(trans, path, b);
}
struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
struct btree_path *skip,
struct btree *b,
unsigned level)
{
struct btree_path *path;
struct six_lock_count ret = { 0, 0 };
if (IS_ERR_OR_NULL(b))
return ret;
trans_for_each_path(trans, path)
if (path != skip && path->l[level].b == b) {
ret.read += btree_node_read_locked(path, level);
ret.intent += btree_node_intent_locked(path, level);
}
return ret;
}
static inline void six_lock_readers_add(struct six_lock *lock, int nr)
{
if (!lock->readers)
atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
else
this_cpu_add(*lock->readers, nr);
}
void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
{
int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->c.level).read;
/*
* Must drop our read locks before calling six_lock_write() -
* six_unlock() won't do wakeups until the reader count
* goes to 0, and it's safe because we have the node intent
* locked:
*/
six_lock_readers_add(&b->c.lock, -readers);
six_lock_write(&b->c.lock, NULL, NULL);
six_lock_readers_add(&b->c.lock, readers);
}
bool __bch2_btree_node_relock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
struct btree *b = btree_path_node(path, level);
int want = __btree_lock_want(path, level);
if (!is_btree_node(path, level))
goto fail;
if (race_fault())
goto fail;
if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
(btree_node_lock_seq_matches(path, b, level) &&
btree_node_lock_increment(trans, b, level, want))) {
mark_btree_node_locked(trans, path, level, want);
return true;
}
fail:
if (b != ERR_PTR(-BCH_ERR_no_btree_node_cached) &&
b != ERR_PTR(-BCH_ERR_no_btree_node_init))
trace_btree_node_relock_fail(trans, _RET_IP_, path, level);
return false;
}
bool bch2_btree_node_upgrade(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
struct btree *b = path->l[level].b;
if (!is_btree_node(path, level))
return false;
switch (btree_lock_want(path, level)) {
case BTREE_NODE_UNLOCKED:
BUG_ON(btree_node_locked(path, level));
return true;
case BTREE_NODE_READ_LOCKED:
BUG_ON(btree_node_intent_locked(path, level));
return bch2_btree_node_relock(trans, path, level);
case BTREE_NODE_INTENT_LOCKED:
break;
}
if (btree_node_intent_locked(path, level))
return true;
if (race_fault())
return false;
if (btree_node_locked(path, level)
? six_lock_tryupgrade(&b->c.lock)
: six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
goto success;
if (btree_node_lock_seq_matches(path, b, level) &&
btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
btree_node_unlock(trans, path, level);
goto success;
}
trace_btree_node_upgrade_fail(trans, _RET_IP_, path, level);
return false;
success:
mark_btree_node_intent_locked(trans, path, level);
return true;
}
static inline bool btree_path_get_locks(struct btree_trans *trans,
struct btree_path *path,
bool upgrade)
{
unsigned l = path->level;
int fail_idx = -1;
do {
if (!btree_path_node(path, l))
break;
if (!(upgrade
? bch2_btree_node_upgrade(trans, path, l)
: bch2_btree_node_relock(trans, path, l)))
fail_idx = l;
l++;
} while (l < path->locks_want);
/*
* When we fail to get a lock, we have to ensure that any child nodes
* can't be relocked so bch2_btree_path_traverse has to walk back up to
* the node that we failed to relock:
*/
if (fail_idx >= 0) {
__bch2_btree_path_unlock(trans, path);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
do {
path->l[fail_idx].b = upgrade
? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
: ERR_PTR(-BCH_ERR_no_btree_node_relock);
--fail_idx;
} while (fail_idx >= 0);
}
if (path->uptodate == BTREE_ITER_NEED_RELOCK)
path->uptodate = BTREE_ITER_UPTODATE;
bch2_trans_verify_locks(trans);
return path->uptodate < BTREE_ITER_NEED_RELOCK;
}
static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
bool cached)
{
return !cached
? container_of(_b, struct btree, c)->key.k.p
: container_of(_b, struct bkey_cached, c)->key.pos;
}
/* Slowpath: */
int __bch2_btree_node_lock(struct btree_trans *trans,
struct btree_path *path,
struct btree *b,
struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
{
struct btree_path *linked;
unsigned reason;
/* Check if it's safe to block: */
trans_for_each_path(trans, linked) {
if (!linked->nodes_locked)
continue;
/*
* Can't block taking an intent lock if we have _any_ nodes read
* locked:
*
* - Our read lock blocks another thread with an intent lock on
* the same node from getting a write lock, and thus from
* dropping its intent lock
*
* - And the other thread may have multiple nodes intent locked:
* both the node we want to intent lock, and the node we
* already have read locked - deadlock:
*/
if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) {
reason = 1;
goto deadlock;
}
if (linked->btree_id != path->btree_id) {
if (linked->btree_id < path->btree_id)
continue;
reason = 3;
goto deadlock;
}
/*
* Within the same btree, non-cached paths come before cached
* paths:
*/
if (linked->cached != path->cached) {
if (!linked->cached)
continue;
reason = 4;
goto deadlock;
}
/*
* Interior nodes must be locked before their descendants: if
* another path has possible descendants locked of the node
* we're about to lock, it must have the ancestors locked too:
*/
if (level > __fls(linked->nodes_locked)) {
reason = 5;
goto deadlock;
}
/* Must lock btree nodes in key order: */
if (btree_node_locked(linked, level) &&
bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
linked->cached)) <= 0) {
reason = 7;
goto deadlock;
}
}
return btree_node_lock_type(trans, path, b, pos, level,
type, should_sleep_fn, p);
deadlock:
trace_trans_restart_would_deadlock(trans, ip, reason, linked, path, &pos);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
}
/* Btree iterator locking: */
#ifdef CONFIG_BCACHEFS_DEBUG
static void bch2_btree_path_verify_locks(struct btree_path *path)
{
unsigned l;
if (!path->nodes_locked) {
BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
btree_path_node(path, path->level));
return;
}
for (l = 0; btree_path_node(path, l); l++)
BUG_ON(btree_lock_want(path, l) !=
btree_node_locked_type(path, l));
}
void bch2_trans_verify_locks(struct btree_trans *trans)
{
struct btree_path *path;
trans_for_each_path(trans, path)
bch2_btree_path_verify_locks(path);
}
#else
static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
#endif
/* Btree path locking: */
/*
* Only for btree_cache.c - only relocks intent locks
*/
int bch2_btree_path_relock_intent(struct btree_trans *trans,
struct btree_path *path)
{
unsigned l;
for (l = path->level;
l < path->locks_want && btree_path_node(path, l);
l++) {
if (!bch2_btree_node_relock(trans, path, l)) {
__bch2_btree_path_unlock(trans, path);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
trace_trans_restart_relock_path_intent(trans, _RET_IP_, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
}
}
return 0;
}
__flatten
static bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
return btree_path_get_locks(trans, path, false);
}
static int bch2_btree_path_relock(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
trace_trans_restart_relock_path(trans, trace_ip, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
}
return 0;
}
bool __bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
struct btree_path *linked;
EBUG_ON(path->locks_want >= new_locks_want);
path->locks_want = new_locks_want;
if (btree_path_get_locks(trans, path, true))
return true;
/*
* XXX: this is ugly - we'd prefer to not be mucking with other
* iterators in the btree_trans here.
*
* On failure to upgrade the iterator, setting iter->locks_want and
* calling get_locks() is sufficient to make bch2_btree_path_traverse()
* get the locks we want on transaction restart.
*
* But if this iterator was a clone, on transaction restart what we did
* to this iterator isn't going to be preserved.
*
* Possibly we could add an iterator field for the parent iterator when
* an iterator is a copy - for now, we'll just upgrade any other
* iterators with the same btree id.
*
* The code below used to be needed to ensure ancestor nodes get locked
* before interior nodes - now that's handled by
* bch2_btree_path_traverse_all().
*/
if (!path->cached && !trans->in_traverse_all)
trans_for_each_path(trans, linked)
if (linked != path &&
linked->cached == path->cached &&
linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
btree_path_get_locks(trans, linked, true);
}
return false;
}
void __bch2_btree_path_downgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
unsigned l;
EBUG_ON(path->locks_want < new_locks_want);
path->locks_want = new_locks_want;
while (path->nodes_locked &&
(l = __fls(path->nodes_locked)) >= path->locks_want) {
if (l > path->level) {
btree_node_unlock(trans, path, l);
} else {
if (btree_node_intent_locked(path, l)) {
six_lock_downgrade(&path->l[l].b->c.lock);
path->nodes_intent_locked ^= 1 << l;
}
break;
}
}
bch2_btree_path_verify_locks(path);
}
void bch2_trans_downgrade(struct btree_trans *trans)
{
struct btree_path *path;
trans_for_each_path(trans, path)
bch2_btree_path_downgrade(trans, path);
}
/* Btree transaction locking: */
int bch2_trans_relock(struct btree_trans *trans)
{
struct btree_path *path;
if (unlikely(trans->restarted))
return -BCH_ERR_transaction_restart_relock;
trans_for_each_path(trans, path)
if (path->should_be_locked &&
bch2_btree_path_relock(trans, path, _RET_IP_)) {
trace_trans_restart_relock(trans, _RET_IP_, path);
BUG_ON(!trans->restarted);
return -BCH_ERR_transaction_restart_relock;
}
return 0;
}
void bch2_trans_unlock(struct btree_trans *trans)
{
struct btree_path *path;
trans_for_each_path(trans, path)
__bch2_btree_path_unlock(trans, path);
/*
* bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
* btree nodes, it implements its own walking:
*/
BUG_ON(!trans->is_initial_gc &&
lock_class_is_held(&bch2_btree_node_lock_key));
}
/* Btree iterator: */
#ifdef CONFIG_BCACHEFS_DEBUG
@ -795,7 +358,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
if (cmp < 0)
continue;
if (!(path->nodes_locked & 1) ||
if (!btree_node_locked(path, 0) ||
!path->should_be_locked)
continue;
@ -1161,13 +724,13 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
struct btree_path *path;
trans_for_each_path(trans, path)
if (!path->cached &&
if (path->uptodate == BTREE_ITER_UPTODATE &&
!path->cached &&
btree_path_pos_in_node(path, b)) {
enum btree_node_locked_type t =
btree_lock_want(path, b->c.level);
if (path->nodes_locked &&
t != BTREE_NODE_UNLOCKED) {
if (t != BTREE_NODE_UNLOCKED) {
btree_node_unlock(trans, path, b->c.level);
six_lock_increment(&b->c.lock, t);
mark_btree_node_locked(trans, path, b->c.level, t);
@ -1232,7 +795,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
}
lock_type = __btree_lock_want(path, path->level);
ret = btree_node_lock(trans, path, b, SPOS_MAX,
ret = btree_node_lock(trans, path, &b->c, SPOS_MAX,
path->level, lock_type,
lock_root_check_fn, rootp,
trace_ip);
@ -1517,7 +1080,7 @@ err:
trans->in_traverse_all = false;
trace_trans_traverse_all(trans, trace_ip);
trace_and_count(c, trans_traverse_all, trans, trace_ip);
return ret;
}
@ -1654,7 +1217,7 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
u64 mask = ~(~0ULL << restart_probability_bits);
if ((prandom_u32() & mask) == mask) {
trace_transaction_restart_injected(trans, _RET_IP_);
trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
}
}
@ -1956,7 +1519,6 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans,
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
path->nodes_intent_locked = 0;
btree_path_list_add(trans, pos, path);
return path;
@ -2006,7 +1568,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
path->level = level;
path->locks_want = locks_want;
path->nodes_locked = 0;
path->nodes_intent_locked = 0;
for (i = 0; i < ARRAY_SIZE(path->l); i++)
path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
#ifdef CONFIG_BCACHEFS_DEBUG
@ -2030,10 +1591,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
*/
locks_want = min(locks_want, BTREE_MAX_DEPTH);
if (locks_want > path->locks_want) {
path->locks_want = locks_want;
btree_path_get_locks(trans, path, true);
}
if (locks_want > path->locks_want)
bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
return path;
}
@ -2166,7 +1725,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
trace_trans_restart_relock_next_node(trans, _THIS_IP_, path);
trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
goto err;
}
@ -3185,9 +2744,11 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
size_t new_top = trans->mem_top + size;
unsigned new_top = trans->mem_top + size;
void *p;
trans->mem_max = max(trans->mem_max, new_top);
if (new_top > trans->mem_bytes) {
size_t old_bytes = trans->mem_bytes;
size_t new_bytes = roundup_pow_of_two(new_top);
@ -3209,7 +2770,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trans->mem_bytes = new_bytes;
if (old_bytes) {
trace_trans_restart_mem_realloced(trans, _RET_IP_, new_bytes);
trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
}
}
@ -3325,12 +2886,10 @@ static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct b
return i;
}
void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
unsigned expected_nr_iters,
size_t expected_mem_bytes,
const char *fn)
void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn)
__acquires(&c->btree_trans_barrier)
{
struct btree_transaction_stats *s;
struct btree_trans *pos;
BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
@ -3344,7 +2903,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
bch2_trans_alloc_paths(trans, c);
if (expected_mem_bytes) {
s = btree_trans_stats(trans);
if (s) {
unsigned expected_mem_bytes = s->max_mem;
trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
@ -3395,9 +2957,13 @@ void bch2_trans_exit(struct btree_trans *trans)
{
struct btree_insert_entry *i;
struct bch_fs *c = trans->c;
struct btree_transaction_stats *s = btree_trans_stats(trans);
bch2_trans_unlock(trans);
if (s)
s->max_mem = max(s->max_mem, trans->mem_max);
trans_for_each_update(trans, i)
__btree_path_put(i->path, true);
trans->nr_updates = 0;
@ -3444,12 +3010,23 @@ void bch2_trans_exit(struct btree_trans *trans)
static void __maybe_unused
bch2_btree_path_node_to_text(struct printbuf *out,
struct btree_bkey_cached_common *b,
bool cached)
struct btree_bkey_cached_common *b)
{
struct six_lock_count c = six_lock_counts(&b->lock);
struct task_struct *owner;
pid_t pid;
rcu_read_lock();
owner = READ_ONCE(b->lock.owner);
pid = owner ? owner->pid : 0;;
rcu_read_unlock();
prt_printf(out, " l=%u %s:",
b->level, bch2_btree_ids[b->btree_id]);
bch2_bpos_to_text(out, btree_node_pos(b, cached));
bch2_bpos_to_text(out, btree_node_pos(b));
prt_printf(out, " locks %u:%u:%u held by pid %u",
c.n[0], c.n[1], c.n[2], pid);
}
void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
@ -3476,9 +3053,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
for (l = 0; l < BTREE_MAX_DEPTH; l++) {
if (btree_node_locked(path, l) &&
!IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
prt_printf(out, " %s l=%u ",
btree_node_intent_locked(path, l) ? "i" : "r", l);
bch2_btree_path_node_to_text(out, b, path->cached);
prt_printf(out, " %c l=%u ",
lock_types[btree_node_locked_type(path, l)], l);
bch2_btree_path_node_to_text(out, b);
prt_printf(out, "\n");
}
}
@ -3496,7 +3073,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
bch2_bpos_to_text(out, trans->locking_pos);
prt_printf(out, " node ");
bch2_btree_path_node_to_text(out, b, path->cached);
bch2_btree_path_node_to_text(out, b);
prt_printf(out, "\n");
}
}

View File

@ -145,12 +145,10 @@ struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_trans_verify_paths(struct btree_trans *);
void bch2_trans_verify_locks(struct btree_trans *);
void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
struct bpos, bool);
#else
static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos, bool key_cache) {}
#endif
@ -195,20 +193,6 @@ static inline int btree_trans_restart(struct btree_trans *trans, int err)
bool bch2_btree_node_upgrade(struct btree_trans *,
struct btree_path *, unsigned);
bool __bch2_btree_path_upgrade(struct btree_trans *,
struct btree_path *, unsigned);
static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
return path->locks_want < new_locks_want
? __bch2_btree_path_upgrade(trans, path, new_locks_want)
: path->uptodate == BTREE_ITER_UPTODATE;
}
void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
@ -367,8 +351,8 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) {
trace_trans_restart_too_many_iters(trans, _THIS_IP_);
if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
}
@ -544,11 +528,10 @@ void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
unsigned, size_t, const char *);
void __bch2_trans_init(struct btree_trans *, struct bch_fs *, const char *);
void bch2_trans_exit(struct btree_trans *);
#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__)
#define bch2_trans_init(_trans, _c, _nr_iters, _mem) __bch2_trans_init(_trans, _c, __func__)
void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);

View File

@ -13,6 +13,11 @@
#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
static inline bool btree_uses_pcpu_readers(enum btree_id id)
{
return id == BTREE_ID_subvolumes;
}
static struct kmem_cache *bch2_key_cache;
static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@ -84,7 +89,10 @@ static void bkey_cached_free(struct btree_key_cache *bc,
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
list_move_tail(&ck->list, &bc->freed);
if (ck->c.lock.readers)
list_move_tail(&ck->list, &bc->freed_pcpu);
else
list_move_tail(&ck->list, &bc->freed_nonpcpu);
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
@ -95,15 +103,51 @@ static void bkey_cached_free(struct btree_key_cache *bc,
six_unlock_intent(&ck->c.lock);
}
static void bkey_cached_free_fast(struct btree_key_cache *bc,
struct bkey_cached *ck)
static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct btree_key_cache_freelist *f;
bool freed = false;
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
if (!ck->c.lock.readers) {
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
if (f->nr < ARRAY_SIZE(f->objs)) {
f->objs[f->nr++] = ck;
freed = true;
}
preempt_enable();
if (!freed) {
mutex_lock(&bc->lock);
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
while (f->nr > ARRAY_SIZE(f->objs) / 2) {
struct bkey_cached *ck2 = f->objs[--f->nr];
list_move_tail(&ck2->list, &bc->freed_nonpcpu);
}
preempt_enable();
list_move_tail(&ck->list, &bc->freed_nonpcpu);
mutex_unlock(&bc->lock);
}
} else {
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_pcpu);
mutex_unlock(&bc->lock);
}
}
static void bkey_cached_free_fast(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
@ -114,74 +158,84 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc,
ck->k = NULL;
ck->u64s = 0;
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
if (f->nr < ARRAY_SIZE(f->objs)) {
f->objs[f->nr++] = ck;
freed = true;
}
preempt_enable();
if (!freed) {
mutex_lock(&bc->lock);
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
while (f->nr > ARRAY_SIZE(f->objs) / 2) {
struct bkey_cached *ck2 = f->objs[--f->nr];
list_move_tail(&ck2->list, &bc->freed);
}
preempt_enable();
list_move_tail(&ck->list, &bc->freed);
mutex_unlock(&bc->lock);
}
bkey_cached_move_to_freelist(bc, ck);
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
static struct bkey_cached *
bkey_cached_alloc(struct btree_key_cache *c)
bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck = NULL;
struct btree_key_cache_freelist *f;
bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
preempt_disable();
f = this_cpu_ptr(c->pcpu_freed);
if (f->nr)
ck = f->objs[--f->nr];
preempt_enable();
if (!ck) {
mutex_lock(&c->lock);
if (!pcpu_readers) {
preempt_disable();
f = this_cpu_ptr(c->pcpu_freed);
while (!list_empty(&c->freed) &&
f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&c->freed, struct bkey_cached, list);
list_del_init(&ck->list);
f->objs[f->nr++] = ck;
}
ck = f->nr ? f->objs[--f->nr] : NULL;
f = this_cpu_ptr(bc->pcpu_freed);
if (f->nr)
ck = f->objs[--f->nr];
preempt_enable();
mutex_unlock(&c->lock);
if (!ck) {
mutex_lock(&bc->lock);
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
while (!list_empty(&bc->freed_nonpcpu) &&
f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
f->objs[f->nr++] = ck;
}
ck = f->nr ? f->objs[--f->nr] : NULL;
preempt_enable();
mutex_unlock(&bc->lock);
}
} else {
mutex_lock(&bc->lock);
if (!list_empty(&bc->freed_pcpu)) {
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
}
mutex_unlock(&bc->lock);
}
if (ck) {
six_lock_intent(&ck->c.lock, NULL, NULL);
six_lock_write(&ck->c.lock, NULL, NULL);
int ret;
ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent);
if (unlikely(ret)) {
bkey_cached_move_to_freelist(bc, ck);
return ERR_PTR(ret);
}
path->l[0].b = (void *) ck;
path->l[0].lock_seq = ck->c.lock.state.seq;
mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
ret = bch2_btree_node_lock_write(trans, path, &ck->c);
if (unlikely(ret)) {
btree_node_unlock(trans, path, 0);
bkey_cached_move_to_freelist(bc, ck);
return ERR_PTR(ret);
}
return ck;
}
ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
if (likely(ck)) {
INIT_LIST_HEAD(&ck->list);
six_lock_init(&ck->c.lock);
__six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
if (pcpu_readers)
six_lock_pcpu_alloc(&ck->c.lock);
ck->c.cached = true;
BUG_ON(!six_trylock_intent(&ck->c.lock));
BUG_ON(!six_trylock_write(&ck->c.lock));
return ck;
@ -215,36 +269,36 @@ bkey_cached_reuse(struct btree_key_cache *c)
}
static struct bkey_cached *
btree_key_cache_create(struct bch_fs *c,
enum btree_id btree_id,
struct bpos pos)
btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck;
bool was_new = true;
ck = bkey_cached_alloc(bc);
ck = bkey_cached_alloc(trans, path);
if (unlikely(IS_ERR(ck)))
return ck;
if (unlikely(!ck)) {
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
bch2_btree_ids[btree_id]);
bch2_btree_ids[path->btree_id]);
return ERR_PTR(-ENOMEM);
}
mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
was_new = false;
} else {
if (btree_id == BTREE_ID_subvolumes)
if (path->btree_id == BTREE_ID_subvolumes)
six_lock_pcpu_alloc(&ck->c.lock);
else
six_lock_pcpu_free(&ck->c.lock);
}
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
ck->key.pos = pos;
ck->c.btree_id = path->btree_id;
ck->key.btree_id = path->btree_id;
ck->key.pos = path->pos;
ck->valid = false;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
@ -256,6 +310,7 @@ btree_key_cache_create(struct bch_fs *c,
if (likely(was_new)) {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
kfree(ck);
} else {
bkey_cached_free_fast(bc, ck);
@ -291,7 +346,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
k = bch2_btree_path_peek_slot(path, &u);
if (!bch2_btree_node_relock(trans, ck_path, 0)) {
trace_trans_restart_relock_key_cache_fill(trans, _THIS_IP_, ck_path);
trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
goto err;
}
@ -320,11 +375,12 @@ static int btree_key_cache_fill(struct btree_trans *trans,
}
}
/*
* XXX: not allowed to be holding read locks when we take a write lock,
* currently
*/
bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
if (ret) {
kfree(new_k);
goto err;
}
if (new_k) {
kfree(ck->k);
ck->u64s = new_u64s;
@ -372,7 +428,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
retry:
ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
if (!ck) {
ck = btree_key_cache_create(c, path->btree_id, path->pos);
ck = btree_key_cache_create(trans, path);
ret = PTR_ERR_OR_ZERO(ck);
if (ret)
goto err;
@ -414,7 +470,7 @@ fill:
*/
if (!path->locks_want &&
!__bch2_btree_path_upgrade(trans, path, 1)) {
trace_transaction_restart_key_cache_upgrade(trans, _THIS_IP_);
trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
goto err;
}
@ -518,21 +574,21 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
} else {
struct btree_path *path2;
evict:
BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
trans_for_each_path(trans, path2)
if (path2 != c_iter.path)
__bch2_btree_path_unlock(trans, path2);
mark_btree_node_unlocked(c_iter.path, 0);
c_iter.path->l[0].b = NULL;
six_lock_write(&ck->c.lock, NULL, NULL);
bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
bkey_cached_evict(&c->btree_key_cache, ck);
bkey_cached_free_fast(&c->btree_key_cache, ck);
}
out:
@ -548,11 +604,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
struct bkey_cached *ck =
container_of(pin, struct bkey_cached, journal);
struct bkey_cached_key key;
struct btree_trans trans;
int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
int ret = 0;
int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
bch2_trans_init(&trans, c, 0, 0);
six_lock_read(&ck->c.lock, NULL, NULL);
btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
key = ck->key;
if (ck->journal.seq != seq ||
@ -562,12 +620,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
}
six_unlock_read(&ck->c.lock);
ret = bch2_trans_do(c, NULL, NULL, 0,
ret = commit_do(&trans, NULL, NULL, 0,
btree_key_cache_flush_pos(&trans, key, seq,
BTREE_INSERT_JOURNAL_RECLAIM, false));
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
bch2_trans_exit(&trans);
return ret;
}
@ -674,12 +733,29 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
list_for_each_entry_safe(ck, t, &bc->freed, list) {
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
break;
list_del(&ck->list);
six_lock_pcpu_free(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
scanned++;
freed++;
}
if (scanned >= nr)
goto out;
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
break;
list_del(&ck->list);
six_lock_pcpu_free(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
scanned++;
@ -767,7 +843,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
bkey_cached_evict(bc, ck);
list_add(&ck->list, &bc->freed);
list_add(&ck->list, &bc->freed_nonpcpu);
}
rcu_read_unlock();
@ -777,11 +853,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
for (i = 0; i < f->nr; i++) {
ck = f->objs[i];
list_add(&ck->list, &bc->freed);
list_add(&ck->list, &bc->freed_nonpcpu);
}
}
list_for_each_entry_safe(ck, n, &bc->freed, list) {
list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
list_for_each_entry_safe(ck, n, &bc->freed_nonpcpu, list) {
cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal);
@ -789,6 +867,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
list_del(&ck->list);
kfree(ck->k);
six_lock_pcpu_free(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
}
@ -808,7 +887,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->freed);
INIT_LIST_HEAD(&c->freed_pcpu);
INIT_LIST_HEAD(&c->freed_nonpcpu);
}
static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)

466
libbcachefs/btree_locking.c Normal file
View File

@ -0,0 +1,466 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_locking.h"
#include "btree_types.h"
struct lock_class_key bch2_btree_node_lock_key;
/* Btree node locking: */
static inline void six_lock_readers_add(struct six_lock *lock, int nr)
{
if (lock->readers)
this_cpu_add(*lock->readers, nr);
else if (nr > 0)
atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
else
atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
}
struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
struct btree_path *skip,
struct btree_bkey_cached_common *b,
unsigned level)
{
struct btree_path *path;
struct six_lock_count ret;
memset(&ret, 0, sizeof(ret));
if (IS_ERR_OR_NULL(b))
return ret;
trans_for_each_path(trans, path)
if (path != skip && &path->l[level].b->c == b) {
int t = btree_node_locked_type(path, level);
if (t != BTREE_NODE_UNLOCKED)
ret.n[t]++;
}
return ret;
}
/* unlock */
void bch2_btree_node_unlock_write(struct btree_trans *trans,
struct btree_path *path, struct btree *b)
{
bch2_btree_node_unlock_write_inlined(trans, path, b);
}
/* lock */
void __bch2_btree_node_lock_write(struct btree_trans *trans,
struct btree_bkey_cached_common *b)
{
int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
/*
* Must drop our read locks before calling six_lock_write() -
* six_unlock() won't do wakeups until the reader count
* goes to 0, and it's safe because we have the node intent
* locked:
*/
six_lock_readers_add(&b->lock, -readers);
btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write);
six_lock_readers_add(&b->lock, readers);
}
static inline bool path_has_read_locks(struct btree_path *path)
{
unsigned l;
for (l = 0; l < BTREE_MAX_DEPTH; l++)
if (btree_node_read_locked(path, l))
return true;
return false;
}
/* Slowpath: */
int __bch2_btree_node_lock(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b,
struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
{
struct btree_path *linked;
unsigned reason;
/* Check if it's safe to block: */
trans_for_each_path(trans, linked) {
if (!linked->nodes_locked)
continue;
/*
* Can't block taking an intent lock if we have _any_ nodes read
* locked:
*
* - Our read lock blocks another thread with an intent lock on
* the same node from getting a write lock, and thus from
* dropping its intent lock
*
* - And the other thread may have multiple nodes intent locked:
* both the node we want to intent lock, and the node we
* already have read locked - deadlock:
*/
if (type == SIX_LOCK_intent &&
path_has_read_locks(linked)) {
reason = 1;
goto deadlock;
}
if (linked->btree_id != path->btree_id) {
if (linked->btree_id < path->btree_id)
continue;
reason = 3;
goto deadlock;
}
/*
* Within the same btree, non-cached paths come before cached
* paths:
*/
if (linked->cached != path->cached) {
if (!linked->cached)
continue;
reason = 4;
goto deadlock;
}
/*
* Interior nodes must be locked before their descendants: if
* another path has possible descendants locked of the node
* we're about to lock, it must have the ancestors locked too:
*/
if (level > btree_path_highest_level_locked(linked)) {
reason = 5;
goto deadlock;
}
/* Must lock btree nodes in key order: */
if (btree_node_locked(linked, level) &&
bpos_cmp(pos, btree_node_pos(&linked->l[level].b->c)) <= 0) {
reason = 7;
goto deadlock;
}
}
return btree_node_lock_type(trans, path, b, pos, level,
type, should_sleep_fn, p);
deadlock:
trace_and_count(trans->c, trans_restart_would_deadlock, trans, ip, reason, linked, path, &pos);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
}
/* relock */
static inline bool btree_path_get_locks(struct btree_trans *trans,
struct btree_path *path,
bool upgrade)
{
unsigned l = path->level;
int fail_idx = -1;
do {
if (!btree_path_node(path, l))
break;
if (!(upgrade
? bch2_btree_node_upgrade(trans, path, l)
: bch2_btree_node_relock(trans, path, l)))
fail_idx = l;
l++;
} while (l < path->locks_want);
/*
* When we fail to get a lock, we have to ensure that any child nodes
* can't be relocked so bch2_btree_path_traverse has to walk back up to
* the node that we failed to relock:
*/
if (fail_idx >= 0) {
__bch2_btree_path_unlock(trans, path);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
do {
path->l[fail_idx].b = upgrade
? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
: ERR_PTR(-BCH_ERR_no_btree_node_relock);
--fail_idx;
} while (fail_idx >= 0);
}
if (path->uptodate == BTREE_ITER_NEED_RELOCK)
path->uptodate = BTREE_ITER_UPTODATE;
bch2_trans_verify_locks(trans);
return path->uptodate < BTREE_ITER_NEED_RELOCK;
}
bool __bch2_btree_node_relock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
struct btree *b = btree_path_node(path, level);
int want = __btree_lock_want(path, level);
if (race_fault())
goto fail;
if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
(btree_node_lock_seq_matches(path, b, level) &&
btree_node_lock_increment(trans, &b->c, level, want))) {
mark_btree_node_locked(trans, path, level, want);
return true;
}
fail:
trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
return false;
}
/* upgrade */
bool bch2_btree_node_upgrade(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
struct btree *b = path->l[level].b;
if (!is_btree_node(path, level))
return false;
switch (btree_lock_want(path, level)) {
case BTREE_NODE_UNLOCKED:
BUG_ON(btree_node_locked(path, level));
return true;
case BTREE_NODE_READ_LOCKED:
BUG_ON(btree_node_intent_locked(path, level));
return bch2_btree_node_relock(trans, path, level);
case BTREE_NODE_INTENT_LOCKED:
break;
case BTREE_NODE_WRITE_LOCKED:
BUG();
}
if (btree_node_intent_locked(path, level))
return true;
if (race_fault())
return false;
if (btree_node_locked(path, level)
? six_lock_tryupgrade(&b->c.lock)
: six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
goto success;
if (btree_node_lock_seq_matches(path, b, level) &&
btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
btree_node_unlock(trans, path, level);
goto success;
}
trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
return false;
success:
mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
return true;
}
/* Btree path locking: */
/*
* Only for btree_cache.c - only relocks intent locks
*/
int bch2_btree_path_relock_intent(struct btree_trans *trans,
struct btree_path *path)
{
unsigned l;
for (l = path->level;
l < path->locks_want && btree_path_node(path, l);
l++) {
if (!bch2_btree_node_relock(trans, path, l)) {
__bch2_btree_path_unlock(trans, path);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
}
}
return 0;
}
__flatten
bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
return btree_path_get_locks(trans, path, false);
}
__flatten
bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
return btree_path_get_locks(trans, path, true);
}
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
EBUG_ON(path->locks_want >= new_locks_want);
path->locks_want = new_locks_want;
return btree_path_get_locks(trans, path, true);
}
bool __bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
struct btree_path *linked;
if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
return true;
/*
* XXX: this is ugly - we'd prefer to not be mucking with other
* iterators in the btree_trans here.
*
* On failure to upgrade the iterator, setting iter->locks_want and
* calling get_locks() is sufficient to make bch2_btree_path_traverse()
* get the locks we want on transaction restart.
*
* But if this iterator was a clone, on transaction restart what we did
* to this iterator isn't going to be preserved.
*
* Possibly we could add an iterator field for the parent iterator when
* an iterator is a copy - for now, we'll just upgrade any other
* iterators with the same btree id.
*
* The code below used to be needed to ensure ancestor nodes get locked
* before interior nodes - now that's handled by
* bch2_btree_path_traverse_all().
*/
if (!path->cached && !trans->in_traverse_all)
trans_for_each_path(trans, linked)
if (linked != path &&
linked->cached == path->cached &&
linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
btree_path_get_locks(trans, linked, true);
}
return false;
}
void __bch2_btree_path_downgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
unsigned l;
EBUG_ON(path->locks_want < new_locks_want);
path->locks_want = new_locks_want;
while (path->nodes_locked &&
(l = btree_path_highest_level_locked(path)) >= path->locks_want) {
if (l > path->level) {
btree_node_unlock(trans, path, l);
} else {
if (btree_node_intent_locked(path, l)) {
six_lock_downgrade(&path->l[l].b->c.lock);
mark_btree_node_locked_noreset(path, l, SIX_LOCK_read);
}
break;
}
}
bch2_btree_path_verify_locks(path);
}
/* Btree transaction locking: */
void bch2_trans_downgrade(struct btree_trans *trans)
{
struct btree_path *path;
trans_for_each_path(trans, path)
bch2_btree_path_downgrade(trans, path);
}
int bch2_trans_relock(struct btree_trans *trans)
{
struct btree_path *path;
if (unlikely(trans->restarted))
return - ((int) trans->restarted);
trans_for_each_path(trans, path)
if (path->should_be_locked &&
!bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
return 0;
}
void bch2_trans_unlock(struct btree_trans *trans)
{
struct btree_path *path;
trans_for_each_path(trans, path)
__bch2_btree_path_unlock(trans, path);
/*
* bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
* btree nodes, it implements its own walking:
*/
BUG_ON(!trans->is_initial_gc &&
lock_class_is_held(&bch2_btree_node_lock_key));
}
/* Debug */
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_path_verify_locks(struct btree_path *path)
{
unsigned l;
if (!path->nodes_locked) {
BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
btree_path_node(path, path->level));
return;
}
for (l = 0; l < BTREE_MAX_DEPTH; l++) {
int want = btree_lock_want(path, l);
int have = btree_node_locked_type(path, l);
BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
BUG_ON(is_btree_node(path, l) &&
(want == BTREE_NODE_UNLOCKED ||
have != BTREE_NODE_WRITE_LOCKED) &&
want != have);
}
}
void bch2_trans_verify_locks(struct btree_trans *trans)
{
struct btree_path *path;
trans_for_each_path(trans, path)
bch2_btree_path_verify_locks(path);
}
#endif

View File

@ -14,68 +14,71 @@
#include "btree_iter.h"
extern struct lock_class_key bch2_btree_node_lock_key;
static inline bool is_btree_node(struct btree_path *path, unsigned l)
{
return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
}
static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
{
return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
? &trans->c->btree_transaction_stats[trans->fn_idx]
: NULL;
}
/* matches six lock types */
enum btree_node_locked_type {
BTREE_NODE_UNLOCKED = -1,
BTREE_NODE_READ_LOCKED = SIX_LOCK_read,
BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write,
};
static inline int btree_node_locked_type(struct btree_path *path,
unsigned level)
{
/*
* We're relying on the fact that if nodes_intent_locked is set
* nodes_locked must be set as well, so that we can compute without
* branches:
*/
return BTREE_NODE_UNLOCKED +
((path->nodes_locked >> level) & 1) +
((path->nodes_intent_locked >> level) & 1);
return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
}
static inline bool btree_node_intent_locked(struct btree_path *path,
unsigned level)
static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
{
return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
}
static inline bool btree_node_read_locked(struct btree_path *path,
unsigned level)
static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
{
return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
}
static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
{
return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
}
static inline bool btree_node_locked(struct btree_path *path, unsigned level)
{
return path->nodes_locked & (1 << level);
return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
}
static inline void mark_btree_node_unlocked(struct btree_path *path,
unsigned level)
{
path->nodes_locked &= ~(1 << level);
path->nodes_intent_locked &= ~(1 << level);
}
static inline void mark_btree_node_locked_noreset(struct btree_trans *trans,
struct btree_path *path,
unsigned level,
enum six_lock_type type)
static inline void mark_btree_node_locked_noreset(struct btree_path *path,
unsigned level,
enum btree_node_locked_type type)
{
/* relying on this to avoid a branch */
BUILD_BUG_ON(SIX_LOCK_read != 0);
BUILD_BUG_ON(SIX_LOCK_intent != 1);
BUG_ON(trans->in_traverse_all && path->sorted_idx > trans->traverse_all_idx);
path->nodes_locked &= ~(3U << (level << 1));
path->nodes_locked |= (type + 1) << (level << 1);
}
path->nodes_locked |= 1 << level;
path->nodes_intent_locked |= type << level;
static inline void mark_btree_node_unlocked(struct btree_path *path,
unsigned level)
{
EBUG_ON(btree_node_write_locked(path, level));
mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
}
static inline void mark_btree_node_locked(struct btree_trans *trans,
@ -83,19 +86,12 @@ static inline void mark_btree_node_locked(struct btree_trans *trans,
unsigned level,
enum six_lock_type type)
{
mark_btree_node_locked_noreset(trans, path, level, type);
mark_btree_node_locked_noreset(path, level, type);
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
path->l[level].lock_taken_time = ktime_get_ns();
#endif
}
static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
struct btree_path *path,
unsigned level)
{
mark_btree_node_locked_noreset(trans, path, level, SIX_LOCK_intent);
}
static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
{
return level < path->locks_want
@ -115,13 +111,6 @@ btree_lock_want(struct btree_path *path, int level)
return BTREE_NODE_UNLOCKED;
}
static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
{
return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
? &trans->c->btree_transaction_stats[trans->fn_idx]
: NULL;
}
static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
@ -135,6 +124,8 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
#endif
}
/* unlock: */
static inline void btree_node_unlock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
@ -149,122 +140,23 @@ static inline void btree_node_unlock(struct btree_trans *trans,
mark_btree_node_unlocked(path, level);
}
static inline int btree_path_lowest_level_locked(struct btree_path *path)
{
return __ffs(path->nodes_locked) >> 1;
}
static inline int btree_path_highest_level_locked(struct btree_path *path)
{
return __fls(path->nodes_locked) >> 1;
}
static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
struct btree_path *path)
{
btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
while (path->nodes_locked)
btree_node_unlock(trans, path, __ffs(path->nodes_locked));
}
static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
{
switch (type) {
case SIX_LOCK_read:
return BCH_TIME_btree_lock_contended_read;
case SIX_LOCK_intent:
return BCH_TIME_btree_lock_contended_intent;
case SIX_LOCK_write:
return BCH_TIME_btree_lock_contended_write;
default:
BUG();
}
}
static inline int btree_node_lock_type(struct btree_trans *trans,
struct btree_path *path,
struct btree *b,
struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
struct bch_fs *c = trans->c;
u64 start_time;
int ret;
if (six_trylock_type(&b->c.lock, type))
return 0;
start_time = local_clock();
trans->locking_path_idx = path->idx;
trans->locking_pos = pos;
trans->locking_btree_id = path->btree_id;
trans->locking_level = level;
trans->locking_lock_type = type;
trans->locking = &b->c;
ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p);
trans->locking = NULL;
if (ret)
return ret;
bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
return 0;
}
/*
* Lock a btree node if we already have it locked on one of our linked
* iterators:
*/
static inline bool btree_node_lock_increment(struct btree_trans *trans,
struct btree *b, unsigned level,
enum btree_node_locked_type want)
{
struct btree_path *path;
trans_for_each_path(trans, path)
if (path->l[level].b == b &&
btree_node_locked_type(path, level) >= want) {
six_lock_increment(&b->c.lock, want);
return true;
}
return false;
}
int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
struct btree *, struct bpos, unsigned,
enum six_lock_type,
six_lock_should_sleep_fn, void *,
unsigned long);
static inline int btree_node_lock(struct btree_trans *trans,
struct btree_path *path,
struct btree *b, struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
{
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
if (likely(six_trylock_type(&b->c.lock, type)) ||
btree_node_lock_increment(trans, b, level, type) ||
!(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type,
should_sleep_fn, p, ip))) {
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
path->l[b->c.level].lock_taken_time = ktime_get_ns();
#endif
}
return ret;
}
bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
static inline bool bch2_btree_node_relock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
EBUG_ON(btree_node_locked(path, level) &&
btree_node_locked_type(path, level) !=
__btree_lock_want(path, level));
return likely(btree_node_locked(path, level)) ||
__bch2_btree_node_relock(trans, path, level);
btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
}
/*
@ -279,6 +171,9 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
EBUG_ON(path->l[b->c.level].b != b);
EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
trans_for_each_path_with_node(trans, b, linked)
linked->l[b->c.level].lock_seq += 2;
@ -289,20 +184,181 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
void bch2_btree_node_unlock_write(struct btree_trans *,
struct btree_path *, struct btree *);
void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
/* lock: */
static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
struct btree_path *path,
struct btree *b)
static inline int __must_check
btree_node_lock_nopath(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type)
{
EBUG_ON(path->l[b->c.level].b != b);
EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
EBUG_ON(!btree_node_intent_locked(path, b->c.level));
six_lock_type(&b->lock, type, NULL, NULL);
return 0;
}
if (unlikely(!six_trylock_write(&b->c.lock)))
static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type)
{
int ret = btree_node_lock_nopath(trans, b, type);
BUG_ON(ret);
}
static inline int btree_node_lock_type(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b,
struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
int ret;
if (six_trylock_type(&b->lock, type))
return 0;
trans->locking_path_idx = path->idx;
trans->locking_pos = pos;
trans->locking_btree_id = path->btree_id;
trans->locking_level = level;
trans->locking_lock_type = type;
trans->locking = b;
ret = six_lock_type(&b->lock, type, should_sleep_fn, p);
trans->locking = NULL;
return ret;
}
/*
* Lock a btree node if we already have it locked on one of our linked
* iterators:
*/
static inline bool btree_node_lock_increment(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
unsigned level,
enum btree_node_locked_type want)
{
struct btree_path *path;
trans_for_each_path(trans, path)
if (&path->l[level].b->c == b &&
btree_node_locked_type(path, level) >= want) {
six_lock_increment(&b->lock, want);
return true;
}
return false;
}
int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
struct btree_bkey_cached_common *,
struct bpos, unsigned,
enum six_lock_type,
six_lock_should_sleep_fn, void *,
unsigned long);
static inline int btree_node_lock(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b,
struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
{
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, type) ||
!(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type,
should_sleep_fn, p, ip))) {
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
path->l[b->level].lock_taken_time = ktime_get_ns();
#endif
}
return ret;
}
void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *);
static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
EBUG_ON(&path->l[b->level].b->c != b);
EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
EBUG_ON(!btree_node_intent_locked(path, b->level));
/*
* six locks are unfair, and read locks block while a thread wants a
* write lock: thus, we need to tell the cycle detector we have a write
* lock _before_ taking the lock:
*/
mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
if (unlikely(!six_trylock_write(&b->lock)))
__bch2_btree_node_lock_write(trans, b);
}
static inline int __must_check
bch2_btree_node_lock_write(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
bch2_btree_node_lock_write_nofail(trans, path, b);
return 0;
}
/* relock: */
bool bch2_btree_path_relock_norestart(struct btree_trans *,
struct btree_path *, unsigned long);
bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
static inline bool bch2_btree_node_relock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
EBUG_ON(btree_node_locked(path, level) &&
!btree_node_write_locked(path, level) &&
btree_node_locked_type(path, level) != __btree_lock_want(path, level));
return likely(btree_node_locked(path, level)) ||
(!IS_ERR_OR_NULL(path->l[level].b) &&
__bch2_btree_node_relock(trans, path, level));
}
static inline int bch2_btree_path_relock(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
}
return 0;
}
/* upgrade */
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
struct btree_path *, unsigned);
bool __bch2_btree_path_upgrade(struct btree_trans *,
struct btree_path *, unsigned);
static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
return path->locks_want < new_locks_want
? __bch2_btree_path_upgrade(trans, path, new_locks_want)
: path->uptodate == BTREE_ITER_UPTODATE;
}
/* misc: */
static inline void btree_path_set_should_be_locked(struct btree_path *path)
{
EBUG_ON(!btree_node_locked(path, path->level));
@ -326,7 +382,20 @@ static inline void btree_path_set_level_up(struct btree_trans *trans,
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
}
/* debug */
struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
struct btree_path *, struct btree *, unsigned);
struct btree_path *,
struct btree_bkey_cached_common *b,
unsigned);
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_path_verify_locks(struct btree_path *);
void bch2_trans_verify_locks(struct btree_trans *);
#else
static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
#endif
#endif /* _BCACHEFS_BTREE_LOCKING_H */

View File

@ -63,6 +63,7 @@ struct btree_bkey_cached_common {
struct six_lock lock;
u8 level;
u8 btree_id;
bool cached;
};
struct btree {
@ -232,9 +233,8 @@ struct btree_path {
*/
bool should_be_locked:1;
unsigned level:3,
locks_want:4,
nodes_locked:4,
nodes_intent_locked:4;
locks_want:4;
u8 nodes_locked;
struct btree_path_level {
struct btree *b;
@ -302,7 +302,8 @@ struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
struct list_head freed;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
struct shrinker shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
@ -338,6 +339,13 @@ struct bkey_cached {
struct bkey_i *k;
};
static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
{
return !b->cached
? container_of(b, struct btree, c)->key.k.p
: container_of(b, struct bkey_cached, c)->key.pos;
}
struct btree_insert_entry {
unsigned flags;
u8 bkey_type;
@ -413,6 +421,7 @@ struct btree_trans {
u64 paths_allocated;
unsigned mem_top;
unsigned mem_max;
unsigned mem_bytes;
void *mem;

View File

@ -143,7 +143,7 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
static void __btree_node_free(struct bch_fs *c, struct btree *b)
{
trace_btree_node_free(c, b);
trace_and_count(c, btree_node_free, c, b);
BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_need_write(b));
@ -160,22 +160,23 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
}
static void bch2_btree_node_free_inmem(struct btree_trans *trans,
struct btree_path *path,
struct btree *b)
{
struct bch_fs *c = trans->c;
struct btree_path *path;
trans_for_each_path(trans, path)
BUG_ON(path->l[b->c.level].b == b &&
path->l[b->c.level].lock_seq == b->c.lock.state.seq);
six_lock_write(&b->c.lock, NULL, NULL);
unsigned level = b->c.level;
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
bch2_btree_node_hash_remove(&c->btree_cache, b);
__btree_node_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
trans_for_each_path(trans, path)
if (path->l[level].b == b) {
btree_node_unlock(trans, path, level);
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
}
}
static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
@ -258,7 +259,9 @@ mem_alloc:
return b;
}
static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
static struct btree *bch2_btree_node_alloc(struct btree_update *as,
struct btree_trans *trans,
unsigned level)
{
struct bch_fs *c = as->c;
struct btree *b;
@ -270,8 +273,8 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
b = p->b[--p->nr];
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
set_btree_node_accessed(b);
set_btree_node_dirty_acct(c, b);
@ -304,7 +307,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
BUG_ON(ret);
trace_btree_node_alloc(c, b);
trace_and_count(c, btree_node_alloc, c, b);
return b;
}
@ -322,12 +325,13 @@ static void btree_set_max(struct btree *b, struct bpos pos)
}
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
struct btree_trans *trans,
struct btree *b,
struct bkey_format format)
{
struct btree *n;
n = bch2_btree_node_alloc(as, b->c.level);
n = bch2_btree_node_alloc(as, trans, b->c.level);
SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
@ -346,6 +350,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
}
static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
struct btree_trans *trans,
struct btree *b)
{
struct bkey_format new_f = bch2_btree_calc_format(b);
@ -357,12 +362,13 @@ static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
if (!bch2_btree_node_format_fits(as->c, b, &new_f))
new_f = b->format;
return __bch2_btree_node_alloc_replacement(as, b, new_f);
return __bch2_btree_node_alloc_replacement(as, trans, b, new_f);
}
static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
static struct btree *__btree_root_alloc(struct btree_update *as,
struct btree_trans *trans, unsigned level)
{
struct btree *b = bch2_btree_node_alloc(as, level);
struct btree *b = bch2_btree_node_alloc(as, trans, level);
btree_set_min(b, POS_MIN);
btree_set_max(b, SPOS_MAX);
@ -377,7 +383,7 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
return b;
}
static void bch2_btree_reserve_put(struct btree_update *as)
static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
struct prealloc_nodes *p;
@ -404,8 +410,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
mutex_unlock(&c->btree_reserve_cache_lock);
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
__btree_node_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@ -459,7 +465,7 @@ err:
/* Asynchronous interior node update machinery */
static void bch2_btree_update_free(struct btree_update *as)
static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
@ -472,7 +478,7 @@ static void bch2_btree_update_free(struct btree_update *as)
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
bch2_disk_reservation_put(c, &as->disk_res);
bch2_btree_reserve_put(as);
bch2_btree_reserve_put(as, trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
as->start_time);
@ -550,12 +556,13 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
static void btree_update_nodes_written(struct btree_update *as)
{
struct bch_fs *c = as->c;
struct btree *b = as->b;
struct btree *b;
struct btree_trans trans;
u64 journal_seq = 0;
unsigned i;
int ret;
bch2_trans_init(&trans, c, 0, 512);
/*
* If we're already in an error state, it might be because a btree node
* was never written, and we might be trying to free that same btree
@ -572,15 +579,16 @@ static void btree_update_nodes_written(struct btree_update *as)
* on disk:
*/
for (i = 0; i < as->nr_old_nodes; i++) {
struct btree *old = as->old_nodes[i];
__le64 seq;
six_lock_read(&old->c.lock, NULL, NULL);
seq = old->data ? old->data->keys.seq : 0;
six_unlock_read(&old->c.lock);
b = as->old_nodes[i];
btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
seq = b->data ? b->data->keys.seq : 0;
six_unlock_read(&b->c.lock);
if (seq == as->old_nodes_seq[i])
wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
TASK_UNINTERRUPTIBLE);
}
@ -597,19 +605,19 @@ static void btree_update_nodes_written(struct btree_update *as)
* journal reclaim does btree updates when flushing bkey_cached entries,
* which may require allocations as well.
*/
bch2_trans_init(&trans, c, 0, 512);
ret = commit_do(&trans, &as->disk_res, &journal_seq,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_JOURNAL_RECLAIM|
JOURNAL_WATERMARK_reserved,
btree_update_nodes_written_trans(&trans, as));
bch2_trans_exit(&trans);
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_JOURNAL_RECLAIM|
JOURNAL_WATERMARK_reserved,
btree_update_nodes_written_trans(&trans, as));
bch2_trans_unlock(&trans);
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
"error %i in btree_update_nodes_written()", ret);
err:
if (b) {
if (as->b) {
b = as->b;
/*
* @b is the node we did the final insert into:
*
@ -622,8 +630,8 @@ err:
* we're in journal error state:
*/
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_write);
mutex_lock(&c->btree_interior_update_lock);
list_del(&as->write_blocked_list);
@ -680,7 +688,7 @@ err:
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
six_lock_read(&b->c.lock, NULL, NULL);
btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
}
@ -688,7 +696,8 @@ err:
for (i = 0; i < as->nr_open_buckets; i++)
bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
bch2_btree_update_free(as);
bch2_btree_update_free(as, &trans);
bch2_trans_exit(&trans);
}
static void btree_interior_update_work(struct work_struct *work)
@ -935,7 +944,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
as->nr_old_nodes++;
}
static void bch2_btree_update_done(struct btree_update *as)
static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
u64 start_time = as->start_time;
@ -946,7 +955,7 @@ static void bch2_btree_update_done(struct btree_update *as)
up_read(&as->c->gc_lock);
as->took_gc_lock = false;
bch2_btree_reserve_put(as);
bch2_btree_reserve_put(as, trans);
continue_at(&as->cl, btree_update_set_nodes_written,
as->c->btree_interior_update_worker);
@ -994,7 +1003,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
nr_nodes[1] += 1;
if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
trace_trans_restart_iter_upgrade(trans, _RET_IP_, path);
trace_and_count(c, trans_restart_iter_upgrade, trans, _RET_IP_, path);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
return ERR_PTR(ret);
}
@ -1048,11 +1057,16 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (ret) {
bch2_trans_unlock(trans);
if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
}
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags);
if (ret) {
trace_trans_restart_journal_preres_get(trans, _RET_IP_);
trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
goto err;
}
@ -1085,8 +1099,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
}
if (ret) {
trace_btree_reserve_get_fail(trans->fn, _RET_IP_,
nr_nodes[0] + nr_nodes[1]);
trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]);
goto err;
}
@ -1097,7 +1110,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
bch2_trans_verify_not_restarted(trans, restart_count);
return as;
err:
bch2_btree_update_free(as);
bch2_btree_update_free(as, trans);
return ERR_PTR(ret);
}
@ -1141,7 +1154,7 @@ static void bch2_btree_set_root(struct btree_update *as,
struct bch_fs *c = as->c;
struct btree *old;
trace_btree_set_root(c, b);
trace_and_count(c, btree_node_set_root, c, b);
BUG_ON(!b->written);
old = btree_node_root(c, b);
@ -1150,7 +1163,7 @@ static void bch2_btree_set_root(struct btree_update *as,
* Ensure no one is using the old root while we switch to the
* new root:
*/
bch2_btree_node_lock_write(trans, path, old);
bch2_btree_node_lock_write_nofail(trans, path, &old->c);
bch2_btree_set_root_inmem(c, b);
@ -1249,6 +1262,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
* node)
*/
static struct btree *__btree_split_node(struct btree_update *as,
struct btree_trans *trans,
struct btree *n1)
{
struct bkey_format_state s;
@ -1258,7 +1272,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
struct bpos n1_pos;
n2 = bch2_btree_node_alloc(as, n1->c.level);
n2 = bch2_btree_node_alloc(as, trans, n1->c.level);
n2->data->max_key = n1->data->max_key;
n2->data->format = n1->format;
@ -1422,15 +1436,15 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_interior_update_will_free_node(as, b);
n1 = bch2_btree_node_alloc_replacement(as, b);
n1 = bch2_btree_node_alloc_replacement(as, trans, b);
if (keys)
btree_split_insert_keys(as, trans, path, n1, keys);
if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
trace_btree_split(c, b);
trace_and_count(c, btree_node_split, c, b);
n2 = __btree_split_node(as, n1);
n2 = __btree_split_node(as, trans, n1);
bch2_btree_build_aux_trees(n2);
bch2_btree_build_aux_trees(n1);
@ -1452,7 +1466,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
if (!parent) {
/* Depth increases, make a new root */
n3 = __btree_root_alloc(as, b->c.level + 1);
n3 = __btree_root_alloc(as, trans, b->c.level + 1);
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
@ -1462,7 +1476,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
}
} else {
trace_btree_compact(c, b);
trace_and_count(c, btree_node_compact, c, b);
bch2_btree_build_aux_trees(n1);
six_unlock_write(&n1->c.lock);
@ -1493,22 +1507,19 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
if (n3)
bch2_btree_update_get_open_buckets(as, n3);
/* Successful split, update the path to point to the new nodes: */
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
if (n3)
bch2_trans_node_add(trans, n3);
if (n2)
bch2_trans_node_add(trans, n2);
bch2_trans_node_add(trans, n1);
/*
* The old node must be freed (in memory) _before_ unlocking the new
* nodes - else another thread could re-acquire a read lock on the old
* node after another thread has locked and updated the new node, thus
* seeing stale data:
*/
bch2_btree_node_free_inmem(trans, b);
bch2_btree_node_free_inmem(trans, path, b);
if (n3)
bch2_trans_node_add(trans, n3);
if (n2)
bch2_trans_node_add(trans, n2);
bch2_trans_node_add(trans, n1);
if (n3)
six_unlock_intent(&n3->c.lock);
@ -1617,7 +1628,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
return PTR_ERR(as);
btree_split(as, trans, path, b, NULL, flags);
bch2_btree_update_done(as);
bch2_btree_update_done(as, trans);
for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
ret = bch2_foreground_maybe_merge(trans, path, l, flags);
@ -1731,12 +1742,12 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (ret)
goto err;
trace_btree_merge(c, b);
trace_and_count(c, btree_node_merge, c, b);
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_interior_update_will_free_node(as, m);
n = bch2_btree_node_alloc(as, b->c.level);
n = bch2_btree_node_alloc(as, trans, b->c.level);
SET_BTREE_NODE_SEQ(n->data,
max(BTREE_NODE_SEQ(b->data),
@ -1771,19 +1782,16 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
six_lock_increment(&m->c.lock, SIX_LOCK_intent);
bch2_btree_node_free_inmem(trans, path, b);
bch2_btree_node_free_inmem(trans, sib_path, m);
bch2_trans_node_add(trans, n);
bch2_trans_verify_paths(trans);
bch2_btree_node_free_inmem(trans, b);
bch2_btree_node_free_inmem(trans, m);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
bch2_btree_update_done(as, trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
out:
@ -1817,13 +1825,13 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
bch2_btree_interior_update_will_free_node(as, b);
n = bch2_btree_node_alloc_replacement(as, b);
n = bch2_btree_node_alloc_replacement(as, trans, b);
bch2_btree_update_add_new_node(as, n);
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->c.lock);
trace_btree_rewrite(c, b);
trace_and_count(c, btree_node_rewrite, c, b);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
@ -1837,12 +1845,12 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
bch2_btree_node_free_inmem(trans, iter->path, b);
bch2_trans_node_add(trans, n);
bch2_btree_node_free_inmem(trans, b);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
bch2_btree_update_done(as, trans);
out:
bch2_btree_path_downgrade(trans, iter->path);
return ret;
@ -1989,7 +1997,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
if (ret)
goto err;
bch2_btree_node_lock_write(trans, iter->path, b);
bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);

View File

@ -117,6 +117,7 @@ struct btree_update {
};
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
struct btree_trans *,
struct btree *,
struct bkey_format);

View File

@ -81,7 +81,7 @@ void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
struct btree_path *path,
struct btree *b)
{
bch2_btree_node_lock_write(trans, path, b);
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
bch2_btree_node_prep_for_write(trans, path, b);
}
@ -169,10 +169,13 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
struct btree_trans trans;
unsigned long old, new, v;
unsigned idx = w - b->writes;
six_lock_read(&b->c.lock, NULL, NULL);
bch2_trans_init(&trans, c, 0, 0);
btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
v = READ_ONCE(b->flags);
do {
@ -188,6 +191,8 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
bch2_trans_exit(&trans);
return 0;
}
@ -285,7 +290,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
ret = bch2_trans_relock(trans);
if (ret) {
trace_trans_restart_journal_preres_get(trans, trace_ip);
trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0);
return ret;
}
@ -375,7 +380,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
* Keys returned by peek() are no longer valid pointers, so we need a
* transaction restart:
*/
trace_trans_restart_key_cache_key_realloced(trans, _RET_IP_, path, old_u64s, new_u64s);
trace_and_count(c, trans_restart_key_cache_key_realloced, trans, _RET_IP_, path, old_u64s, new_u64s);
return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced);
}
@ -567,7 +572,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
int ret;
if (race_fault()) {
trace_trans_restart_fault_inject(trans, trace_ip);
trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
}
@ -741,11 +746,12 @@ static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_
static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
{
struct btree *b = path_l(path)->b;
unsigned l;
do {
if (path->nodes_locked &&
path->nodes_locked != path->nodes_intent_locked)
path_upgrade_readers(trans, path);
for (l = 0; l < BTREE_MAX_DEPTH; l++)
if (btree_node_read_locked(path, l))
path_upgrade_readers(trans, path);
} while ((path = prev_btree_path(trans, path)) &&
path_l(path)->b == b);
}
@ -764,11 +770,13 @@ static inline void normalize_read_intent_locks(struct btree_trans *trans)
? trans->paths + trans->sorted[i + 1]
: NULL;
if (path->nodes_locked) {
if (path->nodes_intent_locked)
nr_intent++;
else
nr_read++;
switch (btree_node_locked_type(path, path->level)) {
case BTREE_NODE_READ_LOCKED:
nr_read++;
break;
case BTREE_NODE_INTENT_LOCKED:
nr_intent++;
break;
}
if (!next || path_l(path)->b != path_l(next)->b) {
@ -791,7 +799,7 @@ static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct
//if (path == pos)
// break;
if (path->nodes_locked != path->nodes_intent_locked &&
if (btree_node_read_locked(path, path->level) &&
!bch2_btree_path_upgrade(trans, path, path->level + 1))
return true;
}
@ -808,12 +816,19 @@ static inline int trans_lock_write(struct btree_trans *trans)
if (same_leaf_as_prev(trans, i))
continue;
/*
* six locks are unfair, and read locks block while a thread
* wants a write lock: thus, we need to tell the cycle detector
* we have a write lock _before_ taking the lock:
*/
mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_write);
if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
if (have_conflicting_read_lock(trans, i->path))
goto fail;
ret = btree_node_lock_type(trans, i->path,
insert_l(i)->b,
&insert_l(i)->b->c,
i->path->pos, i->level,
SIX_LOCK_write, NULL, NULL);
BUG_ON(ret);
@ -824,6 +839,8 @@ static inline int trans_lock_write(struct btree_trans *trans)
return 0;
fail:
mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_intent);
while (--i >= trans->updates) {
if (same_leaf_as_prev(trans, i))
continue;
@ -831,7 +848,7 @@ fail:
bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
}
trace_trans_restart_would_deadlock_write(trans);
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
@ -964,7 +981,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
case BTREE_INSERT_BTREE_NODE_FULL:
ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
trace_trans_restart_btree_node_split(trans, trace_ip, i->path);
trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
break;
case BTREE_INSERT_NEED_MARK_REPLICAS:
bch2_trans_unlock(trans);
@ -975,7 +992,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
ret = bch2_trans_relock(trans);
if (ret)
trace_trans_restart_mark_replicas(trans, trace_ip);
trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip);
break;
case BTREE_INSERT_NEED_JOURNAL_RES:
bch2_trans_unlock(trans);
@ -992,12 +1009,12 @@ int bch2_trans_commit_error(struct btree_trans *trans,
ret = bch2_trans_relock(trans);
if (ret)
trace_trans_restart_journal_res_get(trans, trace_ip);
trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip);
break;
case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
bch2_trans_unlock(trans);
trace_trans_blocked_journal_reclaim(trans, trace_ip);
trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
wait_event_freezable(c->journal.reclaim_wait,
(ret = journal_reclaim_wait_done(c)));
@ -1006,7 +1023,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
ret = bch2_trans_relock(trans);
if (ret)
trace_trans_restart_journal_reclaim(trans, trace_ip);
trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
break;
default:
BUG_ON(ret >= 0);
@ -1107,7 +1124,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
BUG_ON(!i->path->should_be_locked);
if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
trace_trans_restart_upgrade(trans, _RET_IP_, i->path);
trace_and_count(c, trans_restart_upgrade, trans, _RET_IP_, i->path);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
goto out;
}
@ -1148,7 +1165,7 @@ retry:
if (ret)
goto err;
trace_transaction_commit(trans, _RET_IP_);
trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
@ -1617,7 +1634,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
ck = (void *) iter->key_cache_path->l[0].b;
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
trace_trans_restart_key_cache_raced(trans, _RET_IP_);
trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
}

View File

@ -231,9 +231,12 @@ static int bch2_data_update_index_update(struct bch_write_op *op)
m->data_opts.btree_insert_flags);
if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos);
atomic_long_inc(&c->extent_migrate_done);
if (ec_ob)
bch2_ob_add_backpointer(c, ec_ob, &insert->k);
this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
trace_move_extent_finish(&new->k);
}
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@ -248,22 +251,16 @@ next:
}
continue;
nomatch:
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, old);
bch_info(c, "no match for %s", buf.buf);
printbuf_exit(&buf);
}
if (m->ctxt) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->ctxt->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
&m->ctxt->stats->sectors_raced);
}
atomic_long_inc(&c->extent_migrate_raced);
trace_move_race(&new->k);
this_cpu_add(c->counters[BCH_COUNTER_move_extent_race], new->k.size);
trace_move_extent_race(&new->k);
bch2_btree_iter_advance(&iter);
goto next;
}

View File

@ -666,6 +666,9 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
mutex_lock(&s->lock);
prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
prt_newline(&i->buf);
if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
prt_printf(&i->buf, "Lock hold times:");
prt_newline(&i->buf);

View File

@ -384,32 +384,34 @@ inval:
prt_printf(out, "invalid label %u", v);
}
int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
{
struct bch_member *mi;
int v = -1;
int ret = 0;
mutex_lock(&c->sb_lock);
int ret, v = -1;
if (!strlen(name) || !strcmp(name, "none"))
goto write_sb;
return 0;
v = bch2_disk_path_find_or_create(&c->disk_sb, name);
if (v < 0) {
mutex_unlock(&c->sb_lock);
if (v < 0)
return v;
}
ret = bch2_sb_disk_groups_to_cpu(c);
if (ret)
goto unlock;
write_sb:
return ret;
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
SET_BCH_MEMBER_GROUP(mi, v + 1);
return 0;
}
bch2_write_super(c);
unlock:
int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
{
int ret;
mutex_lock(&c->sb_lock);
ret = __bch2_dev_group_set(c, ca, name) ?:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return ret;

View File

@ -82,6 +82,7 @@ void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *
int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
const char *bch2_sb_validate_disk_groups(struct bch_sb *,

View File

@ -772,9 +772,6 @@ static int hash_redo_key(struct btree_trans *trans,
struct bch_hash_info *hash_info,
struct btree_iter *k_iter, struct bkey_s_c k)
{
bch_err(trans->c, "hash_redo_key() not implemented yet");
return -EINVAL;
#if 0
struct bkey_i *delete;
struct bkey_i *tmp;
@ -792,8 +789,14 @@ static int hash_redo_key(struct btree_trans *trans,
delete->k.p = k_iter->pos;
return bch2_btree_iter_traverse(k_iter) ?:
bch2_trans_update(trans, k_iter, delete, 0) ?:
bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
#endif
bch2_hash_set_snapshot(trans, desc, hash_info,
(subvol_inum) { 0, k.k->p.inode },
k.k->p.snapshot, tmp,
BCH_HASH_SET_MUST_CREATE,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
}
static int hash_check_key(struct btree_trans *trans,

View File

@ -1387,7 +1387,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
struct closure *cl = &op->cl;
struct bio *bio = &op->write.op.wbio.bio;
trace_promote(&rbio->bio);
trace_and_count(op->write.op.c, read_promote, &rbio->bio);
/* we now own pages: */
BUG_ON(!rbio->bounce);
@ -1653,7 +1653,7 @@ static void bch2_rbio_retry(struct work_struct *work)
};
struct bch_io_failures failed = { .nr = 0 };
trace_read_retry(&rbio->bio);
trace_and_count(c, read_retry, &rbio->bio);
if (rbio->retry == READ_RETRY_AVOID)
bch2_mark_io_failure(&failed, &rbio->pick);
@ -1909,7 +1909,7 @@ static void bch2_read_endio(struct bio *bio)
if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
ptr_stale(ca, &rbio->pick.ptr)) {
atomic_long_inc(&c->read_realloc_races);
trace_and_count(c, read_reuse_race, &rbio->bio);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
@ -2197,7 +2197,7 @@ get_bio:
rbio->bio.bi_end_io = bch2_read_endio;
if (rbio->bounce)
trace_read_bounce(&rbio->bio);
trace_and_count(c, read_bounce, &rbio->bio);
this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
@ -2212,7 +2212,7 @@ get_bio:
if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
bio_inc_remaining(&orig->bio);
trace_read_split(&orig->bio);
trace_and_count(c, read_split, &orig->bio);
}
if (!rbio->pick.idx) {

View File

@ -391,12 +391,12 @@ retry:
ret = journal_entry_open(j);
if (ret == JOURNAL_ERR_max_in_flight)
trace_journal_entry_full(c);
trace_and_count(c, journal_entry_full, c);
unlock:
if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
!j->res_get_blocked_start) {
j->res_get_blocked_start = local_clock() ?: 1;
trace_journal_full(c);
trace_and_count(c, journal_full, c);
}
can_discard = j->can_discard;

View File

@ -1551,7 +1551,7 @@ static void do_journal_write(struct closure *cl)
bch2_bio_map(bio, w->data, sectors << 9);
trace_journal_write(bio);
trace_and_count(c, journal_write, bio);
closure_bio_submit(bio, cl);
ca->journal.bucket_seq[ca->journal.cur_idx] =

View File

@ -641,7 +641,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
trace_journal_reclaim_start(c, direct, kicked,
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
j->prereserved.reserved,
j->prereserved.remaining,
@ -657,7 +658,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
j->nr_direct_reclaim += nr_flushed;
else
j->nr_background_reclaim += nr_flushed;
trace_journal_reclaim_finish(c, nr_flushed);
trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
if (nr_flushed)
wake_up(&j->reclaim_wait);

View File

@ -252,8 +252,8 @@ static int bch2_move_extent(struct btree_trans *trans,
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
trace_move_extent(k.k);
this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
trace_move_extent_read(k.k);
atomic_add(io->read_sectors, &ctxt->read_sectors);
list_add_tail(&io->list, &ctxt->reads);
@ -275,7 +275,7 @@ err_free:
kfree(io);
err:
percpu_ref_put(&c->writes);
trace_move_alloc_mem_fail(k.k);
trace_and_count(c, move_extent_alloc_mem_fail, k.k);
return ret;
}

View File

@ -165,7 +165,7 @@ static int bch2_copygc(struct bch_fs *c)
if (ret < 0)
bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
trace_copygc(c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
return ret;
}
@ -221,7 +221,7 @@ static int bch2_copygc_thread(void *arg)
wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {
trace_copygc_wait(c, wait, last + wait);
trace_and_count(c, copygc_wait, c, wait, last + wait);
c->copygc_wait = last + wait;
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);

View File

@ -239,29 +239,26 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum,
struct bkey_i *insert, int flags)
int bch2_hash_set_snapshot(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
int flags,
int update_flags)
{
struct btree_iter iter, slot = { NULL };
struct bkey_s_c k;
bool found = false;
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
SPOS(inum.inum,
SPOS(insert->k.p.inode,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
POS(inum.inum, U64_MAX),
POS(insert->k.p.inode, U64_MAX),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (is_visible_key(desc, inum, k)) {
if (!inum.subvol || is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
@ -303,6 +300,26 @@ not_found:
goto out;
}
static __always_inline
int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum,
struct bkey_i *insert, int flags)
{
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
insert->k.p.inode = inum.inum;
return bch2_hash_set_snapshot(trans, desc, info, inum,
snapshot, insert, flags, 0);
}
static __always_inline
int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,

View File

@ -796,7 +796,7 @@ int bch2_write_super(struct bch_fs *c)
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
int ret = 0;
trace_write_super(c, _RET_IP_);
trace_and_count(c, write_super, c, _RET_IP_);
if (c->opts.very_degraded)
degraded_flags |= BCH_FORCE_IF_LOST;

View File

@ -1530,6 +1530,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_member dev_mi;
unsigned dev_idx, nr_devices, u64s;
struct printbuf errbuf = PRINTBUF;
struct printbuf label = PRINTBUF;
int ret;
ret = bch2_read_super(path, &opts, &sb);
@ -1540,6 +1541,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
if (BCH_MEMBER_GROUP(&dev_mi)) {
bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
if (label.allocation_failure) {
ret = -ENOMEM;
goto err;
}
}
err = bch2_dev_may_add(sb.sb, c);
if (err) {
bch_err(c, "device add error: %s", err);
@ -1620,6 +1629,14 @@ have_slot:
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
ret = __bch2_dev_group_set(c, ca, label.buf);
if (ret) {
bch_err(c, "device add error: error setting label");
goto err_unlock;
}
}
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@ -1652,6 +1669,7 @@ err:
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
printbuf_exit(&label);
printbuf_exit(&errbuf);
return ret;
err_late:

View File

@ -190,11 +190,6 @@ read_attribute(internal_uuid);
read_attribute(has_data);
read_attribute(alloc_debug);
read_attribute(read_realloc_races);
read_attribute(extent_migrate_done);
read_attribute(extent_migrate_raced);
read_attribute(bucket_alloc_fail);
#define x(t, n, ...) read_attribute(t);
BCH_PERSISTENT_COUNTERS()
#undef x
@ -378,15 +373,6 @@ SHOW(bch2_fs)
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c));
sysfs_print(read_realloc_races,
atomic_long_read(&c->read_realloc_races));
sysfs_print(extent_migrate_done,
atomic_long_read(&c->extent_migrate_done));
sysfs_print(extent_migrate_raced,
atomic_long_read(&c->extent_migrate_raced));
sysfs_print(bucket_alloc_fail,
atomic_long_read(&c->bucket_alloc_fail));
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
if (attr == &sysfs_gc_gens_pos)
@ -625,11 +611,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_invalidates,
&sysfs_prune_cache,
&sysfs_read_realloc_races,
&sysfs_extent_migrate_done,
&sysfs_extent_migrate_raced,
&sysfs_bucket_alloc_fail,
&sysfs_gc_gens_pos,
&sysfs_copy_gc_enabled,

View File

@ -19,6 +19,8 @@
#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
#define six_release(l) lock_release(l, _RET_IP_)
static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
struct six_lock_vals {
/* Value we add to the lock in order to take the lock: */
u64 lock_val;
@ -65,14 +67,15 @@ struct six_lock_vals {
}
static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
union six_lock_state old)
union six_lock_state old,
struct task_struct *owner)
{
if (type != SIX_LOCK_intent)
return;
if (!old.intent_lock) {
EBUG_ON(lock->owner);
lock->owner = current;
lock->owner = owner;
} else {
EBUG_ON(lock->owner != current);
}
@ -88,64 +91,21 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
return read_count;
}
struct six_lock_waiter {
struct list_head list;
struct task_struct *task;
};
/* This is probably up there with the more evil things I've done */
#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
static inline void six_lock_wakeup(struct six_lock *lock,
union six_lock_state state,
unsigned waitlist_id)
{
if (waitlist_id == SIX_LOCK_write) {
if (state.write_locking && !state.read_lock) {
struct task_struct *p = READ_ONCE(lock->owner);
if (p)
wake_up_process(p);
}
} else {
struct list_head *wait_list = &lock->wait_list[waitlist_id];
struct six_lock_waiter *w, *next;
if (!(state.waiters & (1 << waitlist_id)))
return;
clear_bit(waitlist_bitnr(waitlist_id),
(unsigned long *) &lock->state.v);
raw_spin_lock(&lock->wait_lock);
list_for_each_entry_safe(w, next, wait_list, list) {
list_del_init(&w->list);
if (wake_up_process(w->task) &&
waitlist_id != SIX_LOCK_read) {
if (!list_empty(wait_list))
set_bit(waitlist_bitnr(waitlist_id),
(unsigned long *) &lock->state.v);
break;
}
}
raw_spin_unlock(&lock->wait_lock);
}
}
static __always_inline bool do_six_trylock_type(struct six_lock *lock,
enum six_lock_type type,
bool try)
static int __do_six_trylock_type(struct six_lock *lock,
enum six_lock_type type,
struct task_struct *task,
bool try)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old, new;
bool ret;
int ret;
u64 v;
EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
/*
@ -176,18 +136,6 @@ retry:
this_cpu_sub(*lock->readers, !ret);
preempt_enable();
/*
* If we failed because a writer was trying to take the
* lock, issue a wakeup because we might have caused a
* spurious trylock failure:
*/
if (old.write_locking) {
struct task_struct *p = READ_ONCE(lock->owner);
if (p)
wake_up_process(p);
}
/*
* If we failed from the lock path and the waiting bit wasn't
* set, set it:
@ -208,6 +156,14 @@ retry:
} while ((v = atomic64_cmpxchg(&lock->state.counter,
old.v, new.v)) != old.v);
}
/*
* If we failed because a writer was trying to take the
* lock, issue a wakeup because we might have caused a
* spurious trylock failure:
*/
if (old.write_locking)
ret = -1 - SIX_LOCK_write;
} else if (type == SIX_LOCK_write && lock->readers) {
if (try) {
atomic64_add(__SIX_VAL(write_locking, 1),
@ -227,9 +183,13 @@ retry:
if (ret || try)
v -= __SIX_VAL(write_locking, 1);
if (!ret && !try && !(lock->state.waiters & (1 << SIX_LOCK_write)))
v += __SIX_VAL(waiters, 1 << SIX_LOCK_write);
if (try && !ret) {
old.v = atomic64_add_return(v, &lock->state.counter);
six_lock_wakeup(lock, old, SIX_LOCK_read);
if (old.waiters & (1 << SIX_LOCK_read))
ret = -1 - SIX_LOCK_read;
} else {
atomic64_add(v, &lock->state.counter);
}
@ -243,8 +203,7 @@ retry:
if (type == SIX_LOCK_write)
new.write_locking = 0;
} else if (!try && type != SIX_LOCK_write &&
!(new.waiters & (1 << type)))
} else if (!try && !(new.waiters & (1 << type)))
new.waiters |= 1 << type;
else
break; /* waiting bit already set */
@ -256,14 +215,84 @@ retry:
EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
}
if (ret)
six_set_owner(lock, type, old);
if (ret > 0)
six_set_owner(lock, type, old, task);
EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking));
return ret;
}
static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
{
struct six_lock_waiter *w, *next;
struct task_struct *task;
bool saw_one;
int ret;
again:
ret = 0;
saw_one = false;
raw_spin_lock(&lock->wait_lock);
list_for_each_entry_safe(w, next, &lock->wait_list, list) {
if (w->lock_want != lock_type)
continue;
if (saw_one && lock_type != SIX_LOCK_read)
goto unlock;
saw_one = true;
ret = __do_six_trylock_type(lock, lock_type, w->task, false);
if (ret <= 0)
goto unlock;
__list_del(w->list.prev, w->list.next);
task = w->task;
/*
* Do no writes to @w besides setting lock_acquired - otherwise
* we would need a memory barrier:
*/
barrier();
w->lock_acquired = true;
wake_up_process(task);
}
clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v);
unlock:
raw_spin_unlock(&lock->wait_lock);
if (ret < 0) {
lock_type = -ret - 1;
goto again;
}
}
static inline void six_lock_wakeup(struct six_lock *lock,
union six_lock_state state,
enum six_lock_type lock_type)
{
if (lock_type == SIX_LOCK_write && state.read_lock)
return;
if (!(state.waiters & (1 << lock_type)))
return;
__six_lock_wakeup(lock, lock_type);
}
static bool do_six_trylock_type(struct six_lock *lock,
enum six_lock_type type,
bool try)
{
int ret;
ret = __do_six_trylock_type(lock, type, current, try);
if (ret < 0)
__six_lock_wakeup(lock, -ret - 1);
return ret > 0;
}
__always_inline __flatten
static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
{
@ -304,12 +333,8 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
* Similar to the lock path, we may have caused a spurious write
* lock fail and need to issue a wakeup:
*/
if (old.write_locking) {
struct task_struct *p = READ_ONCE(lock->owner);
if (p)
wake_up_process(p);
}
if (old.write_locking)
six_lock_wakeup(lock, old, SIX_LOCK_write);
if (ret)
six_acquire(&lock->dep_map, 1);
@ -327,7 +352,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
old.v,
old.v + l[type].lock_val)) != old.v);
six_set_owner(lock, type, old);
six_set_owner(lock, type, old, current);
if (type != SIX_LOCK_write)
six_acquire(&lock->dep_map, 1);
return true;
@ -335,33 +360,26 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
#ifdef CONFIG_LOCK_SPIN_ON_OWNER
static inline int six_can_spin_on_owner(struct six_lock *lock)
static inline bool six_optimistic_spin(struct six_lock *lock,
struct six_lock_waiter *wait)
{
struct task_struct *owner;
int retval = 1;
struct task_struct *owner, *task = current;
if (need_resched())
return 0;
switch (wait->lock_want) {
case SIX_LOCK_read:
break;
case SIX_LOCK_intent:
if (lock->wait_list.next != &wait->list)
return false;
break;
case SIX_LOCK_write:
return false;
}
rcu_read_lock();
owner = READ_ONCE(lock->owner);
if (owner)
retval = owner->on_cpu;
rcu_read_unlock();
/*
* if lock->owner is not set, the mutex owner may have just acquired
* it and not set the owner yet or the mutex has been released.
*/
return retval;
}
static inline bool six_spin_on_owner(struct six_lock *lock,
struct task_struct *owner)
{
bool ret = true;
rcu_read_lock();
while (lock->owner == owner) {
while (owner && lock->owner == owner) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
* checking lock->owner still matches owner. If that fails,
@ -370,85 +388,27 @@ static inline bool six_spin_on_owner(struct six_lock *lock,
*/
barrier();
if (!owner->on_cpu || need_resched()) {
ret = false;
/*
* If we're an RT task that will live-lock because we won't let
* the owner complete.
*/
if (wait->lock_acquired ||
!owner->on_cpu ||
rt_task(task) ||
need_resched())
break;
}
cpu_relax();
}
rcu_read_unlock();
return ret;
}
static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
{
struct task_struct *task = current;
if (type == SIX_LOCK_write)
return false;
preempt_disable();
if (!six_can_spin_on_owner(lock))
goto fail;
if (!osq_lock(&lock->osq))
goto fail;
while (1) {
struct task_struct *owner;
/*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
owner = READ_ONCE(lock->owner);
if (owner && !six_spin_on_owner(lock, owner))
break;
if (do_six_trylock_type(lock, type, false)) {
osq_unlock(&lock->osq);
preempt_enable();
return true;
}
/*
* When there's no owner, we might have preempted between the
* owner acquiring the lock and setting the owner field. If
* we're an RT task that will live-lock because we won't let
* the owner complete.
*/
if (!owner && (need_resched() || rt_task(task)))
break;
/*
* The cpu_relax() call is a compiler barrier which forces
* everything in this loop to be re-loaded. We don't need
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
cpu_relax();
}
osq_unlock(&lock->osq);
fail:
preempt_enable();
/*
* If we fell out of the spin path because of need_resched(),
* reschedule now, before we try-lock again. This avoids getting
* scheduled out right after we obtained the lock.
*/
if (need_resched())
schedule();
return false;
return wait->lock_acquired;
}
#else /* CONFIG_LOCK_SPIN_ON_OWNER */
static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
static inline bool six_optimistic_spin(struct six_lock *lock,
struct six_lock_waiter *wait)
{
return false;
}
@ -457,10 +417,10 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
noinline
static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
struct six_lock_waiter *wait,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
union six_lock_state old;
struct six_lock_waiter wait;
int ret = 0;
if (type == SIX_LOCK_write) {
@ -469,46 +429,58 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
smp_mb__after_atomic();
}
ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
if (ret)
goto out_before_sleep;
if (six_optimistic_spin(lock, type))
goto out_before_sleep;
lock_contended(&lock->dep_map, _RET_IP_);
INIT_LIST_HEAD(&wait.list);
wait.task = current;
wait->task = current;
wait->lock_want = type;
wait->lock_acquired = false;
raw_spin_lock(&lock->wait_lock);
/*
* Retry taking the lock after taking waitlist lock, have raced with an
* unlock:
*/
ret = __do_six_trylock_type(lock, type, current, false);
if (ret <= 0)
list_add_tail(&wait->list, &lock->wait_list);
raw_spin_unlock(&lock->wait_lock);
if (unlikely(ret > 0)) {
ret = 0;
goto out;
}
if (unlikely(ret < 0)) {
__six_lock_wakeup(lock, -ret - 1);
ret = 0;
}
if (six_optimistic_spin(lock, wait))
goto out;
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (type == SIX_LOCK_write)
EBUG_ON(lock->owner != current);
else if (list_empty_careful(&wait.list)) {
raw_spin_lock(&lock->wait_lock);
list_add_tail(&wait.list, &lock->wait_list[type]);
raw_spin_unlock(&lock->wait_lock);
}
if (do_six_trylock_type(lock, type, false))
if (wait->lock_acquired)
break;
ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
if (ret)
if (unlikely(ret)) {
raw_spin_lock(&lock->wait_lock);
if (!wait->lock_acquired)
list_del(&wait->list);
raw_spin_unlock(&lock->wait_lock);
if (wait->lock_acquired)
do_six_unlock_type(lock, type);
break;
}
schedule();
}
__set_current_state(TASK_RUNNING);
if (!list_empty_careful(&wait.list)) {
raw_spin_lock(&lock->wait_lock);
list_del_init(&wait.list);
raw_spin_unlock(&lock->wait_lock);
}
out_before_sleep:
out:
if (ret && type == SIX_LOCK_write) {
old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
&lock->state.counter);
@ -518,9 +490,10 @@ out_before_sleep:
return ret;
}
__always_inline
static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
__always_inline __flatten
static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
struct six_lock_waiter *wait,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
int ret;
@ -528,7 +501,7 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
six_acquire(&lock->dep_map, 0);
ret = do_six_trylock_type(lock, type, true) ? 0
: __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
: __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p);
if (ret && type != SIX_LOCK_write)
six_release(&lock->dep_map);
@ -538,28 +511,23 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
return ret;
}
__always_inline
static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
struct six_lock_waiter wait;
return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p);
}
__always_inline __flatten
static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state state;
EBUG_ON(type == SIX_LOCK_write &&
!(lock->state.v & __SIX_LOCK_HELD_intent));
if (type != SIX_LOCK_write)
six_release(&lock->dep_map);
if (type == SIX_LOCK_intent) {
EBUG_ON(lock->owner != current);
if (lock->intent_lock_recurse) {
--lock->intent_lock_recurse;
return;
}
if (type == SIX_LOCK_intent)
lock->owner = NULL;
}
if (type == SIX_LOCK_read &&
lock->readers) {
@ -576,6 +544,27 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
six_lock_wakeup(lock, state, l[type].unlock_wakeup);
}
__always_inline __flatten
static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
{
EBUG_ON(type == SIX_LOCK_write &&
!(lock->state.v & __SIX_LOCK_HELD_intent));
EBUG_ON((type == SIX_LOCK_write ||
type == SIX_LOCK_intent) &&
lock->owner != current);
if (type != SIX_LOCK_write)
six_release(&lock->dep_map);
if (type == SIX_LOCK_intent &&
lock->intent_lock_recurse) {
--lock->intent_lock_recurse;
return;
}
do_six_unlock_type(lock, type);
}
#define __SIX_LOCK(type) \
bool six_trylock_##type(struct six_lock *lock) \
{ \
@ -596,6 +585,14 @@ int six_lock_##type(struct six_lock *lock, \
} \
EXPORT_SYMBOL_GPL(six_lock_##type); \
\
int six_lock_waiter_##type(struct six_lock *lock, \
struct six_lock_waiter *wait, \
six_lock_should_sleep_fn should_sleep_fn, void *p)\
{ \
return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p);\
} \
EXPORT_SYMBOL_GPL(six_lock_waiter_##type); \
\
void six_unlock_##type(struct six_lock *lock) \
{ \
__six_unlock_type(lock, SIX_LOCK_##type); \
@ -639,7 +636,7 @@ bool six_lock_tryupgrade(struct six_lock *lock)
if (lock->readers)
this_cpu_dec(*lock->readers);
six_set_owner(lock, SIX_LOCK_intent, old);
six_set_owner(lock, SIX_LOCK_intent, old, current);
return true;
}
@ -701,44 +698,12 @@ void six_lock_wakeup_all(struct six_lock *lock)
struct six_lock_waiter *w;
raw_spin_lock(&lock->wait_lock);
list_for_each_entry(w, &lock->wait_list[0], list)
list_for_each_entry(w, &lock->wait_list, list)
wake_up_process(w->task);
list_for_each_entry(w, &lock->wait_list[1], list)
wake_up_process(w->task);
raw_spin_unlock(&lock->wait_lock);
}
EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
struct free_pcpu_rcu {
struct rcu_head rcu;
void __percpu *p;
};
static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
{
struct free_pcpu_rcu *rcu =
container_of(_rcu, struct free_pcpu_rcu, rcu);
free_percpu(rcu->p);
kfree(rcu);
}
void six_lock_pcpu_free_rcu(struct six_lock *lock)
{
struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
if (!rcu)
return;
rcu->p = lock->readers;
lock->readers = NULL;
call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
}
EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
void six_lock_pcpu_free(struct six_lock *lock)
{
BUG_ON(lock->readers && pcpu_read_count(lock));
@ -763,15 +728,19 @@ EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
*/
struct six_lock_count six_lock_counts(struct six_lock *lock)
{
struct six_lock_count ret = { 0, lock->state.intent_lock };
struct six_lock_count ret;
ret.n[SIX_LOCK_read] = 0;
ret.n[SIX_LOCK_intent] = lock->state.intent_lock + lock->intent_lock_recurse;
ret.n[SIX_LOCK_write] = lock->state.seq & 1;
if (!lock->readers)
ret.read += lock->state.read_lock;
ret.n[SIX_LOCK_read] += lock->state.read_lock;
else {
int cpu;
for_each_possible_cpu(cpu)
ret.read += *per_cpu_ptr(lock->readers, cpu);
ret.n[SIX_LOCK_read] += *per_cpu_ptr(lock->readers, cpu);
}
return ret;