Update bcachefs sources to 4837f82ee1 bcachefs: Use cached iterators for alloc btree

This commit is contained in:
Kent Overstreet 2020-06-13 19:31:45 -04:00
parent c32bba1325
commit 05408b6f8f
47 changed files with 1997 additions and 887 deletions

View File

@ -1 +1 @@
c9b4a210f946889f56654dda24dd8ced3b1aac24
4837f82ee1a0eaa2fe7c161c940f672c9e52afb6

43
include/linux/semaphore.h Normal file
View File

@ -0,0 +1,43 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2008 Intel Corporation
* Author: Matthew Wilcox <willy@linux.intel.com>
*
* Please see kernel/locking/semaphore.c for documentation of these functions
*/
#ifndef __LINUX_SEMAPHORE_H
#define __LINUX_SEMAPHORE_H
#include <linux/list.h>
#include <linux/spinlock.h>
/* Please don't access any members of this structure directly */
struct semaphore {
raw_spinlock_t lock;
unsigned int count;
struct list_head wait_list;
};
#define __SEMAPHORE_INITIALIZER(name, n) \
{ \
.lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \
.count = n, \
.wait_list = LIST_HEAD_INIT((name).wait_list), \
}
#define DEFINE_SEMAPHORE(name) \
struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1)
static inline void sema_init(struct semaphore *sem, int val)
{
*sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
}
extern void down(struct semaphore *sem);
extern int __must_check down_interruptible(struct semaphore *sem);
extern int __must_check down_killable(struct semaphore *sem);
extern int __must_check down_trylock(struct semaphore *sem);
extern int __must_check down_timeout(struct semaphore *sem, long);
extern void up(struct semaphore *sem);
#endif /* __LINUX_SEMAPHORE_H */

View File

@ -115,6 +115,8 @@ struct six_lock {
#endif
};
typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
static __always_inline void __six_lock_init(struct six_lock *lock,
const char *name,
struct lock_class_key *key)
@ -141,7 +143,7 @@ do { \
#define __SIX_LOCK(type) \
bool six_trylock_##type(struct six_lock *); \
bool six_relock_##type(struct six_lock *, u32); \
void six_lock_##type(struct six_lock *); \
int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
void six_unlock_##type(struct six_lock *);
__SIX_LOCK(read)
@ -167,14 +169,15 @@ static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type ty
}
static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
unsigned seq)
unsigned seq)
{
SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
}
static inline void six_lock_type(struct six_lock *lock, enum six_lock_type type)
static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
SIX_LOCK_DISPATCH(type, six_lock, lock);
SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
}
static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
@ -189,4 +192,6 @@ bool six_trylock_convert(struct six_lock *, enum six_lock_type,
void six_lock_increment(struct six_lock *, enum six_lock_type);
void six_lock_wakeup_all(struct six_lock *);
#endif /* _LINUX_SIX_H */

View File

@ -27,6 +27,7 @@ typedef unsigned gfp_t;
#define GFP_NOFS 0
#define GFP_NOIO 0
#define GFP_NOWAIT 0
#define __GFP_FS 0
#define __GFP_IO 0
#define __GFP_NOWARN 0
#define __GFP_NORETRY 0

View File

@ -144,8 +144,8 @@ DECLARE_EVENT_CLASS(btree_node,
TP_fast_assign(
memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
__entry->level = b->level;
__entry->id = b->btree_id;
__entry->level = b->c.level;
__entry->id = b->c.btree_id;
__entry->inode = b->key.k.p.inode;
__entry->offset = b->key.k.p.offset;
),
@ -262,7 +262,7 @@ TRACE_EVENT(btree_insert_key,
),
TP_fast_assign(
__entry->id = b->btree_id;
__entry->id = b->c.btree_id;
__entry->inode = k->k.p.inode;
__entry->offset = k->k.p.offset;
__entry->size = k->k.size;

View File

@ -4,6 +4,7 @@
#include "alloc_foreground.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_key_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_gc.h"
@ -276,6 +277,13 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
struct bkey_i_alloc *a;
int ret;
retry:
bch2_trans_begin(trans);
ret = bch2_btree_key_cache_flush(trans,
BTREE_ID_ALLOC, iter->pos);
if (ret)
goto err;
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
@ -330,7 +338,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
bch2_trans_init(&trans, c, 0, 0);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@ -364,25 +372,6 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
return ret < 0 ? ret : 0;
}
int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
{
struct btree_trans trans;
struct btree_iter *iter;
int ret;
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
ret = bch2_alloc_write_key(&trans, iter,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY);
bch2_trans_exit(&trans);
return ret < 0 ? ret : 0;
}
/* Bucket IO clocks: */
static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
@ -840,7 +829,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
struct bkey_alloc_unpacked u;
struct bucket *g;
struct bucket_mark m;
struct bkey_s_c k;
bool invalidating_cached_data;
size_t b;
int ret = 0;
@ -860,12 +848,22 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
g = bucket(ca, b);
m = READ_ONCE(g->mark);
bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
invalidating_cached_data = m.cached_sectors != 0;
/*
* If we're not invalidating cached data, we only increment the bucket
* gen in memory here, the incremented gen will be updated in the btree
* by bch2_trans_mark_pointer():
*/
if (!invalidating_cached_data)
bch2_invalidate_bucket(c, ca, b, &m);
else
bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
spin_unlock(&c->freelist_lock);
percpu_up_read(&c->mark_lock);
invalidating_cached_data = m.cached_sectors != 0;
if (!invalidating_cached_data)
goto out;
@ -882,23 +880,18 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
retry:
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
/*
* The allocator has to start before journal replay is finished - thus,
* we have to trust the in memory bucket @m, not the version in the
* btree:
*/
percpu_down_read(&c->mark_lock);
g = bucket(ca, b);
g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
u = alloc_mem_to_key(g, m);
percpu_up_read(&c->mark_lock);
invalidating_cached_data = m.cached_sectors != 0;
invalidating_cached_data = u.cached_sectors != 0;
u.gen++;
u.data_type = 0;
@ -968,31 +961,6 @@ out:
return ret < 0 ? ret : 0;
}
static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t bucket, u64 *flush_seq)
{
struct bucket_mark m;
percpu_down_read(&c->mark_lock);
spin_lock(&c->freelist_lock);
bch2_invalidate_bucket(c, ca, bucket, &m);
verify_not_on_freelist(c, ca, bucket);
BUG_ON(!fifo_push(&ca->free_inc, bucket));
spin_unlock(&c->freelist_lock);
bucket_io_clock_reset(c, ca, bucket, READ);
bucket_io_clock_reset(c, ca, bucket, WRITE);
percpu_up_read(&c->mark_lock);
*flush_seq = max(*flush_seq, bucket_journal_seq(c, m));
return m.cached_sectors != 0;
}
/*
* Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
*/
@ -1007,7 +975,9 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
POS(ca->dev_idx, 0),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
/* Only use nowait if we've already invalidated at least one bucket: */
while (!ret &&
@ -1448,216 +1418,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
return 0;
}
static bool flush_held_btree_writes(struct bch_fs *c)
{
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
bool nodes_unwritten;
size_t i;
again:
cond_resched();
nodes_unwritten = false;
if (bch2_journal_error(&c->journal))
return true;
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
if (btree_node_need_write(b)) {
if (btree_node_may_write(b)) {
rcu_read_unlock();
btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto again;
} else {
nodes_unwritten = true;
}
}
rcu_read_unlock();
return !nodes_unwritten &&
!bch2_btree_interior_updates_nr_pending(c);
}
static void allocator_start_issue_discards(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned dev_iter;
size_t bu;
for_each_rw_member(ca, c, dev_iter)
while (fifo_pop(&ca->free_inc, bu))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca, bu),
ca->mi.bucket_size, GFP_NOIO, 0);
}
static int resize_free_inc(struct bch_dev *ca)
{
alloc_fifo free_inc;
if (!fifo_full(&ca->free_inc))
return 0;
if (!init_fifo(&free_inc,
ca->free_inc.size * 2,
GFP_KERNEL))
return -ENOMEM;
fifo_move(&free_inc, &ca->free_inc);
swap(free_inc, ca->free_inc);
free_fifo(&free_inc);
return 0;
}
static bool bch2_fs_allocator_start_fast(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned dev_iter;
bool ret = true;
if (test_alloc_startup(c))
return false;
down_read(&c->gc_lock);
/* Scan for buckets that are already invalidated: */
for_each_rw_member(ca, c, dev_iter) {
struct bucket_array *buckets;
struct bucket_mark m;
long bu;
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
for (bu = buckets->first_bucket;
bu < buckets->nbuckets; bu++) {
m = READ_ONCE(buckets->b[bu].mark);
if (!buckets->b[bu].gen_valid ||
!is_available_bucket(m) ||
m.cached_sectors ||
(ca->buckets_nouse &&
test_bit(bu, ca->buckets_nouse)))
continue;
percpu_down_read(&c->mark_lock);
bch2_mark_alloc_bucket(c, ca, bu, true,
gc_pos_alloc(c, NULL), 0);
percpu_up_read(&c->mark_lock);
fifo_push(&ca->free_inc, bu);
discard_invalidated_buckets(c, ca);
if (fifo_full(&ca->free[RESERVE_BTREE]))
break;
}
up_read(&ca->bucket_lock);
}
up_read(&c->gc_lock);
/* did we find enough buckets? */
for_each_rw_member(ca, c, dev_iter)
if (!fifo_full(&ca->free[RESERVE_BTREE]))
ret = false;
return ret;
}
int bch2_fs_allocator_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned dev_iter;
u64 journal_seq = 0;
bool wrote;
long bu;
int ret = 0;
if (!test_alloc_startup(c) &&
bch2_fs_allocator_start_fast(c))
return 0;
pr_debug("not enough empty buckets; scanning for reclaimable buckets");
/*
* We're moving buckets to freelists _before_ they've been marked as
* invalidated on disk - we have to so that we can allocate new btree
* nodes to mark them as invalidated on disk.
*
* However, we can't _write_ to any of these buckets yet - they might
* have cached data in them, which is live until they're marked as
* invalidated on disk:
*/
set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
down_read(&c->gc_lock);
do {
wrote = false;
for_each_rw_member(ca, c, dev_iter) {
find_reclaimable_buckets(c, ca);
while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
(bu = next_alloc_bucket(ca)) >= 0) {
ret = resize_free_inc(ca);
if (ret) {
percpu_ref_put(&ca->io_ref);
up_read(&c->gc_lock);
goto err;
}
bch2_invalidate_one_bucket(c, ca, bu,
&journal_seq);
fifo_push(&ca->free[RESERVE_BTREE], bu);
}
}
pr_debug("done scanning for reclaimable buckets");
/*
* XXX: it's possible for this to deadlock waiting on journal reclaim,
* since we're holding btree writes. What then?
*/
ret = bch2_alloc_write(c,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_USE_ALLOC_RESERVE|
BTREE_INSERT_NOWAIT, &wrote);
/*
* If bch2_alloc_write() did anything, it may have used some
* buckets, and we need the RESERVE_BTREE freelist full - so we
* need to loop and scan again.
* And if it errored, it may have been because there weren't
* enough buckets, so just scan and loop again as long as it
* made some progress:
*/
} while (wrote);
up_read(&c->gc_lock);
if (ret)
goto err;
pr_debug("flushing journal");
ret = bch2_journal_flush(&c->journal);
if (ret)
goto err;
pr_debug("issuing discards");
allocator_start_issue_discards(c);
err:
clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
closure_wait_event(&c->btree_interior_update_wait,
flush_held_btree_writes(c));
return ret;
}
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);

View File

@ -54,7 +54,6 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
struct journal_keys;
int bch2_alloc_read(struct bch_fs *, struct journal_keys *);
int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
@ -70,8 +69,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
size_t bucket)
{
if (expensive_debug_checks(c) &&
test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
if (expensive_debug_checks(c)) {
size_t iter;
long i;
unsigned j;
@ -94,7 +92,6 @@ void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
int bch2_alloc_write(struct bch_fs *, unsigned, bool *);
int bch2_fs_allocator_start(struct bch_fs *);
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */

View File

@ -212,9 +212,9 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
case RESERVE_ALLOC:
return 0;
case RESERVE_BTREE:
return BTREE_NODE_OPEN_BUCKET_RESERVE;
return OPEN_BUCKETS_COUNT / 4;
default:
return BTREE_NODE_OPEN_BUCKET_RESERVE * 2;
return OPEN_BUCKETS_COUNT / 2;
}
}

View File

@ -46,16 +46,22 @@ enum alloc_reserve {
typedef FIFO(long) alloc_fifo;
/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
#define OPEN_BUCKETS_COUNT 256
#define OPEN_BUCKETS_COUNT 1024
#define WRITE_POINT_HASH_NR 32
#define WRITE_POINT_MAX 32
typedef u16 open_bucket_idx_t;
struct open_bucket {
spinlock_t lock;
atomic_t pin;
u8 freelist;
open_bucket_idx_t freelist;
/*
* When an open bucket has an ec_stripe attached, this is the index of
* the block in the stripe this open_bucket corresponds to:
*/
u8 ec_idx;
u8 type;
unsigned valid:1;
@ -68,8 +74,8 @@ struct open_bucket {
#define OPEN_BUCKET_LIST_MAX 15
struct open_buckets {
u8 nr;
u8 v[OPEN_BUCKET_LIST_MAX];
open_bucket_idx_t nr;
open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX];
};
struct dev_stripe_state {

View File

@ -190,6 +190,7 @@
#include <linux/percpu-rwsem.h>
#include <linux/rhashtable.h>
#include <linux/rwsem.h>
#include <linux/semaphore.h>
#include <linux/seqlock.h>
#include <linux/shrinker.h>
#include <linux/types.h>
@ -426,8 +427,8 @@ struct bch_dev {
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
unsigned open_buckets_partial_nr;
open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
open_bucket_idx_t open_buckets_partial_nr;
size_t fifo_last_bucket;
@ -478,10 +479,10 @@ enum {
/* startup: */
BCH_FS_ALLOC_READ_DONE,
BCH_FS_ALLOC_CLEAN,
BCH_FS_ALLOCATOR_STARTED,
BCH_FS_ALLOCATOR_RUNNING,
BCH_FS_ALLOCATOR_STOPPING,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
BCH_FS_RW,
@ -550,8 +551,8 @@ struct bch_fs {
struct super_block *vfs_sb;
char name[40];
/* ro/rw, add/remove devices: */
struct mutex state_lock;
/* ro/rw, add/remove/resize devices: */
struct rw_semaphore state_lock;
/* Counts outstanding writes, for clean transition to read-only */
struct percpu_ref writes;
@ -631,6 +632,8 @@ struct bch_fs {
struct list_head btree_trans_list;
mempool_t btree_iters_pool;
struct btree_key_cache btree_key_cache;
struct workqueue_struct *wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
@ -687,8 +690,8 @@ struct bch_fs {
struct closure_waitlist freelist_wait;
u64 blocked_allocate;
u64 blocked_allocate_open_bucket;
u8 open_buckets_freelist;
u8 open_buckets_nr_free;
open_bucket_idx_t open_buckets_freelist;
open_bucket_idx_t open_buckets_nr_free;
struct closure_waitlist open_buckets_wait;
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
@ -724,6 +727,7 @@ struct bch_fs {
struct rw_semaphore gc_lock;
/* IO PATH */
struct semaphore io_in_flight;
struct bio_set bio_read;
struct bio_set bio_read_split;
struct bio_set bio_write;

View File

@ -28,7 +28,7 @@ void bch2_recalc_btree_reserve(struct bch_fs *c)
for (i = 0; i < BTREE_ID_NR; i++)
if (c->btree_roots[i].b)
reserve += min_t(unsigned, 1,
c->btree_roots[i].b->level) * 8;
c->btree_roots[i].b->c.level) * 8;
c->btree_cache.reserve = reserve;
}
@ -72,24 +72,33 @@ static const struct rhashtable_params bch_btree_cache_params = {
.obj_cmpfn = bch2_btree_cache_cmp_fn,
};
static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
b->data = kvpmalloc(btree_bytes(c), gfp);
if (!b->data)
return -ENOMEM;
if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) {
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
return -ENOMEM;
}
return 0;
}
static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
struct btree_cache *bc = &c->btree_cache;
b->data = kvpmalloc(btree_bytes(c), gfp);
if (!b->data)
goto err;
if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
goto err;
bc->used++;
list_move(&b->list, &bc->freeable);
return;
err:
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
list_move(&b->list, &bc->freed);
if (!__btree_node_data_alloc(c, b, gfp)) {
bc->used++;
list_move(&b->list, &bc->freeable);
} else {
list_move(&b->list, &bc->freed);
}
}
static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
@ -99,7 +108,7 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
return NULL;
bkey_btree_ptr_init(&b->key);
six_lock_init(&b->lock);
six_lock_init(&b->c.lock);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
@ -131,8 +140,8 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
{
int ret;
b->level = level;
b->btree_id = id;
b->c.level = level;
b->c.btree_id = id;
mutex_lock(&bc->lock);
ret = __bch2_btree_node_hash_insert(bc, b);
@ -163,10 +172,10 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
lockdep_assert_held(&bc->lock);
if (!six_trylock_intent(&b->lock))
if (!six_trylock_intent(&b->c.lock))
return -ENOMEM;
if (!six_trylock_write(&b->lock))
if (!six_trylock_write(&b->c.lock))
goto out_unlock_intent;
if (btree_node_noevict(b))
@ -207,9 +216,9 @@ out:
trace_btree_node_reap(c, b);
return ret;
out_unlock:
six_unlock_write(&b->lock);
six_unlock_write(&b->c.lock);
out_unlock_intent:
six_unlock_intent(&b->lock);
six_unlock_intent(&b->c.lock);
ret = -ENOMEM;
goto out;
}
@ -241,7 +250,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
return SHRINK_STOP;
/* Return -1 if we can't do anything right now */
if (sc->gfp_mask & __GFP_IO)
if (sc->gfp_mask & __GFP_FS)
mutex_lock(&bc->lock);
else if (!mutex_trylock(&bc->lock))
return -1;
@ -267,8 +276,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
if (++i > 3 &&
!btree_node_reclaim(c, b)) {
btree_node_data_free(c, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
}
}
@ -294,8 +303,8 @@ restart:
mutex_unlock(&bc->lock);
bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
if (freed >= nr)
goto out;
@ -524,35 +533,47 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
*/
list_for_each_entry(b, &bc->freeable, list)
if (!btree_node_reclaim(c, b))
goto out_unlock;
goto got_node;
/*
* We never free struct btree itself, just the memory that holds the on
* disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, &bc->freed, list)
if (!btree_node_reclaim(c, b)) {
btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
if (b->data)
goto out_unlock;
if (!btree_node_reclaim(c, b))
goto got_node;
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
b = NULL;
got_node:
if (b)
list_del_init(&b->list);
mutex_unlock(&bc->lock);
if (!b) {
b = kzalloc(sizeof(struct btree), GFP_KERNEL);
if (!b)
goto err;
}
b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
if (!b)
goto err;
bkey_btree_ptr_init(&b->key);
six_lock_init(&b->c.lock);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
BUG_ON(!six_trylock_intent(&b->c.lock));
BUG_ON(!six_trylock_write(&b->c.lock));
}
if (!b->data) {
if (__btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
goto err;
mutex_lock(&bc->lock);
bc->used++;
mutex_unlock(&bc->lock);
}
BUG_ON(!six_trylock_intent(&b->lock));
BUG_ON(!six_trylock_write(&b->lock));
out_unlock:
BUG_ON(btree_node_hashed(b));
BUG_ON(btree_node_write_in_flight(b));
list_del_init(&b->list);
mutex_unlock(&bc->lock);
out:
b->flags = 0;
b->written = 0;
@ -568,6 +589,14 @@ out:
memalloc_nofs_restore(flags);
return b;
err:
mutex_lock(&bc->lock);
if (b) {
list_add(&b->list, &bc->freed);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
/* Try to cannibalize another cached btree node: */
if (bc->alloc_lock == current) {
b = btree_node_cannibalize(c);
@ -620,8 +649,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
list_add(&b->list, &bc->freeable);
mutex_unlock(&bc->lock);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
return NULL;
}
@ -635,19 +664,27 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
bch2_btree_node_read(c, b, sync);
six_unlock_write(&b->lock);
six_unlock_write(&b->c.lock);
if (!sync) {
six_unlock_intent(&b->lock);
six_unlock_intent(&b->c.lock);
return NULL;
}
if (lock_type == SIX_LOCK_read)
six_lock_downgrade(&b->lock);
six_lock_downgrade(&b->c.lock);
return b;
}
static int lock_node_check_fn(struct six_lock *lock, void *p)
{
struct btree *b = container_of(lock, struct btree, c.lock);
const struct bkey_i *k = p;
return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
}
/**
* bch_btree_node_get - find a btree node in the cache and lock it, reading it
* in from disk if necessary.
@ -720,13 +757,17 @@ lock_node:
if (btree_node_read_locked(iter, level + 1))
btree_node_unlock(iter, level + 1);
if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
lock_node_check_fn, (void *) k)) {
if (b->hash_val != btree_ptr_hash_val(k))
goto retry;
return ERR_PTR(-EINTR);
}
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
b->level != level ||
b->c.level != level ||
race_fault())) {
six_unlock_type(&b->lock, lock_type);
six_unlock_type(&b->c.lock, lock_type);
if (bch2_btree_node_relock(iter, level + 1))
goto retry;
@ -754,11 +795,11 @@ lock_node:
set_btree_node_accessed(b);
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->lock, lock_type);
six_unlock_type(&b->c.lock, lock_type);
return ERR_PTR(-EIO);
}
EBUG_ON(b->btree_id != iter->btree_id ||
EBUG_ON(b->c.btree_id != iter->btree_id ||
BTREE_NODE_LEVEL(b->data) != level ||
bkey_cmp(b->data->max_key, k->k.p));
@ -773,6 +814,7 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
struct bset_tree *t;
int ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
@ -793,12 +835,14 @@ retry:
return b;
} else {
lock_node:
six_lock_read(&b->lock);
ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
if (ret)
goto retry;
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
b->btree_id != btree_id ||
b->level != level)) {
six_unlock_read(&b->lock);
b->c.btree_id != btree_id ||
b->c.level != level)) {
six_unlock_read(&b->c.lock);
goto retry;
}
}
@ -822,11 +866,11 @@ lock_node:
set_btree_node_accessed(b);
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->lock);
six_unlock_read(&b->c.lock);
return ERR_PTR(-EIO);
}
EBUG_ON(b->btree_id != btree_id ||
EBUG_ON(b->c.btree_id != btree_id ||
BTREE_NODE_LEVEL(b->data) != level ||
bkey_cmp(b->data->max_key, k->k.p));
@ -844,7 +888,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
struct bkey_packed *k;
BKEY_PADDED(k) tmp;
struct btree *ret = NULL;
unsigned level = b->level;
unsigned level = b->c.level;
parent = btree_iter_node(iter, level + 1);
if (!parent)
@ -867,7 +911,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
goto out;
}
node_iter = iter->l[parent->level].iter;
node_iter = iter->l[parent->c.level].iter;
k = bch2_btree_node_iter_peek_all(&node_iter, parent);
BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
@ -914,7 +958,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
if (!IS_ERR(ret)) {
six_unlock_intent(&ret->lock);
six_unlock_intent(&ret->c.lock);
ret = ERR_PTR(-EINTR);
}
}
@ -975,7 +1019,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
pr_buf(out,
"l %u %llu:%llu - %llu:%llu:\n"
" ptrs: ",
b->level,
b->c.level,
b->data->min_key.inode,
b->data->min_key.offset,
b->data->max_key.inode,

View File

@ -101,7 +101,7 @@ static inline unsigned btree_blocks(struct bch_fs *c)
(BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
(BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->btree_id].b)
#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b)
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
struct btree *);

View File

@ -186,7 +186,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
bch2_btree_node_iter_advance(&iter, b);
if (b->level) {
if (b->c.level) {
ret = bch2_gc_check_topology(c, k,
&next_node_start,
b->data->max_key,
@ -252,7 +252,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
if (!btree_node_fake(b))
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
&max_stale, initial);
gc_pos_set(c, gc_pos_btree_root(b->btree_id));
gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
mutex_unlock(&c->btree_root_lock);
return ret;
@ -280,7 +280,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
if (ret)
break;
if (b->level) {
if (b->c.level) {
struct btree *child;
BKEY_PADDED(k) tmp;
@ -296,16 +296,16 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
if (ret)
break;
if (b->level > target_depth) {
if (b->c.level > target_depth) {
child = bch2_btree_node_get_noiter(c, &tmp.k,
b->btree_id, b->level - 1);
b->c.btree_id, b->c.level - 1);
ret = PTR_ERR_OR_ZERO(child);
if (ret)
break;
ret = bch2_gc_btree_init_recurse(c, child,
journal_keys, target_depth);
six_unlock_read(&child->lock);
six_unlock_read(&child->c.lock);
if (ret)
break;
@ -336,7 +336,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
if (btree_node_fake(b))
return 0;
six_lock_read(&b->lock);
six_lock_read(&b->c.lock, NULL, NULL);
if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
"btree root with incorrect min_key: %llu:%llu",
b->data->min_key.inode,
@ -351,7 +351,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
BUG();
}
if (b->level >= target_depth)
if (b->c.level >= target_depth)
ret = bch2_gc_btree_init_recurse(c, b,
journal_keys, target_depth);
@ -359,7 +359,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
&max_stale, true);
fsck_err:
six_unlock_read(&b->lock);
six_unlock_read(&b->c.lock);
return ret;
}
@ -798,6 +798,7 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
unsigned i, iter = 0;
int ret;
lockdep_assert_held(&c->state_lock);
trace_gc_start(c);
down_write(&c->gc_lock);
@ -884,6 +885,76 @@ out:
return ret;
}
/*
* For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
* node pointers currently never have cached pointers that can become stale:
*/
static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
int ret;
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr, false);
if (gen_after(g->gc_gen, ptr->gen))
g->gc_gen = ptr->gen;
if (gen_after(g->mark.gen, ptr->gen) > 32) {
/* rewrite btree node */
}
}
}
bch2_trans_exit(&trans);
return ret;
}
int bch2_gc_gens(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
int ret;
down_read(&c->state_lock);
for_each_member_device(ca, c, i) {
struct bucket_array *buckets = bucket_array(ca);
struct bucket *g;
for_each_bucket(g, buckets)
g->gc_gen = g->mark.gen;
}
for (i = 0; i < BTREE_ID_NR; i++)
if (btree_node_type_needs_gc(i)) {
ret = bch2_gc_btree_gens(c, i);
if (ret)
goto err;
}
for_each_member_device(ca, c, i) {
struct bucket_array *buckets = bucket_array(ca);
struct bucket *g;
for_each_bucket(g, buckets)
g->oldest_gen = g->gc_gen;
}
err:
up_read(&c->state_lock);
return ret;
}
/* Btree coalescing */
static void recalc_packed_keys(struct btree *b)
@ -1007,9 +1078,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
set_btree_bset_end(n1, n1->set);
six_unlock_write(&n2->lock);
six_unlock_write(&n2->c.lock);
bch2_btree_node_free_never_inserted(c, n2);
six_unlock_intent(&n2->lock);
six_unlock_intent(&n2->c.lock);
memmove(new_nodes + i - 1,
new_nodes + i,
@ -1045,7 +1116,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_build_aux_trees(n);
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->lock);
six_unlock_write(&n->c.lock);
bch2_btree_node_write(c, n, SIX_LOCK_intent);
}
@ -1088,7 +1159,7 @@ next:
BUG_ON(!bch2_keylist_empty(&keylist));
BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]);
bch2_btree_iter_node_replace(iter, new_nodes[0]);
@ -1113,7 +1184,7 @@ next:
}
for (i = 0; i < nr_new_nodes; i++)
six_unlock_intent(&new_nodes[i]->lock);
six_unlock_intent(&new_nodes[i]->c.lock);
bch2_btree_update_done(as);
bch2_keylist_free(&keylist, NULL);
@ -1154,11 +1225,11 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
for (i = 1; i < GC_MERGE_NODES; i++) {
if (!merge[i] ||
!six_relock_intent(&merge[i]->lock, lock_seq[i]))
!six_relock_intent(&merge[i]->c.lock, lock_seq[i]))
break;
if (merge[i]->level != merge[0]->level) {
six_unlock_intent(&merge[i]->lock);
if (merge[i]->c.level != merge[0]->c.level) {
six_unlock_intent(&merge[i]->c.lock);
break;
}
}
@ -1167,11 +1238,11 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
bch2_coalesce_nodes(c, iter, merge);
for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
lock_seq[i] = merge[i]->lock.state.seq;
six_unlock_intent(&merge[i]->lock);
lock_seq[i] = merge[i]->c.lock.state.seq;
six_unlock_intent(&merge[i]->c.lock);
}
lock_seq[0] = merge[0]->lock.state.seq;
lock_seq[0] = merge[0]->c.lock.state.seq;
if (kthread && kthread_should_stop()) {
bch2_trans_exit(&trans);
@ -1259,7 +1330,14 @@ static int bch2_gc_thread(void *arg)
last = atomic_long_read(&clock->now);
last_kick = atomic_read(&c->kick_gc);
/*
* Full gc is currently incompatible with btree key cache:
*/
#if 0
ret = bch2_gc(c, NULL, false, false);
#else
ret = bch2_gc_gens(c);
#endif
if (ret)
bch_err(c, "btree gc failed: %i", ret);

View File

@ -8,6 +8,7 @@ void bch2_coalesce(struct bch_fs *);
struct journal_keys;
int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool);
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *);
void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
@ -81,7 +82,7 @@ static inline struct gc_pos gc_pos_btree(enum btree_id id,
*/
static inline struct gc_pos gc_pos_btree_node(struct btree *b)
{
return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level);
}
/*

View File

@ -584,8 +584,8 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
struct btree_node_entry *bne;
bool did_sort;
EBUG_ON(!(b->lock.state.seq & 1));
EBUG_ON(iter && iter->l[b->level].b != b);
EBUG_ON(!(b->c.lock.state.seq & 1));
EBUG_ON(iter && iter->l[b->c.level].b != b);
did_sort = btree_node_compact(c, b, iter);
@ -634,8 +634,8 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n"
"pos ",
write ? "before write " : "",
b->btree_id, b->level,
c->btree_roots[b->btree_id].level);
b->c.btree_id, b->c.level,
c->btree_roots[b->c.btree_id].level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
pr_buf(out, " node offset %u", b->written);
@ -747,11 +747,11 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
"incorrect sequence number (wrong btree node)");
}
btree_err_on(BTREE_NODE_ID(bn) != b->btree_id,
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
BTREE_ERR_MUST_RETRY, c, b, i,
"incorrect btree id");
btree_err_on(BTREE_NODE_LEVEL(bn) != b->level,
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
BTREE_ERR_MUST_RETRY, c, b, i,
"incorrect level");
@ -762,7 +762,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
}
if (!write)
compat_btree_node(b->level, b->btree_id, version,
compat_btree_node(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write, bn);
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
@ -783,7 +783,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
"incorrect max key");
if (write)
compat_btree_node(b->level, b->btree_id, version,
compat_btree_node(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write, bn);
/* XXX: ideally we would be validating min_key too */
@ -805,7 +805,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
BTREE_ERR_FATAL, c, b, i,
"invalid bkey format: %s", err);
compat_bformat(b->level, b->btree_id, version,
compat_bformat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
&bn->format);
}
@ -851,7 +851,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
/* XXX: validate k->u64s */
if (!write)
bch2_bkey_compat(b->level, b->btree_id, version,
bch2_bkey_compat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
&b->format, k);
@ -874,7 +874,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
}
if (write)
bch2_bkey_compat(b->level, b->btree_id, version,
bch2_bkey_compat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
&b->format, k);
@ -1280,8 +1280,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
bch2_btree_set_root_for_read(c, b);
err:
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
return ret;
}
@ -1325,15 +1325,15 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_node_iter(&trans, b->btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->level, 0);
iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level, 0);
retry:
ret = bch2_btree_iter_traverse(iter);
if (ret)
goto err;
/* has node been freed? */
if (iter->l[b->level].b != b) {
if (iter->l[b->c.level].b != b) {
/* node has been freed: */
BUG_ON(!btree_node_dying(b));
goto out;
@ -1764,18 +1764,18 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
BUG_ON(lock_type_held == SIX_LOCK_write);
if (lock_type_held == SIX_LOCK_intent ||
six_lock_tryupgrade(&b->lock)) {
six_lock_tryupgrade(&b->c.lock)) {
__bch2_btree_node_write(c, b, SIX_LOCK_intent);
/* don't cycle lock unnecessarily: */
if (btree_node_just_written(b) &&
six_trylock_write(&b->lock)) {
six_trylock_write(&b->c.lock)) {
bch2_btree_post_write_cleanup(c, b);
six_unlock_write(&b->lock);
six_unlock_write(&b->c.lock);
}
if (lock_type_held == SIX_LOCK_read)
six_lock_downgrade(&b->lock);
six_lock_downgrade(&b->c.lock);
} else {
__bch2_btree_node_write(c, b, SIX_LOCK_read);
}
@ -1845,7 +1845,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
b,
(flags & (1 << BTREE_NODE_dirty)) != 0,
(flags & (1 << BTREE_NODE_need_write)) != 0,
b->level,
b->c.level,
b->written,
!list_empty_careful(&b->write_blocked),
b->will_make_reachable != 0,

View File

@ -114,7 +114,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
break;
}
six_unlock_type(&b->lock, lock_held);
six_unlock_type(&b->c.lock, lock_held);
btree_node_wait_on_io(b);
btree_node_lock_type(c, b, lock_held);
}

View File

@ -4,22 +4,16 @@
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "debug.h"
#include "extents.h"
#include "journal.h"
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4)
#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5)
#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6)
#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
{
return l < BTREE_MAX_DEPTH &&
@ -51,7 +45,7 @@ static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
struct btree *b)
{
return iter->btree_id == b->btree_id &&
return iter->btree_id == b->c.btree_id &&
!btree_iter_pos_before_node(iter, b) &&
!btree_iter_pos_after_node(iter, b);
}
@ -68,11 +62,11 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
struct btree_iter *linked;
unsigned readers = 0;
EBUG_ON(!btree_node_intent_locked(iter, b->level));
EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
trans_for_each_iter(iter->trans, linked)
if (linked->l[b->level].b == b &&
btree_node_read_locked(linked, b->level))
if (linked->l[b->c.level].b == b &&
btree_node_read_locked(linked, b->c.level))
readers++;
/*
@ -82,10 +76,10 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
* locked:
*/
atomic64_sub(__SIX_VAL(read_lock, readers),
&b->lock.state.counter);
&b->c.lock.state.counter);
btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
atomic64_add(__SIX_VAL(read_lock, readers),
&b->lock.state.counter);
&b->c.lock.state.counter);
}
bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
@ -99,9 +93,9 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
if (race_fault())
return false;
if (six_relock_type(&b->lock, want, iter->l[level].lock_seq) ||
if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
(btree_node_lock_seq_matches(iter, b, level) &&
btree_node_lock_increment(iter, b, level, want))) {
btree_node_lock_increment(iter->trans, b, level, want))) {
mark_btree_node_locked(iter, level, want);
return true;
} else {
@ -125,12 +119,12 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
return false;
if (btree_node_locked(iter, level)
? six_lock_tryupgrade(&b->lock)
: six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq))
? six_lock_tryupgrade(&b->c.lock)
: six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
goto success;
if (btree_node_lock_seq_matches(iter, b, level) &&
btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
btree_node_unlock(iter, level);
goto success;
}
@ -162,7 +156,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
? 0
: (unsigned long) iter->l[l].b,
is_btree_node(iter, l)
? iter->l[l].b->lock.state.seq
? iter->l[l].b->c.lock.state.seq
: 0);
fail_idx = l;
@ -193,23 +187,21 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
/* Slowpath: */
bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
unsigned level,
struct btree_iter *iter,
enum six_lock_type type)
unsigned level, struct btree_iter *iter,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn,
void *p)
{
struct btree_trans *trans = iter->trans;
struct btree_iter *linked;
u64 start_time = local_clock();
bool ret = true;
/* Check if it's safe to block: */
trans_for_each_iter(iter->trans, linked) {
trans_for_each_iter(trans, linked) {
if (!linked->nodes_locked)
continue;
/* Must lock btree nodes in key order: */
if ((cmp_int(iter->btree_id, linked->btree_id) ?:
bkey_cmp(pos, linked->pos)) < 0)
ret = false;
/*
* Can't block taking an intent lock if we have _any_ nodes read
* locked:
@ -224,13 +216,15 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
*/
if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) {
if (!(iter->trans->nounlock)) {
if (!(trans->nounlock)) {
linked->locks_want = max_t(unsigned,
linked->locks_want,
__fls(linked->nodes_locked) + 1);
btree_iter_get_locks(linked, true, false);
if (!btree_iter_get_locks(linked, true, false))
ret = false;
} else {
ret = false;
}
ret = false;
}
/*
@ -240,14 +234,37 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
*/
if (linked->btree_id == iter->btree_id &&
level > __fls(linked->nodes_locked)) {
if (!(iter->trans->nounlock)) {
if (!(trans->nounlock)) {
linked->locks_want =
max(level + 1, max_t(unsigned,
linked->locks_want,
iter->locks_want));
btree_iter_get_locks(linked, true, false);
if (!btree_iter_get_locks(linked, true, false))
ret = false;
} else {
ret = false;
}
}
/* Must lock btree nodes in key order: */
if ((cmp_int(iter->btree_id, linked->btree_id) ?:
-cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0)
ret = false;
if (iter->btree_id == linked->btree_id &&
btree_node_locked(linked, level) &&
bkey_cmp(pos, linked->l[level].b->key.k.p) <= 0)
ret = false;
/*
* Recheck if this is a node we already have locked - since one
* of the get_locks() calls might've successfully
* upgraded/relocked it:
*/
if (linked->l[level].b == b &&
btree_node_locked_type(linked, level) >= type) {
six_lock_increment(&b->c.lock, type);
return true;
}
}
@ -256,7 +273,14 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
return false;
}
__btree_node_lock_type(iter->trans->c, b, type);
if (six_trylock_type(&b->c.lock, type))
return true;
if (six_lock_type(&b->c.lock, type, should_sleep_fn, p))
return false;
bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
start_time);
return true;
}
@ -267,7 +291,12 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
{
unsigned l;
for (l = 0; btree_iter_node(iter, l); l++) {
if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
BUG_ON(iter->nodes_locked);
return;
}
for (l = 0; is_btree_node(iter, l); l++) {
if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
!btree_node_locked(iter, l))
continue;
@ -281,7 +310,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
{
struct btree_iter *iter;
trans_for_each_iter(trans, iter)
trans_for_each_iter_all(trans, iter)
bch2_btree_iter_verify_locks(iter);
}
#else
@ -289,7 +318,7 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
#endif
__flatten
static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
{
return btree_iter_get_locks(iter, false, trace);
}
@ -349,31 +378,20 @@ bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
void __bch2_btree_iter_downgrade(struct btree_iter *iter,
unsigned downgrade_to)
{
struct btree_iter *linked;
unsigned l;
unsigned l, new_locks_want = downgrade_to ?:
(iter->flags & BTREE_ITER_INTENT ? 1 : 0);
/*
* We downgrade linked iterators as well because btree_iter_upgrade
* might have had to modify locks_want on linked iterators due to lock
* ordering:
*/
trans_for_each_iter(iter->trans, linked) {
unsigned new_locks_want = downgrade_to ?:
(linked->flags & BTREE_ITER_INTENT ? 1 : 0);
if (iter->locks_want < downgrade_to) {
iter->locks_want = new_locks_want;
if (linked->locks_want <= new_locks_want)
continue;
linked->locks_want = new_locks_want;
while (linked->nodes_locked &&
(l = __fls(linked->nodes_locked)) >= linked->locks_want) {
if (l > linked->level) {
btree_node_unlock(linked, l);
while (iter->nodes_locked &&
(l = __fls(iter->nodes_locked)) >= iter->locks_want) {
if (l > iter->level) {
btree_node_unlock(iter, l);
} else {
if (btree_node_intent_locked(linked, l)) {
six_lock_downgrade(&linked->l[l].b->lock);
linked->nodes_intent_locked ^= 1 << l;
if (btree_node_intent_locked(iter, l)) {
six_lock_downgrade(&iter->l[l].b->c.lock);
iter->nodes_intent_locked ^= 1 << l;
}
break;
}
@ -383,6 +401,14 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter,
bch2_btree_trans_verify_locks(iter->trans);
}
void bch2_trans_downgrade(struct btree_trans *trans)
{
struct btree_iter *iter;
trans_for_each_iter(trans, iter)
bch2_btree_iter_downgrade(iter);
}
/* Btree transaction locking: */
bool bch2_trans_relock(struct btree_trans *trans)
@ -514,7 +540,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
return;
trans_for_each_iter_with_node(trans, b, iter)
bch2_btree_iter_verify_level(iter, b->level);
bch2_btree_iter_verify_level(iter, b->c.level);
}
#else
@ -545,7 +571,7 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
struct btree *b,
struct bkey_packed *where)
{
struct btree_iter_level *l = &iter->l[b->level];
struct btree_iter_level *l = &iter->l[b->c.level];
struct bpos pos = btree_iter_search_key(iter);
if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
@ -565,7 +591,7 @@ void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
trans_for_each_iter_with_node(iter->trans, b, linked) {
__bch2_btree_iter_fix_key_modified(linked, b, where);
bch2_btree_iter_verify_level(linked, b->level);
bch2_btree_iter_verify_level(linked, b->c.level);
}
}
@ -635,7 +661,7 @@ fixup_done:
*/
if (!bch2_btree_node_iter_end(node_iter) &&
iter_current_key_modified &&
(b->level ||
(b->c.level ||
btree_node_type_is_extents(iter->btree_id))) {
struct bset_tree *t;
struct bkey_packed *k, *k2, *p;
@ -662,7 +688,7 @@ fixup_done:
}
}
if (!b->level &&
if (!b->c.level &&
node_iter == &iter->l[0].iter &&
iter_current_key_modified)
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
@ -678,7 +704,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
struct bset_tree *t = bch2_bkey_to_bset(b, where);
struct btree_iter *linked;
if (node_iter != &iter->l[b->level].iter) {
if (node_iter != &iter->l[b->c.level].iter) {
__bch2_btree_node_iter_fix(iter, b, node_iter, t,
where, clobber_u64s, new_u64s);
@ -688,9 +714,9 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
trans_for_each_iter_with_node(iter->trans, b, linked) {
__bch2_btree_node_iter_fix(linked, b,
&linked->l[b->level].iter, t,
&linked->l[b->c.level].iter, t,
where, clobber_u64s, new_u64s);
bch2_btree_iter_verify_level(linked, b->level);
bch2_btree_iter_verify_level(linked, b->c.level);
}
}
@ -774,7 +800,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
return;
plevel = b->level + 1;
plevel = b->c.level + 1;
if (!btree_iter_node(iter, plevel))
return;
@ -797,7 +823,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
}
if (!parent_locked)
btree_node_unlock(iter, b->level + 1);
btree_node_unlock(iter, b->c.level + 1);
}
static inline void __btree_iter_init(struct btree_iter *iter,
@ -814,14 +840,16 @@ static inline void __btree_iter_init(struct btree_iter *iter,
static inline void btree_iter_node_set(struct btree_iter *iter,
struct btree *b)
{
BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
btree_iter_verify_new_node(iter, b);
EBUG_ON(!btree_iter_pos_in_node(iter, b));
EBUG_ON(b->lock.state.seq & 1);
EBUG_ON(b->c.lock.state.seq & 1);
iter->l[b->level].lock_seq = b->lock.state.seq;
iter->l[b->level].b = b;
__btree_iter_init(iter, b->level);
iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
iter->l[b->c.level].b = b;
__btree_iter_init(iter, b->c.level);
}
/*
@ -834,18 +862,19 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
struct btree_iter *linked;
trans_for_each_iter(iter->trans, linked)
if (btree_iter_pos_in_node(linked, b)) {
if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
btree_iter_pos_in_node(linked, b)) {
/*
* bch2_btree_iter_node_drop() has already been called -
* the old node we're replacing has already been
* unlocked and the pointer invalidated
*/
BUG_ON(btree_node_locked(linked, b->level));
BUG_ON(btree_node_locked(linked, b->c.level));
t = btree_lock_want(linked, b->level);
t = btree_lock_want(linked, b->c.level);
if (t != BTREE_NODE_UNLOCKED) {
six_lock_increment(&b->lock, t);
mark_btree_node_locked(linked, b->level, t);
six_lock_increment(&b->c.lock, t);
mark_btree_node_locked(linked, b->c.level, t);
}
btree_iter_node_set(linked, b);
@ -855,7 +884,7 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
{
struct btree_iter *linked;
unsigned level = b->level;
unsigned level = b->c.level;
trans_for_each_iter(iter->trans, linked)
if (linked->l[level].b == b) {
@ -873,22 +902,30 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
struct btree_iter *linked;
trans_for_each_iter_with_node(iter->trans, b, linked)
__btree_iter_init(linked, b->level);
__btree_iter_init(linked, b->c.level);
}
static int lock_root_check_fn(struct six_lock *lock, void *p)
{
struct btree *b = container_of(lock, struct btree, c.lock);
struct btree **rootp = p;
return b == *rootp ? 0 : -1;
}
static inline int btree_iter_lock_root(struct btree_iter *iter,
unsigned depth_want)
{
struct bch_fs *c = iter->trans->c;
struct btree *b;
struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
enum six_lock_type lock_type;
unsigned i;
EBUG_ON(iter->nodes_locked);
while (1) {
b = READ_ONCE(c->btree_roots[iter->btree_id].b);
iter->level = READ_ONCE(b->level);
b = READ_ONCE(*rootp);
iter->level = READ_ONCE(b->c.level);
if (unlikely(iter->level < depth_want)) {
/*
@ -905,11 +942,12 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
lock_type = __btree_lock_want(iter, iter->level);
if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
iter, lock_type)))
iter, lock_type,
lock_root_check_fn, rootp)))
return -EINTR;
if (likely(b == c->btree_roots[iter->btree_id].b &&
b->level == iter->level &&
if (likely(b == READ_ONCE(*rootp) &&
b->c.level == iter->level &&
!race_fault())) {
for (i = 0; i < iter->level; i++)
iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
@ -922,7 +960,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
return 0;
}
six_unlock_type(&b->lock, lock_type);
six_unlock_type(&b->c.lock, lock_type);
}
}
@ -1017,24 +1055,28 @@ static void btree_iter_up(struct btree_iter *iter)
static int btree_iter_traverse_one(struct btree_iter *);
static int __btree_iter_traverse_all(struct btree_trans *trans,
struct btree_iter *orig_iter, int ret)
static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter;
u8 sorted[BTREE_ITER_MAX];
unsigned i, nr_sorted = 0;
if (trans->in_traverse_all)
return -EINTR;
trans->in_traverse_all = true;
retry_all:
nr_sorted = 0;
trans_for_each_iter(trans, iter)
sorted[nr_sorted++] = iter - trans->iters;
sorted[nr_sorted++] = iter->idx;
#define btree_iter_cmp_by_idx(_l, _r) \
btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
#undef btree_iter_cmp_by_idx
retry_all:
bch2_trans_unlock(trans);
if (unlikely(ret == -ENOMEM)) {
@ -1050,11 +1092,6 @@ retry_all:
if (unlikely(ret == -EIO)) {
trans->error = true;
if (orig_iter) {
orig_iter->flags |= BTREE_ITER_ERROR;
orig_iter->l[orig_iter->level].b =
BTREE_ITER_NO_NODE_ERROR;
}
goto out;
}
@ -1062,9 +1099,16 @@ retry_all:
/* Now, redo traversals in correct order: */
for (i = 0; i < nr_sorted; i++) {
iter = &trans->iters[sorted[i]];
unsigned idx = sorted[i];
ret = btree_iter_traverse_one(iter);
/*
* sucessfully traversing one iterator can cause another to be
* unlinked, in btree_key_cache_fill()
*/
if (!(trans->iters_linked & (1ULL << idx)))
continue;
ret = btree_iter_traverse_one(&trans->iters[idx]);
if (ret)
goto retry_all;
}
@ -1079,12 +1123,14 @@ retry_all:
}
out:
bch2_btree_cache_cannibalize_unlock(c);
trans->in_traverse_all = false;
return ret;
}
int bch2_btree_iter_traverse_all(struct btree_trans *trans)
{
return __btree_iter_traverse_all(trans, NULL, 0);
return __btree_iter_traverse_all(trans, 0);
}
static inline bool btree_iter_good_node(struct btree_iter *iter,
@ -1129,9 +1175,6 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
{
unsigned depth_want = iter->level;
if (unlikely(iter->level >= BTREE_MAX_DEPTH))
return 0;
/*
* if we need interior nodes locked, call btree_iter_relock() to make
* sure we walk back up enough that we lock them:
@ -1140,9 +1183,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
iter->locks_want > 1)
bch2_btree_iter_relock(iter, false);
if (btree_iter_type(iter) == BTREE_ITER_CACHED)
return bch2_btree_iter_traverse_cached(iter);
if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
if (unlikely(iter->level >= BTREE_MAX_DEPTH))
return 0;
/*
* XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
* here unnecessary
@ -1176,7 +1225,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
return 0;
iter->level = depth_want;
iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN;
if (ret == -EIO) {
iter->flags |= BTREE_ITER_ERROR;
iter->l[iter->level].b =
BTREE_ITER_NO_NODE_ERROR;
} else {
iter->l[iter->level].b =
BTREE_ITER_NO_NODE_DOWN;
}
return ret;
}
}
@ -1189,12 +1246,13 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
int ret;
ret = bch2_trans_cond_resched(iter->trans) ?:
ret = bch2_trans_cond_resched(trans) ?:
btree_iter_traverse_one(iter);
if (unlikely(ret))
ret = __btree_iter_traverse_all(iter->trans, iter, ret);
ret = __btree_iter_traverse_all(trans, ret);
return ret;
}
@ -1343,6 +1401,13 @@ static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
if (!cmp)
goto out;
if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
btree_node_unlock(iter, 0);
iter->l[0].b = BTREE_ITER_NO_NODE_UP;
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
return;
}
l = btree_iter_up_until_good_node(iter, cmp);
if (btree_iter_node(iter, l)) {
@ -1774,6 +1839,26 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
return bch2_btree_iter_peek_slot(iter);
}
struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
{
struct bkey_cached *ck;
int ret;
bch2_btree_iter_checks(iter, BTREE_ITER_CACHED);
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret))
return bkey_s_c_err(ret);
ck = (void *) iter->l[0].b;
EBUG_ON(iter->btree_id != ck->key.btree_id ||
bkey_cmp(iter->pos, ck->key.pos));
BUG_ON(!ck->valid);
return bkey_i_to_s_c(ck->k);
}
static inline void bch2_btree_iter_init(struct btree_trans *trans,
struct btree_iter *iter, enum btree_id btree_id,
struct bpos pos, unsigned flags)
@ -1959,10 +2044,11 @@ static inline void btree_iter_copy(struct btree_iter *dst,
*dst = *src;
dst->idx = idx;
dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
for (i = 0; i < BTREE_MAX_DEPTH; i++)
if (btree_node_locked(dst, i))
six_lock_increment(&dst->l[i].b->lock,
six_lock_increment(&dst->l[i].b->c.lock,
__btree_lock_want(dst, i));
dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
@ -2017,8 +2103,9 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
iter = best;
}
iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
iter->flags &= ~BTREE_ITER_USER_FLAGS;
iter->flags |= flags & BTREE_ITER_USER_FLAGS;
if (iter->flags & BTREE_ITER_INTENT)
bch2_btree_iter_upgrade(iter, 1);
@ -2222,6 +2309,8 @@ int bch2_trans_exit(struct btree_trans *trans)
mutex_unlock(&trans->c->btree_trans_lock);
#endif
bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
kfree(trans->fs_usage_deltas);
kfree(trans->mem);
if (trans->used_mempool)
@ -2244,13 +2333,15 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip);
trans_for_each_iter(trans, iter) {
if (!iter->nodes_locked)
continue;
pr_buf(out, " iter %s:", bch2_btree_ids[iter->btree_id]);
pr_buf(out, " iter %u %s:",
iter->idx,
bch2_btree_ids[iter->btree_id]);
bch2_bpos_to_text(out, iter->pos);
pr_buf(out, "\n");
@ -2258,8 +2349,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
if (btree_node_locked(iter, l)) {
b = iter->l[l].b;
pr_buf(out, " %p l=%u %s ",
b, l, btree_node_intent_locked(iter, l) ? "i" : "r");
pr_buf(out, " %px %s l=%u ",
b, btree_node_intent_locked(iter, l) ? "i" : "r", l);
bch2_bpos_to_text(out, b->key.k.p);
pr_buf(out, "\n");
}
@ -2268,9 +2359,15 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
b = READ_ONCE(trans->locking);
if (b) {
pr_buf(out, " locking %px l=%u %s:",
b, b->level,
bch2_btree_ids[b->btree_id]);
pr_buf(out, " locking iter %u l=%u %s:",
trans->locking_iter_idx,
trans->locking_level,
bch2_btree_ids[trans->locking_btree_id]);
bch2_bpos_to_text(out, trans->locking_pos);
pr_buf(out, " node %px l=%u %s:",
b, b->c.level,
bch2_btree_ids[b->c.btree_id]);
bch2_bpos_to_text(out, b->key.k.p);
pr_buf(out, "\n");
}

View File

@ -27,13 +27,13 @@ static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter,
* that write lock. The lock sequence number is incremented by taking
* and releasing write locks and is even when unlocked:
*/
return iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1;
return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
}
static inline struct btree *btree_node_parent(struct btree_iter *iter,
struct btree *b)
{
return btree_iter_node(iter, b->level + 1);
return btree_iter_node(iter, b->c.level + 1);
}
static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans)
@ -73,8 +73,8 @@ __trans_next_iter(struct btree_trans *trans, unsigned idx)
static inline bool __iter_has_node(const struct btree_iter *iter,
const struct btree *b)
{
return iter->l[b->level].b == b &&
btree_node_lock_seq_matches(iter, b, b->level);
return iter->l[b->c.level].b == b &&
btree_node_lock_seq_matches(iter, b, b->c.level);
}
static inline struct btree_iter *
@ -110,6 +110,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_packed *,
unsigned, unsigned);
bool bch2_btree_iter_relock(struct btree_iter *, bool);
bool bch2_trans_relock(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
@ -136,6 +137,8 @@ static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
__bch2_btree_iter_downgrade(iter, 0);
}
void bch2_trans_downgrade(struct btree_trans *);
void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
@ -168,6 +171,8 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
@ -175,7 +180,9 @@ void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
static inline int btree_iter_cmp(const struct btree_iter *l,
const struct btree_iter *r)
{
return cmp_int(l->btree_id, r->btree_id) ?: bkey_cmp(l->pos, r->pos);
return cmp_int(l->btree_id, r->btree_id) ?:
-cmp_int(btree_iter_type(l), btree_iter_type(r)) ?:
bkey_cmp(l->pos, r->pos);
}
/*
@ -209,9 +216,12 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
unsigned flags)
{
return flags & BTREE_ITER_SLOTS
? bch2_btree_iter_peek_slot(iter)
: bch2_btree_iter_peek(iter);
if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED)
return bch2_btree_iter_peek_cached(iter);
else
return flags & BTREE_ITER_SLOTS
? bch2_btree_iter_peek_slot(iter)
: bch2_btree_iter_peek(iter);
}
static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,

View File

@ -0,0 +1,494 @@
#include "bcachefs.h"
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "error.h"
#include "journal.h"
#include "journal_reclaim.h"
#include <trace/events/bcachefs.h>
static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
const struct bkey_cached *ck = obj;
const struct bkey_cached_key *key = arg->key;
return cmp_int(ck->key.btree_id, key->btree_id) ?:
bkey_cmp(ck->key.pos, key->pos);
}
static const struct rhashtable_params bch2_btree_key_cache_params = {
.head_offset = offsetof(struct bkey_cached, hash),
.key_offset = offsetof(struct bkey_cached, key),
.key_len = sizeof(struct bkey_cached_key),
.obj_cmpfn = bch2_btree_key_cache_cmp_fn,
};
__flatten
static inline struct bkey_cached *
btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
{
struct bkey_cached_key key = {
.btree_id = btree_id,
.pos = pos,
};
return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
bch2_btree_key_cache_params);
}
static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
{
if (!six_trylock_intent(&ck->c.lock))
return false;
if (!six_trylock_write(&ck->c.lock)) {
six_unlock_intent(&ck->c.lock);
return false;
}
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
return false;
}
return true;
}
static void bkey_cached_evict(struct btree_key_cache *c,
struct bkey_cached *ck)
{
BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
bch2_btree_key_cache_params));
memset(&ck->key, ~0, sizeof(ck->key));
}
static void bkey_cached_free(struct btree_key_cache *c,
struct bkey_cached *ck)
{
list_move(&ck->list, &c->freed);
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
static struct bkey_cached *
bkey_cached_alloc(struct btree_key_cache *c)
{
struct bkey_cached *ck;
list_for_each_entry(ck, &c->freed, list)
if (bkey_cached_lock_for_evict(ck))
return ck;
list_for_each_entry(ck, &c->clean, list)
if (bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(c, ck);
return ck;
}
ck = kzalloc(sizeof(*ck), GFP_NOFS);
if (!ck)
return NULL;
INIT_LIST_HEAD(&ck->list);
six_lock_init(&ck->c.lock);
BUG_ON(!six_trylock_intent(&ck->c.lock));
BUG_ON(!six_trylock_write(&ck->c.lock));
return ck;
}
static struct bkey_cached *
btree_key_cache_create(struct btree_key_cache *c,
enum btree_id btree_id,
struct bpos pos)
{
struct bkey_cached *ck;
ck = bkey_cached_alloc(c);
if (!ck)
return ERR_PTR(-ENOMEM);
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
ck->key.pos = pos;
ck->valid = false;
BUG_ON(ck->flags);
if (rhashtable_lookup_insert_fast(&c->table,
&ck->hash,
bch2_btree_key_cache_params)) {
/* We raced with another fill: */
bkey_cached_free(c, ck);
return NULL;
}
list_move(&ck->list, &c->clean);
six_unlock_write(&ck->c.lock);
return ck;
}
static int btree_key_cache_fill(struct btree_trans *trans,
struct btree_iter *ck_iter,
struct bkey_cached *ck)
{
struct btree_iter *iter;
struct bkey_s_c k;
unsigned new_u64s = 0;
struct bkey_i *new_k = NULL;
int ret;
iter = bch2_trans_get_iter(trans, ck->key.btree_id,
ck->key.pos, BTREE_ITER_SLOTS);
if (IS_ERR(iter))
return PTR_ERR(iter);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret) {
bch2_trans_iter_put(trans, iter);
return ret;
}
if (!bch2_btree_node_relock(ck_iter, 0)) {
bch2_trans_iter_put(trans, iter);
trace_transaction_restart_ip(trans->ip, _THIS_IP_);
return -EINTR;
}
if (k.k->u64s > ck->u64s) {
new_u64s = roundup_pow_of_two(k.k->u64s);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k) {
bch2_trans_iter_put(trans, iter);
return -ENOMEM;
}
}
bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter);
if (new_k) {
kfree(ck->k);
ck->u64s = new_u64s;
ck->k = new_k;
}
bkey_reassemble(ck->k, k);
ck->valid = true;
bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
/* We're not likely to need this iterator again: */
bch2_trans_iter_free(trans, iter);
return 0;
}
static int bkey_cached_check_fn(struct six_lock *lock, void *p)
{
struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
const struct btree_iter *iter = p;
return ck->key.btree_id == iter->btree_id &&
!bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
}
int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct bkey_cached *ck;
int ret = 0;
BUG_ON(iter->level);
if (btree_node_locked(iter, 0)) {
ck = (void *) iter->l[0].b;
goto fill;
}
retry:
ck = btree_key_cache_find(c, iter->btree_id, iter->pos);
if (!ck) {
if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
iter->l[0].b = NULL;
return 0;
}
mutex_lock(&c->btree_key_cache.lock);
ck = btree_key_cache_create(&c->btree_key_cache,
iter->btree_id, iter->pos);
mutex_unlock(&c->btree_key_cache.lock);
ret = PTR_ERR_OR_ZERO(ck);
if (ret)
goto err;
if (!ck)
goto retry;
mark_btree_node_locked(iter, 0, SIX_LOCK_intent);
iter->locks_want = 1;
} else {
enum six_lock_type lock_want = __btree_lock_want(iter, 0);
if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
bkey_cached_check_fn, iter)) {
if (ck->key.btree_id != iter->btree_id ||
bkey_cmp(ck->key.pos, iter->pos)) {
goto retry;
}
trace_transaction_restart_ip(trans->ip, _THIS_IP_);
ret = -EINTR;
goto err;
}
if (ck->key.btree_id != iter->btree_id ||
bkey_cmp(ck->key.pos, iter->pos)) {
six_unlock_type(&ck->c.lock, lock_want);
goto retry;
}
mark_btree_node_locked(iter, 0, lock_want);
}
iter->l[0].lock_seq = ck->c.lock.state.seq;
iter->l[0].b = (void *) ck;
fill:
if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
if (!btree_node_intent_locked(iter, 0))
bch2_btree_iter_upgrade(iter, 1);
if (!btree_node_intent_locked(iter, 0)) {
trace_transaction_restart_ip(trans->ip, _THIS_IP_);
ret = -EINTR;
goto err;
}
ret = btree_key_cache_fill(trans, iter, ck);
if (ret)
goto err;
}
iter->uptodate = BTREE_ITER_NEED_PEEK;
bch2_btree_iter_downgrade(iter);
return ret;
err:
if (ret != -EINTR) {
btree_node_unlock(iter, 0);
iter->flags |= BTREE_ITER_ERROR;
iter->l[0].b = BTREE_ITER_NO_NODE_ERROR;
}
return ret;
}
static int btree_key_cache_flush_pos(struct btree_trans *trans,
struct bkey_cached_key key,
u64 journal_seq,
bool evict)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_iter *c_iter = NULL, *b_iter = NULL;
struct bkey_cached *ck;
int ret;
b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
BTREE_ITER_SLOTS|
BTREE_ITER_INTENT);
ret = PTR_ERR_OR_ZERO(b_iter);
if (ret)
goto out;
c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_CACHED_NOCREATE|
BTREE_ITER_INTENT);
ret = PTR_ERR_OR_ZERO(c_iter);
if (ret)
goto out;
retry:
ret = bch2_btree_iter_traverse(c_iter);
if (ret)
goto err;
ck = (void *) c_iter->l[0].b;
if (!ck ||
(journal_seq && ck->journal.seq != journal_seq))
goto out;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
if (!evict)
goto out;
goto evict;
}
ret = bch2_btree_iter_traverse(b_iter) ?:
bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
BTREE_INSERT_JOURNAL_RESERVED|
BTREE_INSERT_JOURNAL_RECLAIM);
err:
if (ret == -EINTR)
goto retry;
BUG_ON(ret && !bch2_journal_error(j));
if (ret)
goto out;
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
if (!evict) {
mutex_lock(&c->btree_key_cache.lock);
list_move_tail(&ck->list, &c->btree_key_cache.clean);
mutex_unlock(&c->btree_key_cache.lock);
} else {
evict:
BUG_ON(!btree_node_intent_locked(c_iter, 0));
mark_btree_node_unlocked(c_iter, 0);
c_iter->l[0].b = NULL;
six_lock_write(&ck->c.lock, NULL, NULL);
mutex_lock(&c->btree_key_cache.lock);
bkey_cached_evict(&c->btree_key_cache, ck);
bkey_cached_free(&c->btree_key_cache, ck);
mutex_unlock(&c->btree_key_cache.lock);
}
out:
bch2_trans_iter_put(trans, b_iter);
bch2_trans_iter_put(trans, c_iter);
return ret;
}
static void btree_key_cache_journal_flush(struct journal *j,
struct journal_entry_pin *pin,
u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bkey_cached *ck =
container_of(pin, struct bkey_cached, journal);
struct bkey_cached_key key;
struct btree_trans trans;
six_lock_read(&ck->c.lock, NULL, NULL);
key = READ_ONCE(ck->key);
if (ck->journal.seq != seq ||
!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
six_unlock_read(&ck->c.lock);
return;
}
six_unlock_read(&ck->c.lock);
bch2_trans_init(&trans, c, 0, 0);
btree_key_cache_flush_pos(&trans, key, seq, false);
bch2_trans_exit(&trans);
}
/*
* Flush and evict a key from the key cache:
*/
int bch2_btree_key_cache_flush(struct btree_trans *trans,
enum btree_id id, struct bpos pos)
{
struct bch_fs *c = trans->c;
struct bkey_cached_key key = { id, pos };
/* Fastpath - assume it won't be found: */
if (!btree_key_cache_find(c, id, pos))
return 0;
return btree_key_cache_flush_pos(trans, key, 0, true);
}
bool bch2_btree_insert_key_cached(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck = (void *) iter->l[0].b;
BUG_ON(insert->u64s > ck->u64s);
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
int difference;
BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
difference = jset_u64s(insert->u64s) - ck->res.u64s;
if (difference > 0) {
trans->journal_preres.u64s -= difference;
ck->res.u64s += difference;
}
}
bkey_copy(ck->k, insert);
ck->valid = true;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
mutex_lock(&c->btree_key_cache.lock);
list_del_init(&ck->list);
set_bit(BKEY_CACHED_DIRTY, &ck->flags);
mutex_unlock(&c->btree_key_cache.lock);
}
bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
&ck->journal, btree_key_cache_journal_flush);
return true;
}
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
enum btree_id id, struct bpos pos)
{
BUG_ON(btree_key_cache_find(trans->c, id, pos));
}
#endif
void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c)
{
struct bkey_cached *ck, *n;
mutex_lock(&c->lock);
list_for_each_entry_safe(ck, n, &c->clean, list) {
kfree(ck->k);
kfree(ck);
}
list_for_each_entry_safe(ck, n, &c->freed, list)
kfree(ck);
mutex_unlock(&c->lock);
rhashtable_destroy(&c->table);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->freed);
INIT_LIST_HEAD(&c->clean);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
{
return rhashtable_init(&c->table, &bch2_btree_key_cache_params);
}

View File

@ -0,0 +1,23 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
#define _BCACHEFS_BTREE_KEY_CACHE_H
int bch2_btree_iter_traverse_cached(struct btree_iter *);
bool bch2_btree_insert_key_cached(struct btree_trans *,
struct btree_iter *, struct bkey_i *);
int bch2_btree_key_cache_flush(struct btree_trans *,
enum btree_id, struct bpos);
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_key_cache_verify_clean(struct btree_trans *,
enum btree_id, struct bpos);
#else
static inline void
bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
enum btree_id id, struct bpos pos) {}
#endif
void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */

View File

@ -102,7 +102,7 @@ static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
EBUG_ON(level >= BTREE_MAX_DEPTH);
if (lock_type != BTREE_NODE_UNLOCKED)
six_unlock_type(&iter->l[level].b->lock, lock_type);
six_unlock_type(&iter->l[level].b->c.lock, lock_type);
mark_btree_node_unlocked(iter, level);
}
@ -143,14 +143,14 @@ static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
{
u64 start_time = local_clock();
six_lock_type(&b->lock, type);
six_lock_type(&b->c.lock, type, NULL, NULL);
bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
}
static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
enum six_lock_type type)
{
if (!six_trylock_type(&b->lock, type))
if (!six_trylock_type(&b->c.lock, type))
__btree_node_lock_type(c, b, type);
}
@ -158,16 +158,16 @@ static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
* Lock a btree node if we already have it locked on one of our linked
* iterators:
*/
static inline bool btree_node_lock_increment(struct btree_iter *iter,
static inline bool btree_node_lock_increment(struct btree_trans *trans,
struct btree *b, unsigned level,
enum btree_node_locked_type want)
{
struct btree_iter *linked;
struct btree_iter *iter;
trans_for_each_iter(iter->trans, linked)
if (linked->l[level].b == b &&
btree_node_locked_type(linked, level) >= want) {
six_lock_increment(&b->lock, want);
trans_for_each_iter(trans, iter)
if (iter->l[level].b == b &&
btree_node_locked_type(iter, level) >= want) {
six_lock_increment(&b->c.lock, want);
return true;
}
@ -175,26 +175,35 @@ static inline bool btree_node_lock_increment(struct btree_iter *iter,
}
bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
struct btree_iter *, enum six_lock_type);
struct btree_iter *, enum six_lock_type,
six_lock_should_sleep_fn, void *);
static inline bool btree_node_lock(struct btree *b, struct bpos pos,
unsigned level,
struct btree_iter *iter,
enum six_lock_type type)
static inline bool btree_node_lock(struct btree *b,
struct bpos pos, unsigned level,
struct btree_iter *iter,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
struct btree_trans *trans = iter->trans;
bool ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
#ifdef CONFIG_BCACHEFS_DEBUG
iter->trans->locking = b;
trans->locking = b;
trans->locking_iter_idx = iter->idx;
trans->locking_pos = pos;
trans->locking_btree_id = iter->btree_id;
trans->locking_level = level;
#endif
ret = likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(iter, b, level, type) ||
__bch2_btree_node_lock(b, pos, level, iter, type);
ret = likely(six_trylock_type(&b->c.lock, type)) ||
btree_node_lock_increment(trans, b, level, type) ||
__bch2_btree_node_lock(b, pos, level, iter, type,
should_sleep_fn, p);
#ifdef CONFIG_BCACHEFS_DEBUG
iter->trans->locking = NULL;
trans->locking = NULL;
#endif
return ret;
}
@ -221,13 +230,13 @@ bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter)
{
struct btree_iter *linked;
EBUG_ON(iter->l[b->level].b != b);
EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
EBUG_ON(iter->l[b->c.level].b != b);
EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
trans_for_each_iter_with_node(iter->trans, b, linked)
linked->l[b->level].lock_seq += 2;
linked->l[b->c.level].lock_seq += 2;
six_unlock_write(&b->lock);
six_unlock_write(&b->c.lock);
}
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
@ -236,10 +245,10 @@ void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
{
EBUG_ON(iter->l[b->level].b != b);
EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq);
EBUG_ON(iter->l[b->c.level].b != b);
EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq);
if (unlikely(!six_trylock_write(&b->lock)))
if (unlikely(!six_trylock_write(&b->c.lock)))
__bch2_btree_node_lock_write(b, iter);
}

View File

@ -60,17 +60,20 @@ struct btree_alloc {
BKEY_PADDED(k);
};
struct btree_bkey_cached_common {
struct six_lock lock;
u8 level;
u8 btree_id;
};
struct btree {
/* Hottest entries first */
struct btree_bkey_cached_common c;
struct rhash_head hash;
u64 hash_val;
struct six_lock lock;
unsigned long flags;
u16 written;
u8 level;
u8 btree_id;
u8 nsets;
u8 nr_key_bits;
@ -180,6 +183,7 @@ struct btree_node_iter {
enum btree_iter_type {
BTREE_ITER_KEYS,
BTREE_ITER_NODES,
BTREE_ITER_CACHED,
};
#define BTREE_ITER_TYPE ((1 << 2) - 1)
@ -211,6 +215,15 @@ enum btree_iter_type {
#define BTREE_ITER_IS_EXTENTS (1 << 6)
#define BTREE_ITER_ERROR (1 << 7)
#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8)
#define BTREE_ITER_CACHED_NOFILL (1 << 9)
#define BTREE_ITER_CACHED_NOCREATE (1 << 10)
#define BTREE_ITER_USER_FLAGS \
(BTREE_ITER_SLOTS \
|BTREE_ITER_INTENT \
|BTREE_ITER_PREFETCH \
|BTREE_ITER_CACHED_NOFILL \
|BTREE_ITER_CACHED_NOCREATE)
enum btree_iter_uptodate {
BTREE_ITER_UPTODATE = 0,
@ -219,6 +232,14 @@ enum btree_iter_uptodate {
BTREE_ITER_NEED_TRAVERSE = 3,
};
#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4)
#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5)
#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6)
#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
/*
* @pos - iterator's current position
* @level - current btree depth
@ -256,7 +277,8 @@ struct btree_iter {
unsigned long ip_allocated;
};
static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
static inline enum btree_iter_type
btree_iter_type(const struct btree_iter *iter)
{
return iter->flags & BTREE_ITER_TYPE;
}
@ -266,6 +288,37 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
return iter->l + iter->level;
}
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
struct list_head freed;
struct list_head clean;
};
struct bkey_cached_key {
u32 btree_id;
struct bpos pos;
} __packed;
#define BKEY_CACHED_DIRTY 0
struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
u8 u64s;
bool valid;
struct bkey_cached_key key;
struct rhash_head hash;
struct list_head list;
struct journal_preres res;
struct journal_entry_pin journal;
struct bkey_i *k;
};
struct btree_insert_entry {
unsigned trigger_flags;
unsigned trans_triggers_run:1;
@ -284,6 +337,10 @@ struct btree_trans {
#ifdef CONFIG_BCACHEFS_DEBUG
struct list_head list;
struct btree *locking;
unsigned locking_iter_idx;
struct bpos locking_pos;
u8 locking_btree_id;
u8 locking_level;
pid_t pid;
#endif
unsigned long ip;
@ -300,6 +357,7 @@ struct btree_trans {
unsigned error:1;
unsigned nounlock:1;
unsigned need_reset:1;
unsigned in_traverse_all:1;
unsigned mem_top;
unsigned mem_bytes;
@ -491,7 +549,7 @@ static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_
/* Type of keys @b contains: */
static inline enum btree_node_type btree_node_type(struct btree *b)
{
return __btree_node_type(b->level, b->btree_id);
return __btree_node_type(b->c.level, b->c.btree_id);
}
static inline bool btree_node_type_is_extents(enum btree_node_type type)

View File

@ -23,6 +23,7 @@ enum btree_insert_flags {
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_JOURNAL_RESERVED,
__BTREE_INSERT_JOURNAL_RECLAIM,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
__BCH_HASH_SET_MUST_CREATE,
@ -47,8 +48,12 @@ enum btree_insert_flags {
/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
/* Indicates that we have pre-reserved space in the journal: */
#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
/* Insert is being called from journal reclaim path: */
#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)

View File

@ -35,7 +35,7 @@ static void btree_node_interior_verify(struct btree *b)
struct bkey_s_c_btree_ptr_v2 bp;
struct bkey unpacked;
BUG_ON(!b->level);
BUG_ON(!b->c.level);
bch2_btree_node_iter_init_from_start(&iter, b);
@ -135,6 +135,8 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
bch2_btree_node_hash_remove(&c->btree_cache, b);
six_lock_wakeup_all(&b->c.lock);
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
@ -150,7 +152,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
btree_node_lock_type(c, b, SIX_LOCK_write);
__btree_node_free(c, b);
six_unlock_write(&b->lock);
six_unlock_write(&b->c.lock);
bch2_open_buckets_put(c, &ob);
}
@ -161,12 +163,12 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
struct btree_iter *linked;
trans_for_each_iter(iter->trans, linked)
BUG_ON(linked->l[b->level].b == b);
BUG_ON(linked->l[b->c.level].b == b);
six_lock_write(&b->lock);
six_lock_write(&b->c.lock, NULL, NULL);
__btree_node_free(c, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@ -265,8 +267,8 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
set_btree_node_need_write(b);
bch2_bset_init_first(b, &b->data->keys);
b->level = level;
b->btree_id = as->btree_id;
b->c.level = level;
b->c.btree_id = as->btree_id;
memset(&b->nr, 0, sizeof(b->nr));
b->data->magic = cpu_to_le64(bset_magic(c));
@ -319,7 +321,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
{
struct btree *n;
n = bch2_btree_node_alloc(as, b->level);
n = bch2_btree_node_alloc(as, b->c.level);
SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
@ -364,7 +366,7 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
bch2_btree_build_aux_trees(b);
bch2_btree_update_add_new_node(as, b);
six_unlock_write(&b->lock);
six_unlock_write(&b->c.lock);
return b;
}
@ -378,7 +380,7 @@ static void bch2_btree_reserve_put(struct btree_update *as)
while (as->nr_prealloc_nodes) {
struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
six_unlock_write(&b->lock);
six_unlock_write(&b->c.lock);
if (c->btree_reserve_cache_nr <
ARRAY_SIZE(c->btree_reserve_cache)) {
@ -394,9 +396,9 @@ static void bch2_btree_reserve_put(struct btree_update *as)
btree_node_lock_type(c, b, SIX_LOCK_write);
__btree_node_free(c, b);
six_unlock_write(&b->lock);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->lock);
six_unlock_intent(&b->c.lock);
}
mutex_unlock(&c->btree_reserve_cache_lock);
@ -527,9 +529,20 @@ static void btree_update_nodes_written(struct btree_update *as)
* to child nodes that weren't written yet: now, the child nodes have
* been written so we can write out the update to the interior node.
*/
/*
* We can't call into journal reclaim here: we'd block on the journal
* reclaim lock, but we may need to release the open buckets we have
* pinned in order for other btree updates to make forward progress, and
* journal reclaim does btree updates when flushing bkey_cached entries,
* which may require allocations as well.
*/
ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_JOURNAL_RECLAIM|
BTREE_INSERT_JOURNAL_RESERVED,
btree_update_nodes_written_trans(&trans, as));
BUG_ON(ret && !bch2_journal_error(&c->journal));
@ -556,7 +569,7 @@ static void btree_update_nodes_written(struct btree_update *as)
if (!ret && as->b == b) {
struct bset *i = btree_bset_last(b);
BUG_ON(!b->level);
BUG_ON(!b->c.level);
BUG_ON(!btree_node_dirty(b));
i->journal_seq = cpu_to_le64(
@ -567,10 +580,10 @@ static void btree_update_nodes_written(struct btree_update *as)
}
mutex_unlock(&c->btree_interior_update_lock);
six_unlock_write(&b->lock);
six_unlock_write(&b->c.lock);
btree_node_write_if_need(c, b, SIX_LOCK_intent);
six_unlock_intent(&b->lock);
six_unlock_intent(&b->c.lock);
}
bch2_journal_pin_drop(&c->journal, &as->journal);
@ -591,7 +604,7 @@ static void btree_update_nodes_written(struct btree_update *as)
btree_node_lock_type(c, b, SIX_LOCK_read);
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
six_unlock_read(&b->c.lock);
}
for (i = 0; i < as->nr_open_buckets; i++)
@ -690,7 +703,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
as->journal_u64s +=
journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
BCH_JSET_ENTRY_btree_root,
b->btree_id, b->level,
b->c.btree_id, b->c.level,
insert, insert->k.u64s);
mutex_lock(&c->btree_interior_update_lock);
@ -801,7 +814,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* operations complete
*/
list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
list_del(&p->write_blocked_list);
list_del_init(&p->write_blocked_list);
btree_update_reparent(as, p);
/*
@ -862,8 +875,11 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
{
struct bch_fs *c = trans->c;
struct btree_update *as;
int ret, disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0;
int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED)
? JOURNAL_RES_GET_RECLAIM : 0;
int ret = 0;
/*
* This check isn't necessary for correctness - it's just to potentially
@ -888,7 +904,7 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
JOURNAL_RES_GET_NONBLOCK);
journal_flags|JOURNAL_RES_GET_NONBLOCK);
if (ret == -EAGAIN) {
if (flags & BTREE_INSERT_NOUNLOCK)
return ERR_PTR(-EINTR);
@ -896,7 +912,8 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
bch2_trans_unlock(trans);
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES, 0);
BTREE_UPDATE_JOURNAL_RES,
journal_flags);
if (ret)
return ERR_PTR(ret);
@ -938,7 +955,7 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
mutex_lock(&c->btree_root_lock);
BUG_ON(btree_node_root(c, b) &&
(b->level < btree_node_root(c, b)->level ||
(b->c.level < btree_node_root(c, b)->c.level ||
!btree_node_dying(btree_node_root(c, b))));
btree_node_root(c, b) = b;
@ -1006,7 +1023,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
as->journal_u64s +=
journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
BCH_JSET_ENTRY_btree_keys,
b->btree_id, b->level,
b->c.btree_id, b->c.level,
insert, insert->k.u64s);
while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
@ -1031,7 +1048,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
struct bset *set1, *set2;
struct bkey_packed *k, *prev = NULL;
n2 = bch2_btree_node_alloc(as, n1->level);
n2 = bch2_btree_node_alloc(as, n1->c.level);
bch2_btree_update_add_new_node(as, n2);
n2->data->max_key = n1->data->max_key;
@ -1100,7 +1117,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
bch2_verify_btree_nr_keys(n1);
bch2_verify_btree_nr_keys(n2);
if (n1->level) {
if (n1->c.level) {
btree_node_interior_verify(n1);
btree_node_interior_verify(n2);
}
@ -1174,7 +1191,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
u64 start_time = local_clock();
BUG_ON(!parent && (b != btree_node_root(c, b)));
BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
bch2_btree_interior_update_will_free_node(as, b);
@ -1191,8 +1208,8 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_build_aux_trees(n2);
bch2_btree_build_aux_trees(n1);
six_unlock_write(&n2->lock);
six_unlock_write(&n1->lock);
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
bch2_btree_node_write(c, n2, SIX_LOCK_intent);
@ -1206,7 +1223,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
if (!parent) {
/* Depth increases, make a new root */
n3 = __btree_root_alloc(as, b->level + 1);
n3 = __btree_root_alloc(as, b->c.level + 1);
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
@ -1219,7 +1236,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
trace_btree_compact(c, b);
bch2_btree_build_aux_trees(n1);
six_unlock_write(&n1->lock);
six_unlock_write(&n1->c.lock);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
@ -1247,7 +1264,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
/* Successful split, update the iterator to point to the new nodes: */
six_lock_increment(&b->lock, SIX_LOCK_intent);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
bch2_btree_iter_node_drop(iter, b);
if (n3)
bch2_btree_iter_node_replace(iter, n3);
@ -1264,10 +1281,10 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_node_free_inmem(c, b, iter);
if (n3)
six_unlock_intent(&n3->lock);
six_unlock_intent(&n3->c.lock);
if (n2)
six_unlock_intent(&n2->lock);
six_unlock_intent(&n1->lock);
six_unlock_intent(&n2->c.lock);
six_unlock_intent(&n1->c.lock);
bch2_btree_trans_verify_locks(iter->trans);
@ -1285,7 +1302,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
struct bkey_packed *k;
/* Don't screw up @iter's position: */
node_iter = iter->l[b->level].iter;
node_iter = iter->l[b->c.level].iter;
/*
* btree_split(), btree_gc_coalesce() will insert keys before
@ -1302,7 +1319,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
btree_update_updated_node(as, b);
trans_for_each_iter_with_node(iter->trans, b, linked)
bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
bch2_btree_trans_verify_iters(iter->trans, b);
}
@ -1328,8 +1345,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
BUG_ON(!b->level);
BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
BUG_ON(!b->c.level);
BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys);
@ -1366,7 +1383,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
* the btree iterator yet, so the merge path's unlock/wait/relock dance
* won't work:
*/
bch2_foreground_maybe_merge(c, iter, b->level,
bch2_foreground_maybe_merge(c, iter, b->c.level,
flags|BTREE_INSERT_NOUNLOCK);
return;
split:
@ -1518,7 +1535,7 @@ retry:
b->sib_u64s[sib] = sib_u64s;
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
six_unlock_intent(&m->lock);
six_unlock_intent(&m->c.lock);
goto out;
}
@ -1548,7 +1565,7 @@ retry:
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_interior_update_will_free_node(as, m);
n = bch2_btree_node_alloc(as, b->level);
n = bch2_btree_node_alloc(as, b->c.level);
bch2_btree_update_add_new_node(as, n);
btree_set_min(n, prev->data->min_key);
@ -1561,7 +1578,7 @@ retry:
bch2_btree_sort_into(c, n, next);
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->lock);
six_unlock_write(&n->c.lock);
bkey_init(&delete.k);
delete.k.p = prev->key.k.p;
@ -1574,7 +1591,7 @@ retry:
bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->lock, SIX_LOCK_intent);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
bch2_btree_iter_node_drop(iter, b);
bch2_btree_iter_node_drop(iter, m);
@ -1585,7 +1602,7 @@ retry:
bch2_btree_node_free_inmem(c, b, iter);
bch2_btree_node_free_inmem(c, m, iter);
six_unlock_intent(&n->lock);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
@ -1607,7 +1624,7 @@ out:
return;
err_cycle_gc_lock:
six_unlock_intent(&m->lock);
six_unlock_intent(&m->c.lock);
if (flags & BTREE_INSERT_NOUNLOCK)
goto out;
@ -1620,7 +1637,7 @@ err_cycle_gc_lock:
goto err;
err_unlock:
six_unlock_intent(&m->lock);
six_unlock_intent(&m->c.lock);
if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
up_read(&c->gc_lock);
err:
@ -1663,7 +1680,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_update_add_new_node(as, n);
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->lock);
six_unlock_write(&n->c.lock);
trace_btree_gc_rewrite_node(c, b);
@ -1678,11 +1695,11 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->lock, SIX_LOCK_intent);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
bch2_btree_iter_node_drop(iter, b);
bch2_btree_iter_node_replace(iter, n);
bch2_btree_node_free_inmem(c, b, iter);
six_unlock_intent(&n->lock);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
return 0;
@ -1759,7 +1776,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
if (new_hash) {
bkey_copy(&new_hash->key, new_key);
ret = bch2_btree_node_hash_insert(&c->btree_cache,
new_hash, b->level, b->btree_id);
new_hash, b->c.level, b->c.btree_id);
BUG_ON(ret);
}
@ -1885,8 +1902,8 @@ err:
list_move(&new_hash->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
six_unlock_write(&new_hash->lock);
six_unlock_intent(&new_hash->lock);
six_unlock_write(&new_hash->c.lock);
six_unlock_intent(&new_hash->c.lock);
}
up_read(&c->gc_lock);
closure_sync(&cl);
@ -1926,8 +1943,8 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
bch2_btree_cache_cannibalize_unlock(c);
set_btree_node_fake(b);
b->level = 0;
b->btree_id = id;
b->c.level = 0;
b->c.btree_id = id;
bkey_btree_ptr_init(&b->key);
b->key.k.p = POS_MAX;
@ -1942,13 +1959,14 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
b->data->format = bch2_btree_calc_format(b);
btree_node_set_format(b, b->data->format);
ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
b->c.level, b->c.btree_id);
BUG_ON(ret);
bch2_btree_set_root_inmem(c, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)

View File

@ -92,9 +92,9 @@ struct btree_update {
struct btree *new_nodes[BTREE_UPDATE_NODES_MAX];
unsigned nr_new_nodes;
u8 open_buckets[BTREE_UPDATE_NODES_MAX *
open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX *
BCH_REPLICAS_MAX];
u8 nr_open_buckets;
open_bucket_idx_t nr_open_buckets;
unsigned journal_u64s;
u64 journal_entries[BTREE_UPDATE_JOURNAL_RES];
@ -173,7 +173,7 @@ void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
struct btree *b)
{
unsigned depth = btree_node_root(c, b)->level + 1;
unsigned depth = btree_node_root(c, b)->c.level + 1;
/*
* Number of nodes we might have to allocate in a worst case btree
@ -181,9 +181,9 @@ static inline unsigned btree_update_reserve_required(struct bch_fs *c,
* a new root, unless we're already at max depth:
*/
if (depth < BTREE_MAX_DEPTH)
return (depth - b->level) * 2 + 1;
return (depth - b->c.level) * 2 + 1;
else
return (depth - b->level) * 2 - 1;
return (depth - b->c.level) * 2 - 1;
}
static inline void btree_node_reset_sib_u64s(struct btree *b)

View File

@ -6,6 +6,7 @@
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "buckets.h"
#include "debug.h"
@ -32,6 +33,9 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
{
bch2_btree_node_lock_write(b, iter);
if (btree_iter_type(iter) == BTREE_ITER_CACHED)
return;
if (unlikely(btree_node_just_written(b)) &&
bch2_btree_post_write_cleanup(c, b))
bch2_btree_iter_reinit_node(iter, b);
@ -135,7 +139,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write_cond(c, b,
(btree_current_write(b) == w && w->journal.seq == seq));
six_unlock_read(&b->lock);
six_unlock_read(&b->c.lock);
}
static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
@ -159,71 +163,35 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
: btree_node_flush1);
}
static inline void __btree_journal_key(struct btree_trans *trans,
enum btree_id btree_id,
struct bkey_i *insert)
{
struct journal *j = &trans->c->journal;
u64 seq = trans->journal_res.seq;
bool needs_whiteout = insert->k.needs_whiteout;
/* ick */
insert->k.needs_whiteout = false;
bch2_journal_add_keys(j, &trans->journal_res,
btree_id, insert);
insert->k.needs_whiteout = needs_whiteout;
bch2_journal_set_has_inode(j, &trans->journal_res,
insert->k.p.inode);
if (trans->journal_seq)
*trans->journal_seq = seq;
}
static void bch2_btree_journal_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree *b = iter_l(iter)->b;
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
__btree_journal_key(trans, iter->btree_id, insert);
btree_bset_last(b)->journal_seq =
cpu_to_le64(trans->journal_res.seq);
}
bch2_btree_add_journal_pin(c, b,
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
? trans->journal_res.seq
: j->replay_journal_seq);
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b);
}
/**
* btree_insert_key - insert a key one key into a leaf node
*/
static void btree_insert_key_leaf(struct btree_trans *trans,
static bool btree_insert_key_leaf(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
struct btree *b = iter_l(iter)->b;
struct bset_tree *t = bset_tree_last(b);
struct bset *i = bset(b, t);
int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
insert->k.needs_whiteout = false;
EBUG_ON(!iter->level &&
!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert)))
bch2_btree_journal_key(trans, iter, insert);
if (unlikely(!bch2_btree_bset_insert_key(iter, b,
&iter_l(iter)->iter, insert)))
return false;
i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
le64_to_cpu(i->journal_seq)));
bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
@ -238,8 +206,11 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
bch2_btree_iter_reinit_node(iter, b);
trace_btree_insert_key(c, b, insert);
return true;
}
/* Cached btree updates: */
/* Normal update interface: */
static inline void btree_insert_entry_checks(struct btree_trans *trans,
@ -322,11 +293,60 @@ btree_key_can_insert(struct btree_trans *trans,
return BTREE_INSERT_OK;
}
static enum btree_insert_ret
btree_key_can_insert_cached(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert,
unsigned *u64s)
{
struct bkey_cached *ck = (void *) iter->l[0].b;
unsigned new_u64s;
struct bkey_i *new_k;
BUG_ON(iter->level);
if (*u64s <= ck->u64s)
return BTREE_INSERT_OK;
new_u64s = roundup_pow_of_two(*u64s);
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k)
return -ENOMEM;
ck->u64s = new_u64s;
ck->k = new_k;
return BTREE_INSERT_OK;
}
static inline void do_btree_insert_one(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
btree_insert_key_leaf(trans, iter, insert);
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
bool did_work;
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
insert->k.needs_whiteout = false;
did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED)
? btree_insert_key_leaf(trans, iter, insert)
: bch2_btree_insert_key_cached(trans, iter, insert);
if (!did_work)
return;
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
bch2_journal_add_keys(j, &trans->journal_res,
iter->btree_id, insert);
bch2_journal_set_has_inode(j, &trans->journal_res,
insert->k.p.inode);
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
}
}
static inline bool iter_has_trans_triggers(struct btree_iter *iter)
@ -351,10 +371,16 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
trans_for_each_update(trans, i)
if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b)))
trans_for_each_update(trans, i) {
/*
* XXX: synchronization of cached update triggers with gc
*/
BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED);
if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
bch2_mark_update(trans, i->iter, i->k, NULL,
i->trigger_flags|BTREE_TRIGGER_GC);
}
}
static inline int
@ -387,7 +413,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
u64s = 0;
u64s += i->k->k.u64s;
ret = btree_key_can_insert(trans, i->iter, i->k, &u64s);
ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
? btree_key_can_insert(trans, i->iter, i->k, &u64s)
: btree_key_can_insert_cached(trans, i->iter, i->k, &u64s);
if (ret) {
*stopped_at = i;
return ret;
@ -411,6 +439,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
JOURNAL_RES_GET_NONBLOCK);
if (ret)
goto err;
} else {
trans->journal_res.seq = c->journal.replay_journal_seq;
}
if (unlikely(trans->extra_journal_entry_u64s)) {
@ -481,7 +511,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
ret = bch2_journal_preres_get(&trans->c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
JOURNAL_RES_GET_NONBLOCK);
JOURNAL_RES_GET_NONBLOCK|
((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)
? JOURNAL_RES_GET_RECLAIM : 0));
if (unlikely(ret == -EAGAIN))
ret = bch2_trans_journal_preres_get_cold(trans,
trans->journal_preres_u64s);
@ -495,7 +527,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
* or anything else that might call bch2_trans_relock(), since that
* would just retake the read locks:
*/
trans_for_each_iter_all(trans, iter) {
trans_for_each_iter(trans, iter) {
if (iter->nodes_locked != iter->nodes_intent_locked) {
EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
EBUG_ON(trans->iters_live & (1ULL << iter->idx));
@ -537,14 +569,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
trans->nounlock = true;
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
!same_leaf_as_prev(trans, i))
bch2_foreground_maybe_merge(trans->c, i->iter,
0, trans->flags);
trans->nounlock = false;
trans_for_each_update2(trans, i)
bch2_btree_iter_downgrade(i->iter);
bch2_trans_downgrade(trans);
return 0;
}
@ -823,6 +855,14 @@ int __bch2_trans_commit(struct btree_trans *trans)
return ret;
}
#ifdef CONFIG_BCACHEFS_DEBUG
trans_for_each_update(trans, i)
if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
!(i->trigger_flags & BTREE_TRIGGER_NORUN))
bch2_btree_key_cache_verify_clean(trans,
i->iter->btree_id, i->iter->pos);
#endif
/*
* Running triggers will append more updates to the list of updates as
* we're walking it:
@ -831,9 +871,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
trans_trigger_run = false;
trans_for_each_update(trans, i) {
if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK)) {
if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK &&
(ret = bch2_btree_iter_traverse(i->iter)))) {
trace_trans_restart_traverse(trans->ip);
ret = -EINTR;
goto out;
}
@ -895,7 +935,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
BUG_ON(i->iter->locks_want < 1);
u64s = jset_u64s(i->k->k.u64s);
if (0)
if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
trans->journal_preres_u64s += u64s;
trans->journal_u64s += u64s;
}

View File

@ -1455,13 +1455,11 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
/* trans_mark: */
static int trans_get_key(struct btree_trans *trans,
enum btree_id btree_id, struct bpos pos,
struct btree_iter **iter,
struct bkey_s_c *k)
static struct btree_iter *trans_get_update(struct btree_trans *trans,
enum btree_id btree_id, struct bpos pos,
struct bkey_s_c *k)
{
struct btree_insert_entry *i;
int ret;
trans_for_each_update(trans, i)
if (i->iter->btree_id == btree_id &&
@ -1469,17 +1467,33 @@ static int trans_get_key(struct btree_trans *trans,
? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
bkey_cmp(pos, i->k->k.p) < 0
: !bkey_cmp(pos, i->iter->pos))) {
*iter = i->iter;
*k = bkey_i_to_s_c(i->k);
return 1;
*k = bkey_i_to_s_c(i->k);
return i->iter;
}
return NULL;
}
static int trans_get_key(struct btree_trans *trans,
enum btree_id btree_id, struct bpos pos,
struct btree_iter **iter,
struct bkey_s_c *k)
{
unsigned flags = btree_id != BTREE_ID_ALLOC
? BTREE_ITER_SLOTS
: BTREE_ITER_CACHED;
int ret;
*iter = trans_get_update(trans, btree_id, pos, k);
if (*iter)
return 1;
*iter = bch2_trans_get_iter(trans, btree_id, pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
flags|BTREE_ITER_INTENT);
if (IS_ERR(*iter))
return PTR_ERR(*iter);
*k = bch2_btree_iter_peek_slot(*iter);
*k = __bch2_btree_iter_peek(*iter, flags);
ret = bkey_err(*k);
if (ret)
bch2_trans_iter_put(trans, *iter);
@ -1492,36 +1506,33 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr));
struct btree_iter *iter;
struct bkey_s_c k_a;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
struct bucket *g;
int ret;
ret = trans_get_key(trans, BTREE_ID_ALLOC,
POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)),
&iter, &k_a);
if (ret < 0)
return ret;
iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a);
if (iter) {
u = bch2_alloc_unpack(k_a);
} else {
iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos,
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
if (k_a.k->type != KEY_TYPE_alloc ||
(!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags)))) {
/*
* During journal replay, and if gc repairs alloc info at
* runtime, the alloc info in the btree might not be up to date
* yet - so, trust the in memory mark - unless we're already
* updating that key:
*/
struct bucket *g;
struct bucket_mark m;
ret = bch2_btree_iter_traverse(iter);
if (ret)
goto out;
percpu_down_read(&c->mark_lock);
g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
u = alloc_mem_to_key(g, m);
g = bucket(ca, pos.offset);
u = alloc_mem_to_key(g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
} else {
u = bch2_alloc_unpack(k_a);
}
ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type,
@ -1535,7 +1546,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
goto out;
bkey_alloc_init(&a->k_i);
a->k.p = iter->pos;
a->k.p = pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i, 0);
out:
@ -1808,6 +1819,13 @@ int bch2_trans_mark_update(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
return 0;
if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
struct bkey_cached *ck = (void *) iter->l[0].b;
return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k),
0, 0, BTREE_TRIGGER_OVERWRITE);
}
while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
struct bkey unpacked;
struct bkey_s_c k;
@ -1975,6 +1993,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
int ret = -ENOMEM;
unsigned i;
lockdep_assert_held(&c->state_lock);
memset(&free, 0, sizeof(free));
memset(&free_inc, 0, sizeof(free_inc));
memset(&alloc_heap, 0, sizeof(alloc_heap));
@ -2001,7 +2021,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bch2_copygc_stop(ca);
if (resize) {
down_write(&c->gc_lock);
down_write(&ca->bucket_lock);
percpu_down_write(&c->mark_lock);
}
@ -2044,10 +2063,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
nbuckets = ca->mi.nbuckets;
if (resize) {
if (resize)
up_write(&ca->bucket_lock);
up_write(&c->gc_lock);
}
if (start_copygc &&
bch2_copygc_start(c, ca))

View File

@ -39,6 +39,7 @@ struct bucket {
u16 io_time[2];
u8 oldest_gen;
u8 gc_gen;
unsigned gen_valid:1;
};

View File

@ -52,8 +52,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
bkey_copy(&v->key, &b->key);
v->written = 0;
v->level = b->level;
v->btree_id = b->btree_id;
v->c.level = b->c.level;
v->c.btree_id = b->c.btree_id;
bch2_btree_keys_init(v, &c->expensive_debug_checks);
if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),

View File

@ -37,7 +37,7 @@ void bch2_io_error_work(struct work_struct *work)
struct bch_fs *c = ca->fs;
bool dev;
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
BCH_FORCE_IF_DEGRADED);
if (dev
@ -47,7 +47,7 @@ void bch2_io_error_work(struct work_struct *work)
bch_err(ca,
"too many IO errors, setting %s RO",
dev ? "device" : "filesystem");
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
}
void bch2_io_error(struct bch_dev *ca)

View File

@ -1315,16 +1315,16 @@ static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * cons
if (IS_ERR(c))
return c;
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
if (!test_bit(BCH_FS_STARTED, &c->flags)) {
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
closure_put(&c->cl);
pr_err("err mounting %s: incomplete filesystem", dev_name);
return ERR_PTR(-EINVAL);
}
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
return c;
@ -1373,7 +1373,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
return ret;
if (opts.read_only != c->opts.read_only) {
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
if (opts.read_only) {
bch2_fs_read_only(c);
@ -1383,7 +1383,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
ret = bch2_fs_read_write(c);
if (ret) {
bch_err(c, "error going rw: %i", ret);
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return -EINVAL;
}
@ -1392,7 +1392,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
c->opts.read_only = opts.read_only;
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
}
if (opts.errors >= 0)

View File

@ -500,6 +500,9 @@ static void bch2_write_done(struct closure *cl)
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
up(&c->io_in_flight);
if (op->end_io) {
EBUG_ON(cl->parent);
closure_debug_destroy(cl);
@ -1258,6 +1261,12 @@ void bch2_write(struct closure *cl)
goto err;
}
/*
* Can't ratelimit copygc - we'd deadlock:
*/
if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
down(&c->io_in_flight);
bch2_increment_clock(c, bio_sectors(bio), WRITE);
data_len = min_t(u64, bio->bi_iter.bi_size,

View File

@ -428,9 +428,10 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
static bool journal_preres_available(struct journal *j,
struct journal_preres *res,
unsigned new_u64s)
unsigned new_u64s,
unsigned flags)
{
bool ret = bch2_journal_preres_get_fast(j, res, new_u64s);
bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags);
if (!ret)
bch2_journal_reclaim_work(&j->reclaim_work.work);
@ -440,13 +441,14 @@ static bool journal_preres_available(struct journal *j,
int __bch2_journal_preres_get(struct journal *j,
struct journal_preres *res,
unsigned new_u64s)
unsigned new_u64s,
unsigned flags)
{
int ret;
closure_wait_event(&j->preres_wait,
(ret = bch2_journal_error(j)) ||
journal_preres_available(j, res, new_u64s));
journal_preres_available(j, res, new_u64s, flags));
return ret;
}
@ -985,9 +987,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
u64 last_seq = cur_seq, nr, seq;
if (!list_empty(journal_entries))
last_seq = le64_to_cpu(list_first_entry(journal_entries,
struct journal_replay,
list)->j.seq);
last_seq = le64_to_cpu(list_last_entry(journal_entries,
struct journal_replay, list)->j.last_seq);
nr = cur_seq - last_seq;
@ -1016,8 +1017,10 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
list_for_each_entry(i, journal_entries, list) {
seq = le64_to_cpu(i->j.seq);
BUG_ON(seq >= cur_seq);
BUG_ON(seq < last_seq || seq >= cur_seq);
if (seq < last_seq)
continue;
journal_seq_pin(j, seq)->devs = i->devs;
}

View File

@ -299,6 +299,7 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
#define JOURNAL_RES_GET_NONBLOCK (1 << 0)
#define JOURNAL_RES_GET_CHECK (1 << 1)
#define JOURNAL_RES_GET_RESERVED (1 << 2)
#define JOURNAL_RES_GET_RECLAIM (1 << 3)
static inline int journal_res_get_fast(struct journal *j,
struct journal_res *res,
@ -406,11 +407,12 @@ static inline void bch2_journal_preres_put(struct journal *j,
}
int __bch2_journal_preres_get(struct journal *,
struct journal_preres *, unsigned);
struct journal_preres *, unsigned, unsigned);
static inline int bch2_journal_preres_get_fast(struct journal *j,
struct journal_preres *res,
unsigned new_u64s)
unsigned new_u64s,
unsigned flags)
{
int d = new_u64s - res->u64s;
union journal_preres_state old, new;
@ -421,7 +423,15 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
new.reserved += d;
if (new.reserved > new.remaining)
/*
* If we're being called from the journal reclaim path, we have
* to unconditionally give out the pre-reservation, there's
* nothing else sensible we can do - otherwise we'd recurse back
* into the reclaim path and deadlock:
*/
if (!(flags & JOURNAL_RES_GET_RECLAIM) &&
new.reserved > new.remaining)
return 0;
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
old.v, new.v)) != old.v);
@ -438,13 +448,13 @@ static inline int bch2_journal_preres_get(struct journal *j,
if (new_u64s <= res->u64s)
return 0;
if (bch2_journal_preres_get_fast(j, res, new_u64s))
if (bch2_journal_preres_get_fast(j, res, new_u64s, flags))
return 0;
if (flags & JOURNAL_RES_GET_NONBLOCK)
return -EAGAIN;
return __bch2_journal_preres_get(j, res, new_u64s);
return __bch2_journal_preres_get(j, res, new_u64s, flags);
}
/* journal_entry_res: */

View File

@ -41,19 +41,21 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
list)->j.last_seq
: 0;
/* Is this entry older than the range we need? */
if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
goto out;
}
if (!c->opts.read_entire_journal) {
/* Is this entry older than the range we need? */
if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
goto out;
}
/* Drop entries we don't need anymore */
list_for_each_entry_safe(i, pos, jlist->head, list) {
if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
break;
list_del(&i->list);
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
/* Drop entries we don't need anymore */
list_for_each_entry_safe(i, pos, jlist->head, list) {
if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
break;
list_del(&i->list);
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
}
}
list_for_each_entry_reverse(i, jlist->head, list) {

View File

@ -28,17 +28,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
struct journal_device *ja,
enum journal_space_from from)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
unsigned available = (journal_space_from(ja, from) -
ja->cur_idx - 1 + ja->nr) % ja->nr;
/*
* Allocator startup needs some journal space before we can do journal
* replay:
*/
if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
--available;
/*
* Don't use the last bucket unless writing the new last_seq
* will make another bucket available:
@ -354,6 +346,37 @@ void __bch2_journal_pin_add(struct journal *j, u64 seq,
journal_wake(j);
}
void bch2_journal_pin_update(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
if (journal_pin_active(pin) && pin->seq < seq)
return;
spin_lock(&j->lock);
if (pin->seq != seq) {
bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
} else {
struct journal_entry_pin_list *pin_list =
journal_seq_pin(j, seq);
/*
* If the pin is already pinning the right sequence number, it
* still might've already been flushed:
*/
list_move(&pin->list, &pin_list->list);
}
spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
journal_wake(j);
}
void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *dst,
struct journal_entry_pin *src,
@ -393,6 +416,9 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
return NULL;
spin_lock(&j->lock);
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)

View File

@ -42,6 +42,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
__bch2_journal_pin_add(j, seq, pin, flush_fn);
}
void bch2_journal_pin_update(struct journal *, u64,
struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_pin_copy(struct journal *,
struct journal_entry_pin *,
struct journal_entry_pin *,

View File

@ -125,6 +125,7 @@ union journal_preres_state {
enum {
JOURNAL_REPLAY_DONE,
JOURNAL_STARTED,
JOURNAL_RECLAIM_STARTED,
JOURNAL_NEED_WRITE,
JOURNAL_NOT_EMPTY,
JOURNAL_MAY_GET_UNRESERVED,

View File

@ -178,9 +178,12 @@ next:
}
continue;
nomatch:
if (m->ctxt)
if (m->ctxt) {
BUG_ON(k.k->p.offset <= iter->pos.offset);
atomic64_inc(&m->ctxt->stats->keys_raced);
atomic64_add(k.k->p.offset - iter->pos.offset,
&m->ctxt->stats->sectors_raced);
}
atomic_long_inc(&c->extent_migrate_raced);
trace_move_race(&new->k);
bch2_btree_iter_next_slot(iter);

View File

@ -8,6 +8,7 @@ struct bch_move_stats {
struct bpos pos;
atomic64_t keys_moved;
atomic64_t keys_raced;
atomic64_t sectors_moved;
atomic64_t sectors_seen;
atomic64_t sectors_raced;

View File

@ -78,7 +78,17 @@ static bool __copygc_pred(struct bch_dev *ca,
ssize_t i = eytzinger0_find_le(h->data, h->used,
sizeof(h->data[0]),
bucket_offset_cmp, &search);
#if 0
/* eytzinger search verify code: */
ssize_t j = -1, k;
for (k = 0; k < h->used; k++)
if (h->data[k].offset <= ptr->offset &&
(j < 0 || h->data[k].offset > h->data[j].offset))
j = k;
BUG_ON(i != j);
#endif
return (i >= 0 &&
ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
ptr->gen == h->data[i].gen);
@ -203,9 +213,12 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
if (sectors_not_moved && !ret)
bch_warn_ratelimited(c,
"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
sectors_not_moved, sectors_to_move,
buckets_not_moved, buckets_to_move);
buckets_not_moved, buckets_to_move,
atomic64_read(&move_stats.sectors_moved),
atomic64_read(&move_stats.keys_raced),
atomic64_read(&move_stats.sectors_raced));
trace_copygc(ca,
atomic64_read(&move_stats.sectors_moved), sectors_not_moved,

View File

@ -265,6 +265,11 @@ enum opt_type {
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Don't free journal entries/keys after startup")\
x(read_entire_journal, u8, \
0, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Read all journal entries, not just dirty ones")\
x(noexcl, u8, \
OPT_MOUNT, \
OPT_BOOL(), \

View File

@ -188,7 +188,7 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i
iter->b = b;
bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
bch2_journal_iter_init(&iter->journal, journal_keys,
b->btree_id, b->level, b->data->min_key);
b->c.btree_id, b->c.level, b->data->min_key);
}
/* Walk btree, overlaying keys from the journal: */
@ -206,11 +206,11 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
ret = key_fn(c, btree_id, b->level, k);
ret = key_fn(c, btree_id, b->c.level, k);
if (ret)
break;
if (b->level) {
if (b->c.level) {
struct btree *child;
BKEY_PADDED(k) tmp;
@ -219,9 +219,9 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
bch2_btree_and_journal_iter_advance(&iter);
if (b->level > 0) {
if (b->c.level > 0) {
child = bch2_btree_node_get_noiter(c, &tmp.k,
b->btree_id, b->level - 1);
b->c.btree_id, b->c.level - 1);
ret = PTR_ERR_OR_ZERO(child);
if (ret)
break;
@ -229,7 +229,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
ret = (node_fn ? node_fn(c, b) : 0) ?:
bch2_btree_and_journal_walk_recurse(c, child,
journal_keys, btree_id, node_fn, key_fn);
six_unlock_read(&child->lock);
six_unlock_read(&child->c.lock);
if (ret)
break;
@ -253,12 +253,12 @@ int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_k
if (btree_node_fake(b))
return 0;
six_lock_read(&b->lock);
six_lock_read(&b->c.lock, NULL, NULL);
ret = (node_fn ? node_fn(c, b) : 0) ?:
bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id,
node_fn, key_fn) ?:
key_fn(c, btree_id, b->level + 1, bkey_i_to_s_c(&b->key));
six_unlock_read(&b->lock);
key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key));
six_unlock_read(&b->c.lock);
return ret;
}
@ -292,17 +292,6 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
cmp_int(l->journal_offset, r->journal_offset);
}
static int journal_sort_seq_cmp(const void *_l, const void *_r)
{
const struct journal_key *l = _l;
const struct journal_key *r = _r;
return cmp_int(r->level, l->level) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
bkey_cmp(l->k->k.p, r->k->k.p);
}
void bch2_journal_keys_free(struct journal_keys *keys)
{
kvfree(keys->d);
@ -319,20 +308,30 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
struct journal_key *src, *dst;
size_t nr_keys = 0;
list_for_each_entry(p, journal_entries, list)
for_each_jset_key(k, _n, entry, &p->j)
nr_keys++;
if (list_empty(journal_entries))
return keys;
keys.journal_seq_base =
le64_to_cpu(list_first_entry(journal_entries,
struct journal_replay,
list)->j.seq);
le64_to_cpu(list_last_entry(journal_entries,
struct journal_replay, list)->j.last_seq);
list_for_each_entry(p, journal_entries, list) {
if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
continue;
for_each_jset_key(k, _n, entry, &p->j)
nr_keys++;
}
keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
if (!keys.d)
goto err;
list_for_each_entry(p, journal_entries, list)
list_for_each_entry(p, journal_entries, list) {
if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
continue;
for_each_jset_key(k, _n, entry, &p->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
@ -342,6 +341,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
keys.journal_seq_base,
.journal_offset = k->_data - p->j._data,
};
}
sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
@ -507,11 +507,48 @@ static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
__bch2_journal_replay_key(&trans, id, level, k));
}
static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
{
struct btree_iter *iter;
int ret;
iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p,
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
ret = PTR_ERR_OR_ZERO(iter) ?:
bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
bch2_trans_iter_put(trans, iter);
return ret;
}
static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
{
return bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY,
__bch2_alloc_replay_key(&trans, k));
}
static int journal_sort_seq_cmp(const void *_l, const void *_r)
{
const struct journal_key *l = _l;
const struct journal_key *r = _r;
return cmp_int(r->level, l->level) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
bkey_cmp(l->k->k.p, r->k->k.p);
}
static int bch2_journal_replay(struct bch_fs *c,
struct journal_keys keys)
{
struct journal *j = &c->journal;
struct journal_key *i;
u64 seq;
int ret;
sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
@ -519,26 +556,63 @@ static int bch2_journal_replay(struct bch_fs *c,
if (keys.nr)
replay_now_at(j, keys.journal_seq_base);
seq = j->replay_journal_seq;
/*
* First replay updates to the alloc btree - these will only update the
* btree key cache:
*/
for_each_journal_key(keys, i) {
if (!i->level)
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
if (i->level)
ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
if (i->btree_id == BTREE_ID_ALLOC)
ret = bch2_alloc_replay_key(c, i->k);
else if (i->k->k.size)
ret = bch2_extent_replay_key(c, i->btree_id, i->k);
else
ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
if (ret) {
bch_err(c, "journal replay: error %d while replaying key",
ret);
return ret;
}
cond_resched();
if (!i->level && i->btree_id == BTREE_ID_ALLOC) {
j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
ret = bch2_alloc_replay_key(c, i->k);
if (ret)
goto err;
}
}
/*
* Next replay updates to interior btree nodes:
*/
for_each_journal_key(keys, i) {
cond_resched();
if (i->level) {
j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
if (ret)
goto err;
}
}
/*
* Now that the btree is in a consistent state, we can start journal
* reclaim (which will be flushing entries from the btree key cache back
* to the btree:
*/
set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
j->replay_journal_seq = seq;
/*
* Now replay leaf node updates:
*/
for_each_journal_key(keys, i) {
cond_resched();
if (i->level || i->btree_id == BTREE_ID_ALLOC)
continue;
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
ret = i->k->k.size
? bch2_extent_replay_key(c, i->btree_id, i->k)
: bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
if (ret)
goto err;
}
replay_now_at(j, j->replay_journal_seq_end);
@ -547,6 +621,9 @@ static int bch2_journal_replay(struct bch_fs *c,
bch2_journal_set_replay_done(j);
bch2_journal_flush_all_pins(j);
return bch2_journal_error(j);
err:
bch_err(c, "journal replay: error %d while replaying key", ret);
return ret;
}
static bool journal_empty(struct list_head *journal)
@ -568,6 +645,9 @@ verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
int ret = 0;
list_for_each_entry(i, journal, list) {
if (le64_to_cpu(i->j.seq) < start_seq)
continue;
fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
"journal entries %llu-%llu missing! (replaying %llu-%llu)",
seq, le64_to_cpu(i->j.seq) - 1,
@ -1169,6 +1249,9 @@ int bch2_fs_initialize(struct bch_fs *c)
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i) {
ret = bch2_dev_journal_alloc(ca);

View File

@ -13,6 +13,7 @@
#include "bkey_sort.h"
#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_key_cache.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "chardev.h"
@ -372,9 +373,9 @@ static void bch2_fs_read_only_work(struct work_struct *work)
struct bch_fs *c =
container_of(work, struct bch_fs, read_only_work);
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
bch2_fs_read_only(c);
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
}
static void bch2_fs_read_only_async(struct bch_fs *c)
@ -455,16 +456,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
if (!test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
ret = bch2_fs_allocator_start(c);
if (ret) {
bch_err(c, "error initializing allocator");
goto err;
}
set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
}
for_each_rw_member(ca, c, i) {
ret = bch2_dev_allocator_start(ca);
if (ret) {
@ -521,6 +512,7 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_fs_io_exit(c);
bch2_fs_btree_interior_update_exit(c);
bch2_fs_btree_iter_exit(c);
bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
bch2_fs_btree_cache_exit(c);
bch2_fs_journal_exit(&c->journal);
bch2_io_clock_exit(&c->io_clock[WRITE]);
@ -575,9 +567,9 @@ void bch2_fs_stop(struct bch_fs *c)
cancel_work_sync(&c->journal_seq_blacklist_gc_work);
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
bch2_fs_read_only(c);
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
for_each_member_device(ca, c, i)
if (ca->kobj.state_in_sysfs &&
@ -649,7 +641,7 @@ static const char *bch2_fs_online(struct bch_fs *c)
bch2_opts_create_sysfs_files(&c->opts_dir))
return "error creating sysfs objects";
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
err = "error creating sysfs objects";
__for_each_member_device(ca, c, i, NULL)
@ -659,7 +651,7 @@ static const char *bch2_fs_online(struct bch_fs *c)
list_add(&c->list, &bch_fs_list);
err = NULL;
err:
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return err;
}
@ -681,7 +673,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->minor = -1;
c->disk_sb.fs_sb = true;
mutex_init(&c->state_lock);
init_rwsem(&c->state_lock);
mutex_init(&c->sb_lock);
mutex_init(&c->replicas_gc_lock);
mutex_init(&c->btree_root_lock);
@ -692,6 +684,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
bch2_fs_allocator_background_init(c);
bch2_fs_allocator_foreground_init(c);
bch2_fs_rebalance_init(c);
@ -724,6 +717,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
seqcount_init(&c->usage_lock);
sema_init(&c->io_in_flight, 64);
c->copy_gc_enabled = 1;
c->rebalance.enabled = 1;
c->promote_whole_extents = true;
@ -785,6 +780,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_journal_init(&c->journal) ||
bch2_fs_replicas_init(c) ||
bch2_fs_btree_cache_init(c) ||
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
bch2_fs_btree_iter_init(c) ||
bch2_fs_btree_interior_update_init(c) ||
bch2_fs_io_init(c) ||
@ -871,7 +867,7 @@ int bch2_fs_start(struct bch_fs *c)
unsigned i;
int ret = -EINVAL;
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
@ -921,7 +917,7 @@ int bch2_fs_start(struct bch_fs *c)
print_mount_opts(c);
ret = 0;
out:
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return ret;
err:
switch (ret) {
@ -1421,22 +1417,47 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
{
int ret;
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
ret = __bch2_dev_set_state(c, ca, new_state, flags);
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return ret;
}
/* Device add/removal: */
int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
{
struct btree_trans trans;
size_t i;
int ret;
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < ca->mi.nbuckets; i++) {
ret = bch2_btree_key_cache_flush(&trans,
BTREE_ID_ALLOC, POS(ca->dev_idx, i));
if (ret)
break;
}
bch2_trans_exit(&trans);
if (ret)
return ret;
return bch2_btree_delete_range(c, BTREE_ID_ALLOC,
POS(ca->dev_idx, 0),
POS(ca->dev_idx + 1, 0),
NULL);
}
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_sb_field_members *mi;
unsigned dev_idx = ca->dev_idx, data;
int ret = -EINVAL;
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
/*
* We consume a reference to ca->ref, regardless of whether we succeed
@ -1463,10 +1484,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
goto err;
}
ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
POS(ca->dev_idx, 0),
POS(ca->dev_idx + 1, 0),
NULL);
ret = bch2_dev_remove_alloc(c, ca);
if (ret) {
bch_err(ca, "Remove failed, error deleting alloc info");
goto err;
@ -1526,13 +1544,13 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return 0;
err:
if (ca->mi.state == BCH_MEMBER_STATE_RW &&
!percpu_ref_is_zero(&ca->io_ref))
__bch2_dev_read_write(c, ca);
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return ret;
}
@ -1608,7 +1626,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
dev_usage_clear(ca);
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
err = "insufficient space in new superblock";
@ -1669,12 +1687,12 @@ have_slot:
goto err_late;
}
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return 0;
err_unlock:
mutex_unlock(&c->sb_lock);
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
err:
if (ca)
bch2_dev_free(ca);
@ -1697,11 +1715,11 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
const char *err;
int ret;
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
ret = bch2_read_super(path, &opts, &sb);
if (ret) {
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return ret;
}
@ -1732,10 +1750,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return 0;
err:
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
bch2_free_super(&sb);
bch_err(c, "error bringing %s online: %s", path, err);
return -EINVAL;
@ -1743,23 +1761,23 @@ err:
int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
{
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
if (!bch2_dev_is_online(ca)) {
bch_err(ca, "Already offline");
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return 0;
}
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
bch_err(ca, "Cannot offline required disk");
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return -EINVAL;
}
__bch2_dev_offline(c, ca);
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return 0;
}
@ -1768,7 +1786,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
struct bch_member *mi;
int ret = 0;
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
if (nbuckets < ca->mi.nbuckets) {
bch_err(ca, "Cannot shrink yet");
@ -1799,7 +1817,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bch2_recalc_capacity(c);
err:
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
return ret;
}
@ -1878,13 +1896,13 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
goto err;
err = "bch2_dev_online() error";
mutex_lock(&c->state_lock);
down_write(&c->state_lock);
for (i = 0; i < nr_devices; i++)
if (bch2_dev_attach_bdev(c, &sb[i])) {
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
goto err_print;
}
mutex_unlock(&c->state_lock);
up_write(&c->state_lock);
err = "insufficient devices";
if (!bch2_fs_may_start(c))

View File

@ -134,7 +134,6 @@ do { \
write_attribute(trigger_journal_flush);
write_attribute(trigger_btree_coalesce);
write_attribute(trigger_gc);
write_attribute(trigger_alloc_write);
write_attribute(prune_cache);
rw_attribute(btree_gc_periodic);
@ -427,7 +426,7 @@ SHOW(bch2_fs)
return 0;
}
STORE(__bch2_fs)
STORE(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@ -485,13 +484,17 @@ STORE(__bch2_fs)
if (attr == &sysfs_trigger_btree_coalesce)
bch2_coalesce(c);
if (attr == &sysfs_trigger_gc)
if (attr == &sysfs_trigger_gc) {
/*
* Full gc is currently incompatible with btree key cache:
*/
#if 0
down_read(&c->state_lock);
bch2_gc(c, NULL, false, false);
if (attr == &sysfs_trigger_alloc_write) {
bool wrote;
bch2_alloc_write(c, 0, &wrote);
up_read(&c->state_lock);
#else
bch2_gc_gens(c);
#endif
}
if (attr == &sysfs_prune_cache) {
@ -501,6 +504,7 @@ STORE(__bch2_fs)
sc.nr_to_scan = strtoul_or_return(buf);
c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
}
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@ -522,17 +526,6 @@ STORE(__bch2_fs)
#endif
return size;
}
STORE(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
mutex_lock(&c->state_lock);
size = __bch2_fs_store(kobj, attr, buf, size);
mutex_unlock(&c->state_lock);
return size;
}
SYSFS_OPS(bch2_fs);
struct attribute *bch2_fs_files[] = {
@ -587,7 +580,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_journal_flush,
&sysfs_trigger_btree_coalesce,
&sysfs_trigger_gc,
&sysfs_trigger_alloc_write,
&sysfs_prune_cache,
&sysfs_copy_gc_enabled,

191
linux/semaphore.c Normal file
View File

@ -0,0 +1,191 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008 Intel Corporation
* Author: Matthew Wilcox <willy@linux.intel.com>
*
* This file implements counting semaphores.
* A counting semaphore may be acquired 'n' times before sleeping.
* See mutex.c for single-acquisition sleeping locks which enforce
* rules which allow code to be debugged more easily.
*/
/*
* Some notes on the implementation:
*
* The spinlock controls access to the other members of the semaphore.
* down_trylock() and up() can be called from interrupt context, so we
* have to disable interrupts when taking the lock. It turns out various
* parts of the kernel expect to be able to use down() on a semaphore in
* interrupt context when they know it will succeed, so we have to use
* irqsave variants for down(), down_interruptible() and down_killable()
* too.
*
* The ->count variable represents how many more tasks can acquire this
* semaphore. If it's zero, there may be tasks waiting on the wait_list.
*/
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
static noinline void __up(struct semaphore *sem);
/**
* down - acquire the semaphore
* @sem: the semaphore to be acquired
*
* Acquires the semaphore. If no more tasks are allowed to acquire the
* semaphore, calling this function will put the task to sleep until the
* semaphore is released.
*
* Use of this function is deprecated, please use down_interruptible() or
* down_killable() instead.
*/
void down(struct semaphore *sem)
{
unsigned long flags;
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
}
EXPORT_SYMBOL(down);
/**
* down_trylock - try to acquire the semaphore, without waiting
* @sem: the semaphore to be acquired
*
* Try to acquire the semaphore atomically. Returns 0 if the semaphore has
* been acquired successfully or 1 if it it cannot be acquired.
*
* NOTE: This return value is inverted from both spin_trylock and
* mutex_trylock! Be careful about this when converting code.
*
* Unlike mutex_trylock, this function can be used from interrupt context,
* and the semaphore can be released by any task or interrupt.
*/
int down_trylock(struct semaphore *sem)
{
unsigned long flags;
int count;
raw_spin_lock_irqsave(&sem->lock, flags);
count = sem->count - 1;
if (likely(count >= 0))
sem->count = count;
raw_spin_unlock_irqrestore(&sem->lock, flags);
return (count < 0);
}
EXPORT_SYMBOL(down_trylock);
/**
* down_timeout - acquire the semaphore within a specified time
* @sem: the semaphore to be acquired
* @timeout: how long to wait before failing
*
* Attempts to acquire the semaphore. If no more tasks are allowed to
* acquire the semaphore, calling this function will put the task to sleep.
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
int down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
result = __down_timeout(sem, timeout);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return result;
}
EXPORT_SYMBOL(down_timeout);
/**
* up - release the semaphore
* @sem: the semaphore to release
*
* Release the semaphore. Unlike mutexes, up() may be called from any
* context and even by tasks which have never called down().
*/
void up(struct semaphore *sem)
{
unsigned long flags;
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
}
EXPORT_SYMBOL(up);
/* Functions for the contended case */
struct semaphore_waiter {
struct list_head list;
struct task_struct *task;
bool up;
};
/*
* Because this function is inlined, the 'state' parameter will be
* constant, and thus optimised away by the compiler. Likewise the
* 'timeout' parameter for the cases without timeouts.
*/
static inline int __sched __down_common(struct semaphore *sem, long state,
long timeout)
{
struct semaphore_waiter waiter;
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = current;
waiter.up = false;
for (;;) {
if (unlikely(timeout <= 0))
goto timed_out;
__set_current_state(state);
raw_spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
raw_spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
timed_out:
list_del(&waiter.list);
return -ETIME;
}
static noinline void __sched __down(struct semaphore *sem)
{
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
{
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
static noinline void __sched __up(struct semaphore *sem)
{
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
waiter->up = true;
wake_up_process(waiter->task);
}

View File

@ -267,15 +267,21 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
#endif
noinline
static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type)
static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old, new;
struct six_lock_waiter wait;
int ret = 0;
u64 v;
ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
if (ret)
return ret;
if (six_optimistic_spin(lock, type))
return;
return 0;
lock_contended(&lock->dep_map, _RET_IP_);
@ -292,6 +298,10 @@ static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type t
raw_spin_unlock(&lock->wait_lock);
}
ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
if (ret)
break;
v = READ_ONCE(lock->state.v);
do {
new.v = old.v = v;
@ -311,7 +321,8 @@ static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type t
schedule();
}
six_set_owner(lock, type, old);
if (!ret)
six_set_owner(lock, type, old);
__set_current_state(TASK_RUNNING);
@ -320,18 +331,28 @@ static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type t
list_del_init(&wait.list);
raw_spin_unlock(&lock->wait_lock);
}
return ret;
}
__always_inline
static void __six_lock_type(struct six_lock *lock, enum six_lock_type type)
static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
int ret;
if (type != SIX_LOCK_write)
six_acquire(&lock->dep_map, 0);
if (!do_six_trylock_type(lock, type))
__six_lock_type_slowpath(lock, type);
ret = do_six_trylock_type(lock, type) ? 0
: __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
lock_acquired(&lock->dep_map, _RET_IP_);
if (ret && type != SIX_LOCK_write)
six_release(&lock->dep_map);
if (!ret)
lock_acquired(&lock->dep_map, _RET_IP_);
return ret;
}
static inline void six_lock_wakeup(struct six_lock *lock,
@ -417,9 +438,10 @@ bool six_relock_##type(struct six_lock *lock, u32 seq) \
} \
EXPORT_SYMBOL_GPL(six_relock_##type); \
\
void six_lock_##type(struct six_lock *lock) \
int six_lock_##type(struct six_lock *lock, \
six_lock_should_sleep_fn should_sleep_fn, void *p) \
{ \
__six_lock_type(lock, SIX_LOCK_##type); \
return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\
} \
EXPORT_SYMBOL_GPL(six_lock_##type); \
\
@ -514,3 +536,18 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
}
}
EXPORT_SYMBOL_GPL(six_lock_increment);
void six_lock_wakeup_all(struct six_lock *lock)
{
struct six_lock_waiter *w;
raw_spin_lock(&lock->wait_lock);
list_for_each_entry(w, &lock->wait_list[0], list)
wake_up_process(w->task);
list_for_each_entry(w, &lock->wait_list[1], list)
wake_up_process(w->task);
raw_spin_unlock(&lock->wait_lock);
}
EXPORT_SYMBOL_GPL(six_lock_wakeup_all);