Update bcachefs sources to 75e8a078b8 bcachefs: improved flush_held_btree_writes()

This commit is contained in:
Kent Overstreet 2019-02-28 21:34:16 -05:00
parent 17c5215c1c
commit a4eb187a6f
31 changed files with 1221 additions and 1037 deletions

View File

@ -1 +1 @@
09a546543006b60d44c4c51e7b40cd3ec7837a5e
75e8a078b85703322fcf558f75a6845c0ef5dbb0

View File

@ -319,6 +319,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
struct bkey_i_extent *e;
BKEY_PADDED(k) k;
u64 b = sector_to_bucket(ca, physical);
struct bucket_mark m;
struct disk_reservation res;
unsigned sectors;
int ret;
@ -337,7 +338,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
.gen = bucket(ca, b)->mark.gen,
});
bucket_set_dirty(ca, b);
bucket_cmpxchg(bucket(ca, b), m, m.dirty = true);
ret = bch2_disk_reservation_get(c, &res, sectors, 1,
BCH_DISK_RESERVATION_NOFAIL);

View File

@ -128,6 +128,34 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
*p += bytes;
}
struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a)
{
struct bkey_alloc_unpacked ret = { .gen = a->gen };
const void *d = a->data;
unsigned idx = 0;
#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++);
BCH_ALLOC_FIELDS()
#undef x
return ret;
}
static void bch2_alloc_pack(struct bkey_i_alloc *dst,
const struct bkey_alloc_unpacked src)
{
unsigned idx = 0;
void *d = dst->v.data;
dst->v.fields = 0;
dst->v.gen = src.gen;
#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name);
BCH_ALLOC_FIELDS()
#undef x
set_bkey_val_bytes(&dst->k, (void *) d - (void *) &dst->v);
}
static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
{
unsigned i, bytes = offsetof(struct bch_alloc, data);
@ -173,15 +201,24 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
{
const void *d = a->data;
unsigned idx = 0;
unsigned idx = 0, data_type, dirty_sectors, cached_sectors;
struct bucket_mark m;
g->_mark.gen = a->gen;
g->gen_valid = 1;
g->io_time[READ] = get_alloc_field(a, &d, idx++);
g->io_time[WRITE] = get_alloc_field(a, &d, idx++);
g->_mark.data_type = get_alloc_field(a, &d, idx++);
g->_mark.dirty_sectors = get_alloc_field(a, &d, idx++);
g->_mark.cached_sectors = get_alloc_field(a, &d, idx++);
data_type = get_alloc_field(a, &d, idx++);
dirty_sectors = get_alloc_field(a, &d, idx++);
cached_sectors = get_alloc_field(a, &d, idx++);
g->oldest_gen = get_alloc_field(a, &d, idx++);
bucket_cmpxchg(g, m, ({
m.gen = a->gen;
m.data_type = data_type;
m.dirty_sectors = dirty_sectors;
m.cached_sectors = cached_sectors;
}));
g->gen_valid = 1;
}
static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
@ -199,6 +236,7 @@ static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
put_alloc_field(a, &d, idx++, m.data_type);
put_alloc_field(a, &d, idx++, m.dirty_sectors);
put_alloc_field(a, &d, idx++, m.cached_sectors);
put_alloc_field(a, &d, idx++, g->oldest_gen);
set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
}
@ -315,6 +353,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
BTREE_INSERT_NOMARK|
flags,
BTREE_INSERT_ENTRY(iter, &a->k_i));
if (ret)
@ -358,7 +397,8 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
? 0
: bch2_btree_insert_at(c, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_REPLAY,
BTREE_INSERT_JOURNAL_REPLAY|
BTREE_INSERT_NOMARK,
BTREE_INSERT_ENTRY(&iter, k));
err:
bch2_btree_iter_unlock(&iter);
@ -824,6 +864,142 @@ static inline long next_alloc_bucket(struct bch_dev *ca)
return -1;
}
/*
* returns sequence number of most recent journal entry that updated this
* bucket:
*/
static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
{
if (m.journal_seq_valid) {
u64 journal_seq = atomic64_read(&c->journal.seq);
u64 bucket_seq = journal_seq;
bucket_seq &= ~((u64) U16_MAX);
bucket_seq |= m.journal_seq;
if (bucket_seq > journal_seq)
bucket_seq -= 1 << 16;
return bucket_seq;
} else {
return 0;
}
}
static int bch2_invalidate_one_bucket2(struct bch_fs *c, struct bch_dev *ca,
struct btree_iter *iter,
u64 *journal_seq, unsigned flags)
{
#if 0
__BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
#else
/* hack: */
__BKEY_PADDED(k, 8) alloc_key;
#endif
struct bkey_i_alloc *a;
struct bkey_alloc_unpacked u;
struct bucket_mark m;
struct bkey_s_c k;
bool invalidating_cached_data;
size_t b;
int ret;
BUG_ON(!ca->alloc_heap.used ||
!ca->alloc_heap.data[0].nr);
b = ca->alloc_heap.data[0].bucket;
/* first, put on free_inc and mark as owned by allocator: */
percpu_down_read_preempt_disable(&c->mark_lock);
spin_lock(&c->freelist_lock);
verify_not_on_freelist(c, ca, b);
BUG_ON(!fifo_push(&ca->free_inc, b));
bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
m = bucket(ca, b)->mark;
spin_unlock(&c->freelist_lock);
percpu_up_read_preempt_enable(&c->mark_lock);
bch2_btree_iter_cond_resched(iter);
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
retry:
k = bch2_btree_iter_peek_slot(iter);
ret = btree_iter_err(k);
if (ret)
return ret;
if (k.k && k.k->type == KEY_TYPE_alloc)
u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
else
memset(&u, 0, sizeof(u));
invalidating_cached_data = u.cached_sectors != 0;
//BUG_ON(u.dirty_sectors);
u.data_type = 0;
u.dirty_sectors = 0;
u.cached_sectors = 0;
u.read_time = c->bucket_clock[READ].hand;
u.write_time = c->bucket_clock[WRITE].hand;
u.gen++;
a = bkey_alloc_init(&alloc_key.k);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
ret = bch2_btree_insert_at(c, NULL,
invalidating_cached_data ? journal_seq : NULL,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
flags,
BTREE_INSERT_ENTRY(iter, &a->k_i));
if (ret == -EINTR)
goto retry;
if (!ret) {
/* remove from alloc_heap: */
struct alloc_heap_entry e, *top = ca->alloc_heap.data;
top->bucket++;
top->nr--;
if (!top->nr)
heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
/*
* Make sure we flush the last journal entry that updated this
* bucket (i.e. deleting the last reference) before writing to
* this bucket again:
*/
*journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
} else {
size_t b2;
/* remove from free_inc: */
percpu_down_read_preempt_disable(&c->mark_lock);
spin_lock(&c->freelist_lock);
bch2_mark_alloc_bucket(c, ca, b, false,
gc_pos_alloc(c, NULL), 0);
BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
BUG_ON(b != b2);
spin_unlock(&c->freelist_lock);
percpu_up_read_preempt_enable(&c->mark_lock);
}
return ret;
}
static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t bucket, u64 *flush_seq)
{
@ -844,18 +1020,7 @@ static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
percpu_up_read_preempt_enable(&c->mark_lock);
if (m.journal_seq_valid) {
u64 journal_seq = atomic64_read(&c->journal.seq);
u64 bucket_seq = journal_seq;
bucket_seq &= ~((u64) U16_MAX);
bucket_seq |= m.journal_seq;
if (bucket_seq > journal_seq)
bucket_seq -= 1 << 16;
*flush_seq = max(*flush_seq, bucket_seq);
}
*flush_seq = max(*flush_seq, bucket_journal_seq(c, m));
return m.cached_sectors != 0;
}
@ -868,7 +1033,6 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
struct btree_iter iter;
u64 journal_seq = 0;
int ret = 0;
long b;
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@ -876,14 +1040,11 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
/* Only use nowait if we've already invalidated at least one bucket: */
while (!ret &&
!fifo_full(&ca->free_inc) &&
(b = next_alloc_bucket(ca)) >= 0) {
bool must_flush =
bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
ret = __bch2_alloc_write_key(c, ca, b, &iter,
must_flush ? &journal_seq : NULL,
!fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
}
ca->alloc_heap.used)
ret = bch2_invalidate_one_bucket2(c, ca, &iter, &journal_seq,
BTREE_INSERT_GC_LOCK_HELD|
(!fifo_empty(&ca->free_inc)
? BTREE_INSERT_NOWAIT : 0));
bch2_btree_iter_unlock(&iter);
@ -1305,24 +1466,16 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
return 0;
}
static void flush_held_btree_writes(struct bch_fs *c)
static bool flush_done(struct bch_fs *c)
{
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
bool nodes_blocked;
bool nodes_unwritten;
size_t i;
struct closure cl;
closure_init_stack(&cl);
clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
again:
pr_debug("flushing dirty btree nodes");
cond_resched();
closure_wait(&c->btree_interior_update_wait, &cl);
nodes_blocked = false;
nodes_unwritten = false;
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
@ -1334,24 +1487,25 @@ again:
six_unlock_read(&b->lock);
goto again;
} else {
nodes_blocked = true;
nodes_unwritten = true;
}
}
rcu_read_unlock();
if (c->btree_roots_dirty)
if (c->btree_roots_dirty) {
bch2_journal_meta(&c->journal);
if (nodes_blocked) {
closure_sync(&cl);
goto again;
}
closure_wake_up(&c->btree_interior_update_wait);
closure_sync(&cl);
return !nodes_unwritten &&
!bch2_btree_interior_updates_nr_pending(c);
}
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
static void flush_held_btree_writes(struct bch_fs *c)
{
clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
closure_wait_event(&c->btree_interior_update_wait, flush_done(c));
}
static void allocator_start_issue_discards(struct bch_fs *c)
@ -1470,7 +1624,6 @@ not_enough:
&journal_seq);
fifo_push(&ca->free[RESERVE_BTREE], bu);
bucket_set_dirty(ca, bu);
}
}
@ -1517,7 +1670,6 @@ int bch2_fs_allocator_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
bool wrote;
int ret;
down_read(&c->gc_lock);
@ -1536,8 +1688,7 @@ int bch2_fs_allocator_start(struct bch_fs *c)
}
set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
return bch2_alloc_write(c, false, &wrote);
return 0;
}
void bch2_fs_allocator_background_init(struct bch_fs *c)

View File

@ -5,6 +5,15 @@
#include "alloc_types.h"
#include "debug.h"
struct bkey_alloc_unpacked {
u8 gen;
#define x(_name, _bits) u##_bits _name;
BCH_ALLOC_FIELDS()
#undef x
};
struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *);
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);

View File

@ -723,7 +723,7 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
{
u64 stranded = c->write_points_nr * c->bucket_size_max;
u64 free = bch2_fs_sectors_free(c);
u64 free = bch2_fs_usage_read_short(c).free;
return stranded * factor > free;
}

View File

@ -396,8 +396,6 @@ struct bch_dev {
struct bucket_array __rcu *buckets[2];
unsigned long *buckets_nouse;
unsigned long *buckets_written;
/* most out of date gen in the btree */
u8 *oldest_gens;
struct rw_semaphore bucket_lock;
struct bch_dev_usage __percpu *usage[2];

View File

@ -821,11 +821,12 @@ struct bch_alloc {
} __attribute__((packed, aligned(8)));
#define BCH_ALLOC_FIELDS() \
x(read_time, 2) \
x(write_time, 2) \
x(data_type, 1) \
x(dirty_sectors, 2) \
x(cached_sectors, 2)
x(read_time, 16) \
x(write_time, 16) \
x(data_type, 8) \
x(dirty_sectors, 16) \
x(cached_sectors, 16) \
x(oldest_gen, 8)
enum {
#define x(name, bytes) BCH_ALLOC_FIELD_##name,
@ -835,12 +836,12 @@ enum {
};
static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
#define x(name, bytes) [BCH_ALLOC_FIELD_##name] = bytes,
#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
BCH_ALLOC_FIELDS()
#undef x
};
#define x(name, bytes) + bytes
#define x(name, bits) + (bits / 8)
static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
DIV_ROUND_UP(offsetof(struct bch_alloc, data)
BCH_ALLOC_FIELDS(), sizeof(u64));

View File

@ -138,24 +138,24 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t b = PTR_BUCKET_NR(ca, ptr);
struct bucket *g = PTR_BUCKET(ca, ptr);
struct bucket *g = PTR_BUCKET(ca, ptr, true);
struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
if (mustfix_fsck_err_on(!g->gen_valid, c,
"found ptr with missing gen in alloc btree,\n"
"type %u gen %u",
k.k->type, ptr->gen)) {
g->_mark.gen = ptr->gen;
g->gen_valid = 1;
bucket_set_dirty(ca, b);
g2->_mark.gen = g->_mark.gen = ptr->gen;
g2->_mark.dirty = g->_mark.dirty = true;
g2->gen_valid = g->gen_valid = true;
}
if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
"%u ptr gen in the future: %u > %u",
k.k->type, ptr->gen, g->mark.gen)) {
g->_mark.gen = ptr->gen;
g->gen_valid = 1;
bucket_set_dirty(ca, b);
g2->_mark.gen = g->_mark.gen = ptr->gen;
g2->_mark.dirty = g->_mark.dirty = true;
g2->gen_valid = g->gen_valid = true;
set_bit(BCH_FS_FIXED_GENS, &c->flags);
}
}
@ -163,10 +163,10 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t b = PTR_BUCKET_NR(ca, ptr);
struct bucket *g = PTR_BUCKET(ca, ptr, true);
if (gen_after(ca->oldest_gens[b], ptr->gen))
ca->oldest_gens[b] = ptr->gen;
if (gen_after(g->oldest_gen, ptr->gen))
g->oldest_gen = ptr->gen;
*max_stale = max(*max_stale, ptr_stale(ca, ptr));
}
@ -230,12 +230,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
bch2_verify_btree_nr_keys(b);
gc_pos_set(c, gc_pos_btree_node(b));
ret = btree_gc_mark_node(c, b, &max_stale, initial);
if (ret)
break;
gc_pos_set(c, gc_pos_btree_node(b));
if (!initial) {
if (max_stale > 64)
bch2_btree_node_rewrite(c, &iter,
@ -483,88 +483,38 @@ static void bch2_gc_free(struct bch_fs *c)
percpu_up_write(&c->mark_lock);
}
static void bch2_gc_done_nocheck(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
{
struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
struct stripe *dst, *src;
c->ec_stripes_heap.used = 0;
while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
(src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
*dst = *src;
if (dst->alive)
bch2_stripes_heap_insert(c, dst, dst_iter.pos);
genradix_iter_advance(&dst_iter, &c->stripes[0]);
genradix_iter_advance(&src_iter, &c->stripes[1]);
}
}
for_each_member_device(ca, c, i) {
struct bucket_array *src = __bucket_array(ca, 1);
memcpy(__bucket_array(ca, 0), src,
sizeof(struct bucket_array) +
sizeof(struct bucket) * src->nbuckets);
};
for_each_member_device(ca, c, i) {
unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
struct bch_dev_usage *dst = (void *)
bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
struct bch_dev_usage *src = (void *)
bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
*dst = *src;
}
{
unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
c->replicas.nr;
struct bch_fs_usage *dst = (void *)
bch2_acc_percpu_u64s((void *) c->usage[0], nr);
struct bch_fs_usage *src = (void *)
bch2_acc_percpu_u64s((void *) c->usage[1], nr);
memcpy(&dst->s.gc_start[0],
&src->s.gc_start[0],
nr * sizeof(u64) - offsetof(typeof(*dst), s.gc_start));
}
}
static void bch2_gc_done(struct bch_fs *c, bool initial)
{
struct bch_dev *ca;
bool verify = !initial ||
(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO));
unsigned i;
#define copy_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
bch_err(c, _msg ": got %llu, should be %llu, fixing" \
, ##__VA_ARGS__, dst->_f, src->_f); \
if (verify) \
bch_err(c, _msg ": got %llu, should be %llu, fixing"\
, ##__VA_ARGS__, dst->_f, src->_f); \
dst->_f = src->_f; \
}
#define copy_stripe_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
bch_err_ratelimited(c, "stripe %zu has wrong "_msg \
": got %u, should be %u, fixing", \
dst_iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \
if (verify) \
bch_err_ratelimited(c, "stripe %zu has wrong "_msg\
": got %u, should be %u, fixing", \
dst_iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \
dst->_f = src->_f; \
dst->dirty = true; \
}
#define copy_bucket_field(_f) \
if (dst->b[b].mark._f != src->b[b].mark._f) { \
bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
": got %u, should be %u, fixing", \
i, b, dst->b[b].mark._f, src->b[b].mark._f); \
if (verify) \
bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
": got %u, should be %u, fixing", i, b, \
dst->b[b].mark._f, src->b[b].mark._f); \
dst->b[b]._mark._f = src->b[b].mark._f; \
dst->b[b]._mark.dirty = true; \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
@ -573,12 +523,6 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
percpu_down_write(&c->mark_lock);
if (initial &&
!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))) {
bch2_gc_done_nocheck(c);
goto out;
}
{
struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
@ -629,6 +573,11 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
copy_bucket_field(stripe);
copy_bucket_field(dirty_sectors);
copy_bucket_field(cached_sectors);
if (dst->b[b].oldest_gen != src->b[b].oldest_gen) {
dst->b[b].oldest_gen = src->b[b].oldest_gen;
dst->b[b]._mark.dirty = true;
}
}
};
@ -641,44 +590,46 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
unsigned b;
for (b = 0; b < BCH_DATA_NR; b++)
copy_dev_field(buckets[b],
"buckets[%s]", bch2_data_types[b]);
copy_dev_field(buckets_alloc, "buckets_alloc");
copy_dev_field(buckets_ec, "buckets_ec");
copy_dev_field(buckets[b], "buckets[%s]",
bch2_data_types[b]);
copy_dev_field(buckets_alloc, "buckets_alloc");
copy_dev_field(buckets_ec, "buckets_ec");
copy_dev_field(buckets_unavailable, "buckets_unavailable");
for (b = 0; b < BCH_DATA_NR; b++)
copy_dev_field(sectors[b],
"sectors[%s]", bch2_data_types[b]);
copy_dev_field(sectors_fragmented,
"sectors_fragmented");
copy_dev_field(sectors[b], "sectors[%s]",
bch2_data_types[b]);
copy_dev_field(sectors_fragmented, "sectors_fragmented");
}
{
unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
c->replicas.nr;
unsigned nr = fs_usage_u64s(c);
struct bch_fs_usage *dst = (void *)
bch2_acc_percpu_u64s((void *) c->usage[0], nr);
struct bch_fs_usage *src = (void *)
bch2_acc_percpu_u64s((void *) c->usage[1], nr);
copy_fs_field(s.hidden, "hidden");
copy_fs_field(s.data, "data");
copy_fs_field(s.cached, "cached");
copy_fs_field(s.reserved, "reserved");
copy_fs_field(s.nr_inodes, "nr_inodes");
copy_fs_field(hidden, "hidden");
copy_fs_field(data, "data");
copy_fs_field(cached, "cached");
copy_fs_field(reserved, "reserved");
copy_fs_field(nr_inodes, "nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
copy_fs_field(persistent_reserved[i],
"persistent_reserved[%i]", i);
for (i = 0; i < c->replicas.nr; i++) {
/*
* XXX: print out replicas entry
*/
copy_fs_field(data[i], "data[%i]", i);
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
char buf[80];
bch2_replicas_entry_to_text(&PBUF(buf), e);
copy_fs_field(replicas[i], "%s", buf);
}
}
out:
percpu_up_write(&c->mark_lock);
#undef copy_fs_field
@ -693,19 +644,18 @@ static int bch2_gc_start(struct bch_fs *c)
struct bch_dev *ca;
unsigned i;
percpu_down_write(&c->mark_lock);
/*
* indicate to stripe code that we need to allocate for the gc stripes
* radix tree, too
*/
gc_pos_set(c, gc_phase(GC_PHASE_START));
percpu_down_write(&c->mark_lock);
BUG_ON(c->usage[1]);
c->usage[1] = __alloc_percpu_gfp(sizeof(struct bch_fs_usage) +
sizeof(u64) * c->replicas.nr,
sizeof(u64),
GFP_KERNEL);
c->usage[1] = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
sizeof(u64), GFP_KERNEL);
percpu_up_write(&c->mark_lock);
if (!c->usage[1])
@ -740,8 +690,12 @@ static int bch2_gc_start(struct bch_fs *c)
dst->first_bucket = src->first_bucket;
dst->nbuckets = src->nbuckets;
for (b = 0; b < src->nbuckets; b++)
dst->b[b]._mark.gen = src->b[b].mark.gen;
for (b = 0; b < src->nbuckets; b++) {
dst->b[b]._mark.gen =
dst->b[b].oldest_gen =
src->b[b].mark.gen;
dst->b[b].gen_valid = src->b[b].gen_valid;
}
};
percpu_up_write(&c->mark_lock);
@ -800,6 +754,8 @@ out:
if (iter++ <= 2) {
bch_info(c, "Fixed gens, restarting mark and sweep:");
clear_bit(BCH_FS_FIXED_GENS, &c->flags);
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
bch2_gc_free(c);
goto again;
}

View File

@ -455,6 +455,7 @@ static inline bool btree_node_is_extents(struct btree *b)
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
switch (type) {
case BKEY_TYPE_ALLOC:
case BKEY_TYPE_BTREE:
case BKEY_TYPE_EXTENTS:
case BKEY_TYPE_INODES:
@ -489,7 +490,6 @@ enum btree_insert_ret {
/* leaf node needs to be split */
BTREE_INSERT_BTREE_NODE_FULL,
BTREE_INSERT_ENOSPC,
BTREE_INSERT_NEED_GC_LOCK,
BTREE_INSERT_NEED_MARK_REPLICAS,
};

View File

@ -81,6 +81,7 @@ enum {
__BTREE_INSERT_USE_RESERVE,
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_NOMARK,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
__BCH_HASH_SET_MUST_CREATE,
@ -107,12 +108,12 @@ enum {
#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
/*
* Insert is for journal replay: don't get journal reservations, or mark extents
* (bch_mark_key)
*/
/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
/* Don't call bch2_mark_key: */
#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK)
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)

View File

@ -483,7 +483,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
struct btree *b;
struct disk_reservation disk_res = { 0, 0 };
unsigned sectors = nr_nodes * c->opts.btree_node_size;
int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD;
int ret, disk_res_flags = 0;
if (flags & BTREE_INSERT_NOFAIL)
disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
@ -1086,8 +1086,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&old->key),
fs_usage);
bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
gc_pos_btree_root(b->btree_id));
bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
percpu_up_read_preempt_enable(&c->mark_lock);
mutex_unlock(&c->btree_interior_update_lock);
@ -1188,8 +1187,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
bkey_disassemble(b, k, &tmp),
fs_usage);
bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
gc_pos_btree_node(b));
bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
percpu_up_read_preempt_enable(&c->mark_lock);
mutex_unlock(&c->btree_interior_update_lock);
@ -1564,7 +1562,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
closure_init_stack(&cl);
/* Hack, because gc and splitting nodes doesn't mix yet: */
if (!down_read_trylock(&c->gc_lock)) {
if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
!down_read_trylock(&c->gc_lock)) {
if (flags & BTREE_INSERT_NOUNLOCK)
return -EINTR;
@ -1607,7 +1606,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
*/
__bch2_btree_iter_downgrade(iter, 1);
out:
up_read(&c->gc_lock);
if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
up_read(&c->gc_lock);
closure_sync(&cl);
return ret;
}
@ -1685,7 +1685,8 @@ retry:
}
/* We're changing btree topology, doesn't mix with gc: */
if (!down_read_trylock(&c->gc_lock))
if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
!down_read_trylock(&c->gc_lock))
goto err_cycle_gc_lock;
if (!bch2_btree_iter_upgrade(iter, U8_MAX,
@ -1745,7 +1746,8 @@ retry:
bch2_btree_update_done(as);
up_read(&c->gc_lock);
if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
up_read(&c->gc_lock);
out:
bch2_btree_iter_verify_locks(iter);
@ -1776,7 +1778,8 @@ err_cycle_gc_lock:
err_unlock:
six_unlock_intent(&m->lock);
up_read(&c->gc_lock);
if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
up_read(&c->gc_lock);
err:
BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
@ -1942,8 +1945,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
c->opts.btree_node_size *
bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
BCH_DISK_RESERVATION_NOFAIL|
BCH_DISK_RESERVATION_GC_LOCK_HELD);
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
parent = btree_node_parent(iter, b);
@ -1989,8 +1991,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&b->key),
fs_usage);
bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
gc_pos_btree_root(b->btree_id));
bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
percpu_up_read_preempt_enable(&c->mark_lock);
mutex_unlock(&c->btree_interior_update_lock);

View File

@ -415,6 +415,25 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
btree_iter_cmp(l.iter, r.iter);
}
static bool btree_trans_relock(struct btree_insert *trans)
{
struct btree_insert_entry *i;
trans_for_each_iter(trans, i)
return bch2_btree_iter_relock(i->iter);
return true;
}
static void btree_trans_unlock(struct btree_insert *trans)
{
struct btree_insert_entry *i;
trans_for_each_iter(trans, i) {
bch2_btree_iter_unlock(i->iter);
break;
}
}
/* Normal update interface: */
static enum btree_insert_ret
@ -466,49 +485,12 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
struct btree_iter *linked;
unsigned u64s;
int ret;
retry:
trans_for_each_iter(trans, i)
BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
/* reserve space for deferred updates */
__trans_for_each_entry(trans, i, i->deferred) {
}
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
u64s = 0;
trans_for_each_entry(trans, i)
u64s += jset_u64s(i->k->k.u64s);
while ((ret = bch2_journal_res_get(&c->journal,
&trans->journal_res, u64s,
JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
struct btree_iter *iter = NULL;
trans_for_each_iter(trans, i)
iter = i->iter;
if (iter)
bch2_btree_iter_unlock(iter);
ret = bch2_journal_res_get(&c->journal,
&trans->journal_res, u64s,
JOURNAL_RES_GET_CHECK);
if (ret)
return ret;
if (iter && !bch2_btree_iter_relock(iter)) {
trans_restart(" (iter relock after journal res get blocked)");
return -EINTR;
}
}
if (ret)
return ret;
}
multi_lock_write(c, trans);
if (race_fault()) {
@ -536,6 +518,36 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
}
}
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
u64s = 0;
trans_for_each_entry(trans, i)
u64s += jset_u64s(i->k->k.u64s);
ret = bch2_journal_res_get(&c->journal,
&trans->journal_res, u64s,
JOURNAL_RES_GET_NONBLOCK);
if (likely(!ret))
goto got_journal_res;
if (ret != -EAGAIN)
goto out;
multi_unlock_write(trans);
btree_trans_unlock(trans);
ret = bch2_journal_res_get(&c->journal,
&trans->journal_res, u64s,
JOURNAL_RES_GET_CHECK);
if (ret)
return ret;
if (!btree_trans_relock(trans)) {
trans_restart(" (iter relock after journal res get blocked)");
return -EINTR;
}
goto retry;
}
got_journal_res:
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (journal_seq_verify(c))
trans_for_each_entry(trans, i)
@ -623,6 +635,9 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
/* for the sake of sanity: */
BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&c->gc_lock);
bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
trans_for_each_entry(trans, i)
@ -715,18 +730,6 @@ err:
ret = -EINTR;
}
break;
case BTREE_INSERT_NEED_GC_LOCK:
ret = -EINTR;
if (!down_read_trylock(&c->gc_lock)) {
if (flags & BTREE_INSERT_NOUNLOCK)
goto out;
bch2_btree_iter_unlock(trans->entries[0].iter);
down_read(&c->gc_lock);
}
up_read(&c->gc_lock);
break;
case BTREE_INSERT_ENOSPC:
ret = -ENOSPC;
break;

View File

@ -116,14 +116,14 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
unsigned i, nr;
unsigned i;
percpu_down_write(&c->mark_lock);
nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr;
usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr);
usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0],
fs_usage_u64s(c));
for (i = 0; i < BCH_REPLICAS_MAX; i++)
usage->s.reserved += usage->persistent_reserved[i];
usage->reserved += usage->persistent_reserved[i];
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
@ -132,10 +132,10 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
switch (e->data_type) {
case BCH_DATA_BTREE:
case BCH_DATA_USER:
usage->s.data += usage->data[i];
usage->data += usage->replicas[i];
break;
case BCH_DATA_CACHED:
usage->s.cached += usage->data[i];
usage->cached += usage->replicas[i];
break;
}
}
@ -143,44 +143,38 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
percpu_up_write(&c->mark_lock);
}
#define bch2_usage_read_raw(_stats) \
({ \
typeof(*this_cpu_ptr(_stats)) _acc; \
\
memset(&_acc, 0, sizeof(_acc)); \
acc_u64s_percpu((u64 *) &_acc, \
(u64 __percpu *) _stats, \
sizeof(_acc) / sizeof(u64)); \
\
_acc; \
})
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
{
return bch2_usage_read_raw(ca->usage[0]);
struct bch_dev_usage ret;
memset(&ret, 0, sizeof(ret));
acc_u64s_percpu((u64 *) &ret,
(u64 __percpu *) ca->usage[0],
sizeof(ret) / sizeof(u64));
return ret;
}
struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
{
struct bch_fs_usage *ret;
unsigned nr = READ_ONCE(c->replicas.nr);
unsigned v, u64s = fs_usage_u64s(c);
retry:
ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
ret = kzalloc(u64s * sizeof(u64), GFP_NOFS);
if (unlikely(!ret))
return NULL;
percpu_down_read_preempt_disable(&c->mark_lock);
if (unlikely(nr < c->replicas.nr)) {
nr = c->replicas.nr;
v = fs_usage_u64s(c);
if (unlikely(u64s != v)) {
u64s = v;
percpu_up_read_preempt_enable(&c->mark_lock);
kfree(ret);
goto retry;
}
acc_u64s_percpu((u64 *) ret,
(u64 __percpu *) c->usage[0],
sizeof(*ret) / sizeof(u64) + nr);
acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
return ret;
}
@ -197,27 +191,44 @@ static u64 avail_factor(u64 r)
return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
}
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
{
return min(fs_usage.s.hidden +
fs_usage.s.data +
reserve_factor(fs_usage.s.reserved +
fs_usage.s.online_reserved),
return min(fs_usage->hidden +
fs_usage->data +
reserve_factor(fs_usage->reserved +
fs_usage->online_reserved),
c->capacity);
}
static struct bch_fs_usage_short
__bch2_fs_usage_read_short(struct bch_fs *c)
{
struct bch_fs_usage_short ret;
u64 data, reserved;
ret.capacity = c->capacity -
percpu_u64_get(&c->usage[0]->hidden);
data = percpu_u64_get(&c->usage[0]->data);
reserved = percpu_u64_get(&c->usage[0]->reserved) +
percpu_u64_get(&c->usage[0]->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
ret.nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
return ret;
}
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *c)
{
struct bch_fs_usage_summarized usage =
bch2_usage_read_raw(&c->usage[0]->s);
struct bch_fs_usage_short ret;
ret.capacity = READ_ONCE(c->capacity) - usage.hidden;
ret.used = min(ret.capacity, usage.data +
reserve_factor(usage.reserved +
usage.online_reserved));
ret.nr_inodes = usage.nr_inodes;
percpu_down_read_preempt_disable(&c->mark_lock);
ret = __bch2_fs_usage_read_short(c);
percpu_up_read_preempt_enable(&c->mark_lock);
return ret;
}
@ -254,10 +265,9 @@ static bool bucket_became_unavailable(struct bucket_mark old,
int bch2_fs_usage_apply(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct disk_reservation *disk_res,
struct gc_pos gc_pos)
struct disk_reservation *disk_res)
{
s64 added = fs_usage->s.data + fs_usage->s.reserved;
s64 added = fs_usage->data + fs_usage->reserved;
s64 should_not_have_added;
int ret = 0;
@ -277,19 +287,11 @@ int bch2_fs_usage_apply(struct bch_fs *c,
if (added > 0) {
disk_res->sectors -= added;
fs_usage->s.online_reserved -= added;
fs_usage->online_reserved -= added;
}
acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
(u64 *) fs_usage,
sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
if (gc_visited(c, gc_pos)) {
BUG_ON(!c->usage[1]);
acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
(u64 *) fs_usage,
sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
}
(u64 *) fs_usage, fs_usage_u64s(c));
return ret;
}
@ -300,7 +302,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
int nr, s64 size)
{
if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
fs_usage->s.hidden += size;
fs_usage->hidden += size;
dev_usage->buckets[type] += nr;
}
@ -384,10 +386,10 @@ static inline void update_replicas(struct bch_fs *c,
BUG_ON(!sectors);
if (r->data_type == BCH_DATA_CACHED)
fs_usage->s.cached += sectors;
fs_usage->cached += sectors;
else
fs_usage->s.data += sectors;
fs_usage->data[idx] += sectors;
fs_usage->data += sectors;
fs_usage->replicas[idx] += sectors;
}
static inline void update_cached_sectors(struct bch_fs *c,
@ -401,15 +403,28 @@ static inline void update_cached_sectors(struct bch_fs *c,
update_replicas(c, fs_usage, &r.e, sectors);
}
static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old,
bool gc)
#define do_mark_fn(fn, c, pos, flags, ...) \
({ \
int gc, ret = 0; \
\
percpu_rwsem_assert_held(&c->mark_lock); \
\
for (gc = 0; gc < 2 && !ret; gc++) \
if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \
(gc && gc_visited(c, pos))) \
ret = fn(c, __VA_ARGS__, gc); \
ret; \
})
static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *ret,
bool gc)
{
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark new;
struct bucket_mark old, new;
*old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
BUG_ON(!is_available_bucket(new));
new.owned_by_allocator = true;
@ -420,26 +435,29 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
new.gen++;
}));
if (old->cached_sectors)
if (old.cached_sectors)
update_cached_sectors(c, fs_usage, ca->dev_idx,
-old->cached_sectors);
-((s64) old.cached_sectors));
if (!gc)
*ret = old;
return 0;
}
void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old)
{
percpu_rwsem_assert_held(&c->mark_lock);
__bch2_invalidate_bucket(c, ca, b, old, false);
do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
ca, b, old);
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, b),
old->cached_sectors);
}
static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
bool gc)
static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
bool gc)
{
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
@ -451,20 +469,70 @@ static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
return 0;
}
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
struct gc_pos pos, unsigned flags)
{
percpu_rwsem_assert_held(&c->mark_lock);
do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
ca, b, owned_by_allocator);
}
if (!(flags & BCH_BUCKET_MARK_GC))
__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
bool inserting,
struct bch_fs_usage *fs_usage,
unsigned journal_seq, unsigned flags,
bool gc)
{
struct bkey_alloc_unpacked u;
struct bch_dev *ca;
struct bucket *g;
struct bucket_mark old, m;
if ((flags & BCH_BUCKET_MARK_GC) ||
gc_visited(c, pos))
__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
if (!inserting)
return 0;
/*
* alloc btree is read in by bch2_alloc_read, not gc:
*/
if (flags & BCH_BUCKET_MARK_GC)
return 0;
u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
ca = bch_dev_bkey_exists(c, k.k->p.inode);
g = __bucket(ca, k.k->p.offset, gc);
/*
* this should currently only be getting called from the bucket
* invalidate path:
*/
BUG_ON(u.dirty_sectors);
BUG_ON(u.cached_sectors);
BUG_ON(!g->mark.owned_by_allocator);
old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
m.gen = u.gen;
m.data_type = u.data_type;
m.dirty_sectors = u.dirty_sectors;
m.cached_sectors = u.cached_sectors;
}));
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
g->oldest_gen = u.oldest_gen;
g->gen_valid = 1;
if (old.cached_sectors) {
update_cached_sectors(c, fs_usage, ca->dev_idx,
-old.cached_sectors);
trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
old.cached_sectors);
}
return 0;
}
#define checked_add(a, b) \
@ -474,9 +542,9 @@ do { \
BUG_ON((a) != _res); \
} while (0)
static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, enum bch_data_type type,
unsigned sectors, bool gc)
static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, enum bch_data_type type,
unsigned sectors, bool gc)
{
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
@ -490,6 +558,8 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
return 0;
}
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@ -501,15 +571,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
type != BCH_DATA_JOURNAL);
if (likely(c)) {
percpu_rwsem_assert_held(&c->mark_lock);
if (!(flags & BCH_BUCKET_MARK_GC))
__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
false);
if ((flags & BCH_BUCKET_MARK_GC) ||
gc_visited(c, pos))
__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
true);
do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
ca, b, type, sectors);
} else {
struct bucket *g;
struct bucket_mark new;
@ -553,7 +616,7 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
* loop, to avoid racing with the start of gc clearing all the marks - GC does
* that with the gc pos seqlock held.
*/
static void bch2_mark_pointer(struct bch_fs *c,
static bool bch2_mark_pointer(struct bch_fs *c,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
@ -581,7 +644,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
EBUG_ON(!p.ptr.cached &&
test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
return;
return true;
}
if (!p.ptr.cached)
@ -612,6 +675,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
BUG_ON(!gc && bucket_became_unavailable(old, new));
return false;
}
static int bch2_mark_stripe_ptr(struct bch_fs *c,
@ -694,13 +759,13 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
s64 disk_sectors = data_type == BCH_DATA_BTREE
? sectors
: ptr_disk_sectors_delta(p, sectors);
bch2_mark_pointer(c, p, disk_sectors, data_type,
fs_usage, journal_seq, flags, gc);
bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
fs_usage, journal_seq, flags, gc);
if (p.ptr.cached) {
update_cached_sectors(c, fs_usage, p.ptr.dev,
disk_sectors);
if (disk_sectors && !stale)
update_cached_sectors(c, fs_usage, p.ptr.dev,
disk_sectors);
} else if (!p.ec_nr) {
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
@ -826,30 +891,31 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
unsigned journal_seq, unsigned flags,
bool gc)
{
int ret = 0;
if (!fs_usage || gc)
fs_usage = this_cpu_ptr(c->usage[gc]);
switch (k.k->type) {
case KEY_TYPE_alloc:
return bch2_mark_alloc(c, k, inserting,
fs_usage, journal_seq, flags, gc);
case KEY_TYPE_btree_ptr:
ret = bch2_mark_extent(c, k, inserting
? c->opts.btree_node_size
: -c->opts.btree_node_size,
BCH_DATA_BTREE,
fs_usage, journal_seq, flags, gc);
break;
return bch2_mark_extent(c, k, inserting
? c->opts.btree_node_size
: -c->opts.btree_node_size,
BCH_DATA_BTREE,
fs_usage, journal_seq, flags, gc);
case KEY_TYPE_extent:
ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
fs_usage, journal_seq, flags, gc);
break;
return bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
fs_usage, journal_seq, flags, gc);
case KEY_TYPE_stripe:
ret = bch2_mark_stripe(c, k, inserting,
fs_usage, journal_seq, flags, gc);
break;
return bch2_mark_stripe(c, k, inserting,
fs_usage, journal_seq, flags, gc);
case KEY_TYPE_inode:
if (inserting)
fs_usage->s.nr_inodes++;
fs_usage->nr_inodes++;
else
fs_usage->s.nr_inodes--;
break;
fs_usage->nr_inodes--;
return 0;
case KEY_TYPE_reservation: {
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
@ -857,15 +923,13 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
replicas = clamp_t(unsigned, replicas, 1,
ARRAY_SIZE(fs_usage->persistent_reserved));
fs_usage->s.reserved += sectors;
fs_usage->reserved += sectors;
fs_usage->persistent_reserved[replicas - 1] += sectors;
break;
return 0;
}
default:
break;
return 0;
}
return ret;
}
int bch2_mark_key_locked(struct bch_fs *c,
@ -875,26 +939,9 @@ int bch2_mark_key_locked(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
int ret;
if (!(flags & BCH_BUCKET_MARK_GC)) {
ret = __bch2_mark_key(c, k, inserting, sectors,
fs_usage ?: this_cpu_ptr(c->usage[0]),
journal_seq, flags, false);
if (ret)
return ret;
}
if ((flags & BCH_BUCKET_MARK_GC) ||
gc_visited(c, pos)) {
ret = __bch2_mark_key(c, k, inserting, sectors,
this_cpu_ptr(c->usage[1]),
journal_seq, flags, true);
if (ret)
return ret;
}
return 0;
return do_mark_fn(__bch2_mark_key, c, pos, flags,
k, inserting, sectors, fs_usage,
journal_seq, flags);
}
int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
@ -932,7 +979,7 @@ void bch2_mark_update(struct btree_insert *trans,
percpu_down_read_preempt_disable(&c->mark_lock);
fs_usage = bch2_fs_usage_get_scratch(c);
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
if (!(trans->flags & BTREE_INSERT_NOMARK))
bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
bpos_min(insert->k->k.p, b->key.k.p).offset -
bkey_start_offset(&insert->k->k),
@ -985,7 +1032,7 @@ void bch2_mark_update(struct btree_insert *trans,
bch2_btree_node_iter_advance(&node_iter, b);
}
if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res) &&
!warned_disk_usage &&
!xchg(&warned_disk_usage, 1)) {
char buf[200];
@ -1026,13 +1073,13 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c)
{
percpu_u64_set(&c->pcpu->sectors_available, 0);
return avail_factor(bch2_fs_sectors_free(c));
return avail_factor(__bch2_fs_usage_read_short(c).free);
}
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
percpu_down_read_preempt_disable(&c->mark_lock);
this_cpu_sub(c->usage[0]->s.online_reserved,
this_cpu_sub(c->usage[0]->online_reserved,
res->sectors);
percpu_up_read_preempt_enable(&c->mark_lock);
@ -1071,38 +1118,22 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
out:
pcpu->sectors_available -= sectors;
this_cpu_add(c->usage[0]->s.online_reserved, sectors);
this_cpu_add(c->usage[0]->online_reserved, sectors);
res->sectors += sectors;
percpu_up_read_preempt_enable(&c->mark_lock);
return 0;
recalculate:
/*
* GC recalculates sectors_available when it starts, so that hopefully
* we don't normally end up blocking here:
*/
/*
* Piss fuck, we can be called from extent_insert_fixup() with btree
* locks held:
*/
if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
down_read(&c->gc_lock);
else if (!down_read_trylock(&c->gc_lock))
return -EINTR;
}
percpu_down_write(&c->mark_lock);
sectors_available = bch2_recalc_sectors_available(c);
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
atomic64_set(&c->sectors_available,
max_t(s64, 0, sectors_available - sectors));
this_cpu_add(c->usage[0]->s.online_reserved, sectors);
this_cpu_add(c->usage[0]->online_reserved, sectors);
res->sectors += sectors;
ret = 0;
} else {
@ -1112,9 +1143,6 @@ recalculate:
percpu_up_write(&c->mark_lock);
if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
up_read(&c->gc_lock);
return ret;
}
@ -1135,7 +1163,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
struct bucket_array *buckets = NULL, *old_buckets = NULL;
unsigned long *buckets_nouse = NULL;
unsigned long *buckets_written = NULL;
u8 *oldest_gens = NULL;
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
alloc_heap alloc_heap;
@ -1161,8 +1188,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO)) ||
!(oldest_gens = kvpmalloc(nbuckets * sizeof(u8),
GFP_KERNEL|__GFP_ZERO)) ||
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)) ||
@ -1197,9 +1222,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
memcpy(buckets->b,
old_buckets->b,
n * sizeof(struct bucket));
memcpy(oldest_gens,
ca->oldest_gens,
n * sizeof(u8));
memcpy(buckets_nouse,
ca->buckets_nouse,
BITS_TO_LONGS(n) * sizeof(unsigned long));
@ -1211,7 +1233,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
rcu_assign_pointer(ca->buckets[0], buckets);
buckets = old_buckets;
swap(ca->oldest_gens, oldest_gens);
swap(ca->buckets_nouse, buckets_nouse);
swap(ca->buckets_written, buckets_written);
@ -1255,8 +1276,6 @@ err:
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
kvpfree(buckets_written,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
kvpfree(oldest_gens,
nbuckets * sizeof(u8));
if (buckets)
call_rcu(&old_buckets->rcu, buckets_free_rcu);
@ -1276,7 +1295,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));

View File

@ -16,13 +16,14 @@
#define bucket_cmpxchg(g, new, expr) \
({ \
struct bucket *_g = g; \
u64 _v = atomic64_read(&(g)->_mark.v); \
struct bucket_mark _old; \
\
do { \
(new).v.counter = _old.v.counter = _v; \
expr; \
} while ((_v = atomic64_cmpxchg(&(g)->_mark.v, \
} while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \
_old.v.counter, \
(new).v.counter)) != _old.v.counter);\
_old; \
@ -56,18 +57,6 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, false);
}
static inline void bucket_set_dirty(struct bch_dev *ca, size_t b)
{
struct bucket *g;
struct bucket_mark m;
rcu_read_lock();
g = bucket(ca, b);
bucket_cmpxchg(g, m, m.dirty = true);
rcu_read_unlock();
}
static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
size_t b, int rw)
{
@ -86,7 +75,9 @@ static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
{
return bucket(ca, b)->mark.gen - ca->oldest_gens[b];
struct bucket *g = bucket(ca, b);
return g->mark.gen - g->oldest_gen;
}
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
@ -96,9 +87,10 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
}
static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
const struct bch_extent_ptr *ptr,
bool gc)
{
return bucket(ca, PTR_BUCKET_NR(ca, ptr));
return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
}
static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
@ -219,31 +211,28 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
/* Filesystem usage: */
static inline unsigned fs_usage_u64s(struct bch_fs *c)
{
return sizeof(struct bch_fs_usage) / sizeof(u64) +
READ_ONCE(c->replicas.nr);
}
static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
{
struct bch_fs_usage *ret;
ret = this_cpu_ptr(c->usage_scratch);
memset(ret, 0, sizeof(*ret) + c->replicas.nr * sizeof(u64));
struct bch_fs_usage *ret = this_cpu_ptr(c->usage_scratch);
memset(ret, 0, fs_usage_u64s(c) * sizeof(u64));
return ret;
}
struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *);
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
static inline u64 bch2_fs_sectors_free(struct bch_fs *c)
{
struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
return usage.capacity - usage.used;
}
/* key/bucket marking: */
void bch2_bucket_seq_cleanup(struct bch_fs *);
@ -257,8 +246,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
#define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
#define BCH_BUCKET_MARK_GC (1 << 1)
#define BCH_BUCKET_MARK_GC (1 << 0)
#define BCH_BUCKET_MARK_NOATOMIC (1 << 1)
int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
bool, s64, struct gc_pos,
@ -268,7 +257,7 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
struct bch_fs_usage *, u64, unsigned);
void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, struct gc_pos);
struct disk_reservation *);
/* disk reservations: */
@ -282,8 +271,6 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
#define BCH_DISK_RESERVATION_GC_LOCK_HELD (1 << 1)
#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD (1 << 2)
int bch2_disk_reservation_add(struct bch_fs *,
struct disk_reservation *,

View File

@ -38,6 +38,7 @@ struct bucket {
};
u16 io_time[2];
u8 oldest_gen;
unsigned gen_valid:1;
};
@ -62,35 +63,33 @@ struct bch_dev_usage {
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
/* summarized: */
struct bch_fs_usage_summarized {
u64 online_reserved;
u64 online_reserved;
/* fields after online_reserved are cleared/recalculated by gc: */
u64 gc_start[0];
/* fields after online_reserved are cleared/recalculated by gc: */
u64 gc_start[0];
u64 hidden;
u64 data;
u64 cached;
u64 reserved;
u64 nr_inodes;
u64 hidden;
u64 data;
u64 cached;
u64 reserved;
u64 nr_inodes;
/* XXX: add stats for compression ratio */
/* XXX: add stats for compression ratio */
#if 0
u64 uncompressed;
u64 compressed;
u64 uncompressed;
u64 compressed;
#endif
} s;
/* broken out: */
u64 persistent_reserved[BCH_REPLICAS_MAX];
u64 data[];
u64 replicas[];
};
struct bch_fs_usage_short {
u64 capacity;
u64 used;
u64 free;
u64 nr_inodes;
};

View File

@ -402,10 +402,10 @@ static long bch2_ioctl_usage(struct bch_fs *c,
if (!src)
return -ENOMEM;
percpu_up_read_preempt_enable(&c->mark_lock);
dst.used = bch2_fs_sectors_used(c, src);
dst.online_reserved = src->online_reserved;
dst.used = bch2_fs_sectors_used(c, *src);
dst.online_reserved = src->s.online_reserved;
percpu_up_read_preempt_enable(&c->mark_lock);
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
dst.persistent_reserved[i] =

View File

@ -979,10 +979,8 @@ bch2_extent_can_insert(struct btree_insert *trans,
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
(sectors = bch2_extent_is_compressed(k))) {
int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
if (trans->flags & BTREE_INSERT_NOFAIL)
flags |= BCH_DISK_RESERVATION_NOFAIL;
int flags = trans->flags & BTREE_INSERT_NOFAIL
? BCH_DISK_RESERVATION_NOFAIL : 0;
switch (bch2_disk_reservation_add(trans->c,
trans->disk_res,
@ -991,8 +989,6 @@ bch2_extent_can_insert(struct btree_insert *trans,
break;
case -ENOSPC:
return BTREE_INSERT_ENOSPC;
case -EINTR:
return BTREE_INSERT_NEED_GC_LOCK;
default:
BUG();
}

View File

@ -100,7 +100,7 @@ do { \
({ \
bool _r = !fifo_empty((fifo)); \
if (_r) \
(i) = (fifo)->data[--(fifo)->back & (fifo)->mask] \
(i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \
_r; \
})

View File

@ -17,23 +17,14 @@
#include <trace/events/bcachefs.h>
static bool journal_entry_is_open(struct journal *j)
static bool __journal_entry_is_open(union journal_res_state state)
{
return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
}
void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
static bool journal_entry_is_open(struct journal *j)
{
struct journal_buf *w = journal_prev_buf(j);
atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
if (!need_write_just_set &&
test_bit(JOURNAL_NEED_WRITE, &j->flags))
bch2_time_stats_update(j->delay_time,
j->need_write_time);
closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
return __journal_entry_is_open(j->reservations);
}
static void journal_pin_new_entry(struct journal *j, int count)
@ -77,39 +68,71 @@ static inline bool journal_entry_empty(struct jset *j)
return true;
}
static enum {
JOURNAL_ENTRY_ERROR,
JOURNAL_ENTRY_INUSE,
JOURNAL_ENTRY_CLOSED,
JOURNAL_UNLOCKED,
} journal_buf_switch(struct journal *j, bool need_write_just_set)
void bch2_journal_halt(struct journal *j)
{
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
return;
new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
journal_wake(j);
closure_wake_up(&journal_cur_buf(j)->wait);
}
/* journal entry close/open: */
void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
{
if (!need_write_just_set &&
test_bit(JOURNAL_NEED_WRITE, &j->flags))
bch2_time_stats_update(j->delay_time,
j->need_write_time);
clear_bit(JOURNAL_NEED_WRITE, &j->flags);
closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
}
/*
* Returns true if journal entry is now closed:
*/
static bool __journal_entry_close(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
bool set_need_write = false;
unsigned sectors;
lockdep_assert_held(&j->lock);
do {
old.v = new.v = v;
if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
return JOURNAL_ENTRY_CLOSED;
return true;
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
/* this entry will never be written: */
closure_wake_up(&buf->wait);
return JOURNAL_ENTRY_ERROR;
return true;
}
if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
set_bit(JOURNAL_NEED_WRITE, &j->flags);
j->need_write_time = local_clock();
set_need_write = true;
}
if (new.prev_buf_unwritten)
return JOURNAL_ENTRY_INUSE;
/*
* avoid race between setting buf->data->u64s and
* journal_res_put starting write:
*/
journal_state_inc(&new);
return false;
new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
new.idx++;
@ -119,15 +142,12 @@ static enum {
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
clear_bit(JOURNAL_NEED_WRITE, &j->flags);
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
j->prev_buf_sectors =
vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) *
c->opts.block_size;
BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
BUG_ON(sectors > buf->sectors);
buf->sectors = sectors;
bkey_extent_init(&buf->key);
@ -150,7 +170,6 @@ static enum {
* Hence, we want update/set last_seq on the current journal entry right
* before we open a new one:
*/
bch2_journal_reclaim_fast(j);
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
if (journal_entry_empty(buf->data))
@ -163,32 +182,22 @@ static enum {
bch2_journal_buf_init(j);
cancel_delayed_work(&j->write_work);
spin_unlock(&j->lock);
/* ugh - might be called from __journal_res_get() under wait_event() */
__set_current_state(TASK_RUNNING);
bch2_journal_buf_put(j, old.idx, need_write_just_set);
bch2_journal_space_available(j);
return JOURNAL_UNLOCKED;
bch2_journal_buf_put(j, old.idx, set_need_write);
return true;
}
void bch2_journal_halt(struct journal *j)
static bool journal_entry_close(struct journal *j)
{
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
bool ret;
do {
old.v = new.v = v;
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
return;
spin_lock(&j->lock);
ret = __journal_entry_close(j);
spin_unlock(&j->lock);
new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
journal_wake(j);
closure_wake_up(&journal_cur_buf(j)->wait);
closure_wake_up(&journal_prev_buf(j)->wait);
return ret;
}
/*
@ -196,46 +205,39 @@ void bch2_journal_halt(struct journal *j)
* journal reservation - journal entry is open means journal is dirty:
*
* returns:
* 1: success
* 0: journal currently full (must wait)
* -EROFS: insufficient rw devices
* -EIO: journal error
* 0: success
* -ENOSPC: journal currently full, must invoke reclaim
* -EAGAIN: journal blocked, must wait
* -EROFS: insufficient rw devices or journal error
*/
static int journal_entry_open(struct journal *j)
{
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
ssize_t u64s;
int sectors;
int u64s;
u64 v;
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
if (!fifo_free(&j->pin))
return 0;
if (j->blocked)
return -EAGAIN;
sectors = bch2_journal_entry_sectors(j);
if (sectors <= 0)
return sectors;
if (j->cur_entry_error)
return j->cur_entry_error;
BUG_ON(!j->cur_entry_sectors);
buf->disk_sectors = sectors;
buf->u64s_reserved = j->entry_u64s_reserved;
buf->disk_sectors = j->cur_entry_sectors;
buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9);
sectors = min_t(unsigned, sectors, buf->size >> 9);
j->cur_buf_sectors = sectors;
u64s = (sectors << 9) / sizeof(u64);
/* Subtract the journal header */
u64s -= sizeof(struct jset) / sizeof(u64);
u64s -= buf->u64s_reserved;
u64s = max_t(ssize_t, 0L, u64s);
BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
u64s = (int) (buf->sectors << 9) / sizeof(u64) -
journal_entry_overhead(j);
u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
if (u64s <= le32_to_cpu(buf->data->u64s))
return 0;
return -ENOSPC;
/*
* Must be set before marking the journal entry as open:
@ -246,11 +248,14 @@ static int journal_entry_open(struct journal *j)
do {
old.v = new.v = v;
EBUG_ON(journal_state_count(new, new.idx));
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
return -EIO;
return -EROFS;
/* Handle any already added entries */
new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
journal_state_inc(&new);
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
@ -263,37 +268,22 @@ static int journal_entry_open(struct journal *j)
&j->write_work,
msecs_to_jiffies(j->write_delay_ms));
journal_wake(j);
return 1;
return 0;
}
static bool __journal_entry_close(struct journal *j)
static bool journal_quiesced(struct journal *j)
{
bool set_need_write;
union journal_res_state state = READ_ONCE(j->reservations);
bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
if (!journal_entry_is_open(j)) {
spin_unlock(&j->lock);
return true;
}
set_need_write = !test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags);
if (set_need_write)
j->need_write_time = local_clock();
switch (journal_buf_switch(j, set_need_write)) {
case JOURNAL_ENTRY_INUSE:
spin_unlock(&j->lock);
return false;
default:
spin_unlock(&j->lock);
case JOURNAL_UNLOCKED:
return true;
}
if (!ret)
journal_entry_close(j);
return ret;
}
static bool journal_entry_close(struct journal *j)
static void journal_quiesce(struct journal *j)
{
spin_lock(&j->lock);
return __journal_entry_close(j);
wait_event(j->wait, journal_quiesced(j));
}
static void journal_write_work(struct work_struct *work)
@ -337,7 +327,11 @@ retry:
if (journal_res_get_fast(j, res, flags))
return 0;
if (bch2_journal_error(j))
return -EROFS;
spin_lock(&j->lock);
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call journal_entry_close()
@ -355,56 +349,43 @@ retry:
*/
buf = journal_cur_buf(j);
if (journal_entry_is_open(j) &&
buf->size >> 9 < buf->disk_sectors &&
buf->size < JOURNAL_ENTRY_SIZE_MAX)
j->buf_size_want = max(j->buf_size_want, buf->size << 1);
buf->buf_size >> 9 < buf->disk_sectors &&
buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
/*
* Close the current journal entry if necessary, then try to start a new
* one:
*/
switch (journal_buf_switch(j, false)) {
case JOURNAL_ENTRY_ERROR:
spin_unlock(&j->lock);
return -EROFS;
case JOURNAL_ENTRY_INUSE:
if (journal_entry_is_open(j) &&
!__journal_entry_close(j)) {
/*
* The current journal entry is still open, but we failed to get
* a journal reservation because there's not enough space in it,
* and we can't close it and start another because we haven't
* finished writing out the previous entry:
* We failed to get a reservation on the current open journal
* entry because it's full, and we can't close it because
* there's still a previous one in flight:
*/
spin_unlock(&j->lock);
trace_journal_entry_full(c);
goto blocked;
case JOURNAL_ENTRY_CLOSED:
break;
case JOURNAL_UNLOCKED:
goto retry;
ret = -EAGAIN;
} else {
ret = journal_entry_open(j);
}
/* We now have a new, closed journal buf - see if we can open it: */
ret = journal_entry_open(j);
if ((ret == -EAGAIN || ret == -ENOSPC) &&
!j->res_get_blocked_start)
j->res_get_blocked_start = local_clock() ?: 1;
spin_unlock(&j->lock);
if (ret < 0)
return ret;
if (ret)
if (!ret)
goto retry;
if (ret == -ENOSPC) {
/*
* Journal is full - can't rely on reclaim from work item due to
* freezing:
*/
trace_journal_full(c);
if (!(flags & JOURNAL_RES_GET_NONBLOCK))
bch2_journal_reclaim_work(&j->reclaim_work.work);
ret = -EAGAIN;
}
/* Journal's full, we have to wait */
/*
* Direct reclaim - can't rely on reclaim from work item
* due to freezing..
*/
bch2_journal_reclaim_work(&j->reclaim_work.work);
trace_journal_full(c);
blocked:
if (!j->res_get_blocked_start)
j->res_get_blocked_start = local_clock() ?: 1;
return -EAGAIN;
return ret;
}
/*
@ -422,7 +403,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
{
int ret;
wait_event(j->wait,
closure_wait_event(&j->async_wait,
(ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
(flags & JOURNAL_RES_GET_NONBLOCK));
return ret;
@ -441,9 +422,9 @@ void bch2_journal_entry_res_resize(struct journal *j,
j->entry_u64s_reserved += d;
if (d <= 0)
goto out_unlock;
goto out;
j->cur_entry_u64s -= d;
j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
smp_mb();
state = READ_ONCE(j->reservations);
@ -454,15 +435,12 @@ void bch2_journal_entry_res_resize(struct journal *j,
* Not enough room in current journal entry, have to flush it:
*/
__journal_entry_close(j);
goto out;
} else {
journal_cur_buf(j)->u64s_reserved += d;
}
journal_cur_buf(j)->u64s_reserved += d;
out_unlock:
spin_unlock(&j->lock);
out:
spin_unlock(&j->lock);
res->u64s += d;
return;
}
/* journal flushing: */
@ -492,47 +470,47 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
int ret;
retry:
spin_lock(&j->lock);
if (seq < journal_cur_seq(j) ||
/*
* Can't try to open more than one sequence number ahead:
*/
BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
if (journal_cur_seq(j) > seq ||
journal_entry_is_open(j)) {
spin_unlock(&j->lock);
return 0;
}
if (journal_cur_seq(j) < seq) {
switch (journal_buf_switch(j, false)) {
case JOURNAL_ENTRY_ERROR:
spin_unlock(&j->lock);
return -EROFS;
case JOURNAL_ENTRY_INUSE:
/* haven't finished writing out the previous one: */
trace_journal_entry_full(c);
goto blocked;
case JOURNAL_ENTRY_CLOSED:
break;
case JOURNAL_UNLOCKED:
goto retry;
}
if (journal_cur_seq(j) < seq &&
!__journal_entry_close(j)) {
/* haven't finished writing out the previous one: */
trace_journal_entry_full(c);
ret = -EAGAIN;
} else {
BUG_ON(journal_cur_seq(j) != seq);
ret = journal_entry_open(j);
}
BUG_ON(journal_cur_seq(j) < seq);
ret = journal_entry_open(j);
if (ret) {
spin_unlock(&j->lock);
return ret < 0 ? ret : 0;
}
blocked:
if (!j->res_get_blocked_start)
if ((ret == -EAGAIN || ret == -ENOSPC) &&
!j->res_get_blocked_start)
j->res_get_blocked_start = local_clock() ?: 1;
closure_wait(&j->async_wait, cl);
if (ret == -EAGAIN || ret == -ENOSPC)
closure_wait(&j->async_wait, cl);
spin_unlock(&j->lock);
bch2_journal_reclaim_work(&j->reclaim_work.work);
return -EAGAIN;
if (ret == -ENOSPC) {
trace_journal_full(c);
bch2_journal_reclaim_work(&j->reclaim_work.work);
ret = -EAGAIN;
}
return ret;
}
static int journal_seq_error(struct journal *j, u64 seq)
@ -615,8 +593,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
if (seq == journal_cur_seq(j))
__journal_entry_close(j);
else
spin_unlock(&j->lock);
spin_unlock(&j->lock);
}
static int journal_seq_flushed(struct journal *j, u64 seq)
@ -628,8 +605,7 @@ static int journal_seq_flushed(struct journal *j, u64 seq)
if (seq == journal_cur_seq(j))
__journal_entry_close(j);
else
spin_unlock(&j->lock);
spin_unlock(&j->lock);
return ret;
}
@ -721,6 +697,26 @@ int bch2_journal_flush(struct journal *j)
return bch2_journal_flush_seq(j, seq);
}
/* block/unlock the journal: */
void bch2_journal_unblock(struct journal *j)
{
spin_lock(&j->lock);
j->blocked--;
spin_unlock(&j->lock);
journal_wake(j);
}
void bch2_journal_block(struct journal *j)
{
spin_lock(&j->lock);
j->blocked++;
spin_unlock(&j->lock);
journal_quiesce(j);
}
/* allocate journal on a device: */
static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
@ -743,7 +739,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
goto err;
journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
nr + sizeof(*journal_buckets) / sizeof(u64));
nr + sizeof(*journal_buckets) / sizeof(u64));
if (!journal_buckets)
goto err;
@ -806,9 +802,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
ja->nr++;
bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB),
0);
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB),
0);
if (c) {
spin_unlock(&c->journal.lock);
@ -859,7 +855,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
*/
if (bch2_disk_reservation_get(c, &disk_res,
bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
mutex_unlock(&c->sb_lock);
return -ENOSPC;
}
@ -930,8 +926,7 @@ void bch2_fs_journal_stop(struct journal *j)
c->btree_roots_dirty)
bch2_journal_meta(j);
BUG_ON(journal_entry_is_open(j) ||
j->reservations.prev_buf_unwritten);
journal_quiesce(j);
BUG_ON(!bch2_journal_error(j) &&
test_bit(JOURNAL_NOT_EMPTY, &j->flags));
@ -957,7 +952,7 @@ void bch2_fs_journal_start(struct journal *j)
journal_pin_new_entry(j, 0);
/*
* journal_buf_switch() only inits the next journal entry when it
* __journal_entry_close() only inits the next journal entry when it
* closes an open journal entry - the very first journal entry gets
* initialized here:
*/
@ -966,6 +961,7 @@ void bch2_fs_journal_start(struct journal *j)
c->last_bucket_seq_cleanup = journal_cur_seq(j);
bch2_journal_space_available(j);
spin_unlock(&j->lock);
/*
@ -975,7 +971,7 @@ void bch2_fs_journal_start(struct journal *j)
*/
bch2_journal_seq_blacklist_write(j);
queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
}
/* init/exit: */
@ -1021,8 +1017,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
kvpfree(j->buf[1].data, j->buf[1].size);
kvpfree(j->buf[0].data, j->buf[0].size);
kvpfree(j->buf[1].data, j->buf[1].buf_size);
kvpfree(j->buf[0].data, j->buf[0].buf_size);
free_fifo(&j->pin);
}
@ -1046,8 +1042,8 @@ int bch2_fs_journal_init(struct journal *j)
lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
j->buf[0].size = JOURNAL_ENTRY_SIZE_MIN;
j->buf[1].size = JOURNAL_ENTRY_SIZE_MIN;
j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->write_delay_ms = 1000;
j->reclaim_delay_ms = 100;
@ -1060,8 +1056,8 @@ int bch2_fs_journal_init(struct journal *j)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
!(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
!(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
!(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
ret = -ENOMEM;
goto out;
}
@ -1078,35 +1074,54 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
{
struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union journal_res_state *s = &j->reservations;
union journal_res_state s;
struct bch_dev *ca;
unsigned iter;
rcu_read_lock();
spin_lock(&j->lock);
s = READ_ONCE(j->reservations);
pr_buf(&out,
"active journal entries:\t%llu\n"
"seq:\t\t\t%llu\n"
"last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n"
"reservation count:\t%u\n"
"reservation offset:\t%u\n"
"current entry u64s:\t%u\n"
"io in flight:\t\t%i\n"
"need write:\t\t%i\n"
"dirty:\t\t\t%i\n"
"replay done:\t\t%i\n",
"current entry:\t\t",
fifo_used(&j->pin),
journal_cur_seq(j),
journal_last_seq(j),
j->last_seq_ondisk,
journal_state_count(*s, s->idx),
s->cur_entry_offset,
j->cur_entry_u64s,
s->prev_buf_unwritten,
j->last_seq_ondisk);
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
pr_buf(&out, "error\n");
break;
case JOURNAL_ENTRY_CLOSED_VAL:
pr_buf(&out, "closed\n");
break;
default:
pr_buf(&out, "%u/%u\n",
s.cur_entry_offset,
j->cur_entry_u64s);
break;
}
pr_buf(&out,
"current entry refs:\t%u\n"
"prev entry unwritten:\t",
journal_state_count(s, s.idx));
if (s.prev_buf_unwritten)
pr_buf(&out, "yes, ref %u\n",
journal_state_count(s, !s.idx));
else
pr_buf(&out, "no\n");
pr_buf(&out,
"need write:\t\t%i\n"
"replay done:\t\t%i\n",
test_bit(JOURNAL_NEED_WRITE, &j->flags),
journal_entry_is_open(j),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
for_each_member_device_rcu(ca, c, iter,
@ -1119,9 +1134,12 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
pr_buf(&out,
"dev %u:\n"
"\tnr\t\t%u\n"
"\tavailable\t%u:%u\n"
"\tcur_idx\t\t%u (seq %llu)\n"
"\tlast_idx\t%u (seq %llu)\n",
iter, ja->nr,
bch2_journal_dev_buckets_available(j, ja),
ja->sectors_free,
ja->cur_idx, ja->bucket_seq[ja->cur_idx],
ja->last_idx, ja->bucket_seq[ja->last_idx]);
}

View File

@ -178,6 +178,11 @@ static inline unsigned jset_u64s(unsigned u64s)
return u64s + sizeof(struct jset_entry) / sizeof(u64);
}
static inline int journal_entry_overhead(struct journal *j)
{
return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
}
static inline struct jset_entry *
bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
{
@ -222,7 +227,7 @@ static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *
id, 0, k, k->k.u64s);
}
void bch2_journal_buf_put_slowpath(struct journal *, bool);
void __bch2_journal_buf_put(struct journal *, bool);
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
bool need_write_just_set)
@ -233,17 +238,10 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
.buf0_count = idx == 0,
.buf1_count = idx == 1,
}).v, &j->reservations.counter);
EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
/*
* Do not initiate a journal write if the journal is in an error state
* (previous journal entry write may have failed)
*/
if (s.idx != idx &&
!journal_state_count(s, idx) &&
s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
bch2_journal_buf_put_slowpath(j, need_write_just_set);
if (!journal_state_count(s, idx)) {
EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
__bch2_journal_buf_put(j, need_write_just_set);
}
}
/*
@ -291,6 +289,8 @@ static inline int journal_res_get_fast(struct journal *j,
if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
return 0;
EBUG_ON(!journal_state_count(new, new.idx));
if (flags & JOURNAL_RES_GET_CHECK)
return 1;
@ -330,6 +330,8 @@ out:
return 0;
}
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *,
struct journal_entry_res *,
unsigned);
@ -367,6 +369,9 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
set_bit(JOURNAL_REPLAY_DONE, &j->flags);
}
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
ssize_t bch2_journal_print_debug(struct journal *, char *);
ssize_t bch2_journal_print_pins(struct journal *, char *);

View File

@ -825,7 +825,6 @@ fsck_err:
int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
struct journal_entry_pin_list *pin_list;
struct bkey_i *k, *_n;
struct jset_entry *entry;
struct journal_replay *i, *n;
@ -854,7 +853,8 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
ret = bch2_btree_insert(c, entry->btree_id, k,
&disk_res, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_REPLAY);
BTREE_INSERT_JOURNAL_REPLAY|
BTREE_INSERT_NOMARK);
}
if (ret) {
@ -866,10 +866,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
cond_resched();
}
pin_list = journal_seq_pin(j, j->replay_journal_seq);
if (atomic_dec_and_test(&pin_list->count))
journal_wake(j);
bch2_journal_pin_put(j, j->replay_journal_seq);
}
j->replay_journal_seq = 0;
@ -884,82 +881,6 @@ err:
/* journal write: */
static unsigned journal_dev_buckets_available(struct journal *j,
struct journal_device *ja)
{
unsigned next = (ja->cur_idx + 1) % ja->nr;
unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
/*
* Don't use the last bucket unless writing the new last_seq
* will make another bucket available:
*/
if (available &&
journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
--available;
return available;
}
/* returns number of sectors available for next journal entry: */
int bch2_journal_entry_sectors(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
unsigned sectors_available = UINT_MAX;
unsigned i, nr_online = 0, nr_devs = 0;
lockdep_assert_held(&j->lock);
rcu_read_lock();
for_each_member_device_rcu(ca, c, i,
&c->rw_devs[BCH_DATA_JOURNAL]) {
struct journal_device *ja = &ca->journal;
unsigned buckets_this_device, sectors_this_device;
if (!ja->nr)
continue;
buckets_this_device = journal_dev_buckets_available(j, ja);
sectors_this_device = ja->sectors_free;
nr_online++;
/*
* We that we don't allocate the space for a journal entry
* until we write it out - thus, account for it here:
*/
if (j->prev_buf_sectors >= sectors_this_device) {
if (!buckets_this_device)
continue;
buckets_this_device--;
sectors_this_device = ca->mi.bucket_size;
}
sectors_this_device -= j->prev_buf_sectors;
if (buckets_this_device)
sectors_this_device = ca->mi.bucket_size;
if (!sectors_this_device)
continue;
sectors_available = min(sectors_available,
sectors_this_device);
nr_devs++;
}
rcu_read_unlock();
if (nr_online < c->opts.metadata_replicas_required)
return -EROFS;
if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
return 0;
return sectors_available;
}
static void __journal_write_alloc(struct journal *j,
struct journal_buf *w,
struct dev_alloc_list *devs_sorted,
@ -1033,7 +954,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
&c->rw_devs[BCH_DATA_JOURNAL]);
spin_lock(&j->lock);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
@ -1049,7 +969,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
if (sectors > ja->sectors_free &&
sectors <= ca->mi.bucket_size &&
journal_dev_buckets_available(j, ja)) {
bch2_journal_dev_buckets_available(j, ja)) {
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
ja->sectors_free = ca->mi.bucket_size;
}
@ -1058,10 +978,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
done:
if (replicas >= replicas_want)
j->prev_buf_sectors = 0;
spin_unlock(&j->lock);
rcu_read_unlock();
return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
@ -1116,17 +1032,17 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
unsigned new_size = READ_ONCE(j->buf_size_want);
void *new_buf;
if (buf->size >= new_size)
if (buf->buf_size >= new_size)
return;
new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
if (!new_buf)
return;
memcpy(new_buf, buf->data, buf->size);
kvpfree(buf->data, buf->size);
memcpy(new_buf, buf->data, buf->buf_size);
kvpfree(buf->data, buf->buf_size);
buf->data = new_buf;
buf->size = new_size;
buf->buf_size = new_size;
}
static void journal_write_done(struct closure *cl)
@ -1166,7 +1082,7 @@ static void journal_write_done(struct closure *cl)
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
out:
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
@ -1220,20 +1136,22 @@ void bch2_journal_write(struct closure *cl)
struct bch_extent_ptr *ptr;
bool validate_before_checksum = false;
unsigned i, sectors, bytes, u64s;
int ret;
bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
journal_buf_realloc(j, w);
jset = w->data;
j->write_start_time = local_clock();
start = vstruct_last(w->data);
start = vstruct_last(jset);
end = bch2_journal_super_entries_add_common(c, start);
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
le32_add_cpu(&w->data->u64s, u64s);
BUG_ON(vstruct_sectors(jset, c->block_bits) >
w->disk_sectors);
le32_add_cpu(&jset->u64s, u64s);
BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
journal_write_compact(jset);
@ -1271,12 +1189,28 @@ void bch2_journal_write(struct closure *cl)
goto err;
sectors = vstruct_sectors(jset, c->block_bits);
BUG_ON(sectors > j->prev_buf_sectors);
BUG_ON(sectors > w->sectors);
bytes = vstruct_bytes(w->data);
memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
bytes = vstruct_bytes(jset);
memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
if (journal_write_alloc(j, w, sectors)) {
spin_lock(&j->lock);
ret = journal_write_alloc(j, w, sectors);
/*
* write is allocated, no longer need to account for it in
* bch2_journal_space_available():
*/
w->sectors = 0;
/*
* journal entry has been compacted and allocated, recalculate space
* available:
*/
bch2_journal_space_available(j);
spin_unlock(&j->lock);
if (ret) {
bch2_journal_halt(j);
bch_err(c, "Unable to allocate journal write");
bch2_fatal_error(c);
@ -1316,7 +1250,7 @@ void bch2_journal_write(struct closure *cl)
trace_journal_write(bio);
closure_bio_submit(bio, cl);
ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
}
for_each_rw_member(ca, c, i)

View File

@ -39,7 +39,6 @@ int bch2_journal_read(struct bch_fs *, struct list_head *);
void bch2_journal_entries_free(struct list_head *);
int bch2_journal_replay(struct bch_fs *, struct list_head *);
int bch2_journal_entry_sectors(struct journal *);
void bch2_journal_write(struct closure *);
#endif /* _BCACHEFS_JOURNAL_IO_H */

View File

@ -1,15 +1,213 @@
#include "bcachefs.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "replicas.h"
#include "super.h"
/* Free space calculations: */
unsigned bch2_journal_dev_buckets_available(struct journal *j,
struct journal_device *ja)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
unsigned next = (ja->cur_idx + 1) % ja->nr;
unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
/*
* Allocator startup needs some journal space before we can do journal
* replay:
*/
if (available &&
test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
available--;
/*
* Don't use the last bucket unless writing the new last_seq
* will make another bucket available:
*/
if (available &&
journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
--available;
return available;
}
void bch2_journal_space_available(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
unsigned sectors_next_entry = UINT_MAX;
unsigned sectors_total = UINT_MAX;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
unsigned i, nr_online = 0, nr_devs = 0;
unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
? journal_prev_buf(j)->sectors
: 0;
int ret = 0;
lockdep_assert_held(&j->lock);
rcu_read_lock();
for_each_member_device_rcu(ca, c, i,
&c->rw_devs[BCH_DATA_JOURNAL]) {
struct journal_device *ja = &ca->journal;
unsigned buckets_this_device, sectors_this_device;
if (!ja->nr)
continue;
nr_online++;
buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
sectors_this_device = ja->sectors_free;
/*
* We that we don't allocate the space for a journal entry
* until we write it out - thus, account for it here:
*/
if (unwritten_sectors >= sectors_this_device) {
if (!buckets_this_device)
continue;
buckets_this_device--;
sectors_this_device = ca->mi.bucket_size;
}
sectors_this_device -= unwritten_sectors;
if (sectors_this_device < ca->mi.bucket_size &&
buckets_this_device) {
buckets_this_device--;
sectors_this_device = ca->mi.bucket_size;
}
if (!sectors_this_device)
continue;
sectors_next_entry = min(sectors_next_entry,
sectors_this_device);
sectors_total = min(sectors_total,
buckets_this_device * ca->mi.bucket_size +
sectors_this_device);
max_entry_size = min_t(unsigned, max_entry_size,
ca->mi.bucket_size);
nr_devs++;
}
rcu_read_unlock();
if (nr_online < c->opts.metadata_replicas_required) {
ret = -EROFS;
sectors_next_entry = 0;
} else if (!sectors_next_entry ||
nr_devs < min_t(unsigned, nr_online,
c->opts.metadata_replicas)) {
ret = -ENOSPC;
sectors_next_entry = 0;
} else if (!fifo_free(&j->pin)) {
ret = -ENOSPC;
sectors_next_entry = 0;
}
j->cur_entry_sectors = sectors_next_entry;
j->cur_entry_error = ret;
if (!ret)
journal_wake(j);
}
/* Discards - last part of journal reclaim: */
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
bool ret;
spin_lock(&j->lock);
ret = ja->nr &&
ja->last_idx != ja->cur_idx &&
ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
spin_unlock(&j->lock);
return ret;
}
/*
* Advance ja->last_idx as long as it points to buckets that are no longer
* dirty, issuing discards if necessary:
*/
static void journal_do_discards(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
unsigned iter;
mutex_lock(&j->reclaim_lock);
for_each_rw_member(ca, c, iter) {
struct journal_device *ja = &ca->journal;
while (should_discard_bucket(j, ja)) {
if (ca->mi.discard &&
blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
ja->buckets[ja->last_idx]),
ca->mi.bucket_size, GFP_NOIO, 0);
spin_lock(&j->lock);
ja->last_idx = (ja->last_idx + 1) % ja->nr;
bch2_journal_space_available(j);
spin_unlock(&j->lock);
}
}
mutex_unlock(&j->reclaim_lock);
}
/*
* Journal entry pinning - machinery for holding a reference on a given journal
* entry, holding it open to ensure it gets replayed during recovery:
*/
static void bch2_journal_reclaim_fast(struct journal *j)
{
struct journal_entry_pin_list temp;
bool popped = false;
lockdep_assert_held(&j->lock);
/*
* Unpin journal entries whose reference counts reached zero, meaning
* all btree nodes got written out
*/
while (!fifo_empty(&j->pin) &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
BUG_ON(!fifo_pop(&j->pin, temp));
popped = true;
}
if (popped)
bch2_journal_space_available(j);
}
void bch2_journal_pin_put(struct journal *j, u64 seq)
{
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
if (atomic_dec_and_test(&pin_list->count)) {
spin_lock(&j->lock);
bch2_journal_reclaim_fast(j);
spin_unlock(&j->lock);
}
}
static inline void __journal_pin_add(struct journal *j,
u64 seq,
struct journal_entry_pin *pin,
@ -24,10 +222,7 @@ static inline void __journal_pin_add(struct journal *j,
pin->seq = seq;
pin->flush = flush_fn;
if (flush_fn)
list_add(&pin->list, &pin_list->list);
else
INIT_LIST_HEAD(&pin->list);
list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
/*
* If the journal is currently full, we might want to call flush_fn
@ -129,88 +324,55 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
* data off of a specific device:
*/
/**
* bch2_journal_reclaim_fast - do the fast part of journal reclaim
*
* Called from IO submission context, does not block. Cleans up after btree
* write completions by advancing the journal pin and each cache's last_idx,
* kicking off discards and background reclaim as necessary.
*/
void bch2_journal_reclaim_fast(struct journal *j)
{
struct journal_entry_pin_list temp;
bool popped = false;
lockdep_assert_held(&j->lock);
/*
* Unpin journal entries whose reference counts reached zero, meaning
* all btree nodes got written out
*/
while (!fifo_empty(&j->pin) &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
BUG_ON(!fifo_pop(&j->pin, temp));
popped = true;
}
if (popped)
journal_wake(j);
}
static void journal_pin_mark_flushing(struct journal *j,
struct journal_entry_pin *pin,
u64 seq)
{
lockdep_assert_held(&j->reclaim_lock);
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
BUG_ON(j->flush_in_progress);
j->flush_in_progress = pin;
}
static void journal_pin_flush(struct journal *j,
struct journal_entry_pin *pin,
u64 seq)
{
pin->flush(j, pin, seq);
BUG_ON(j->flush_in_progress != pin);
j->flush_in_progress = NULL;
wake_up(&j->pin_flush_wait);
}
static struct journal_entry_pin *
journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
/* no need to iterate over empty fifo entries: */
bch2_journal_reclaim_fast(j);
spin_lock(&j->lock);
BUG_ON(!atomic_read(&fifo_peek_front(&j->pin).count));
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
if (*seq > seq_to_flush ||
if (*seq > max_seq ||
(ret = list_first_entry_or_null(&pin_list->list,
struct journal_entry_pin, list)))
break;
return ret;
}
if (ret) {
list_move(&ret->list, &pin_list->flushed);
BUG_ON(j->flush_in_progress);
j->flush_in_progress = ret;
j->last_flushed = jiffies;
}
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
bool ret;
spin_lock(&j->lock);
ret = ja->nr &&
(ja->last_idx != ja->cur_idx &&
ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
spin_unlock(&j->lock);
return ret;
}
static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
unsigned min_nr)
{
struct journal_entry_pin *pin;
u64 seq;
lockdep_assert_held(&j->reclaim_lock);
while ((pin = journal_get_next_pin(j, min_nr
? U64_MAX : seq_to_flush, &seq))) {
if (min_nr)
min_nr--;
pin->flush(j, pin, seq);
BUG_ON(j->flush_in_progress != pin);
j->flush_in_progress = NULL;
wake_up(&j->pin_flush_wait);
}
}
/**
* bch2_journal_reclaim_work - free up journal buckets
*
@ -235,104 +397,44 @@ void bch2_journal_reclaim_work(struct work_struct *work)
struct bch_fs, journal.reclaim_work);
struct journal *j = &c->journal;
struct bch_dev *ca;
struct journal_entry_pin *pin;
u64 seq, seq_to_flush = 0;
unsigned iter, bucket_to_flush;
unsigned long next_flush;
bool reclaim_lock_held = false, need_flush;
unsigned iter, bucket_to_flush, min_nr = 0;
u64 seq_to_flush = 0;
journal_do_discards(j);
mutex_lock(&j->reclaim_lock);
spin_lock(&j->lock);
/*
* Advance last_idx to point to the oldest journal entry containing
* btree node updates that have not yet been written out
*/
for_each_rw_member(ca, c, iter) {
struct journal_device *ja = &ca->journal;
if (!ja->nr)
continue;
while (should_discard_bucket(j, ja)) {
if (!reclaim_lock_held) {
/*
* ugh:
* might be called from __journal_res_get()
* under wait_event() - have to go back to
* TASK_RUNNING before doing something that
* would block, but only if we're doing work:
*/
__set_current_state(TASK_RUNNING);
mutex_lock(&j->reclaim_lock);
reclaim_lock_held = true;
/* recheck under reclaim_lock: */
continue;
}
if (ca->mi.discard &&
blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
ja->buckets[ja->last_idx]),
ca->mi.bucket_size, GFP_NOIO, 0);
spin_lock(&j->lock);
ja->last_idx = (ja->last_idx + 1) % ja->nr;
spin_unlock(&j->lock);
journal_wake(j);
}
/*
* Write out enough btree nodes to free up 50% journal
* buckets
*/
spin_lock(&j->lock);
/* Try to keep the journal at most half full: */
bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
seq_to_flush = max_t(u64, seq_to_flush,
ja->bucket_seq[bucket_to_flush]);
spin_unlock(&j->lock);
}
/* Also flush if the pin fifo is more than half full */
spin_lock(&j->lock);
seq_to_flush = max_t(s64, seq_to_flush,
(s64) journal_cur_seq(j) -
(j->pin.size >> 1));
spin_unlock(&j->lock);
/*
* If it's been longer than j->reclaim_delay_ms since we last flushed,
* make sure to flush at least one journal pin:
*/
next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
need_flush = time_after(jiffies, next_flush);
if (time_after(jiffies, j->last_flushed +
msecs_to_jiffies(j->reclaim_delay_ms)))
min_nr = 1;
while ((pin = journal_get_next_pin(j, need_flush
? U64_MAX
: seq_to_flush, &seq))) {
if (!reclaim_lock_held) {
spin_unlock(&j->lock);
__set_current_state(TASK_RUNNING);
mutex_lock(&j->reclaim_lock);
reclaim_lock_held = true;
spin_lock(&j->lock);
continue;
}
journal_flush_pins(j, seq_to_flush, min_nr);
journal_pin_mark_flushing(j, pin, seq);
spin_unlock(&j->lock);
journal_pin_flush(j, pin, seq);
need_flush = false;
j->last_flushed = jiffies;
spin_lock(&j->lock);
}
spin_unlock(&j->lock);
if (reclaim_lock_held)
mutex_unlock(&j->reclaim_lock);
mutex_unlock(&j->reclaim_lock);
if (!test_bit(BCH_FS_RO, &c->flags))
queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
@ -341,8 +443,6 @@ void bch2_journal_reclaim_work(struct work_struct *work)
static int journal_flush_done(struct journal *j, u64 seq_to_flush)
{
struct journal_entry_pin *pin;
u64 pin_seq;
int ret;
ret = bch2_journal_error(j);
@ -350,16 +450,10 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush)
return ret;
mutex_lock(&j->reclaim_lock);
journal_flush_pins(j, seq_to_flush, 0);
spin_lock(&j->lock);
while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) {
journal_pin_mark_flushing(j, pin, pin_seq);
spin_unlock(&j->lock);
journal_pin_flush(j, pin, pin_seq);
spin_lock(&j->lock);
}
/*
* If journal replay hasn't completed, the unreplayed journal entries
* hold refs on their corresponding sequence numbers

View File

@ -3,6 +3,10 @@
#define JOURNAL_PIN (32 * 1024)
unsigned bch2_journal_dev_buckets_available(struct journal *,
struct journal_device *);
void bch2_journal_space_available(struct journal *);
static inline bool journal_pin_active(struct journal_entry_pin *pin)
{
return pin->seq != 0;
@ -16,6 +20,8 @@ journal_seq_pin(struct journal *j, u64 seq)
return &j->pin.data[seq & j->pin.mask];
}
void bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
@ -27,7 +33,6 @@ void bch2_journal_pin_add_if_older(struct journal *,
journal_pin_flush_fn);
void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
void bch2_journal_reclaim_fast(struct journal *);
void bch2_journal_reclaim_work(struct work_struct *);
void bch2_journal_flush_pins(struct journal *, u64);

View File

@ -21,8 +21,10 @@ struct journal_buf {
struct closure_waitlist wait;
unsigned size;
unsigned disk_sectors;
unsigned buf_size; /* size in bytes of @data */
unsigned sectors; /* maximum size for current entry */
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
@ -128,9 +130,20 @@ struct journal {
unsigned long flags;
union journal_res_state reservations;
/* Max size of current journal entry */
unsigned cur_entry_u64s;
unsigned prev_buf_sectors;
unsigned cur_buf_sectors;
unsigned cur_entry_sectors;
/*
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
* insufficient devices:
*/
int cur_entry_error;
/* Reserved space in journal entry to be used just prior to write */
unsigned entry_u64s_reserved;
unsigned buf_size_want;
/*
@ -141,6 +154,9 @@ struct journal {
spinlock_t lock;
/* if nonzero, we may not open a new journal entry: */
unsigned blocked;
/* Used when waiting because the journal was full */
wait_queue_head_t wait;
struct closure_waitlist async_wait;
@ -155,9 +171,6 @@ struct journal {
u64 seq_ondisk;
u64 last_seq_ondisk;
/* Reserved space in journal entry to be used just prior to write */
unsigned entry_u64s_reserved;
/*
* FIFO of journal entries whose btree updates have not yet been
* written out.

View File

@ -82,7 +82,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(u->v));
break;
case FS_USAGE_INODES:
percpu_u64_set(&c->usage[0]->s.nr_inodes,
percpu_u64_set(&c->usage[0]->nr_inodes,
le64_to_cpu(u->v));
break;
case FS_USAGE_KEY_VERSION:
@ -406,22 +406,19 @@ int bch2_fs_initialize(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
ret = bch2_gc(c, &journal, true);
if (ret)
goto err;
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i)
if (bch2_dev_journal_alloc(ca)) {
for_each_online_member(ca, c, i) {
ret = bch2_dev_journal_alloc(ca);
if (ret) {
percpu_ref_put(&ca->io_ref);
goto err;
}
}
/*
* journal_res_get() will crash if called before this has

View File

@ -244,14 +244,14 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
*dst = *src;
for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
if (!src->data[src_idx])
if (!src->replicas[src_idx])
continue;
dst_idx = __replicas_entry_idx(dst_r,
cpu_replicas_entry(src_r, src_idx));
BUG_ON(dst_idx < 0);
dst->data[dst_idx] = src->data[src_idx];
dst->replicas[dst_idx] = src->replicas[src_idx];
}
}
@ -261,39 +261,37 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
static int replicas_table_update(struct bch_fs *c,
struct bch_replicas_cpu *new_r)
{
struct bch_fs_usage __percpu *new_usage[3] = { NULL, NULL, NULL };
struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
struct bch_fs_usage __percpu *new_scratch = NULL;
unsigned bytes = sizeof(struct bch_fs_usage) +
sizeof(u64) * new_r->nr;
unsigned i;
int ret = -ENOMEM;
for (i = 0; i < 3; i++) {
if (i < 2 && !c->usage[i])
continue;
if (!(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
GFP_NOIO)) ||
(c->usage[1] &&
!(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
GFP_NOIO))) ||
!(new_scratch = __alloc_percpu_gfp(bytes, sizeof(u64),
GFP_NOIO)))
goto err;
new_usage[i] = __alloc_percpu_gfp(bytes, sizeof(u64),
GFP_NOIO);
if (!new_usage[i])
goto err;
}
if (c->usage[0])
__replicas_table_update(new_usage[0], new_r,
c->usage[0], &c->replicas);
if (c->usage[1])
__replicas_table_update(new_usage[1], new_r,
c->usage[1], &c->replicas);
for (i = 0; i < 2; i++) {
if (!c->usage[i])
continue;
__replicas_table_update(new_usage[i], new_r,
c->usage[i], &c->replicas);
swap(c->usage[i], new_usage[i]);
}
swap(c->usage_scratch, new_usage[2]);
swap(c->replicas, *new_r);
swap(c->usage[0], new_usage[0]);
swap(c->usage[1], new_usage[1]);
swap(c->usage_scratch, new_scratch);
swap(c->replicas, *new_r);
ret = 0;
err:
for (i = 0; i < 3; i++)
free_percpu(new_usage[i]);
free_percpu(new_scratch);
free_percpu(new_usage[1]);
free_percpu(new_usage[0]);
return ret;
}
@ -456,7 +454,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
if (__replicas_has_entry(&c->replicas_gc, e))
continue;
v = percpu_u64_get(&c->usage[0]->data[i]);
v = percpu_u64_get(&c->usage[0]->replicas[i]);
if (!v)
continue;
@ -557,7 +555,7 @@ int bch2_replicas_set_usage(struct bch_fs *c,
BUG_ON(ret < 0);
}
percpu_u64_set(&c->usage[0]->data[idx], sectors);
percpu_u64_set(&c->usage[0]->replicas[idx], sectors);
return 0;
}
@ -974,5 +972,6 @@ int bch2_fs_replicas_init(struct bch_fs *c)
{
c->journal.entry_u64s_reserved +=
reserve_journal_replicas(c, &c->replicas);
return 0;
return replicas_table_update(c, &c->replicas);
}

View File

@ -125,7 +125,7 @@ struct bch_hash_desc {
bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
};
static inline struct btree_iter *
static __always_inline struct btree_iter *
bch2_hash_lookup(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
@ -159,7 +159,7 @@ bch2_hash_lookup(struct btree_trans *trans,
return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
}
static inline struct btree_iter *
static __always_inline struct btree_iter *
bch2_hash_hole(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
@ -185,10 +185,11 @@ bch2_hash_hole(struct btree_trans *trans,
return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
}
static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *start)
static __always_inline
int bch2_hash_needs_whiteout(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *start)
{
struct btree_iter *iter;
struct bkey_s_c k;
@ -211,10 +212,11 @@ static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
return btree_iter_err(k);
}
static inline int __bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
u64 inode, struct bkey_i *insert, int flags)
static __always_inline
int __bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
u64 inode, struct bkey_i *insert, int flags)
{
struct btree_iter *iter, *slot = NULL;
struct bkey_s_c k;
@ -276,10 +278,11 @@ static inline int bch2_hash_set(const struct bch_hash_desc desc,
inode, insert, flags));
}
static inline int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *iter)
static __always_inline
int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *iter)
{
struct bkey_i *delete;
int ret;
@ -300,10 +303,11 @@ static inline int bch2_hash_delete_at(struct btree_trans *trans,
return 0;
}
static inline int bch2_hash_delete(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
u64 inode, const void *key)
static __always_inline
int bch2_hash_delete(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
u64 inode, const void *key)
{
struct btree_iter *iter;

View File

@ -136,7 +136,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
sb->bio = bio;
}
new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order);
if (!new_sb)
return -ENOMEM;
@ -923,7 +923,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
percpu_down_read_preempt_disable(&c->mark_lock);
{
u64 nr_inodes = percpu_u64_get(&c->usage[0]->s.nr_inodes);
u64 nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
@ -970,7 +970,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
u64 sectors = percpu_u64_get(&c->usage[0]->data[i]);
u64 sectors = percpu_u64_get(&c->usage[0]->replicas[i]);
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);

View File

@ -567,7 +567,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
{
struct bch_sb_field_members *mi;
struct bch_fs *c;
unsigned i, iter_size, fs_usage_size;
unsigned i, iter_size;
const char *err;
pr_verbose_init(opts, "");
@ -661,9 +661,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
(btree_blocks(c) + 1) * 2 *
sizeof(struct btree_node_iter_set);
fs_usage_size = sizeof(struct bch_fs_usage) +
sizeof(u64) * c->replicas.nr;
if (!(c->wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcache_copygc",
@ -680,8 +677,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
max(offsetof(struct btree_read_bio, bio),
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->usage[0] = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
!(c->usage_scratch = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||

View File

@ -243,17 +243,17 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
pr_buf(&out, "capacity:\t\t\t%llu\n", c->capacity);
pr_buf(&out, "hidden:\t\t\t\t%llu\n",
fs_usage->s.hidden);
fs_usage->hidden);
pr_buf(&out, "data:\t\t\t\t%llu\n",
fs_usage->s.data);
fs_usage->data);
pr_buf(&out, "cached:\t\t\t\t%llu\n",
fs_usage->s.cached);
fs_usage->cached);
pr_buf(&out, "reserved:\t\t\t%llu\n",
fs_usage->s.reserved);
fs_usage->reserved);
pr_buf(&out, "nr_inodes:\t\t\t%llu\n",
fs_usage->s.nr_inodes);
fs_usage->nr_inodes);
pr_buf(&out, "online reserved:\t\t%llu\n",
fs_usage->s.online_reserved);
fs_usage->online_reserved);
for (i = 0;
i < ARRAY_SIZE(fs_usage->persistent_reserved);
@ -269,7 +269,7 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
pr_buf(&out, "\t");
bch2_replicas_entry_to_text(&out, e);
pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
pr_buf(&out, ":\t%llu\n", fs_usage->replicas[i]);
}
percpu_up_read_preempt_enable(&c->mark_lock);