Update bcachefs sources to 26409a8f75 bcachefs: Journal updates to dev usage

This commit is contained in:
Kent Overstreet 2021-02-02 14:26:28 -05:00
parent 7eef5f46dd
commit 4064aa126e
31 changed files with 804 additions and 704 deletions

View File

@ -1 +1 @@
ea3414eed52e5d90c248453e84b2dcd91c960306
26409a8f755b8faa620a49796d7935566204daaf

View File

@ -572,14 +572,10 @@ int cmd_list_journal(int argc, char *argv[])
printf("journal entry %8llu\n"
" version %8u\n"
" last seq %8llu\n"
" read clock %8u\n"
" write clock %8u\n"
,
le64_to_cpu(p->j.seq),
le32_to_cpu(p->j.version),
le64_to_cpu(p->j.last_seq),
le16_to_cpu(p->j.read_clock),
le16_to_cpu(p->j.write_clock));
le64_to_cpu(p->j.last_seq));
for_each_jset_key(k, _n, entry, &p->j) {
char buf[200];

View File

@ -623,8 +623,6 @@ static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
printf(" flags: %x", le32_to_cpu(clean->flags));
printf(" read clock: %x", le16_to_cpu(clean->read_clock));
printf(" write clock: %x", le16_to_cpu(clean->write_clock));
printf(" journal seq: %llx", le64_to_cpu(clean->journal_seq));
}

View File

@ -14,6 +14,7 @@
#include "ec.h"
#include "error.h"
#include "recovery.h"
#include "varint.h"
#include <linux/kthread.h>
#include <linux/math64.h>
@ -24,15 +25,12 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
static const char * const bch2_alloc_field_names[] = {
#define x(name, bytes) #name,
BCH_ALLOC_FIELDS()
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
BCH_ALLOC_FIELDS_V1()
#undef x
NULL
};
static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
/* Ratelimiting/PD controllers */
static void pd_controllers_update(struct work_struct *work)
@ -67,10 +65,10 @@ static void pd_controllers_update(struct work_struct *work)
/* Persistent alloc info: */
static inline u64 get_alloc_field(const struct bch_alloc *a,
const void **p, unsigned field)
static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
const void **p, unsigned field)
{
unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
u64 v;
if (!(a->fields & (1 << field)))
@ -97,10 +95,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a,
return v;
}
static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
unsigned field, u64 v)
static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
unsigned field, u64 v)
{
unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
if (!v)
return;
@ -127,55 +125,149 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
*p += bytes;
}
static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
struct bkey_s_c k)
{
const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
const void *d = in->data;
unsigned idx = 0;
out->gen = in->gen;
#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
BCH_ALLOC_FIELDS_V1()
#undef x
}
static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst,
const struct bkey_alloc_unpacked src)
{
struct bkey_i_alloc *a = bkey_alloc_init(&dst->k);
void *d = a->v.data;
unsigned bytes, idx = 0;
a->k.p = POS(src.dev, src.bucket);
a->v.fields = 0;
a->v.gen = src.gen;
#define x(_name, _bits) alloc_field_v1_put(a, &d, idx++, src._name);
BCH_ALLOC_FIELDS_V1()
#undef x
bytes = (void *) d - (void *) &a->v;
set_bkey_val_bytes(&a->k, bytes);
memset_u64s_tail(&a->v, 0, bytes);
}
static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
struct bkey_s_c k)
{
struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
const u8 *in = a.v->data;
const u8 *end = bkey_val_end(a);
unsigned fieldnr = 0;
int ret;
u64 v;
out->gen = a.v->gen;
out->oldest_gen = a.v->oldest_gen;
out->data_type = a.v->data_type;
#define x(_name, _bits) \
if (fieldnr < a.v->nr_fields) { \
ret = bch2_varint_decode(in, end, &v); \
if (ret < 0) \
return ret; \
in += ret; \
} else { \
v = 0; \
} \
out->_name = v; \
if (v != out->_name) \
return -1; \
fieldnr++;
BCH_ALLOC_FIELDS_V2()
#undef x
return 0;
}
static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
const struct bkey_alloc_unpacked src)
{
struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
u8 *out = a->v.data;
u8 *end = (void *) &dst[1];
u8 *last_nonzero_field = out;
unsigned bytes;
a->k.p = POS(src.dev, src.bucket);
a->v.gen = src.gen;
a->v.oldest_gen = src.oldest_gen;
a->v.data_type = src.data_type;
#define x(_name, _bits) \
nr_fields++; \
\
if (src._name) { \
out += bch2_varint_encode(out, src._name); \
\
last_nonzero_field = out; \
last_nonzero_fieldnr = nr_fields; \
} else { \
*out++ = 0; \
}
BCH_ALLOC_FIELDS_V2()
#undef x
BUG_ON(out > end);
out = last_nonzero_field;
a->v.nr_fields = last_nonzero_fieldnr;
bytes = (u8 *) out - (u8 *) &a->v;
set_bkey_val_bytes(&a->k, bytes);
memset_u64s_tail(&a->v, 0, bytes);
}
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
{
struct bkey_alloc_unpacked ret = { .gen = 0 };
struct bkey_alloc_unpacked ret = {
.dev = k.k->p.inode,
.bucket = k.k->p.offset,
.gen = 0,
};
if (k.k->type == KEY_TYPE_alloc) {
const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
const void *d = a->data;
unsigned idx = 0;
if (k.k->type == KEY_TYPE_alloc_v2)
bch2_alloc_unpack_v2(&ret, k);
else if (k.k->type == KEY_TYPE_alloc)
bch2_alloc_unpack_v1(&ret, k);
ret.gen = a->gen;
#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++);
BCH_ALLOC_FIELDS()
#undef x
}
return ret;
}
void bch2_alloc_pack(struct bkey_i_alloc *dst,
void bch2_alloc_pack(struct bch_fs *c,
struct bkey_alloc_buf *dst,
const struct bkey_alloc_unpacked src)
{
unsigned idx = 0;
void *d = dst->v.data;
unsigned bytes;
dst->v.fields = 0;
dst->v.gen = src.gen;
#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name);
BCH_ALLOC_FIELDS()
#undef x
bytes = (void *) d - (void *) &dst->v;
set_bkey_val_bytes(&dst->k, bytes);
memset_u64s_tail(&dst->v, 0, bytes);
if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))
bch2_alloc_pack_v2(dst, src);
else
bch2_alloc_pack_v1(dst, src);
}
static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
{
unsigned i, bytes = offsetof(struct bch_alloc, data);
for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
if (a->fields & (1 << i))
bytes += BCH_ALLOC_FIELD_BYTES[i];
bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
return DIV_ROUND_UP(bytes, sizeof(u64));
}
const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
@ -190,20 +282,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
return NULL;
}
void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
const void *d = a.v->data;
unsigned i;
struct bkey_alloc_unpacked u;
pr_buf(out, "gen %u", a.v->gen);
if (k.k->p.inode >= c->sb.nr_devices ||
!c->devs[k.k->p.inode])
return "invalid device";
for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
if (a.v->fields & (1 << i))
pr_buf(out, " %s %llu",
bch2_alloc_field_names[i],
get_alloc_field(a.v, &d, i));
if (bch2_alloc_unpack_v2(&u, k))
return "unpack error";
return NULL;
}
void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
pr_buf(out, "gen %u oldest_gen %u data_type %u",
u.gen, u.oldest_gen, u.data_type);
#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name);
BCH_ALLOC_FIELDS_V2()
#undef x
}
static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
@ -213,7 +315,9 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
struct bucket *g;
struct bkey_alloc_unpacked u;
if (level || k.k->type != KEY_TYPE_alloc)
if (level ||
(k.k->type != KEY_TYPE_alloc &&
k.k->type != KEY_TYPE_alloc_v2))
return 0;
ca = bch_dev_bkey_exists(c, k.k->p.inode);
@ -234,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
struct bch_dev *ca;
unsigned i;
int ret = 0;
int ret;
down_read(&c->gc_lock);
ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
@ -248,26 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
return ret;
}
percpu_down_write(&c->mark_lock);
bch2_dev_usage_from_buckets(c);
percpu_up_write(&c->mark_lock);
mutex_lock(&c->bucket_clock[READ].lock);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
bch2_recalc_oldest_io(c, ca, READ);
up_read(&ca->bucket_lock);
}
mutex_unlock(&c->bucket_clock[READ].lock);
mutex_lock(&c->bucket_clock[WRITE].lock);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
bch2_recalc_oldest_io(c, ca, WRITE);
up_read(&ca->bucket_lock);
}
mutex_unlock(&c->bucket_clock[WRITE].lock);
return 0;
}
@ -281,8 +363,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
struct bucket *g;
struct bucket_mark m;
struct bkey_alloc_unpacked old_u, new_u;
__BKEY_PADDED(k, 8) alloc_key; /* hack: */
struct bkey_i_alloc *a;
struct bkey_alloc_buf a;
int ret;
retry:
bch2_trans_begin(trans);
@ -303,17 +384,14 @@ retry:
ca = bch_dev_bkey_exists(c, iter->pos.inode);
g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
new_u = alloc_mem_to_key(g, m);
new_u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
if (!bkey_alloc_unpacked_cmp(old_u, new_u))
return 0;
a = bkey_alloc_init(&alloc_key.k);
a->k.p = iter->pos;
bch2_alloc_pack(a, new_u);
bch2_trans_update(trans, iter, &a->k_i,
bch2_alloc_pack(c, &a, new_u);
bch2_trans_update(trans, iter, &a.k,
BTREE_TRIGGER_NORUN);
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|flags);
@ -358,114 +436,6 @@ err:
/* Bucket IO clocks: */
static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
{
struct bucket_clock *clock = &c->bucket_clock[rw];
struct bucket_array *buckets = bucket_array(ca);
struct bucket *g;
u16 max_last_io = 0;
unsigned i;
lockdep_assert_held(&c->bucket_clock[rw].lock);
/* Recalculate max_last_io for this device: */
for_each_bucket(g, buckets)
max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
ca->max_last_bucket_io[rw] = max_last_io;
/* Recalculate global max_last_io: */
max_last_io = 0;
for_each_member_device(ca, c, i)
max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
clock->max_last_io = max_last_io;
}
static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
{
struct bucket_clock *clock = &c->bucket_clock[rw];
struct bucket_array *buckets;
struct bch_dev *ca;
struct bucket *g;
unsigned i;
trace_rescale_prios(c);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
for_each_bucket(g, buckets)
g->io_time[rw] = clock->hand -
bucket_last_io(c, g, rw) / 2;
bch2_recalc_oldest_io(c, ca, rw);
up_read(&ca->bucket_lock);
}
}
static inline u64 bucket_clock_freq(u64 capacity)
{
return max(capacity >> 10, 2028ULL);
}
static void bch2_inc_clock_hand(struct io_timer *timer)
{
struct bucket_clock *clock = container_of(timer,
struct bucket_clock, rescale);
struct bch_fs *c = container_of(clock,
struct bch_fs, bucket_clock[clock->rw]);
struct bch_dev *ca;
u64 capacity;
unsigned i;
mutex_lock(&clock->lock);
/* if clock cannot be advanced more, rescale prio */
if (clock->max_last_io >= U16_MAX - 2)
bch2_rescale_bucket_io_times(c, clock->rw);
BUG_ON(clock->max_last_io >= U16_MAX - 2);
for_each_member_device(ca, c, i)
ca->max_last_bucket_io[clock->rw]++;
clock->max_last_io++;
clock->hand++;
mutex_unlock(&clock->lock);
capacity = READ_ONCE(c->capacity);
if (!capacity)
return;
/*
* we only increment when 0.1% of the filesystem capacity has been read
* or written too, this determines if it's time
*
* XXX: we shouldn't really be going off of the capacity of devices in
* RW mode (that will be 0 when we're RO, yet we can still service
* reads)
*/
timer->expire += bucket_clock_freq(capacity);
bch2_io_timer_add(&c->io_clock[clock->rw], timer);
}
static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
{
struct bucket_clock *clock = &c->bucket_clock[rw];
clock->hand = 1;
clock->rw = rw;
clock->rescale.fn = bch2_inc_clock_hand;
clock->rescale.expire = bucket_clock_freq(c->capacity);
mutex_init(&clock->lock);
}
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
size_t bucket_nr, int rw)
{
@ -473,9 +443,9 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
struct btree_iter *iter;
struct bucket *g;
struct bkey_i_alloc *a;
struct bkey_alloc_buf *a;
struct bkey_alloc_unpacked u;
u16 *time;
u64 *time, now;
int ret = 0;
iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
@ -486,28 +456,25 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
if (ret)
goto out;
a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
percpu_down_read(&c->mark_lock);
g = bucket(ca, bucket_nr);
u = alloc_mem_to_key(g, READ_ONCE(g->mark));
u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
bkey_alloc_init(&a->k_i);
a->k.p = iter->pos;
time = rw == READ ? &u.read_time : &u.write_time;
if (*time == c->bucket_clock[rw].hand)
now = atomic64_read(&c->io_clock[rw].now);
if (*time == now)
goto out;
*time = c->bucket_clock[rw].hand;
*time = now;
bch2_alloc_pack(a, u);
ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
bch2_alloc_pack(c, a, u);
ret = bch2_trans_update(trans, iter, &a->k, 0) ?:
bch2_trans_commit(trans, NULL, NULL, 0);
out:
bch2_trans_iter_put(trans, iter);
@ -576,23 +543,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
return ret;
}
static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
size_t bucket,
struct bucket_mark mark)
static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
struct bucket_mark m)
{
u8 gc_gen;
if (!is_available_bucket(mark))
if (!is_available_bucket(m))
return false;
if (mark.owned_by_allocator)
if (m.owned_by_allocator)
return false;
if (ca->buckets_nouse &&
test_bit(bucket, ca->buckets_nouse))
test_bit(b, ca->buckets_nouse))
return false;
gc_gen = bucket_gc_gen(ca, bucket);
gc_gen = bucket_gc_gen(bucket(ca, b));
if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
ca->inc_gen_needs_gc++;
@ -606,43 +572,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
/*
* Determines what order we're going to reuse buckets, smallest bucket_key()
* first.
*
*
* - We take into account the read prio of the bucket, which gives us an
* indication of how hot the data is -- we scale the prio so that the prio
* farthest from the clock is worth 1/8th of the closest.
*
* - The number of sectors of cached data in the bucket, which gives us an
* indication of the cost in cache misses this eviction will cause.
*
* - If hotness * sectors used compares equal, we pick the bucket with the
* smallest bucket_gc_gen() - since incrementing the same bucket's generation
* number repeatedly forces us to run mark and sweep gc to avoid generation
* number wraparound.
*/
static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark m)
static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
u64 now, u64 last_seq_ondisk)
{
unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
unsigned max_last_io = ca->max_last_bucket_io[READ];
unsigned used = bucket_sectors_used(m);
/*
* Time since last read, scaled to [0, 8) where larger value indicates
* more recently read data:
*/
unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
if (used) {
/*
* Prefer to keep buckets that have been read more recently, and
* buckets that have more data in them:
*/
u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
/* How much we want to keep the data in this bucket: */
unsigned long data_wantness =
(hotness + 1) * bucket_sectors_used(m);
unsigned long needs_journal_commit =
bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
return (data_wantness << 9) |
(needs_journal_commit << 8) |
(bucket_gc_gen(ca, b) / 16);
return -last_read_scaled;
} else {
/*
* Prefer to use buckets with smaller gc_gen so that we don't
* have to walk the btree and recalculate oldest_gen - but shift
* off the low bits so that buckets will still have equal sort
* keys when there's only a small difference, so that we can
* keep sequential buckets together:
*/
return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
(bucket_gc_gen(g) >> 4);
}
}
static inline int bucket_alloc_cmp(alloc_heap *h,
@ -665,16 +621,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets;
struct alloc_heap_entry e = { 0 };
u64 now, last_seq_ondisk;
size_t b, i, nr = 0;
ca->alloc_heap.used = 0;
mutex_lock(&c->bucket_clock[READ].lock);
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
bch2_recalc_oldest_io(c, ca, READ);
ca->alloc_heap.used = 0;
now = atomic64_read(&c->io_clock[READ].now);
last_seq_ondisk = c->journal.last_seq_ondisk;
/*
* Find buckets with lowest read priority, by building a maxheap sorted
@ -682,8 +637,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
* all buckets have been visited.
*/
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
unsigned long key = bucket_sort_key(c, ca, b, m);
struct bucket *g = &buckets->b[b];
struct bucket_mark m = READ_ONCE(g->mark);
unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
if (!bch2_can_invalidate_bucket(ca, b, m))
continue;
@ -718,7 +674,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
}
up_read(&ca->bucket_lock);
mutex_unlock(&c->bucket_clock[READ].lock);
}
static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@ -863,14 +818,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
struct btree_iter *iter,
u64 *journal_seq, unsigned flags)
{
#if 0
__BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
#else
/* hack: */
__BKEY_PADDED(k, 8) alloc_key;
#endif
struct bch_fs *c = trans->c;
struct bkey_i_alloc *a;
struct bkey_alloc_buf a;
struct bkey_alloc_unpacked u;
struct bucket *g;
struct bucket_mark m;
@ -920,8 +869,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
goto out;
}
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
retry:
ret = bch2_btree_iter_traverse(iter);
@ -931,7 +878,7 @@ retry:
percpu_down_read(&c->mark_lock);
g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
u = alloc_mem_to_key(g, m);
u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
@ -941,14 +888,11 @@ retry:
u.data_type = 0;
u.dirty_sectors = 0;
u.cached_sectors = 0;
u.read_time = c->bucket_clock[READ].hand;
u.write_time = c->bucket_clock[WRITE].hand;
u.read_time = atomic64_read(&c->io_clock[READ].now);
u.write_time = atomic64_read(&c->io_clock[WRITE].now);
a = bkey_alloc_init(&alloc_key.k);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i,
bch2_alloc_pack(c, &a, u);
bch2_trans_update(trans, iter, &a.k,
BTREE_TRIGGER_BUCKET_INVALIDATE);
/*
@ -1455,8 +1399,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
bch2_bucket_clock_init(c, READ);
bch2_bucket_clock_init(c, WRITE);
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);

View File

@ -7,12 +7,33 @@
#include "debug.h"
struct bkey_alloc_unpacked {
u64 bucket;
u8 dev;
u8 gen;
u8 oldest_gen;
u8 data_type;
#define x(_name, _bits) u##_bits _name;
BCH_ALLOC_FIELDS()
BCH_ALLOC_FIELDS_V2()
#undef x
};
struct bkey_alloc_buf {
struct bkey_i k;
union {
struct {
#define x(_name, _bits) + _bits / 8
u8 _pad[8 + BCH_ALLOC_FIELDS_V1()];
#undef x
} _v1;
struct {
#define x(_name, _bits) + 8 + _bits / 8
u8 _pad[8 + BCH_ALLOC_FIELDS_V2()];
#undef x
} _v2;
};
} __attribute__((packed, aligned(8)));
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
@ -20,23 +41,28 @@ struct bkey_alloc_unpacked {
static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
struct bkey_alloc_unpacked r)
{
return l.gen != r.gen
#define x(_name, _bits) || l._name != r._name
BCH_ALLOC_FIELDS()
return l.gen != r.gen ||
l.oldest_gen != r.oldest_gen ||
l.data_type != r.data_type
#define x(_name, ...) || l._name != r._name
BCH_ALLOC_FIELDS_V2()
#undef x
;
}
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
void bch2_alloc_pack(struct bkey_i_alloc *,
void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
const struct bkey_alloc_unpacked);
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
static inline struct bkey_alloc_unpacked
alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
alloc_mem_to_key(struct btree_iter *iter,
struct bucket *g, struct bucket_mark m)
{
return (struct bkey_alloc_unpacked) {
.dev = iter->pos.inode,
.bucket = iter->pos.offset,
.gen = m.gen,
.oldest_gen = g->oldest_gen,
.data_type = m.data_type,
@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc (struct bkey_ops) { \
.key_invalid = bch2_alloc_invalid, \
.key_invalid = bch2_alloc_v1_invalid, \
.val_to_text = bch2_alloc_to_text, \
}
#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \
.key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
}

View File

@ -10,30 +10,6 @@
struct ec_bucket_buf;
/* There's two of these clocks, one for reads and one for writes: */
struct bucket_clock {
/*
* "now" in (read/write) IO time - incremented whenever we do X amount
* of reads or writes.
*
* Goes with the bucket read/write prios: when we read or write to a
* bucket we reset the bucket's prio to the current hand; thus hand -
* prio = time since bucket was last read/written.
*
* The units are some amount (bytes/sectors) of data read/written, and
* the units can change on the fly if we need to rescale to fit
* everything in a u16 - your only guarantee is that the units are
* consistent.
*/
u16 hand;
u16 max_last_io;
int rw;
struct io_timer rescale;
struct mutex lock;
};
enum alloc_reserve {
RESERVE_BTREE_MOVINGGC = -2,
RESERVE_BTREE = -1,

View File

@ -429,7 +429,9 @@ struct bch_dev {
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
struct bch_dev_usage __percpu *usage[2];
struct bch_dev_usage *usage_base;
struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
struct bch_dev_usage __percpu *usage_gc;
/* Allocator: */
struct task_struct __rcu *alloc_thread;
@ -451,9 +453,6 @@ struct bch_dev {
size_t fifo_last_bucket;
/* last calculated minimum prio */
u16 max_last_bucket_io[2];
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
@ -473,6 +472,7 @@ struct bch_dev {
atomic64_t rebalance_work;
struct journal_device journal;
u64 prev_journal_sector;
struct work_struct io_error_work;
@ -584,6 +584,8 @@ struct bch_fs {
struct journal_entry_res replicas_journal_res;
struct journal_entry_res dev_usage_journal_res;
struct bch_disk_groups_cpu __rcu *disk_groups;
struct bch_opts opts;
@ -691,14 +693,6 @@ struct bch_fs {
struct mutex usage_scratch_lock;
struct bch_fs_usage *usage_scratch;
/*
* When we invalidate buckets, we use both the priority and the amount
* of good data to determine which buckets to reuse first - to weight
* those together consistently we keep track of the smallest nonzero
* priority of any bucket.
*/
struct bucket_clock bucket_clock[2];
struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */

View File

@ -341,7 +341,8 @@ static inline void bkey_init(struct bkey *k)
x(reflink_v, 16) \
x(inline_data, 17) \
x(btree_ptr_v2, 18) \
x(indirect_inline_data, 19)
x(indirect_inline_data, 19) \
x(alloc_v2, 20)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@ -551,9 +552,11 @@ struct bch_extent_stripe_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:5,
block:8,
idx:51;
redundancy:4,
idx:47;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 idx:51,
__u64 idx:47,
redundancy:4,
block:8,
type:5;
#endif
@ -799,35 +802,40 @@ struct bch_alloc {
__u8 data[];
} __attribute__((packed, aligned(8)));
#define BCH_ALLOC_FIELDS() \
#define BCH_ALLOC_FIELDS_V1() \
x(read_time, 16) \
x(write_time, 16) \
x(data_type, 8) \
x(dirty_sectors, 16) \
x(cached_sectors, 16) \
x(oldest_gen, 8)
x(oldest_gen, 8) \
x(stripe, 32) \
x(stripe_redundancy, 8)
struct bch_alloc_v2 {
struct bch_val v;
__u8 nr_fields;
__u8 gen;
__u8 oldest_gen;
__u8 data_type;
__u8 data[];
} __attribute__((packed, aligned(8)));
#define BCH_ALLOC_FIELDS_V2() \
x(read_time, 64) \
x(write_time, 64) \
x(dirty_sectors, 16) \
x(cached_sectors, 16) \
x(stripe, 32) \
x(stripe_redundancy, 8)
enum {
#define x(name, bytes) BCH_ALLOC_FIELD_##name,
BCH_ALLOC_FIELDS()
#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
BCH_ALLOC_FIELDS_V1()
#undef x
BCH_ALLOC_FIELD_NR
};
static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
BCH_ALLOC_FIELDS()
#undef x
};
#define x(name, bits) + (bits / 8)
static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
DIV_ROUND_UP(offsetof(struct bch_alloc, data)
BCH_ALLOC_FIELDS(), sizeof(u64));
#undef x
#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
/* Quotas: */
enum quota_types {
@ -1131,8 +1139,8 @@ struct bch_sb_field_clean {
struct bch_sb_field field;
__le32 flags;
__le16 read_clock;
__le16 write_clock;
__le16 _read_clock; /* no longer used */
__le16 _write_clock;
__le64 journal_seq;
union {
@ -1305,6 +1313,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
/*
* Features:
@ -1332,7 +1341,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(btree_updates_journalled, 13) \
x(reflink_inline_data, 14) \
x(new_varint, 15) \
x(journal_no_flush, 16)
x(journal_no_flush, 16) \
x(alloc_v2, 17)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
@ -1340,7 +1350,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
(1ULL << BCH_FEATURE_new_varint)| \
(1ULL << BCH_FEATURE_journal_no_flush))
(1ULL << BCH_FEATURE_journal_no_flush)| \
(1ULL << BCH_FEATURE_alloc_v2))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
@ -1493,7 +1504,9 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(blacklist, 3) \
x(blacklist_v2, 4) \
x(usage, 5) \
x(data_usage, 6)
x(data_usage, 6) \
x(clock, 7) \
x(dev_usage, 8)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@ -1541,6 +1554,30 @@ struct jset_entry_data_usage {
struct bch_replicas_entry r;
} __attribute__((packed));
struct jset_entry_clock {
struct jset_entry entry;
__u8 rw;
__u8 pad[7];
__le64 time;
} __attribute__((packed));
struct jset_entry_dev_usage_type {
__le64 buckets;
__le64 sectors;
__le64 fragmented;
} __attribute__((packed));
struct jset_entry_dev_usage {
struct jset_entry entry;
__le32 dev;
__u32 pad;
__le64 buckets_ec;
__le64 buckets_unavailable;
struct jset_entry_dev_usage_type d[];
} __attribute__((packed));
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
@ -1563,8 +1600,8 @@ struct jset {
__u8 encrypted_start[0];
__le16 read_clock;
__le16 write_clock;
__le16 _read_clock; /* no longer used */
__le16 _write_clock;
/* Sequence number of oldest dirty journal entry */
__le64 last_seq;

View File

@ -530,6 +530,7 @@ BKEY_VAL_ACCESSORS(reflink_v);
BKEY_VAL_ACCESSORS(inline_data);
BKEY_VAL_ACCESSORS(btree_ptr_v2);
BKEY_VAL_ACCESSORS(indirect_inline_data);
BKEY_VAL_ACCESSORS(alloc_v2);
/* byte order helpers */

View File

@ -706,8 +706,8 @@ static void bch2_gc_free(struct bch_fs *c)
ca->mi.nbuckets * sizeof(struct bucket));
ca->buckets[1] = NULL;
free_percpu(ca->usage[1]);
ca->usage[1] = NULL;
free_percpu(ca->usage_gc);
ca->usage_gc = NULL;
}
free_percpu(c->usage_gc);
@ -720,7 +720,7 @@ static int bch2_gc_done(struct bch_fs *c,
struct bch_dev *ca;
bool verify = (!initial ||
(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
unsigned i;
unsigned i, dev;
int ret = 0;
#define copy_field(_f, _msg, ...) \
@ -786,7 +786,10 @@ static int bch2_gc_done(struct bch_fs *c,
}
}
for_each_member_device(ca, c, i) {
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
for_each_member_device(ca, c, dev) {
struct bucket_array *dst = __bucket_array(ca, 0);
struct bucket_array *src = __bucket_array(ca, 1);
size_t b;
@ -801,13 +804,24 @@ static int bch2_gc_done(struct bch_fs *c,
dst->b[b].oldest_gen = src->b[b].oldest_gen;
}
{
struct bch_dev_usage *dst = ca->usage_base;
struct bch_dev_usage *src = (void *)
bch2_acc_percpu_u64s((void *) ca->usage_gc,
dev_usage_u64s());
copy_dev_field(buckets_ec, "buckets_ec");
copy_dev_field(buckets_unavailable, "buckets_unavailable");
for (i = 0; i < BCH_DATA_NR; i++) {
copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
}
}
};
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
bch2_dev_usage_from_buckets(c);
{
unsigned nr = fs_usage_u64s(c);
struct bch_fs_usage *dst = c->usage_base;
@ -862,7 +876,7 @@ static int bch2_gc_start(struct bch_fs *c)
for_each_member_device(ca, c, i) {
BUG_ON(ca->buckets[1]);
BUG_ON(ca->usage[1]);
BUG_ON(ca->usage_gc);
ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
@ -873,9 +887,9 @@ static int bch2_gc_start(struct bch_fs *c)
return -ENOMEM;
}
ca->usage[1] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[1]) {
bch_err(c, "error allocating ca->usage[gc]");
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
if (!ca->usage_gc) {
bch_err(c, "error allocating ca->usage_gc");
percpu_ref_put(&ca->ref);
return -ENOMEM;
}
@ -1489,7 +1503,7 @@ static int bch2_gc_thread(void *arg)
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last = atomic_long_read(&clock->now);
unsigned long last = atomic64_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
int ret;
@ -1510,7 +1524,7 @@ static int bch2_gc_thread(void *arg)
if (c->btree_gc_periodic) {
unsigned long next = last + c->capacity / 16;
if (atomic_long_read(&clock->now) >= next)
if (atomic64_read(&clock->now) >= next)
break;
bch2_io_clock_schedule_timeout(clock, next);
@ -1522,7 +1536,7 @@ static int bch2_gc_thread(void *arg)
}
__set_current_state(TASK_RUNNING);
last = atomic_long_read(&clock->now);
last = atomic64_read(&clock->now);
last_kick = atomic_read(&c->kick_gc);
/*

View File

@ -222,7 +222,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
wp = bch2_alloc_sectors_start(c,
c->opts.metadata_target ?:
c->opts.foreground_target,
0,
writepoint_ptr(&c->btree_write_point),
&devs_have,
res->nr_replicas,

View File

@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
struct bch_dev *ca;
unsigned i;
percpu_down_write(&c->mark_lock);
@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
}
for_each_member_device(ca, c, i) {
struct bch_dev_usage dev = bch2_dev_usage_read(ca);
usage->hidden += (dev.d[BCH_DATA_sb].buckets +
dev.d[BCH_DATA_journal].buckets) *
ca->mi.bucket_size;
}
percpu_up_write(&c->mark_lock);
}
@ -189,14 +198,27 @@ out_pool:
return ret;
}
static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
unsigned journal_seq,
bool gc)
{
return this_cpu_ptr(gc
? ca->usage_gc
: ca->usage[journal_seq & JOURNAL_BUF_MASK]);
}
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
struct bch_dev_usage ret;
unsigned seq, i, u64s = dev_usage_u64s();
memset(&ret, 0, sizeof(ret));
acc_u64s_percpu((u64 *) &ret,
(u64 __percpu *) ca->usage[0],
sizeof(ret) / sizeof(u64));
do {
seq = read_seqcount_begin(&c->usage_lock);
memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
} while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
}
@ -261,7 +283,8 @@ retry:
void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
{
unsigned u64s = fs_usage_u64s(c);
struct bch_dev *ca;
unsigned i, u64s = fs_usage_u64s(c);
BUG_ON(idx >= ARRAY_SIZE(c->usage));
@ -272,6 +295,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
(u64 __percpu *) c->usage[idx], u64s);
percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
rcu_read_lock();
for_each_member_device_rcu(ca, c, i, NULL) {
u64s = dev_usage_u64s();
acc_u64s_percpu((u64 *) ca->usage_base,
(u64 __percpu *) ca->usage[idx], u64s);
percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
}
rcu_read_unlock();
write_seqcount_end(&c->usage_lock);
preempt_enable();
}
@ -454,14 +487,14 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
struct bch_fs_usage *fs_usage,
struct bucket_mark old, struct bucket_mark new,
bool gc)
u64 journal_seq, bool gc)
{
struct bch_dev_usage *u;
percpu_rwsem_assert_held(&c->mark_lock);
preempt_disable();
u = this_cpu_ptr(ca->usage[gc]);
u = dev_usage_ptr(ca, journal_seq, gc);
if (bucket_type(old))
account_bucket(fs_usage, u, bucket_type(old),
@ -491,31 +524,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
bch2_wake_allocator(ca);
}
__flatten
void bch2_dev_usage_from_buckets(struct bch_fs *c)
{
struct bch_dev *ca;
struct bucket_mark old = { .v.counter = 0 };
struct bucket_array *buckets;
struct bucket *g;
unsigned i;
int cpu;
c->usage_base->hidden = 0;
for_each_member_device(ca, c, i) {
for_each_possible_cpu(cpu)
memset(per_cpu_ptr(ca->usage[0], cpu), 0,
sizeof(*ca->usage[0]));
buckets = bucket_array(ca);
for_each_bucket(g, buckets)
bch2_dev_usage_update(c, ca, c->usage_base,
old, g->mark, false);
}
}
static inline int update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct bch_replicas_entry *r,
@ -653,7 +661,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
new.owned_by_allocator = owned_by_allocator;
}));
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
/*
* XXX: this is wrong, this means we'll be doing updates to the percpu
* buckets_alloc counter that don't have an open journal buffer and
* we'll race with the machinery that accumulates that to ca->usage_base
*/
bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc);
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
@ -685,7 +698,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
struct bucket_mark old_m, m;
/* We don't do anything for deletions - do we?: */
if (new.k->type != KEY_TYPE_alloc)
if (new.k->type != KEY_TYPE_alloc &&
new.k->type != KEY_TYPE_alloc_v2)
return 0;
/*
@ -708,6 +722,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
m.data_type = u.data_type;
m.dirty_sectors = u.dirty_sectors;
m.cached_sectors = u.cached_sectors;
m.stripe = u.stripe != 0;
if (journal_seq) {
m.journal_seq_valid = 1;
@ -715,12 +730,14 @@ static int bch2_mark_alloc(struct bch_fs *c,
}
}));
bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
g->oldest_gen = u.oldest_gen;
g->gen_valid = 1;
g->stripe = u.stripe;
g->stripe_redundancy = u.stripe_redundancy;
/*
* need to know if we're getting called from the invalidate path or
@ -778,7 +795,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (c)
bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
old, new, gc);
old, new, 0, gc);
return 0;
}
@ -915,11 +932,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
unsigned ptr_idx,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags,
bool enabled)
u64 journal_seq, unsigned flags)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
@ -932,8 +948,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
char buf[200];
int ret;
if (enabled)
g->ec_redundancy = s->nr_redundant;
if (g->stripe && g->stripe != k.k->p.offset) {
bch2_fs_inconsistent(c,
"bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EINVAL;
}
old = bucket_cmpxchg(g, new, ({
ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
@ -941,23 +962,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
if (ret)
return ret;
if (new.stripe && enabled)
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
if (!new.stripe && !enabled)
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
new.stripe = enabled;
if ((flags & BTREE_TRIGGER_GC) && parity) {
new.data_type = enabled ? BCH_DATA_parity : 0;
new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
if (parity) {
new.data_type = BCH_DATA_parity;
new.dirty_sectors = le16_to_cpu(s->sectors);
}
if (journal_seq) {
@ -966,10 +973,10 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
}
}));
if (!enabled)
g->ec_redundancy = 0;
g->stripe = k.k->p.offset;
g->stripe_redundancy = s->nr_redundant;
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
return 0;
}
@ -1036,7 +1043,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
old.v.counter,
new.v.counter)) != old.v.counter);
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
BUG_ON(!gc && bucket_became_unavailable(old, new));
@ -1163,6 +1170,8 @@ static int bch2_mark_stripe(struct bch_fs *c,
unsigned i;
int ret;
BUG_ON(gc && old_s);
if (!m || (old_s && !m->alive)) {
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
idx);
@ -1170,48 +1179,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
}
if (!new_s) {
/* Deleting: */
for (i = 0; i < old_s->nr_blocks; i++) {
ret = bucket_set_stripe(c, old, i, fs_usage,
journal_seq, flags, false);
if (ret)
return ret;
}
if (!gc && m->on_heap) {
spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_del(c, m, idx);
spin_unlock(&c->ec_stripes_heap_lock);
}
if (gc)
update_replicas(c, fs_usage, &m->r.e,
-((s64) m->sectors * m->nr_redundant));
spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_del(c, m, idx);
spin_unlock(&c->ec_stripes_heap_lock);
memset(m, 0, sizeof(*m));
} else {
BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
for (i = 0; i < new_s->nr_blocks; i++) {
if (!old_s ||
memcmp(new_s->ptrs + i,
old_s->ptrs + i,
sizeof(struct bch_extent_ptr))) {
if (old_s) {
bucket_set_stripe(c, old, i, fs_usage,
journal_seq, flags, false);
if (ret)
return ret;
}
ret = bucket_set_stripe(c, new, i, fs_usage,
journal_seq, flags, true);
if (ret)
return ret;
}
}
m->alive = true;
m->sectors = le16_to_cpu(new_s->sectors);
m->algorithm = new_s->algorithm;
@ -1220,27 +1193,13 @@ static int bch2_mark_stripe(struct bch_fs *c,
m->blocks_nonempty = 0;
for (i = 0; i < new_s->nr_blocks; i++) {
unsigned s = stripe_blockcount_get(new_s, i);
/*
* gc recalculates this field from stripe ptr
* references:
*/
if (!gc)
m->block_sectors[i] = s;
m->blocks_nonempty += !!s;
m->block_sectors[i] =
stripe_blockcount_get(new_s, i);
m->blocks_nonempty += !!m->block_sectors[i];
}
if (gc && old_s)
update_replicas(c, fs_usage, &m->r.e,
-((s64) m->sectors * m->nr_redundant));
bch2_bkey_to_replicas(&m->r.e, new);
if (gc)
update_replicas(c, fs_usage, &m->r.e,
((s64) m->sectors * m->nr_redundant));
if (!gc) {
spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_update(c, m, idx);
@ -1248,6 +1207,25 @@ static int bch2_mark_stripe(struct bch_fs *c,
}
}
if (gc) {
/*
* gc recalculates this field from stripe ptr
* references:
*/
memset(m->block_sectors, 0, sizeof(m->block_sectors));
m->blocks_nonempty = 0;
for (i = 0; i < new_s->nr_blocks; i++) {
ret = mark_stripe_bucket(c, new, i, fs_usage,
journal_seq, flags);
if (ret)
return ret;
}
update_replicas(c, fs_usage, &m->r.e,
((s64) m->sectors * m->nr_redundant));
}
return 0;
}
@ -1271,6 +1249,7 @@ static int bch2_mark_key_locked(struct bch_fs *c,
switch (k.k->type) {
case KEY_TYPE_alloc:
case KEY_TYPE_alloc_v2:
ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_btree_ptr:
@ -1539,9 +1518,10 @@ static int trans_get_key(struct btree_trans *trans,
return ret;
}
static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
const struct bch_extent_ptr *ptr,
struct bkey_alloc_unpacked *u)
static struct bkey_alloc_buf *
bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
const struct bch_extent_ptr *ptr,
struct bkey_alloc_unpacked *u)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
@ -1549,8 +1529,13 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
struct bucket *g;
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey_alloc_buf *a;
int ret;
a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
if (IS_ERR(a))
return a;
iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
if (iter) {
*u = bch2_alloc_unpack(k);
@ -1562,17 +1547,17 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
ret = bch2_btree_iter_traverse(iter);
if (ret) {
bch2_trans_iter_put(trans, iter);
return ret;
return ERR_PTR(ret);
}
percpu_down_read(&c->mark_lock);
g = bucket(ca, pos.offset);
*u = alloc_mem_to_key(g, READ_ONCE(g->mark));
*u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
}
*_iter = iter;
return 0;
return a;
}
static int bch2_trans_mark_pointer(struct btree_trans *trans,
@ -1582,27 +1567,20 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
struct bkey_alloc_buf *a;
int ret;
ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
if (ret)
return ret;
a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
if (IS_ERR(a))
return PTR_ERR(a);
ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
&u.dirty_sectors, &u.cached_sectors);
if (ret)
goto out;
a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
bkey_alloc_init(&a->k_i);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i, 0);
bch2_alloc_pack(c, a, u);
bch2_trans_update(trans, iter, &a->k, 0);
out:
bch2_trans_iter_put(trans, iter);
return ret;
@ -1713,34 +1691,51 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
}
static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
const struct bch_extent_ptr *ptr,
s64 sectors, bool parity)
struct bkey_s_c_stripe s,
unsigned idx, bool deleting)
{
struct bkey_i_alloc *a;
struct bch_fs *c = trans->c;
const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
struct bkey_alloc_buf *a;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
int ret;
bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
int ret = 0;
ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
if (ret)
return ret;
a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
if (IS_ERR(a))
return PTR_ERR(a);
if (parity) {
s64 sectors = le16_to_cpu(s.v->sectors);
if (deleting)
sectors = -sectors;
u.dirty_sectors += sectors;
u.data_type = u.dirty_sectors
? BCH_DATA_parity
: 0;
}
a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto err;
if (!deleting) {
if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
"bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
iter->pos.inode, iter->pos.offset, u.gen,
u.stripe, s.k->p.offset)) {
ret = -EIO;
goto err;
}
bkey_alloc_init(&a->k_i);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i, 0);
u.stripe = s.k->p.offset;
u.stripe_redundancy = s.v->nr_redundant;
} else {
u.stripe = 0;
u.stripe_redundancy = 0;
}
bch2_alloc_pack(c, a, u);
bch2_trans_update(trans, iter, &a->k, 0);
err:
bch2_trans_iter_put(trans, iter);
return ret;
@ -1750,51 +1745,50 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
? bkey_s_c_to_stripe(old).v : NULL;
const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
? bkey_s_c_to_stripe(new).v : NULL;
struct bkey_s_c_stripe old_s = { NULL };
struct bkey_s_c_stripe new_s = { NULL };
struct bch_replicas_padded r;
unsigned i;
int ret = 0;
if (old.k->type == KEY_TYPE_stripe)
old_s = bkey_s_c_to_stripe(old);
if (new.k->type == KEY_TYPE_stripe)
new_s = bkey_s_c_to_stripe(new);
/*
* If the pointers aren't changing, we don't need to do anything:
*/
if (new_s && old_s &&
!memcmp(old_s->ptrs, new_s->ptrs,
new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
if (new_s.k && old_s.k &&
new_s.v->nr_blocks == old_s.v->nr_blocks &&
new_s.v->nr_redundant == old_s.v->nr_redundant &&
!memcmp(old_s.v->ptrs, new_s.v->ptrs,
new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
return 0;
if (new_s) {
unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant;
s64 sectors = le16_to_cpu(new_s->sectors);
if (new_s.k) {
s64 sectors = le16_to_cpu(new_s.v->sectors);
bch2_bkey_to_replicas(&r.e, new);
update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
for (i = 0; i < new_s->nr_blocks; i++) {
bool parity = i >= nr_data;
ret = bch2_trans_mark_stripe_alloc_ref(trans,
&new_s->ptrs[i], sectors, parity);
for (i = 0; i < new_s.v->nr_blocks; i++) {
ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
i, false);
if (ret)
return ret;
}
}
if (old_s) {
unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant;
s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
if (old_s.k) {
s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
bch2_bkey_to_replicas(&r.e, old);
update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
for (i = 0; i < old_s->nr_blocks; i++) {
bool parity = i >= nr_data;
ret = bch2_trans_mark_stripe_alloc_ref(trans,
&old_s->ptrs[i], sectors, parity);
for (i = 0; i < old_s.v->nr_blocks; i++) {
ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
i, true);
if (ret)
return ret;
}
@ -2065,21 +2059,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
struct bkey_alloc_buf *a;
struct bch_extent_ptr ptr = {
.dev = ca->dev_idx,
.offset = bucket_to_sector(ca, b),
};
int ret = 0;
a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
return ret;
ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
if (ret)
return ret;
a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
if (IS_ERR(a))
return PTR_ERR(a);
if (u.data_type && u.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
@ -2112,10 +2101,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
u.data_type = type;
u.dirty_sectors = sectors;
bkey_alloc_init(&a->k_i);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i, 0);
bch2_alloc_pack(c, a, u);
bch2_trans_update(trans, iter, &a->k, 0);
out:
bch2_trans_iter_put(trans, iter);
return ret;
@ -2422,13 +2409,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
free_percpu(ca->usage[0]);
for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
free_percpu(ca->usage[i]);
kfree(ca->usage_base);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
unsigned i;
ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
if (!ca->usage_base)
return -ENOMEM;
for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
ca->usage[i] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[i])
return -ENOMEM;
}
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
}

View File

@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, false);
}
static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
{
return c->bucket_clock[rw].hand - g->io_time[rw];
}
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
*/
static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
static inline u8 bucket_gc_gen(struct bucket *g)
{
struct bucket *g = bucket(ca, b);
return g->mark.gen - g->oldest_gen;
}
@ -169,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
void bch2_dev_usage_from_buckets(struct bch_fs *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats)
{
@ -214,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
READ_ONCE(c->replicas.nr);
}
static inline unsigned dev_usage_u64s(void)
{
return sizeof(struct bch_dev_usage) / sizeof(u64);
}
void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);

View File

@ -37,11 +37,12 @@ struct bucket {
const struct bucket_mark mark;
};
u16 io_time[2];
u64 io_time[2];
u8 oldest_gen;
u8 gc_gen;
unsigned gen_valid:1;
u8 ec_redundancy;
u8 stripe_redundancy;
u32 stripe;
};
struct bucket_array {

View File

@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
spin_lock(&clock->timer_lock);
if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
if (time_after_eq((unsigned long) atomic64_read(&clock->now),
timer->expire)) {
spin_unlock(&clock->timer_lock);
timer->fn(timer);
@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
{
struct io_timer *timer;
unsigned long now = atomic_long_add_return(sectors, &clock->now);
unsigned long now = atomic64_add_return(sectors, &clock->now);
while ((timer = get_expired_timer(clock, now)))
timer->fn(timer);
@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
unsigned i;
spin_lock(&clock->timer_lock);
now = atomic_long_read(&clock->now);
now = atomic64_read(&clock->now);
for (i = 0; i < clock->timers.used; i++)
pr_buf(out, "%ps:\t%li\n",
@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
int bch2_io_clock_init(struct io_clock *clock)
{
atomic_long_set(&clock->now, 0);
atomic64_set(&clock->now, 0);
spin_lock_init(&clock->timer_lock);
clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();

View File

@ -26,7 +26,7 @@ struct io_timer {
typedef HEAP(struct io_timer *) io_timer_heap;
struct io_clock {
atomic_long_t now;
atomic64_t now;
u16 __percpu *pcpu_buf;
unsigned max_slop;

View File

@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
if (!bkey_cmp(k.k->p, POS_MIN))
return "stripe at pos 0";
if (k.k->p.inode)
return "invalid stripe key";
@ -279,10 +282,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
char buf2[200];
bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
bch_err_ratelimited(c,
"stripe checksum error at %u:%u: csum type %u, expected %llx got %llx",
i, j, v->csum_type,
want.lo, got.lo);
"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
(void *) _RET_IP_, i, j, v->csum_type,
want.lo, got.lo, buf2);
clear_bit(i, buf->valid);
break;
}
@ -335,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
static void ec_block_endio(struct bio *bio)
{
struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
struct bch_stripe *v = &ec_bio->buf->key.v;
struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
@ -343,6 +352,13 @@ static void ec_block_endio(struct bio *bio)
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
if (ptr_stale(ca, ptr)) {
bch_err_ratelimited(ca->fs,
"error %s stripe: stale pointer after io",
bio_data_dir(bio) == READ ? "reading from" : "writing to");
clear_bit(ec_bio->idx, ec_bio->buf->valid);
}
bio_put(&ec_bio->bio);
percpu_ref_put(&ca->io_ref);
closure_put(cl);
@ -652,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c,
static int ec_stripe_delete(struct bch_fs *c, size_t idx)
{
//pr_info("deleting stripe %zu", idx);
return bch2_btree_delete_range(c, BTREE_ID_EC,
POS(0, idx),
POS(0, idx + 1),
@ -795,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
*dst = (struct bch_extent_stripe_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
.block = block,
.redundancy = s->key.v.nr_redundant,
.idx = s->key.k.p.offset,
};
}
@ -1054,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
if (!ob)
return;
//pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
ec = ob->ec;
mutex_lock(&ec->lock);
@ -1348,12 +1362,14 @@ static s64 get_existing_stripe(struct bch_fs *c,
struct stripe *m;
size_t heap_idx;
u64 stripe_idx;
s64 ret = -1;
if (may_create_new_stripe(c))
return -1;
spin_lock(&c->ec_stripes_heap_lock);
for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
/* No blocks worth reusing, stripe will just be deleted: */
if (!h->data[heap_idx].blocks_nonempty)
continue;
@ -1365,13 +1381,12 @@ static s64 get_existing_stripe(struct bch_fs *c,
m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant) {
bch2_stripes_heap_del(c, m, stripe_idx);
spin_unlock(&c->ec_stripes_heap_lock);
return stripe_idx;
ret = stripe_idx;
break;
}
}
spin_unlock(&c->ec_stripes_heap_lock);
return -1;
return ret;
}
struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,

View File

@ -704,14 +704,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
if (p.ptr.cached)
continue;
if (p.has_ec) {
struct stripe *s =
genradix_ptr(&c->stripes[0], p.ec.idx);
WARN_ON(!s);
if (s)
replicas += s->nr_redundant;
}
if (p.has_ec)
replicas += p.ec.redundancy;
replicas++;
@ -734,16 +728,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
durability = max_t(unsigned, durability, ca->mi.durability);
if (p.has_ec) {
struct stripe *s =
genradix_ptr(&c->stripes[0], p.ec.idx);
if (p.has_ec)
durability += p.ec.redundancy;
if (WARN_ON(!s))
goto out;
durability += s->nr_redundant;
}
out:
return durability;
}

View File

@ -1121,6 +1121,9 @@ int bch2_fs_journal_init(struct journal *j)
j->entry_u64s_reserved +=
BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
j->entry_u64s_reserved +=
2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
atomic64_set(&j->reservations.counter,
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);

View File

@ -5,6 +5,7 @@
#include "btree_update_interior.h"
#include "buckets.h"
#include "checksum.h"
#include "disk_groups.h"
#include "error.h"
#include "io.h"
#include "journal.h"
@ -426,6 +427,69 @@ fsck_err:
return ret;
}
static int journal_entry_validate_clock(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
int write)
{
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
int ret = 0;
if (journal_entry_err_on(bytes != sizeof(*clock),
c, "invalid journal entry clock: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(clock->rw > 1,
c, "invalid journal entry clock: bad rw")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
fsck_err:
return ret;
}
static int journal_entry_validate_dev_usage(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
int write)
{
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
unsigned dev;
int ret = 0;
if (journal_entry_err_on(bytes < expected,
c, "invalid journal entry dev usage: bad size (%u < %u)",
bytes, expected)) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
dev = le32_to_cpu(u->dev);
if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
c, "invalid journal entry dev usage: bad dev")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(u->pad,
c, "invalid journal entry dev usage: bad pad")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
fsck_err:
return ret;
}
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, int);
@ -937,6 +1001,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
bch2_replicas_entry_sort(&replicas.e);
/*
* If we're mounting in degraded mode - if we didn't read all
* the devices - this is wrong:
@ -1032,16 +1098,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
unsigned sectors)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_devs_mask devs;
struct journal_device *ja;
struct bch_dev *ca;
struct dev_alloc_list devs_sorted;
unsigned target = c->opts.metadata_target ?:
c->opts.foreground_target;
unsigned i, replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
rcu_read_lock();
retry:
devs = target_rw_devs(c, BCH_DATA_journal, target);
devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
&c->rw_devs[BCH_DATA_journal]);
devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
@ -1073,6 +1143,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
if (replicas < replicas_want && target) {
/* Retry from all devices: */
target = 0;
goto retry;
}
done:
rcu_read_unlock();
@ -1278,6 +1354,9 @@ static void do_journal_write(struct closure *cl)
bio->bi_private = ca;
bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
ca->prev_journal_sector = bio->bi_iter.bi_sector;
if (!JSET_NO_FLUSH(w->data))
bio->bi_opf |= REQ_FUA;
if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
@ -1348,8 +1427,8 @@ void bch2_journal_write(struct closure *cl)
end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
end = bch2_journal_super_entries_add_common(c, end,
le64_to_cpu(jset->seq));
bch2_journal_super_entries_add_common(c, &end,
le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
@ -1358,10 +1437,7 @@ void bch2_journal_write(struct closure *cl)
journal_write_compact(jset);
jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
jset->magic = cpu_to_le64(jset_magic(c));
jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
? cpu_to_le32(BCH_JSET_VERSION_OLD)
: cpu_to_le32(c->sb.version);

View File

@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
data_opts->rewrite_dev = p.ptr.dev;
if (p.has_ec) {
struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx);
data_opts->nr_replicas += m->nr_redundant;
}
if (p.has_ec)
data_opts->nr_replicas += p.ec.redundancy;
return DATA_REWRITE;
}
@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c)
bucket_sectors_used(m) >= ca->mi.bucket_size)
continue;
WARN_ON(m.stripe && !g->ec_redundancy);
WARN_ON(m.stripe && !g->stripe_redundancy);
e = (struct copygc_heap_entry) {
.dev = dev_idx,
.gen = m.gen,
.replicas = 1 + g->ec_redundancy,
.replicas = 1 + g->stripe_redundancy,
.fragmentation = bucket_sectors_used(m) * (1U << 15)
/ ca->mi.bucket_size,
.sectors = bucket_sectors_used(m),
@ -301,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last, wait;
u64 last, wait;
set_freezable();
@ -309,7 +306,7 @@ static int bch2_copygc_thread(void *arg)
if (kthread_wait_freezable(c->copy_gc_enabled))
break;
last = atomic_long_read(&clock->now);
last = atomic64_read(&clock->now);
wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {

View File

@ -136,6 +136,11 @@ enum opt_type {
OPT_STR(bch2_str_hash_types), \
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \
NULL, "Hash function for directory entries and xattrs")\
x(metadata_target, u16, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
OPT_FN(bch2_opt_target), \
BCH_SB_METADATA_TARGET, 0, \
"(target)", "Device or disk group for metadata writes") \
x(foreground_target, u16, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
OPT_FN(bch2_opt_target), \

View File

@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
unsigned long io_start;
u64 io_start;
long throttle;
set_freezable();
io_start = atomic_long_read(&clock->now);
io_start = atomic64_read(&clock->now);
p = rebalance_work(c);
prev_start = jiffies;
prev_cputime = curr_cputime();
@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
(20 - w.dev_most_full_percent),
50);
if (atomic_long_read(&clock->now) + clock->max_slop <
if (atomic64_read(&clock->now) + clock->max_slop <
r->throttled_until_iotime) {
r->throttled_until_cputime = start + throttle;
r->state = REBALANCE_THROTTLED;
@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U));
io_start = atomic_long_read(&clock->now);
io_start = atomic64_read(&clock->now);
p = w;
prev_start = start;
prev_cputime = cputime;
@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
case REBALANCE_THROTTLED:
bch2_hprint(&PBUF(h1),
(r->throttled_until_iotime -
atomic_long_read(&c->io_clock[WRITE].now)) << 9);
atomic64_read(&c->io_clock[WRITE].now)) << 9);
pr_buf(out, "throttled for %lu sec or %s io\n",
(r->throttled_until_cputime - jiffies) / HZ,
h1);

View File

@ -17,7 +17,7 @@ struct bch_fs_rebalance {
atomic64_t work_unknown_dev;
enum rebalance_state state;
unsigned long throttled_until_iotime;
u64 throttled_until_iotime;
unsigned long throttled_until_cputime;
struct bch_move_stats move_stats;

View File

@ -825,10 +825,31 @@ static int journal_replay_entry_early(struct bch_fs *c,
case BCH_JSET_ENTRY_data_usage: {
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
ret = bch2_replicas_set_usage(c, &u->r,
le64_to_cpu(u->v));
break;
}
case BCH_JSET_ENTRY_dev_usage: {
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
sizeof(struct jset_entry_dev_usage_type);
unsigned i;
ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
for (i = 0; i < nr_types; i++) {
ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
}
break;
}
case BCH_JSET_ENTRY_blacklist: {
struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);
@ -847,6 +868,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(bl_entry->end) + 1);
break;
}
case BCH_JSET_ENTRY_clock: {
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
atomic64_set(&c->io_clock[clock->rw].now, clock->time);
}
}
return ret;
@ -861,9 +888,6 @@ static int journal_replay_early(struct bch_fs *c,
int ret;
if (clean) {
c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
for (entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
@ -876,9 +900,6 @@ static int journal_replay_early(struct bch_fs *c,
if (i->ignore)
continue;
c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
vstruct_for_each(&i->j, entry) {
ret = journal_replay_entry_early(c, entry);
if (ret)
@ -942,13 +963,6 @@ static int verify_superblock_clean(struct bch_fs *c,
return 0;
}
mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
"superblock read clock %u doesn't match journal %u after clean shutdown",
clean->read_clock, j->read_clock);
mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
"superblock write clock %u doesn't match journal %u after clean shutdown",
clean->write_clock, j->write_clock);
for (i = 0; i < BTREE_ID_NR; i++) {
char buf1[200], buf2[200];
struct bkey_i *k1, *k2;

View File

@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
#endif
}
static void replicas_entry_sort(struct bch_replicas_entry *e)
void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
{
bubble_sort(e->devs, e->nr_devs, u8_cmp);
}
@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
break;
}
replicas_entry_sort(e);
bch2_replicas_entry_sort(e);
}
void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
@ -142,7 +142,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
for (i = 0; i < devs.nr; i++)
e->devs[e->nr_devs++] = devs.devs[i];
replicas_entry_sort(e);
bch2_replicas_entry_sort(e);
}
static struct bch_replicas_cpu
@ -197,7 +197,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
int bch2_replicas_entry_idx(struct bch_fs *c,
struct bch_replicas_entry *search)
{
replicas_entry_sort(search);
bch2_replicas_entry_sort(search);
return __replicas_entry_idx(&c->replicas, search);
}
@ -681,7 +681,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
for_each_replicas_entry(sb_r, e) {
dst = cpu_replicas_entry(cpu_r, idx++);
memcpy(dst, e, replicas_entry_bytes(e));
replicas_entry_sort(dst);
bch2_replicas_entry_sort(dst);
}
return 0;
@ -718,7 +718,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
dst->nr_devs = e->nr_devs;
dst->nr_required = 1;
memcpy(dst->devs, e->devs, e->nr_devs);
replicas_entry_sort(dst);
bch2_replicas_entry_sort(dst);
}
return 0;

View File

@ -5,6 +5,7 @@
#include "eytzinger.h"
#include "replicas_types.h"
void bch2_replicas_entry_sort(struct bch_replicas_entry *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);

View File

@ -963,31 +963,28 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
return ret;
}
static void
entry_init_u64s(struct jset_entry *entry, unsigned u64s)
static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
{
memset(entry, 0, u64s * sizeof(u64));
struct jset_entry *entry = *end;
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
memset(entry, 0, u64s * sizeof(u64));
/*
* The u64s field counts from the start of data, ignoring the shared
* fields.
*/
entry->u64s = u64s - 1;
*end = vstruct_next(*end);
return entry;
}
static void
entry_init_size(struct jset_entry *entry, size_t size)
void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)
{
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
entry_init_u64s(entry, u64s);
}
struct jset_entry *
bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry *entry,
u64 journal_seq)
{
unsigned i;
struct bch_dev *ca;
unsigned i, dev;
percpu_down_write(&c->mark_lock);
@ -1000,58 +997,77 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
{
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_INODES;
u->v = cpu_to_le64(c->usage_base->nr_inodes);
entry = vstruct_next(entry);
}
{
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_KEY_VERSION;
u->v = cpu_to_le64(atomic64_read(&c->key_version));
entry = vstruct_next(entry);
}
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_RESERVED;
u->entry.level = i;
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
entry = vstruct_next(entry);
}
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
struct jset_entry_data_usage, entry);
entry_init_size(entry, sizeof(*u) + e->nr_devs);
u->entry.type = BCH_JSET_ENTRY_data_usage;
u->v = cpu_to_le64(c->usage_base->replicas[i]);
memcpy(&u->r, e, replicas_entry_bytes(e));
}
entry = vstruct_next(entry);
for_each_member_device(ca, c, dev) {
unsigned b = sizeof(struct jset_entry_dev_usage) +
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
struct jset_entry_dev_usage *u =
container_of(jset_entry_init(end, b),
struct jset_entry_dev_usage, entry);
u->entry.type = BCH_JSET_ENTRY_dev_usage;
u->dev = cpu_to_le32(dev);
u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable);
for (i = 0; i < BCH_DATA_NR; i++) {
u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
}
}
percpu_up_write(&c->mark_lock);
return entry;
for (i = 0; i < 2; i++) {
struct jset_entry_clock *clock =
container_of(jset_entry_init(end, sizeof(*clock)),
struct jset_entry_clock, entry);
clock->entry.type = BCH_JSET_ENTRY_clock;
clock->rw = i;
clock->time = atomic64_read(&c->io_clock[i].now);
}
}
void bch2_fs_mark_clean(struct bch_fs *c)
@ -1080,15 +1096,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
}
sb_clean->flags = 0;
sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
/* Trying to catch outstanding bug: */
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
entry = sb_clean->start;
entry = bch2_journal_super_entries_add_common(c, entry, 0);
bch2_journal_super_entries_add_common(c, &entry, 0);
entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));

View File

@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
/* BCH_SB_FIELD_clean: */
struct jset_entry *
bch2_journal_super_entries_add_common(struct bch_fs *,
struct jset_entry *, u64);
void bch2_journal_super_entries_add_common(struct bch_fs *,
struct jset_entry **, u64);
void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);

View File

@ -148,6 +148,22 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
return c;
}
static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i, nr = 0, u64s =
(sizeof(struct jset_entry_dev_usage) +
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR);
rcu_read_lock();
for_each_member_device_rcu(ca, c, i, NULL)
nr++;
rcu_read_unlock();
bch2_journal_entry_res_resize(&c->journal,
&c->dev_usage_journal_res, u64s * nr);
}
/* Filesystem RO/RW: */
/*
@ -174,9 +190,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
/*
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
@ -399,9 +412,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
for_each_rw_member(ca, c, i) {
ret = bch2_dev_allocator_start(ca);
if (ret) {
@ -779,6 +789,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_fsio_init(c))
goto err;
bch2_dev_usage_journal_reserve(c);
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
@ -1521,6 +1533,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
mutex_unlock(&c->sb_lock);
up_write(&c->state_lock);
bch2_dev_usage_journal_reserve(c);
return 0;
err:
if (ca->mi.state == BCH_MEMBER_STATE_RW &&
@ -1530,19 +1544,6 @@ err:
return ret;
}
static void dev_usage_clear(struct bch_dev *ca)
{
struct bucket_array *buckets;
percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
up_read(&ca->bucket_lock);
}
/* Add new device to running filesystem: */
int bch2_dev_add(struct bch_fs *c, const char *path)
{
@ -1600,8 +1601,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (ret)
goto err;
dev_usage_clear(ca);
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
@ -1655,6 +1654,8 @@ have_slot:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
bch2_dev_usage_journal_reserve(c);
err = "error marking superblock";
ret = bch2_trans_mark_dev_sb(c, NULL, ca);
if (ret)

View File

@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
{
int rw = (private ? 1 : 0);
return bucket_last_io(c, bucket(ca, b), rw);
return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
}
static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
size_t b, void *private)
{
return bucket_gc_gen(ca, b);
return bucket_gc_gen(bucket(ca, b));
}
static int unsigned_cmp(const void *_l, const void *_r)