Update bcachefs sources to da7fefde29 bcachefs: shim for userspace raid library

This commit is contained in:
Kent Overstreet 2018-11-23 03:04:34 -05:00
parent c416528eaa
commit bca8b084ad
41 changed files with 3295 additions and 1018 deletions

View File

@ -1 +1 @@
a9f14c773fb122a4b283fc7b79d9f98703a18890
da7fefde294e3c56359ee498a62a77182a4733cd

View File

@ -6,6 +6,8 @@
#include <linux/kobject.h>
#include <linux/types.h>
#define BIO_MAX_PAGES 256
typedef unsigned fmode_t;
struct bio;

View File

@ -9,6 +9,7 @@
#include "buckets.h"
#include "clock.h"
#include "debug.h"
#include "ec.h"
#include "error.h"
#include "journal_io.h"
@ -82,7 +83,8 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
case BCH_ALLOC: {
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
/* allow for unknown fields */
if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
return "incorrect value size";
break;
}
@ -235,6 +237,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
struct bucket *g;
struct bkey_i_alloc *a;
int ret;
u8 *d;
percpu_down_read_preempt_disable(&c->usage_lock);
@ -258,32 +261,50 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
bch2_btree_iter_set_pos(iter, a->k.p);
return bch2_btree_insert_at(c, NULL, journal_seq,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
flags,
BTREE_INSERT_ENTRY(iter, &a->k_i));
ret = bch2_btree_insert_at(c, NULL, journal_seq,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
flags,
BTREE_INSERT_ENTRY(iter, &a->k_i));
if (!ret && ca->buckets_written)
set_bit(b, ca->buckets_written);
return ret;
}
int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
{
struct bch_dev *ca;
struct btree_iter iter;
int ret;
if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
if (k->k.p.inode >= c->sb.nr_devices ||
!c->devs[k->k.p.inode])
return 0;
ca = bch_dev_bkey_exists(c, pos.inode);
ca = bch_dev_bkey_exists(c, k->k.p.inode);
if (pos.offset >= ca->mi.nbuckets)
if (k->k.p.offset >= ca->mi.nbuckets)
return 0;
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p,
BTREE_ITER_INTENT);
ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto err;
/* check buckets_written with btree node locked: */
ret = test_bit(k->k.p.offset, ca->buckets_written)
? 0
: bch2_btree_insert_at(c, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_REPLAY,
BTREE_INSERT_ENTRY(&iter, k));
err:
bch2_btree_iter_unlock(&iter);
return ret;
}
@ -909,12 +930,6 @@ static int bch2_allocator_thread(void *arg)
pr_debug("free_inc now empty");
do {
if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
up_read(&c->gc_lock);
bch_err(ca, "gc failure");
goto stop;
}
/*
* Find some buckets that we can invalidate, either
* they're completely unused, or only contain clean data
@ -1112,6 +1127,24 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
}
mutex_unlock(&c->btree_reserve_cache_lock);
while (1) {
struct open_bucket *ob;
spin_lock(&c->freelist_lock);
if (!ca->open_buckets_partial_nr) {
spin_unlock(&c->freelist_lock);
break;
}
ob = c->open_buckets +
ca->open_buckets_partial[--ca->open_buckets_partial_nr];
ob->on_partial_list = false;
spin_unlock(&c->freelist_lock);
bch2_open_bucket_put(c, ob);
}
bch2_ec_stop_dev(c, ca);
/*
* Wake up threads that were blocked on allocation, so they can notice
* the device can no longer be removed and the capacity has changed:
@ -1254,9 +1287,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
bool invalidating_data = false;
int ret = 0;
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
return -1;
if (test_alloc_startup(c)) {
invalidating_data = true;
goto not_enough;
@ -1264,51 +1294,47 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
/* Scan for buckets that are already invalidated: */
for_each_rw_member(ca, c, dev_iter) {
struct btree_iter iter;
struct bucket_array *buckets;
struct bucket_mark m;
struct bkey_s_c k;
for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
if (k.k->type != BCH_ALLOC)
down_read(&ca->bucket_lock);
percpu_down_read_preempt_disable(&c->usage_lock);
buckets = bucket_array(ca);
for (bu = buckets->first_bucket;
bu < buckets->nbuckets; bu++) {
m = READ_ONCE(buckets->b[bu].mark);
if (!m.gen_valid ||
!is_available_bucket(m) ||
m.cached_sectors)
continue;
bu = k.k->p.offset;
m = READ_ONCE(bucket(ca, bu)->mark);
if (!is_available_bucket(m) || m.cached_sectors)
continue;
percpu_down_read_preempt_disable(&c->usage_lock);
bch2_mark_alloc_bucket(c, ca, bu, true,
gc_pos_alloc(c, NULL),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
percpu_up_read_preempt_enable(&c->usage_lock);
gc_pos_alloc(c, NULL), 0);
fifo_push(&ca->free_inc, bu);
if (fifo_full(&ca->free_inc))
discard_invalidated_buckets(c, ca);
if (fifo_full(&ca->free[RESERVE_BTREE]))
break;
}
bch2_btree_iter_unlock(&iter);
percpu_up_read_preempt_enable(&c->usage_lock);
up_read(&ca->bucket_lock);
}
/* did we find enough buckets? */
for_each_rw_member(ca, c, dev_iter)
if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
if (!fifo_full(&ca->free[RESERVE_BTREE])) {
percpu_ref_put(&ca->io_ref);
goto not_enough;
}
return 0;
not_enough:
pr_debug("did not find enough empty buckets; issuing discards");
/* clear out free_inc, we'll be using it again below: */
for_each_rw_member(ca, c, dev_iter)
discard_invalidated_buckets(c, ca);
pr_debug("scanning for reclaimable buckets");
pr_debug("not enough empty buckets; scanning for reclaimable buckets");
for_each_rw_member(ca, c, dev_iter) {
find_reclaimable_buckets(c, ca);

View File

@ -16,7 +16,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
}
int bch2_alloc_read(struct bch_fs *, struct list_head *);
int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{

View File

@ -61,6 +61,7 @@
#include "clock.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
#include "io.h"
#include <linux/math64.h>
@ -94,6 +95,11 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
if (ob->ec) {
bch2_ec_bucket_written(c, ob);
return;
}
percpu_down_read_preempt_disable(&c->usage_lock);
spin_lock(&ob->lock);
@ -113,6 +119,19 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
closure_wake_up(&c->open_buckets_wait);
}
void bch2_open_bucket_write_error(struct bch_fs *c,
struct open_buckets *obs,
unsigned dev)
{
struct open_bucket *ob;
unsigned i;
open_bucket_for_each(c, obs, ob, i)
if (ob->ptr.dev == dev &&
ob->ec)
bch2_ec_bucket_cancel(c, ob);
}
static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
{
struct open_bucket *ob;
@ -128,15 +147,17 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
}
static void open_bucket_free_unused(struct bch_fs *c,
struct write_point *wp,
struct open_bucket *ob)
struct open_bucket *ob,
bool may_realloc)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
BUG_ON(ca->open_buckets_partial_nr >=
ARRAY_SIZE(ca->open_buckets_partial));
if (wp->type == BCH_DATA_USER) {
if (ca->open_buckets_partial_nr <
ARRAY_SIZE(ca->open_buckets_partial) &&
may_realloc) {
spin_lock(&c->freelist_lock);
ob->on_partial_list = true;
ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
@ -284,18 +305,18 @@ out:
return ob;
}
static int __dev_alloc_cmp(struct write_point *wp,
unsigned l, unsigned r)
static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
unsigned l, unsigned r)
{
return ((wp->next_alloc[l] > wp->next_alloc[r]) -
(wp->next_alloc[l] < wp->next_alloc[r]));
return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
(stripe->next_alloc[l] < stripe->next_alloc[r]));
}
#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
struct write_point *wp,
struct bch_devs_mask *devs)
struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
struct dev_stripe_state *stripe,
struct bch_devs_mask *devs)
{
struct dev_alloc_list ret = { .nr = 0 };
struct bch_dev *ca;
@ -304,14 +325,14 @@ struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
for_each_member_device_rcu(ca, c, i, devs)
ret.devs[ret.nr++] = i;
bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
return ret;
}
void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
struct write_point *wp)
void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca,
struct dev_stripe_state *stripe)
{
u64 *v = wp->next_alloc + ca->dev_idx;
u64 *v = stripe->next_alloc + ca->dev_idx;
u64 free_space = dev_buckets_free(c, ca);
u64 free_space_inv = free_space
? div64_u64(1ULL << 48, free_space)
@ -323,26 +344,30 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
else
*v = U64_MAX;
for (v = wp->next_alloc;
v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
for (v = stripe->next_alloc;
v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
*v = *v < scale ? 0 : *v - scale;
}
#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0)
#define BUCKET_ALLOC_USE_DURABILITY (1 << 1)
static int bch2_bucket_alloc_set(struct bch_fs *c,
struct open_buckets *ptrs,
struct write_point *wp,
struct dev_stripe_state *stripe,
struct bch_devs_mask *devs_may_alloc,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
enum alloc_reserve reserve,
unsigned flags,
struct closure *cl)
{
struct dev_alloc_list devs_sorted =
bch2_wp_alloc_list(c, wp, devs_may_alloc);
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
struct bch_dev *ca;
bool alloc_failure = false;
unsigned i;
unsigned i, durability;
BUG_ON(*nr_effective >= nr_replicas);
@ -353,13 +378,11 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
if (!ca)
continue;
if (!ca->mi.durability &&
(*have_cache ||
wp->type != BCH_DATA_USER))
if (!ca->mi.durability && *have_cache)
continue;
ob = bch2_bucket_alloc(c, ca, reserve,
wp->type == BCH_DATA_USER, cl);
flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
if (IS_ERR(ob)) {
enum bucket_alloc_ret ret = -PTR_ERR(ob);
@ -374,13 +397,16 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
continue;
}
durability = (flags & BUCKET_ALLOC_USE_DURABILITY)
? ca->mi.durability : 1;
__clear_bit(ca->dev_idx, devs_may_alloc->d);
*nr_effective += ca->mi.durability;
*have_cache |= !ca->mi.durability;
*nr_effective += durability;
*have_cache |= !durability;
ob_push(c, ptrs, ob);
bch2_wp_rescale(c, ca, wp);
bch2_dev_stripe_increment(c, ca, stripe);
if (*nr_effective >= nr_replicas)
return 0;
@ -389,15 +415,150 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
return alloc_failure ? -ENOSPC : -EROFS;
}
/* Allocate from stripes: */
/*
* XXX: use a higher watermark for allocating open buckets here:
*/
static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
{
struct bch_devs_mask devs;
struct open_bucket *ob;
unsigned i, nr_have = 0, nr_data =
min_t(unsigned, h->nr_active_devs,
EC_STRIPE_MAX) - h->redundancy;
bool have_cache = true;
int ret = 0;
BUG_ON(h->blocks.nr > nr_data);
BUG_ON(h->parity.nr > h->redundancy);
devs = h->devs;
open_bucket_for_each(c, &h->parity, ob, i)
__clear_bit(ob->ptr.dev, devs.d);
open_bucket_for_each(c, &h->blocks, ob, i)
__clear_bit(ob->ptr.dev, devs.d);
percpu_down_read_preempt_disable(&c->usage_lock);
rcu_read_lock();
if (h->parity.nr < h->redundancy) {
nr_have = h->parity.nr;
ret = bch2_bucket_alloc_set(c, &h->parity,
&h->parity_stripe,
&devs,
h->redundancy,
&nr_have,
&have_cache,
RESERVE_NONE,
0,
NULL);
if (ret)
goto err;
}
if (h->blocks.nr < nr_data) {
nr_have = h->blocks.nr;
ret = bch2_bucket_alloc_set(c, &h->blocks,
&h->block_stripe,
&devs,
nr_data,
&nr_have,
&have_cache,
RESERVE_NONE,
0,
NULL);
if (ret)
goto err;
}
rcu_read_unlock();
percpu_up_read_preempt_enable(&c->usage_lock);
return bch2_ec_stripe_new_alloc(c, h);
err:
rcu_read_unlock();
percpu_up_read_preempt_enable(&c->usage_lock);
return -1;
}
/*
* if we can't allocate a new stripe because there are already too many
* partially filled stripes, force allocating from an existing stripe even when
* it's to a device we don't want:
*/
static void bucket_alloc_from_stripe(struct bch_fs *c,
struct open_buckets *ptrs,
struct write_point *wp,
struct bch_devs_mask *devs_may_alloc,
u16 target,
unsigned erasure_code,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache)
{
struct dev_alloc_list devs_sorted;
struct ec_stripe_head *h;
struct open_bucket *ob;
struct bch_dev *ca;
unsigned i, ec_idx;
if (!erasure_code)
return;
if (nr_replicas < 2)
return;
if (ec_open_bucket(c, ptrs))
return;
h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1);
if (!h)
return;
if (!h->s && ec_stripe_alloc(c, h))
goto out_put_head;
rcu_read_lock();
devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
rcu_read_unlock();
for (i = 0; i < devs_sorted.nr; i++)
open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
if (ob->ptr.dev == devs_sorted.devs[i] &&
!test_and_set_bit(ec_idx, h->s->blocks_allocated))
goto got_bucket;
goto out_put_head;
got_bucket:
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
ob->ec_idx = ec_idx;
ob->ec = h->s;
__clear_bit(ob->ptr.dev, devs_may_alloc->d);
*nr_effective += ca->mi.durability;
*have_cache |= !ca->mi.durability;
ob_push(c, ptrs, ob);
atomic_inc(&h->s->pin);
out_put_head:
bch2_ec_stripe_head_put(h);
}
/* Sector allocator */
static int get_buckets_from_writepoint(struct bch_fs *c,
struct open_buckets *ptrs,
struct write_point *wp,
struct bch_devs_mask *devs_may_alloc,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache)
static void get_buckets_from_writepoint(struct bch_fs *c,
struct open_buckets *ptrs,
struct write_point *wp,
struct bch_devs_mask *devs_may_alloc,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
bool need_ec)
{
struct open_buckets ptrs_skip = { .nr = 0 };
struct open_bucket *ob;
@ -409,7 +570,8 @@ static int get_buckets_from_writepoint(struct bch_fs *c,
if (*nr_effective < nr_replicas &&
test_bit(ob->ptr.dev, devs_may_alloc->d) &&
(ca->mi.durability ||
(wp->type == BCH_DATA_USER && !*have_cache))) {
(wp->type == BCH_DATA_USER && !*have_cache)) &&
(ob->ec || !need_ec)) {
__clear_bit(ob->ptr.dev, devs_may_alloc->d);
*nr_effective += ca->mi.durability;
*have_cache |= !ca->mi.durability;
@ -420,8 +582,6 @@ static int get_buckets_from_writepoint(struct bch_fs *c,
}
}
wp->ptrs = ptrs_skip;
return *nr_effective < nr_replicas ? -ENOSPC : 0;
}
static int open_bucket_add_buckets(struct bch_fs *c,
@ -429,22 +589,25 @@ static int open_bucket_add_buckets(struct bch_fs *c,
struct write_point *wp,
struct bch_devs_list *devs_have,
u16 target,
unsigned erasure_code,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
enum alloc_reserve reserve,
struct closure *cl)
struct closure *_cl)
{
struct bch_devs_mask devs;
const struct bch_devs_mask *t;
struct open_bucket *ob;
unsigned i;
struct closure *cl = NULL;
unsigned i, flags = BUCKET_ALLOC_USE_DURABILITY;
int ret;
percpu_down_read_preempt_disable(&c->usage_lock);
rcu_read_lock();
if (wp->type == BCH_DATA_USER)
flags |= BUCKET_MAY_ALLOC_PARTIAL;
devs = c->rw_devs[wp->type];
rcu_read_lock();
devs = target_rw_devs(c, wp->type, target);
rcu_read_unlock();
/* Don't allocate from devices we already have pointers to: */
for (i = 0; i < devs_have->nr; i++)
@ -453,50 +616,83 @@ static int open_bucket_add_buckets(struct bch_fs *c,
open_bucket_for_each(c, ptrs, ob, i)
__clear_bit(ob->ptr.dev, devs.d);
t = bch2_target_to_mask(c, target);
if (t)
bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
if (erasure_code) {
get_buckets_from_writepoint(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
have_cache, true);
if (*nr_effective >= nr_replicas)
return 0;
ret = get_buckets_from_writepoint(c, ptrs, wp, &devs,
nr_replicas, nr_effective, have_cache);
if (!ret)
goto out;
bucket_alloc_from_stripe(c, ptrs, wp, &devs,
target, erasure_code,
nr_replicas, nr_effective,
have_cache);
if (*nr_effective >= nr_replicas)
return 0;
}
get_buckets_from_writepoint(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
have_cache, false);
if (*nr_effective >= nr_replicas)
return 0;
percpu_down_read_preempt_disable(&c->usage_lock);
rcu_read_lock();
retry_blocking:
/*
* Try nonblocking first, so that if one device is full we'll try from
* other devices:
*/
ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
nr_replicas, nr_effective, have_cache,
reserve, NULL);
if (!ret || ret == -EROFS || !cl)
goto out;
reserve, flags, cl);
if (ret && ret != -EROFS && !cl && _cl) {
cl = _cl;
goto retry_blocking;
}
ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
nr_replicas, nr_effective, have_cache,
reserve, cl);
out:
rcu_read_unlock();
percpu_up_read_preempt_enable(&c->usage_lock);
return ret;
}
void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
struct open_buckets *obs,
enum bch_data_type data_type)
{
struct open_buckets ptrs = { .nr = 0 };
struct open_bucket *ob, *ob2;
unsigned i, j;
open_bucket_for_each(c, obs, ob, i) {
bool drop = !ca || ob->ptr.dev == ca->dev_idx;
if (!drop && ob->ec) {
mutex_lock(&ob->ec->lock);
open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
drop |= ob2->ptr.dev == ca->dev_idx;
open_bucket_for_each(c, &ob->ec->parity, ob2, j)
drop |= ob2->ptr.dev == ca->dev_idx;
mutex_unlock(&ob->ec->lock);
}
if (drop)
bch2_open_bucket_put(c, ob);
else
ob_push(c, &ptrs, ob);
}
*obs = ptrs;
}
void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
struct write_point *wp)
{
struct open_buckets ptrs = { .nr = 0 };
struct open_bucket *ob;
unsigned i;
mutex_lock(&wp->lock);
open_bucket_for_each(c, &wp->ptrs, ob, i)
if (!ca || ob->ptr.dev == ca->dev_idx)
open_bucket_free_unused(c, wp, ob);
else
ob_push(c, &ptrs, ob);
wp->ptrs = ptrs;
bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type);
mutex_unlock(&wp->lock);
}
@ -629,6 +825,7 @@ out:
*/
struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
unsigned target,
unsigned erasure_code,
struct write_point_specifier write_point,
struct bch_devs_list *devs_have,
unsigned nr_replicas,
@ -648,26 +845,37 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
write_points_nr = c->write_points_nr;
wp = writepoint_find(c, write_point.v);
/* metadata may not allocate on cache devices: */
if (wp->type != BCH_DATA_USER)
have_cache = true;
if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve, cl);
} else {
ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve, NULL);
if (!ret)
goto alloc_done;
ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, 0,
ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
0, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve, cl);
}
alloc_done:
BUG_ON(!ret && nr_effective < nr_replicas);
if (erasure_code && !ec_open_bucket(c, &ptrs))
pr_debug("failed to get ec bucket: ret %u", ret);
if (ret == -EROFS &&
nr_effective >= nr_replicas_required)
ret = 0;
@ -677,7 +885,7 @@ alloc_done:
/* Free buckets we didn't use: */
open_bucket_for_each(c, &wp->ptrs, ob, i)
open_bucket_free_unused(c, wp, ob);
open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER);
wp->ptrs = ptrs;
@ -696,7 +904,8 @@ err:
if (ptrs.nr < ARRAY_SIZE(ptrs.v))
ob_push(c, &ptrs, ob);
else
open_bucket_free_unused(c, wp, ob);
open_bucket_free_unused(c, ob,
wp->type == BCH_DATA_USER);
wp->ptrs = ptrs;
mutex_unlock(&wp->lock);

View File

@ -16,11 +16,11 @@ struct dev_alloc_list {
u8 devs[BCH_SB_MEMBERS_MAX];
};
struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
struct write_point *,
struct bch_devs_mask *);
void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
struct write_point *);
struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
struct dev_stripe_state *,
struct bch_devs_mask *);
void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *,
struct dev_stripe_state *);
long bch2_bucket_alloc_new_fs(struct bch_dev *);
@ -42,6 +42,22 @@ static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \
(_i)++)
static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
struct open_buckets *obs)
{
struct open_bucket *ob;
unsigned i;
open_bucket_for_each(c, obs, ob, i)
if (ob->ec)
return ob;
return NULL;
}
void bch2_open_bucket_write_error(struct bch_fs *,
struct open_buckets *, unsigned);
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
@ -75,7 +91,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
}
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
unsigned,
unsigned, unsigned,
struct write_point_specifier,
struct bch_devs_list *,
unsigned, unsigned,
@ -87,6 +103,9 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
struct bkey_i_extent *, unsigned);
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
struct open_buckets *, enum bch_data_type);
void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
struct write_point *);

View File

@ -7,6 +7,8 @@
#include "clock_types.h"
#include "fifo.h"
struct ec_bucket_buf;
/* There's two of these clocks, one for reads and one for writes: */
struct bucket_clock {
/*
@ -55,8 +57,10 @@ struct open_bucket {
u8 freelist;
bool valid;
bool on_partial_list;
u8 ec_idx;
unsigned sectors_free;
struct bch_extent_ptr ptr;
struct ec_stripe_new *ec;
};
#define OPEN_BUCKET_LIST_MAX 15
@ -66,18 +70,23 @@ struct open_buckets {
u8 v[OPEN_BUCKET_LIST_MAX];
};
struct dev_stripe_state {
u64 next_alloc[BCH_SB_MEMBERS_MAX];
};
struct write_point {
struct hlist_node node;
struct mutex lock;
u64 last_used;
unsigned long write_point;
enum bch_data_type type;
bool is_ec;
/* calculated based on how many pointers we're actually going to use: */
unsigned sectors_free;
struct open_buckets ptrs;
u64 next_alloc[BCH_SB_MEMBERS_MAX];
struct dev_stripe_state stripe;
};
struct write_point_specifier {

View File

@ -201,7 +201,7 @@
#include <linux/dynamic_fault.h>
#define bch2_fs_init_fault(name) \
#define bch2_fs_init_fault(name) \
dynamic_fault("bcachefs:bch_fs_init:" name)
#define bch2_meta_read_fault(name) \
dynamic_fault("bcachefs:meta:read:" name)
@ -270,7 +270,10 @@ do { \
BCH_DEBUG_PARAM(test_alloc_startup, \
"Force allocator startup to use the slowpath where it" \
"can't find enough free buckets without invalidating" \
"cached data")
"cached data") \
BCH_DEBUG_PARAM(force_reconstruct_read, \
"Force reads to use the reconstruct path, when reading" \
"from erasure coded extents")
#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
@ -308,6 +311,7 @@ enum bch_time_stats {
#include "btree_types.h"
#include "buckets_types.h"
#include "clock_types.h"
#include "ec_types.h"
#include "journal_types.h"
#include "keylist_types.h"
#include "quota_types.h"
@ -330,13 +334,16 @@ enum gc_phase {
GC_PHASE_START,
GC_PHASE_SB,
#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
DEFINE_BCH_BTREE_IDS()
#undef DEF_BTREE_ID
GC_PHASE_BTREE_EC,
GC_PHASE_BTREE_EXTENTS,
GC_PHASE_BTREE_INODES,
GC_PHASE_BTREE_DIRENTS,
GC_PHASE_BTREE_XATTRS,
GC_PHASE_BTREE_ALLOC,
GC_PHASE_BTREE_QUOTAS,
GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
GC_PHASE_DONE
};
struct gc_pos {
@ -381,14 +388,14 @@ struct bch_dev {
* gc_lock, for device resize - holding any is sufficient for access:
* Or rcu_read_lock(), but only for ptr_stale():
*/
struct bucket_array __rcu *buckets;
struct bucket_array __rcu *buckets[2];
unsigned long *buckets_dirty;
unsigned long *buckets_written;
/* most out of date gen in the btree */
u8 *oldest_gens;
struct rw_semaphore bucket_lock;
struct bch_dev_usage __percpu *usage_percpu;
struct bch_dev_usage usage_cached;
struct bch_dev_usage __percpu *usage[2];
/* Allocator: */
struct task_struct __rcu *alloc_thread;
@ -466,7 +473,6 @@ enum {
/* errors: */
BCH_FS_ERROR,
BCH_FS_GC_FAILURE,
/* misc: */
BCH_FS_BDEV_MOUNTED,
@ -602,8 +608,8 @@ struct bch_fs {
atomic64_t sectors_available;
struct bch_fs_usage __percpu *usage_percpu;
struct bch_fs_usage usage_cached;
struct bch_fs_usage __percpu *usage[2];
struct percpu_rw_semaphore usage_lock;
struct closure_waitlist freelist_wait;
@ -644,9 +650,6 @@ struct bch_fs {
*
* gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
*
* gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
* currently running, and gc marks are currently valid
*
* Protected by gc_pos_lock. Only written to by GC thread, so GC thread
* can read without a lock.
*/
@ -681,6 +684,21 @@ struct bch_fs {
/* REBALANCE */
struct bch_fs_rebalance rebalance;
/* ERASURE CODING */
struct list_head ec_new_stripe_list;
struct mutex ec_new_stripe_lock;
GENRADIX(struct ec_stripe) ec_stripes;
struct mutex ec_stripes_lock;
ec_stripes_heap ec_stripes_heap;
spinlock_t ec_stripes_heap_lock;
struct bio_set ec_bioset;
struct work_struct ec_stripe_delete_work;
struct llist_head ec_stripe_delete_list;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
struct bio_set dio_write_bioset;

View File

@ -233,6 +233,9 @@ struct bkey_packed {
} __attribute__((packed, aligned(8)));
#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64))
#define BKEY_U64s_MAX U8_MAX
#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s)
#define KEY_PACKED_BITS_START 24
#define KEY_FORMAT_LOCAL_BTREE 0
@ -460,8 +463,9 @@ enum bch_compression_type {
x(ptr, 0) \
x(crc32, 1) \
x(crc64, 2) \
x(crc128, 3)
#define BCH_EXTENT_ENTRY_MAX 4
x(crc128, 3) \
x(stripe_ptr, 4)
#define BCH_EXTENT_ENTRY_MAX 5
enum bch_extent_entry_type {
#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@ -552,7 +556,7 @@ struct bch_extent_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:1,
cached:1,
erasure_coded:1,
unused:1,
reservation:1,
offset:44, /* 8 petabytes */
dev:8,
@ -562,23 +566,35 @@ struct bch_extent_ptr {
dev:8,
offset:44,
reservation:1,
erasure_coded:1,
unused:1,
cached:1,
type:1;
#endif
} __attribute__((packed, aligned(8)));
struct bch_extent_reservation {
struct bch_extent_stripe_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:5,
unused:23,
block:8,
idx:51;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 idx:51,
block:8,
type:5;
#endif
};
struct bch_extent_reservation {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:6,
unused:22,
replicas:4,
generation:32;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 generation:32,
replicas:4,
unused:23,
type:5;
unused:22,
type:6;
#endif
};
@ -701,7 +717,8 @@ BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION);
BCH_INODE_FIELD(bi_data_replicas, 8) \
BCH_INODE_FIELD(bi_promote_target, 16) \
BCH_INODE_FIELD(bi_foreground_target, 16) \
BCH_INODE_FIELD(bi_background_target, 16)
BCH_INODE_FIELD(bi_background_target, 16) \
BCH_INODE_FIELD(bi_erasure_code, 16)
#define BCH_INODE_FIELDS_INHERIT() \
BCH_INODE_FIELD(bi_data_checksum) \
@ -711,7 +728,8 @@ BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION);
BCH_INODE_FIELD(bi_data_replicas) \
BCH_INODE_FIELD(bi_promote_target) \
BCH_INODE_FIELD(bi_foreground_target) \
BCH_INODE_FIELD(bi_background_target)
BCH_INODE_FIELD(bi_background_target) \
BCH_INODE_FIELD(bi_erasure_code)
enum {
/*
@ -871,6 +889,27 @@ struct bch_quota {
} __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(quota, BCH_QUOTA);
/* Erasure coding */
enum {
BCH_STRIPE = 128,
};
struct bch_stripe {
struct bch_val v;
__le16 sectors;
__u8 algorithm;
__u8 nr_blocks;
__u8 nr_redundant;
__u8 csum_granularity_bits;
__u8 csum_type;
__u8 pad;
struct bch_extent_ptr ptrs[0];
} __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(stripe, BCH_STRIPE);
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@ -1060,7 +1099,7 @@ struct bch_sb_field_quota {
struct bch_disk_group {
__u8 label[BCH_SB_LABEL_SIZE];
__le64 flags[2];
};
} __attribute__((packed, aligned(8)));
LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
@ -1069,7 +1108,7 @@ LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
struct bch_sb_field_disk_groups {
struct bch_sb_field field;
struct bch_disk_group entries[0];
};
} __attribute__((packed, aligned(8)));
/*
* On clean shutdown, store btree roots and current journal sequence number in
@ -1235,12 +1274,15 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
struct bch_sb, flags[2], 0, 4);
LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
/* Features: */
enum bch_sb_features {
BCH_FEATURE_LZ4 = 0,
BCH_FEATURE_GZIP = 1,
BCH_FEATURE_ZSTD = 2,
BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */
BCH_FEATURE_EC = 4,
BCH_FEATURE_NR,
};
@ -1407,7 +1449,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
DEF_BTREE_ID(DIRENTS, 2, "dirents") \
DEF_BTREE_ID(XATTRS, 3, "xattrs") \
DEF_BTREE_ID(ALLOC, 4, "alloc") \
DEF_BTREE_ID(QUOTAS, 5, "quotas")
DEF_BTREE_ID(QUOTAS, 5, "quotas") \
DEF_BTREE_ID(EC, 6, "erasure_coding")
#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,

View File

@ -579,6 +579,8 @@ BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC);
BKEY_VAL_ACCESSORS(quota, BCH_QUOTA);
BKEY_VAL_ACCESSORS(stripe, BCH_STRIPE);
/* byte order helpers */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

View File

@ -4,6 +4,7 @@
#include "btree_types.h"
#include "alloc_background.h"
#include "dirent.h"
#include "ec.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
@ -17,6 +18,7 @@ const struct bkey_ops bch2_bkey_ops[] = {
[BKEY_TYPE_XATTRS] = bch2_bkey_xattr_ops,
[BKEY_TYPE_ALLOC] = bch2_bkey_alloc_ops,
[BKEY_TYPE_QUOTAS] = bch2_bkey_quota_ops,
[BKEY_TYPE_EC] = bch2_bkey_ec_ops,
[BKEY_TYPE_BTREE] = bch2_bkey_btree_ops,
};

View File

@ -14,6 +14,7 @@
#include "buckets.h"
#include "clock.h"
#include "debug.h"
#include "ec.h"
#include "error.h"
#include "extents.h"
#include "journal.h"
@ -113,6 +114,7 @@ static bool bkey_type_needs_gc(enum bkey_type type)
switch (type) {
case BKEY_TYPE_BTREE:
case BKEY_TYPE_EXTENTS:
case BKEY_TYPE_EC:
return true;
default:
return false;
@ -153,6 +155,17 @@ static u8 ptr_gens_recalc_oldest(struct bch_fs *c,
}
}
break;
case BKEY_TYPE_EC:
switch (k.k->type) {
case BCH_STRIPE: {
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
for (ptr = s.v->ptrs;
ptr < s.v->ptrs + s.v->nr_blocks;
ptr++)
ptr_gen_recalc_oldest(c, ptr, &max_stale);
}
}
default:
break;
}
@ -214,6 +227,21 @@ static int ptr_gens_check(struct bch_fs *c, enum bkey_type type,
}
}
break;
case BKEY_TYPE_EC:
switch (k.k->type) {
case BCH_STRIPE: {
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
for (ptr = s.v->ptrs;
ptr < s.v->ptrs + s.v->nr_blocks;
ptr++) {
ret = ptr_gen_check(c, type, ptr);
if (ret)
return ret;
}
}
}
break;
default:
break;
}
@ -229,8 +257,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
{
struct gc_pos pos = { 0 };
unsigned flags =
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD|
BCH_BUCKET_MARK_GC|
(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
int ret = 0;
@ -359,15 +386,27 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
return 0;
}
static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
{
return (int) btree_id_to_gc_phase(l) -
(int) btree_id_to_gc_phase(r);
}
static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
bool initial)
{
enum btree_id ids[BTREE_ID_NR];
unsigned i;
for (i = 0; i < BTREE_ID_NR; i++) {
enum bkey_type type = bkey_type(0, i);
for (i = 0; i < BTREE_ID_NR; i++)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
int ret = bch2_gc_btree(c, i, initial);
for (i = 0; i < BTREE_ID_NR; i++) {
enum btree_id id = ids[i];
enum bkey_type type = bkey_type(0, id);
int ret = bch2_gc_btree(c, id, initial);
if (ret)
return ret;
@ -441,9 +480,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
BCH_DATA_SB, flags);
}
if (c)
spin_lock(&c->journal.lock);
for (i = 0; i < ca->journal.nr; i++) {
b = ca->journal.buckets[i];
bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
@ -453,7 +489,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
if (c) {
percpu_up_read_preempt_enable(&c->usage_lock);
spin_unlock(&c->journal.lock);
} else {
preempt_enable();
}
@ -468,9 +503,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
gc_pos_set(c, gc_phase(GC_PHASE_SB));
for_each_online_member(ca, c, i)
bch2_mark_dev_superblock(c, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
mutex_unlock(&c->sb_lock);
}
@ -478,7 +511,6 @@ static void bch2_mark_superblocks(struct bch_fs *c)
static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
{
struct gc_pos pos = { 0 };
struct bch_fs_usage stats = { 0 };
struct btree_update *as;
struct pending_btree_node_free *d;
@ -490,13 +522,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
bch2_mark_key(c, BKEY_TYPE_BTREE,
bkey_i_to_s_c(&d->key),
true, 0,
pos, &stats, 0,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
/*
* Don't apply stats - pending deletes aren't tracked in
* bch_alloc_stats:
*/
pos, NULL, 0,
BCH_BUCKET_MARK_GC);
mutex_unlock(&c->btree_interior_update_lock);
}
@ -517,8 +544,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
fifo_for_each_entry(i, &ca->free_inc, iter)
bch2_mark_alloc_bucket(c, ca, i, true,
gc_pos_alloc(c, NULL),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
BCH_BUCKET_MARK_GC);
@ -526,8 +552,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
fifo_for_each_entry(i, &ca->free[j], iter)
bch2_mark_alloc_bucket(c, ca, i, true,
gc_pos_alloc(c, NULL),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
BCH_BUCKET_MARK_GC);
}
spin_unlock(&c->freelist_lock);
@ -541,8 +566,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
gc_pos_alloc(c, ob),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
BCH_BUCKET_MARK_GC);
}
spin_unlock(&ob->lock);
}
@ -550,121 +574,310 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
percpu_up_read_preempt_enable(&c->usage_lock);
}
static void bch2_gc_start(struct bch_fs *c)
static void bch2_gc_free(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
for_each_member_device(ca, c, i) {
kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
ca->buckets[1] = NULL;
free_percpu(ca->usage[1]);
ca->usage[1] = NULL;
}
free_percpu(c->usage[1]);
c->usage[1] = NULL;
}
static void bch2_gc_done_nocheck(struct bch_fs *c)
{
struct bch_dev *ca;
struct bucket_array *buckets;
struct bucket_mark new;
unsigned i;
size_t b;
int cpu;
for_each_member_device(ca, c, i) {
struct bucket_array *src = __bucket_array(ca, 1);
memcpy(__bucket_array(ca, 0), src,
sizeof(struct bucket_array) +
sizeof(struct bucket) * src->nbuckets);
};
for_each_member_device(ca, c, i) {
struct bch_dev_usage *p;
for_each_possible_cpu(cpu) {
p = per_cpu_ptr(ca->usage[0], cpu);
memset(p, 0, sizeof(*p));
}
preempt_disable();
*this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1);
preempt_enable();
}
{
struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
struct bch_fs_usage *p;
for_each_possible_cpu(cpu) {
p = per_cpu_ptr(c->usage[0], cpu);
memset(p, 0, offsetof(typeof(*p), online_reserved));
}
preempt_disable();
memcpy(this_cpu_ptr(c->usage[0]),
&src,
offsetof(typeof(*p), online_reserved));
preempt_enable();
}
}
static void bch2_gc_done(struct bch_fs *c, bool initial)
{
struct bch_dev *ca;
unsigned i;
int cpu;
#define copy_field(_f, _msg, ...) \
if (dst._f != src._f) { \
pr_info(_msg ": got %llu, should be %llu, fixing" \
, ##__VA_ARGS__, dst._f, src._f); \
dst._f = src._f; \
}
#define copy_bucket_field(_f) \
if (dst->b[b].mark._f != src->b[b].mark._f) { \
pr_info("dev %u bucket %zu has wrong " #_f \
": got %u, should be %u, fixing", \
i, b, dst->b[b].mark._f, src->b[b].mark._f); \
dst->b[b]._mark._f = src->b[b].mark._f; \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
#define copy_fs_field(_f, _msg, ...) \
copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
percpu_down_write(&c->usage_lock);
/*
* Indicates to buckets code that gc is now in progress - done under
* usage_lock to avoid racing with bch2_mark_key():
*/
__gc_pos_set(c, gc_phase(GC_PHASE_START));
if (initial) {
bch2_gc_done_nocheck(c);
goto out;
}
/* Save a copy of the existing bucket stats while we recompute them: */
for_each_member_device(ca, c, i) {
ca->usage_cached = __bch2_dev_usage_read(ca);
struct bucket_array *dst = __bucket_array(ca, 0);
struct bucket_array *src = __bucket_array(ca, 1);
size_t b;
if (initial) {
memcpy(dst, src,
sizeof(struct bucket_array) +
sizeof(struct bucket) * dst->nbuckets);
}
for (b = 0; b < src->nbuckets; b++) {
copy_bucket_field(gen);
copy_bucket_field(data_type);
copy_bucket_field(owned_by_allocator);
copy_bucket_field(stripe);
copy_bucket_field(dirty_sectors);
copy_bucket_field(cached_sectors);
}
};
for_each_member_device(ca, c, i) {
struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0);
struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1);
struct bch_dev_usage *p;
unsigned b;
for (b = 0; b < BCH_DATA_NR; b++)
copy_dev_field(buckets[b],
"buckets[%s]", bch2_data_types[b]);
copy_dev_field(buckets_alloc, "buckets_alloc");
copy_dev_field(buckets_ec, "buckets_ec");
for (b = 0; b < BCH_DATA_NR; b++)
copy_dev_field(sectors[b],
"sectors[%s]", bch2_data_types[b]);
copy_dev_field(sectors_fragmented,
"sectors_fragmented");
for_each_possible_cpu(cpu) {
struct bch_dev_usage *p =
per_cpu_ptr(ca->usage_percpu, cpu);
p = per_cpu_ptr(ca->usage[0], cpu);
memset(p, 0, sizeof(*p));
}
preempt_disable();
p = this_cpu_ptr(ca->usage[0]);
*p = dst;
preempt_enable();
}
{
struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0);
struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
struct bch_fs_usage *p;
unsigned r, b;
for (r = 0; r < BCH_REPLICAS_MAX; r++) {
for (b = 0; b < BCH_DATA_NR; b++)
copy_fs_field(replicas[r].data[b],
"replicas[%i].data[%s]",
r, bch2_data_types[b]);
copy_fs_field(replicas[r].ec_data,
"replicas[%i].ec_data", r);
copy_fs_field(replicas[r].persistent_reserved,
"replicas[%i].persistent_reserved", r);
}
for (b = 0; b < BCH_DATA_NR; b++)
copy_fs_field(buckets[b],
"buckets[%s]", bch2_data_types[b]);
for_each_possible_cpu(cpu) {
p = per_cpu_ptr(c->usage[0], cpu);
memset(p, 0, offsetof(typeof(*p), online_reserved));
}
preempt_disable();
p = this_cpu_ptr(c->usage[0]);
memcpy(p, &dst, offsetof(typeof(*p), online_reserved));
preempt_enable();
}
out:
percpu_up_write(&c->usage_lock);
#undef copy_field
#undef copy_fs_field
#undef copy_dev_field
#undef copy_bucket_field
}
static int bch2_gc_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
BUG_ON(c->usage[1]);
c->usage[1] = alloc_percpu(struct bch_fs_usage);
if (!c->usage[1])
return -ENOMEM;
for_each_member_device(ca, c, i) {
BUG_ON(ca->buckets[1]);
BUG_ON(ca->usage[1]);
ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!ca->buckets[1]) {
percpu_ref_put(&ca->ref);
return -ENOMEM;
}
ca->usage[1] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[1]) {
percpu_ref_put(&ca->ref);
return -ENOMEM;
}
}
c->usage_cached = __bch2_fs_usage_read(c);
for_each_possible_cpu(cpu) {
struct bch_fs_usage *p =
per_cpu_ptr(c->usage_percpu, cpu);
percpu_down_write(&c->usage_lock);
memset(p->replicas, 0, sizeof(p->replicas));
memset(p->buckets, 0, sizeof(p->buckets));
}
for_each_member_device(ca, c, i) {
struct bucket_array *dst = __bucket_array(ca, 1);
struct bucket_array *src = __bucket_array(ca, 0);
size_t b;
dst->first_bucket = src->first_bucket;
dst->nbuckets = src->nbuckets;
for (b = 0; b < src->nbuckets; b++)
dst->b[b]._mark.gen = src->b[b].mark.gen;
};
percpu_up_write(&c->usage_lock);
/* Clear bucket marks: */
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
bucket_cmpxchg(buckets->b + b, new, ({
new.owned_by_allocator = 0;
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
}));
ca->oldest_gens[b] = new.gen;
}
up_read(&ca->bucket_lock);
}
return 0;
}
/**
* bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
* bch2_gc - walk _all_ references to buckets, and recompute them:
*
* Order matters here:
* - Concurrent GC relies on the fact that we have a total ordering for
* everything that GC walks - see gc_will_visit_node(),
* gc_will_visit_root()
*
* - also, references move around in the course of index updates and
* various other crap: everything needs to agree on the ordering
* references are allowed to move around in - e.g., we're allowed to
* start with a reference owned by an open_bucket (the allocator) and
* move it to the btree, but not the reverse.
*
* This is necessary to ensure that gc doesn't miss references that
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
void bch2_gc(struct bch_fs *c)
int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
{
struct bch_dev *ca;
u64 start_time = local_clock();
unsigned i;
unsigned i, iter = 0;
int ret;
/*
* Walk _all_ references to buckets, and recompute them:
*
* Order matters here:
* - Concurrent GC relies on the fact that we have a total ordering for
* everything that GC walks - see gc_will_visit_node(),
* gc_will_visit_root()
*
* - also, references move around in the course of index updates and
* various other crap: everything needs to agree on the ordering
* references are allowed to move around in - e.g., we're allowed to
* start with a reference owned by an open_bucket (the allocator) and
* move it to the btree, but not the reverse.
*
* This is necessary to ensure that gc doesn't miss references that
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
trace_gc_start(c);
/*
* Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
* gc_lock if sectors_available goes to 0:
*/
bch2_recalc_sectors_available(c);
down_write(&c->gc_lock);
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
again:
ret = bch2_gc_start(c);
if (ret)
goto out;
bch2_gc_start(c);
bch2_mark_superblocks(c);
ret = bch2_gc_btrees(c, NULL, false);
if (ret) {
bch_err(c, "btree gc failed: %d", ret);
set_bit(BCH_FS_GC_FAILURE, &c->flags);
ret = bch2_gc_btrees(c, journal, initial);
if (ret)
goto out;
}
bch2_mark_pending_btree_node_frees(c);
bch2_mark_allocator_buckets(c);
/* Indicates that gc is no longer in progress: */
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
c->gc_count++;
out:
if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
/*
* XXX: make sure gens we fixed got saved
*/
if (iter++ <= 2) {
bch_info(c, "Fixed gens, restarting mark and sweep:");
clear_bit(BCH_FS_FIXED_GENS, &c->flags);
goto again;
}
bch_info(c, "Unable to fix bucket gens, looping");
ret = -EINVAL;
}
if (!ret)
bch2_gc_done(c, initial);
/* Indicates that gc is no longer in progress: */
__gc_pos_set(c, gc_phase(GC_PHASE_START));
bch2_gc_free(c);
up_write(&c->gc_lock);
if (!ret && initial)
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
trace_gc_end(c);
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
@ -680,6 +893,7 @@ out:
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
closure_wake_up(&c->freelist_wait);
return ret;
}
/* Btree coalescing */
@ -995,9 +1209,6 @@ void bch2_coalesce(struct bch_fs *c)
{
enum btree_id id;
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
return;
down_read(&c->gc_lock);
trace_gc_coalesce_start(c);
@ -1009,7 +1220,6 @@ void bch2_coalesce(struct bch_fs *c)
if (ret) {
if (ret != -ESHUTDOWN)
bch_err(c, "btree coalescing failed: %d", ret);
set_bit(BCH_FS_GC_FAILURE, &c->flags);
return;
}
}
@ -1024,6 +1234,7 @@ static int bch2_gc_thread(void *arg)
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last = atomic_long_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
int ret;
set_freezable();
@ -1057,7 +1268,9 @@ static int bch2_gc_thread(void *arg)
last = atomic_long_read(&clock->now);
last_kick = atomic_read(&c->kick_gc);
bch2_gc(c);
ret = bch2_gc(c, NULL, false);
if (ret)
bch_err(c, "btree gc failed: %i", ret);
debug_check_no_locks_held();
}
@ -1098,30 +1311,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
{
unsigned iter = 0;
int ret = 0;
down_write(&c->gc_lock);
again:
bch2_gc_start(c);
bch2_mark_superblocks(c);
ret = bch2_gc_btrees(c, journal, true);
if (ret)
goto err;
if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
if (iter++ > 2) {
bch_info(c, "Unable to fix bucket gens, looping");
ret = -EINVAL;
goto err;
}
bch_info(c, "Fixed gens, restarting initial mark and sweep:");
clear_bit(BCH_FS_FIXED_GENS, &c->flags);
goto again;
}
int ret = bch2_gc(c, journal, true);
/*
* Skip past versions that might have possibly been used (as nonces),
@ -1130,9 +1320,5 @@ again:
if (c->sb.encryption_type)
atomic64_add(1 << 16, &c->key_version);
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
err:
up_write(&c->gc_lock);
return ret;
}

View File

@ -6,7 +6,7 @@
enum bkey_type;
void bch2_coalesce(struct bch_fs *);
void bch2_gc(struct bch_fs *);
int bch2_gc(struct bch_fs *, struct list_head *, bool);
void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *);
int bch2_initial_gc(struct bch_fs *, struct list_head *);
@ -54,11 +54,22 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
return 0;
}
static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
{
switch (id) {
#define DEF_BTREE_ID(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
DEFINE_BCH_BTREE_IDS()
#undef DEF_BTREE_ID
default:
BUG();
}
}
static inline struct gc_pos gc_pos_btree(enum btree_id id,
struct bpos pos, unsigned level)
{
return (struct gc_pos) {
.phase = GC_PHASE_BTREE_EXTENTS + id,
.phase = btree_id_to_gc_phase(id),
.pos = pos,
.level = level,
};
@ -93,14 +104,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o
};
}
static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
{
unsigned seq;
bool ret;
do {
seq = read_seqcount_begin(&c->gc_pos_lock);
ret = gc_pos_cmp(c->gc_pos, pos) < 0;
ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
return ret;

View File

@ -817,7 +817,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
*/
iter->level = depth_want;
iter->l[iter->level].b = NULL;
return 0;
return 1;
}
lock_type = __btree_lock_want(iter, iter->level);
@ -1044,6 +1044,9 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
? btree_iter_down(iter)
: btree_iter_lock_root(iter, depth_want);
if (unlikely(ret)) {
if (ret == 1)
return 0;
iter->level = depth_want;
iter->l[iter->level].b = BTREE_ITER_NOT_END;
return ret;

View File

@ -159,7 +159,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
{
struct bch_fs *c = as->c;
struct pending_btree_node_free *d;
unsigned replicas;
/*
* btree_update lock is only needed here to avoid racing with
@ -177,15 +176,6 @@ found:
BUG_ON(d->index_update_done);
d->index_update_done = true;
/*
* Btree nodes are accounted as freed in bch_alloc_stats when they're
* freed from the index:
*/
replicas = bch2_extent_nr_dirty_ptrs(k);
if (replicas)
stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
c->opts.btree_node_size * replicas;
/*
* We're dropping @k from the btree, but it's still live until the
* index update is persistent so we need to keep a reference around for
@ -207,15 +197,16 @@ found:
* bch2_mark_key() compares the current gc pos to the pos we're
* moving this reference from, hence one comparison here:
*/
if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
struct bch_fs_usage tmp = { 0 };
if (gc_pos_cmp(c->gc_pos, b
? gc_pos_btree_node(b)
: gc_pos_btree_root(as->btree_id)) >= 0 &&
gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
struct gc_pos pos = { 0 };
bch2_mark_key(c, BKEY_TYPE_BTREE,
bkey_i_to_s_c(&d->key),
false, 0, b
? gc_pos_btree_node(b)
: gc_pos_btree_root(as->btree_id),
&tmp, 0, 0);
false, 0, pos,
NULL, 0, BCH_BUCKET_MARK_GC);
/*
* Don't apply tmp - pending deletes aren't tracked in
* bch_alloc_stats:
@ -286,19 +277,13 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
static void bch2_btree_node_free_ondisk(struct bch_fs *c,
struct pending_btree_node_free *pending)
{
struct bch_fs_usage stats = { 0 };
BUG_ON(!pending->index_update_done);
bch2_mark_key(c, BKEY_TYPE_BTREE,
bkey_i_to_s_c(&pending->key),
false, 0,
gc_phase(GC_PHASE_PENDING_DELETE),
&stats, 0, 0);
/*
* Don't apply stats - pending deletes aren't tracked in
* bch_alloc_stats:
*/
NULL, 0, 0);
}
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@ -339,7 +324,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
writepoint_ptr(&c->btree_write_point),
&devs_have,
res->nr_replicas,
@ -637,12 +622,12 @@ static void btree_update_wait_on_journal(struct closure *cl)
int ret;
ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
if (ret < 0)
goto err;
if (!ret) {
if (ret == -EAGAIN) {
continue_at(cl, btree_update_wait_on_journal, system_wq);
return;
}
if (ret < 0)
goto err;
bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
err:

View File

@ -343,19 +343,40 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
trans_for_each_entry(trans, i)
BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
u64s = 0;
trans_for_each_entry(trans, i)
u64s += jset_u64s(i->k->k.u64s);
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
? bch2_journal_res_get(&c->journal,
&trans->journal_res,
u64s, u64s)
: 0;
if (ret)
return ret;
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
u64s = 0;
trans_for_each_entry(trans, i)
u64s += jset_u64s(i->k->k.u64s);
while ((ret = bch2_journal_res_get(&c->journal,
&trans->journal_res, u64s,
JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
struct btree_iter *iter = trans->entries[0].iter;
struct closure cl;
bch2_btree_iter_unlock(iter);
closure_init_stack(&cl);
while ((ret = bch2_journal_open_seq_async(&c->journal,
trans->journal_res.seq,
&cl)) == -EAGAIN)
closure_sync(&cl);
if (ret)
return ret;
if (!bch2_btree_iter_relock(iter)) {
trans_restart(" (iter relock after journal res get blocked)");
return -EINTR;
}
}
if (ret)
return ret;
}
multi_lock_write(c, trans);

View File

@ -68,6 +68,7 @@
#include "btree_gc.h"
#include "btree_update.h"
#include "buckets.h"
#include "ec.h"
#include "error.h"
#include "movinggc.h"
@ -83,8 +84,7 @@ static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
static void bch2_fs_stats_verify(struct bch_fs *c)
{
struct bch_fs_usage stats =
__bch2_fs_usage_read(c);
struct bch_fs_usage stats =_bch2_fs_usage_read(c);
unsigned i, j;
for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
@ -207,43 +207,24 @@ do { \
_acc; \
})
#define bch2_usage_read_cached(_c, _cached, _uncached) \
({ \
typeof(_cached) _ret; \
unsigned _seq; \
\
do { \
_seq = read_seqcount_begin(&(_c)->gc_pos_lock); \
_ret = (_c)->gc_pos.phase == GC_PHASE_DONE \
? bch2_usage_read_raw(_uncached) \
: (_cached); \
} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq)); \
\
_ret; \
})
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc)
{
return bch2_usage_read_raw(ca->usage_percpu);
return bch2_usage_read_raw(ca->usage[gc]);
}
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
{
return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
return bch2_usage_read_raw(ca->usage[0]);
}
struct bch_fs_usage
__bch2_fs_usage_read(struct bch_fs *c)
struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc)
{
return bch2_usage_read_raw(c->usage_percpu);
return bch2_usage_read_raw(c->usage[gc]);
}
struct bch_fs_usage
bch2_fs_usage_read(struct bch_fs *c)
struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
{
return bch2_usage_read_cached(c,
c->usage_cached,
c->usage_percpu);
return bch2_usage_read_raw(c->usage[0]);
}
struct fs_usage_sum {
@ -269,6 +250,7 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
sum.data += stats.replicas[i].data[BCH_DATA_BTREE];
sum.data += stats.replicas[i].data[BCH_DATA_USER];
sum.data += stats.replicas[i].ec_data;
sum.cached += stats.replicas[i].data[BCH_DATA_CACHED];
sum.reserved += stats.replicas[i].persistent_reserved;
}
@ -324,13 +306,11 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m)
: m.data_type;
}
static bool bucket_became_unavailable(struct bch_fs *c,
struct bucket_mark old,
static bool bucket_became_unavailable(struct bucket_mark old,
struct bucket_mark new)
{
return is_available_bucket(old) &&
!is_available_bucket(new) &&
(!c || c->gc_pos.phase == GC_PHASE_DONE);
!is_available_bucket(new);
}
void bch2_fs_usage_apply(struct bch_fs *c,
@ -360,12 +340,14 @@ void bch2_fs_usage_apply(struct bch_fs *c,
percpu_down_read_preempt_disable(&c->usage_lock);
/* online_reserved not subject to gc: */
this_cpu_ptr(c->usage_percpu)->online_reserved +=
this_cpu_ptr(c->usage[0])->online_reserved +=
stats->online_reserved;
stats->online_reserved = 0;
if (!gc_will_visit(c, gc_pos))
bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
bch2_usage_add(this_cpu_ptr(c->usage[0]), stats);
if (gc_visited(c, gc_pos))
bch2_usage_add(this_cpu_ptr(c->usage[1]), stats);
bch2_fs_stats_verify(c);
percpu_up_read_preempt_enable(&c->usage_lock);
@ -374,8 +356,9 @@ void bch2_fs_usage_apply(struct bch_fs *c,
}
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
struct bch_fs_usage *stats,
struct bucket_mark old, struct bucket_mark new)
struct bch_fs_usage *fs_usage,
struct bucket_mark old, struct bucket_mark new,
bool gc)
{
struct bch_dev_usage *dev_usage;
@ -387,16 +370,22 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
bch2_data_types[old.data_type],
bch2_data_types[new.data_type]);
stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
dev_usage = this_cpu_ptr(ca->usage[gc]);
dev_usage = this_cpu_ptr(ca->usage_percpu);
dev_usage->buckets[bucket_type(old)]--;
dev_usage->buckets[bucket_type(new)]++;
if (bucket_type(old) != bucket_type(new)) {
if (bucket_type(old)) {
fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
dev_usage->buckets[bucket_type(old)]--;
} else {
fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
dev_usage->buckets[bucket_type(new)]++;
}
}
dev_usage->buckets_alloc +=
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
dev_usage->buckets_ec +=
(int) new.stripe - (int) old.stripe;
dev_usage->buckets_unavailable +=
is_unavailable_bucket(new) - is_unavailable_bucket(old);
@ -417,21 +406,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
({ \
struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
\
bch2_dev_usage_update(c, ca, stats, _old, new); \
bch2_dev_usage_update(c, ca, stats, _old, new, gc); \
_old; \
})
void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old)
static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old,
bool gc)
{
struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
struct bucket *g;
struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark new;
percpu_rwsem_assert_held(&c->usage_lock);
g = bucket(ca, b);
*old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
BUG_ON(!is_available_bucket(new));
@ -442,38 +428,49 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
new.gen++;
}));
/*
* This isn't actually correct yet, since fs usage is still
* uncompressed sectors:
*/
stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
}
void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old)
{
percpu_rwsem_assert_held(&c->usage_lock);
__bch2_invalidate_bucket(c, ca, b, old, false);
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, b),
old->cached_sectors);
}
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
struct gc_pos pos, unsigned flags)
static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
bool gc)
{
struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
struct bucket *g;
struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark old, new;
percpu_rwsem_assert_held(&c->usage_lock);
g = bucket(ca, b);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos))
return;
old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
new.owned_by_allocator = owned_by_allocator;
}));
BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
c->gc_pos.phase == GC_PHASE_DONE);
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
}
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
struct gc_pos pos, unsigned flags)
{
percpu_rwsem_assert_held(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC))
__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
if ((flags & BCH_BUCKET_MARK_GC) ||
gc_visited(c, pos))
__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
}
#define checked_add(a, b) \
@ -483,35 +480,47 @@ do { \
BUG_ON((a) != _res); \
} while (0)
static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, enum bch_data_type type,
unsigned sectors, bool gc)
{
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark old, new;
BUG_ON(type != BCH_DATA_SB &&
type != BCH_DATA_JOURNAL);
old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
fs_usage->replicas[0].data[type] += sectors;
}
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, enum bch_data_type type,
unsigned sectors, struct gc_pos pos,
unsigned flags)
{
struct bch_fs_usage *stats;
struct bucket *g;
struct bucket_mark old, new;
BUG_ON(type != BCH_DATA_SB &&
type != BCH_DATA_JOURNAL);
if (likely(c)) {
percpu_rwsem_assert_held(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos))
return;
stats = this_cpu_ptr(c->usage_percpu);
g = bucket(ca, b);
old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
stats->replicas[0].data[type] += sectors;
if (!(flags & BCH_BUCKET_MARK_GC))
__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
false);
if ((flags & BCH_BUCKET_MARK_GC) ||
gc_visited(c, pos))
__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
true);
} else {
struct bucket *g;
struct bucket_mark old, new;
rcu_read_lock();
g = bucket(ca, b);
@ -522,9 +531,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
rcu_read_unlock();
}
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new));
}
static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
@ -569,23 +575,15 @@ static void bch2_mark_pointer(struct bch_fs *c,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
u64 journal_seq, unsigned flags,
bool gc)
{
struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_BUCKET(ca, &p.ptr);
size_t b = PTR_BUCKET_NR(ca, &p.ptr);
struct bucket *g = __bucket(ca, b, gc);
u64 v;
if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
if (journal_seq)
bucket_cmpxchg(g, new, ({
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}));
return;
}
v = atomic64_read(&g->_mark.v);
do {
new.v.counter = old.v.counter = v;
@ -627,17 +625,59 @@ static void bch2_mark_pointer(struct bch_fs *c,
old.v.counter,
new.v.counter)) != old.v.counter);
bch2_dev_usage_update(c, ca, fs_usage, old, new);
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new));
BUG_ON(!gc && bucket_became_unavailable(old, new));
}
static void bch2_mark_stripe_ptr(struct bch_fs *c,
struct bch_extent_stripe_ptr p,
s64 sectors, unsigned flags,
s64 *adjusted_disk_sectors,
unsigned *redundancy)
{
struct ec_stripe *m;
unsigned old, new, nr_data;
int blocks_nonempty_delta;
s64 parity_sectors;
m = genradix_ptr(&c->ec_stripes, p.idx);
if (WARN_ON(!m))
return;
if (WARN_ON(!m->alive))
return;
nr_data = m->nr_blocks - m->nr_redundant;
parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
if (sectors < 0)
parity_sectors = -parity_sectors;
*adjusted_disk_sectors += parity_sectors;
*redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1);
new = atomic_add_return(sectors, &m->block_sectors[p.block]);
old = new - sectors;
blocks_nonempty_delta = (int) !!new - (int) !!old;
if (!blocks_nonempty_delta)
return;
atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
bch2_stripes_heap_update(c, m, p.idx);
}
static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, enum bch_data_type data_type,
struct gc_pos pos,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags)
u64 journal_seq, unsigned flags,
bool gc)
{
BUG_ON(!sectors);
@ -649,28 +689,43 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
struct extent_ptr_decoded p;
s64 cached_sectors = 0;
s64 dirty_sectors = 0;
s64 ec_sectors = 0;
unsigned replicas = 0;
unsigned ec_redundancy = 0;
unsigned i;
extent_for_each_ptr_decode(e, p, entry) {
s64 disk_sectors = ptr_disk_sectors(e, p, sectors);
s64 adjusted_disk_sectors = disk_sectors;
bch2_mark_pointer(c, e, p, disk_sectors, data_type,
stats, journal_seq, flags);
stats, journal_seq, flags, gc);
if (!p.ptr.cached)
for (i = 0; i < p.ec_nr; i++)
bch2_mark_stripe_ptr(c, p.ec[i],
disk_sectors, flags,
&adjusted_disk_sectors,
&ec_redundancy);
if (!p.ptr.cached)
replicas++;
if (p.ptr.cached)
cached_sectors += disk_sectors;
cached_sectors += adjusted_disk_sectors;
else if (!p.ec_nr)
dirty_sectors += adjusted_disk_sectors;
else
dirty_sectors += disk_sectors;
ec_sectors += adjusted_disk_sectors;
}
replicas = clamp_t(unsigned, replicas,
1, ARRAY_SIZE(stats->replicas));
ec_redundancy = clamp_t(unsigned, ec_redundancy,
1, ARRAY_SIZE(stats->replicas));
stats->replicas[0].data[BCH_DATA_CACHED] += cached_sectors;
stats->replicas[replicas - 1].data[data_type] += dirty_sectors;
stats->replicas[ec_redundancy - 1].ec_data += ec_sectors;
break;
}
case BCH_RESERVATION: {
@ -686,6 +741,105 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
}
}
static void bucket_set_stripe(struct bch_fs *c,
const struct bch_stripe *v,
bool enabled,
struct bch_fs_usage *fs_usage,
u64 journal_seq,
bool gc)
{
unsigned i;
for (i = 0; i < v->nr_blocks; i++) {
const struct bch_extent_ptr *ptr = v->ptrs + i;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t b = PTR_BUCKET_NR(ca, ptr);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark new, old;
BUG_ON(ptr_stale(ca, ptr));
old = bucket_cmpxchg(g, new, ({
new.stripe = enabled;
if (journal_seq) {
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}
}));
BUG_ON(old.stripe == enabled);
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
}
}
static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
bool inserting,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags,
bool gc)
{
switch (k.k->type) {
case BCH_STRIPE: {
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
size_t idx = s.k->p.offset;
struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
unsigned i;
BUG_ON(!m);
BUG_ON(m->alive == inserting);
BUG_ON(atomic_read(&m->blocks_nonempty));
for (i = 0; i < EC_STRIPE_MAX; i++)
BUG_ON(atomic_read(&m->block_sectors[i]));
if (inserting) {
m->sectors = le16_to_cpu(s.v->sectors);
m->algorithm = s.v->algorithm;
m->nr_blocks = s.v->nr_blocks;
m->nr_redundant = s.v->nr_redundant;
}
if (inserting)
bch2_stripes_heap_insert(c, m, idx);
else
bch2_stripes_heap_del(c, m, idx);
bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
break;
}
}
}
static void __bch2_mark_key(struct bch_fs *c,
enum bkey_type type, struct bkey_s_c k,
bool inserting, s64 sectors,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags,
bool gc)
{
switch (type) {
case BKEY_TYPE_BTREE:
bch2_mark_extent(c, k, inserting
? c->opts.btree_node_size
: -c->opts.btree_node_size,
BCH_DATA_BTREE,
stats, journal_seq, flags, gc);
break;
case BKEY_TYPE_EXTENTS:
bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
stats, journal_seq, flags, gc);
break;
case BKEY_TYPE_EC:
bch2_mark_stripe(c, k, inserting,
stats, journal_seq, flags, gc);
break;
default:
break;
}
}
void bch2_mark_key(struct bch_fs *c,
enum bkey_type type, struct bkey_s_c k,
bool inserting, s64 sectors,
@ -693,57 +847,23 @@ void bch2_mark_key(struct bch_fs *c,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags)
{
/*
* synchronization w.r.t. GC:
*
* Normally, bucket sector counts/marks are updated on the fly, as
* references are added/removed from the btree, the lists of buckets the
* allocator owns, other metadata buckets, etc.
*
* When GC is in progress and going to mark this reference, we do _not_
* mark this reference here, to avoid double counting - GC will count it
* when it gets to it.
*
* To know whether we should mark a given reference (GC either isn't
* running, or has already marked references at this position) we
* construct a total order for everything GC walks. Then, we can simply
* compare the position of the reference we're marking - @pos - with
* GC's current position. If GC is going to mark this reference, GC's
* current position will be less than @pos; if GC's current position is
* greater than @pos GC has either already walked this position, or
* isn't running.
*
* To avoid racing with GC's position changing, we have to deal with
* - GC's position being set to GC_POS_MIN when GC starts:
* usage_lock guards against this
* - GC's position overtaking @pos: we guard against this with
* whatever lock protects the data structure the reference lives in
* (e.g. the btree node lock, or the relevant allocator lock).
*/
percpu_down_read_preempt_disable(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos))
flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
if (!stats)
stats = this_cpu_ptr(c->usage_percpu);
if (!(flags & BCH_BUCKET_MARK_GC)) {
if (!stats)
stats = this_cpu_ptr(c->usage[0]);
switch (type) {
case BKEY_TYPE_BTREE:
bch2_mark_extent(c, k, inserting
? c->opts.btree_node_size
: -c->opts.btree_node_size,
BCH_DATA_BTREE,
pos, stats, journal_seq, flags);
break;
case BKEY_TYPE_EXTENTS:
bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
pos, stats, journal_seq, flags);
break;
default:
break;
__bch2_mark_key(c, type, k, inserting, sectors,
stats, journal_seq, flags, false);
}
if ((flags & BCH_BUCKET_MARK_GC) ||
gc_visited(c, pos)) {
__bch2_mark_key(c, type, k, inserting, sectors,
this_cpu_ptr(c->usage[1]),
journal_seq, flags, true);
}
percpu_up_read_preempt_enable(&c->usage_lock);
}
@ -819,28 +939,20 @@ void bch2_mark_update(struct btree_insert *trans,
/* Disk reservations: */
static u64 __recalc_sectors_available(struct bch_fs *c)
static u64 bch2_recalc_sectors_available(struct bch_fs *c)
{
int cpu;
for_each_possible_cpu(cpu)
per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
per_cpu_ptr(c->usage[0], cpu)->available_cache = 0;
return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
}
/* Used by gc when it's starting: */
void bch2_recalc_sectors_available(struct bch_fs *c)
{
percpu_down_write(&c->usage_lock);
atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
percpu_up_write(&c->usage_lock);
}
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
percpu_down_read_preempt_disable(&c->usage_lock);
this_cpu_sub(c->usage_percpu->online_reserved,
this_cpu_sub(c->usage[0]->online_reserved,
res->sectors);
bch2_fs_stats_verify(c);
@ -860,7 +972,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
int ret;
percpu_down_read_preempt_disable(&c->usage_lock);
stats = this_cpu_ptr(c->usage_percpu);
stats = this_cpu_ptr(c->usage[0]);
if (sectors <= stats->available_cache)
goto out;
@ -908,7 +1020,7 @@ recalculate:
}
percpu_down_write(&c->usage_lock);
sectors_available = __recalc_sectors_available(c);
sectors_available = bch2_recalc_sectors_available(c);
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
@ -949,6 +1061,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
struct bucket_array *buckets = NULL, *old_buckets = NULL;
unsigned long *buckets_dirty = NULL;
unsigned long *buckets_written = NULL;
u8 *oldest_gens = NULL;
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
@ -962,7 +1075,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
btree_reserve);
bool resize = ca->buckets != NULL,
bool resize = ca->buckets[0] != NULL,
start_copygc = ca->copygc_thread != NULL;
int ret = -ENOMEM;
unsigned i;
@ -980,6 +1093,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
!(buckets_dirty = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)) ||
!(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)) ||
!init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_MOVINGGC],
copygc_reserve, GFP_KERNEL) ||
@ -1014,13 +1130,17 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
memcpy(buckets_dirty,
ca->buckets_dirty,
BITS_TO_LONGS(n) * sizeof(unsigned long));
memcpy(buckets_written,
ca->buckets_written,
BITS_TO_LONGS(n) * sizeof(unsigned long));
}
rcu_assign_pointer(ca->buckets, buckets);
rcu_assign_pointer(ca->buckets[0], buckets);
buckets = old_buckets;
swap(ca->oldest_gens, oldest_gens);
swap(ca->buckets_dirty, buckets_dirty);
swap(ca->buckets_written, buckets_written);
if (resize)
percpu_up_write(&c->usage_lock);
@ -1060,6 +1180,8 @@ err:
free_fifo(&free[i]);
kvpfree(buckets_dirty,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
kvpfree(buckets_written,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
kvpfree(oldest_gens,
nbuckets * sizeof(u8));
if (buckets)
@ -1077,19 +1199,21 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
free_fifo(&ca->free_inc);
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&ca->free[i]);
kvpfree(ca->buckets_written,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->buckets_dirty,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
kvpfree(rcu_dereference_protected(ca->buckets, 1),
kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
free_percpu(ca->usage_percpu);
free_percpu(ca->usage[0]);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
return -ENOMEM;
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;

View File

@ -28,23 +28,34 @@
_old; \
})
static inline struct bucket_array *bucket_array(struct bch_dev *ca)
static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
bool gc)
{
return rcu_dereference_check(ca->buckets,
return rcu_dereference_check(ca->buckets[gc],
!ca->fs ||
percpu_rwsem_is_held(&ca->fs->usage_lock) ||
lockdep_is_held(&ca->fs->gc_lock) ||
lockdep_is_held(&ca->bucket_lock));
}
static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
static inline struct bucket_array *bucket_array(struct bch_dev *ca)
{
struct bucket_array *buckets = bucket_array(ca);
return __bucket_array(ca, false);
}
static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
{
struct bucket_array *buckets = __bucket_array(ca, gc);
BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
return buckets->b + b;
}
static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
{
return __bucket(ca, b, false);
}
static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
size_t b, int rw)
{
@ -128,7 +139,7 @@ static inline bool bucket_unused(struct bucket_mark mark)
/* Device usage: */
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool);
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
@ -167,7 +178,7 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
/* Filesystem usage: */
struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool);
struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, struct gc_pos);
@ -184,6 +195,7 @@ static inline bool is_available_bucket(struct bucket_mark mark)
{
return (!mark.owned_by_allocator &&
!mark.dirty_sectors &&
!mark.stripe &&
!mark.nouse);
}
@ -205,17 +217,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
struct gc_pos, unsigned);
#define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1)
#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2)
#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3)
#define BCH_BUCKET_MARK_GC (1 << 1)
void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
bool, s64, struct gc_pos,
struct bch_fs_usage *, u64, unsigned);
void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
void bch2_recalc_sectors_available(struct bch_fs *);
void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
static inline void bch2_disk_reservation_put(struct bch_fs *c,

View File

@ -18,7 +18,8 @@ struct bucket_mark {
gen_valid:1,
owned_by_allocator:1,
nouse:1,
journal_seq_valid:1;
journal_seq_valid:1,
stripe:1;
u16 dirty_sectors;
u16 cached_sectors;
@ -52,6 +53,7 @@ struct bucket_array {
struct bch_dev_usage {
u64 buckets[BCH_DATA_NR];
u64 buckets_alloc;
u64 buckets_ec;
u64 buckets_unavailable;
/* _compressed_ sectors: */
@ -61,15 +63,18 @@ struct bch_dev_usage {
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
u64 online_reserved;
u64 available_cache;
struct {
u64 data[BCH_DATA_NR];
u64 ec_data;
u64 persistent_reserved;
} replicas[BCH_REPLICAS_MAX];
u64 buckets[BCH_DATA_NR];
/* fields starting here aren't touched by gc: */
u64 online_reserved;
u64 available_cache;
};
/*

View File

@ -601,11 +601,13 @@ have_compressed:
goto out;
}
ret = mempool_init_kmalloc_pool(
&c->decompress_workspace,
1, decompress_workspace_size);
if (ret)
goto out;
if (!mempool_initialized(&c->decompress_workspace)) {
ret = mempool_init_kmalloc_pool(
&c->decompress_workspace,
1, decompress_workspace_size);
if (ret)
goto out;
}
out:
pr_verbose_init(c->opts, "ret %i", ret);
return ret;

View File

@ -54,6 +54,19 @@ static inline struct target target_decode(unsigned target)
}
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
enum bch_data_type data_type,
u16 target)
{
struct bch_devs_mask devs = c->rw_devs[data_type];
const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
if (t)
bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
return devs;
}
bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
int bch2_disk_path_find(struct bch_sb_handle *, const char *);

1283
libbcachefs/ec.c Normal file

File diff suppressed because it is too large Load Diff

108
libbcachefs/ec.h Normal file
View File

@ -0,0 +1,108 @@
#ifndef _BCACHEFS_EC_H
#define _BCACHEFS_EC_H
#include "ec_types.h"
#include "keylist_types.h"
const char *bch2_ec_key_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_ec_key_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
#define bch2_bkey_ec_ops (struct bkey_ops) { \
.key_invalid = bch2_ec_key_invalid, \
.val_to_text = bch2_ec_key_to_text, \
}
struct bch_read_bio;
struct ec_stripe_buf {
/* might not be buffering the entire stripe: */
unsigned offset;
unsigned size;
unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)];
void *data[EC_STRIPE_MAX];
union {
struct bkey_i_stripe key;
u64 pad[255];
};
};
struct ec_stripe_head;
struct ec_stripe_new {
struct bch_fs *c;
struct ec_stripe_head *h;
struct mutex lock;
struct list_head list;
/* counts in flight writes, stripe is created when pin == 0 */
atomic_t pin;
int err;
unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
struct open_buckets blocks;
struct open_buckets parity;
struct keylist keys;
u64 inline_keys[BKEY_U64s * 8];
struct ec_stripe_buf stripe;
};
struct ec_stripe_head {
struct list_head list;
struct mutex lock;
struct list_head stripes;
unsigned target;
unsigned algo;
unsigned redundancy;
struct bch_devs_mask devs;
unsigned nr_active_devs;
unsigned blocksize;
struct dev_stripe_state block_stripe;
struct dev_stripe_state parity_stripe;
struct open_buckets blocks;
struct open_buckets parity;
struct ec_stripe_new *s;
};
int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *,
struct bpos, unsigned);
void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
void bch2_ec_stripe_head_put(struct ec_stripe_head *);
struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
unsigned, unsigned);
void bch2_stripes_heap_update(struct bch_fs *, struct ec_stripe *, size_t);
void bch2_stripes_heap_del(struct bch_fs *, struct ec_stripe *, size_t);
void bch2_stripes_heap_insert(struct bch_fs *, struct ec_stripe *, size_t);
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_ec_flush_new_stripes(struct bch_fs *);
int bch2_fs_ec_start(struct bch_fs *);
void bch2_fs_ec_exit(struct bch_fs *);
int bch2_fs_ec_init(struct bch_fs *);
#endif /* _BCACHEFS_EC_H */

30
libbcachefs/ec_types.h Normal file
View File

@ -0,0 +1,30 @@
#ifndef _BCACHEFS_EC_TYPES_H
#define _BCACHEFS_EC_TYPES_H
#include <linux/llist.h>
#define EC_STRIPE_MAX 16
struct ec_stripe {
size_t heap_idx;
u16 sectors;
u8 algorithm;
u8 nr_blocks;
u8 nr_redundant;
u8 alive;
atomic_t pin;
atomic_t blocks_nonempty;
atomic_t block_sectors[EC_STRIPE_MAX];
};
struct ec_stripe_heap_entry {
size_t idx;
unsigned blocks_nonempty;
};
typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
#endif /* _BCACHEFS_EC_TYPES_H */

View File

@ -193,29 +193,41 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
return nr_ptrs;
}
unsigned bch2_extent_ptr_durability(struct bch_fs *c,
const struct bch_extent_ptr *ptr)
static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
struct extent_ptr_decoded p)
{
unsigned i, durability = 0;
struct bch_dev *ca;
if (ptr->cached)
if (p.ptr.cached)
return 0;
ca = bch_dev_bkey_exists(c, ptr->dev);
ca = bch_dev_bkey_exists(c, p.ptr.dev);
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
return 0;
if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
durability = max_t(unsigned, durability, ca->mi.durability);
return ca->mi.durability;
for (i = 0; i < p.ec_nr; i++) {
struct ec_stripe *s =
genradix_ptr(&c->ec_stripes, p.idx);
if (WARN_ON(!s))
continue;
durability = max_t(unsigned, durability, s->nr_redundant);
}
return durability;
}
unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
{
const struct bch_extent_ptr *ptr;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
unsigned durability = 0;
extent_for_each_ptr(e, ptr)
durability += bch2_extent_ptr_durability(c, ptr);
extent_for_each_ptr_decode(e, p, entry)
durability += bch2_extent_ptr_durability(c, p);
return durability;
}
@ -258,30 +270,46 @@ bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
return false;
}
static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e,
union bch_extent_entry *entry)
{
union bch_extent_entry *i = e.v->start;
if (i == entry)
return NULL;
while (extent_entry_next(i) != entry)
i = extent_entry_next(i);
return i;
}
union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e,
struct bch_extent_ptr *ptr)
{
union bch_extent_entry *dst;
union bch_extent_entry *src;
union bch_extent_entry *dst, *src, *prev;
bool drop_crc = true;
EBUG_ON(ptr < &e.v->start->ptr ||
ptr >= &extent_entry_last(e)->ptr);
EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
src = to_entry(ptr + 1);
src = extent_entry_next(to_entry(ptr));
if (src != extent_entry_last(e) &&
extent_entry_type(src) == BCH_EXTENT_ENTRY_ptr) {
dst = to_entry(ptr);
} else {
extent_for_each_entry(e, dst) {
if (dst == to_entry(ptr))
break;
!extent_entry_is_crc(src))
drop_crc = false;
if (extent_entry_next(dst) == to_entry(ptr) &&
extent_entry_is_crc(dst))
break;
dst = to_entry(ptr);
while ((prev = extent_entry_prev(e, dst))) {
if (extent_entry_is_ptr(prev))
break;
if (extent_entry_is_crc(prev)) {
if (drop_crc)
dst = prev;
break;
}
dst = prev;
}
memmove_u64s_down(dst, src,
@ -423,6 +451,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
entry->crc128.csum.lo = (__force __le64)
swab64((__force u64) entry->crc128.csum.lo);
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
break;
}
}
break;
@ -470,6 +500,7 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
const union bch_extent_entry *entry;
struct bch_extent_crc_unpacked crc;
const struct bch_extent_ptr *ptr;
const struct bch_extent_stripe_ptr *ec;
struct bch_dev *ca;
bool first = true;
@ -478,6 +509,18 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
pr_buf(out, " ");
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
ptr->cached ? " cached" : "",
ca && ptr_stale(ca, ptr)
? " stale" : "");
break;
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
@ -490,17 +533,11 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
crc.csum_type,
crc.compression_type);
break;
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
case BCH_EXTENT_ENTRY_stripe_ptr:
ec = &entry->stripe_ptr;
pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
ptr->cached ? " cached" : "",
ca && ptr_stale(ca, ptr)
? " stale" : "");
pr_buf(out, "ec: idx %llu block %u",
(u64) ec->idx, ec->block);
break;
default:
pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
@ -536,6 +573,11 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
f = &failed->devs[failed->nr++];
f->dev = p->ptr.dev;
f->idx = p->idx;
f->nr_failed = 1;
f->nr_retries = 0;
} else if (p->idx != f->idx) {
f->idx = p->idx;
f->nr_failed = 1;
f->nr_retries = 0;
} else {
@ -550,15 +592,22 @@ static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p1,
const struct extent_ptr_decoded p2)
{
struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
if (likely(!p1.idx && !p2.idx)) {
struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
/* Pick at random, biased in favor of the faster device: */
/* Pick at random, biased in favor of the faster device: */
return bch2_rand_range(l1 + l2) > l1;
return bch2_rand_range(l1 + l2) > l1;
}
if (force_reconstruct_read(c))
return p1.idx > p2.idx;
return p1.idx < p2.idx;
}
static int extent_pick_read_device(struct bch_fs *c,
@ -579,7 +628,20 @@ static int extent_pick_read_device(struct bch_fs *c,
continue;
f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
if (f && f->nr_failed >= f->nr_retries)
if (f)
p.idx = f->nr_failed < f->nr_retries
? f->idx
: f->idx + 1;
if (!p.idx &&
!bch2_dev_is_readable(ca))
p.idx++;
if (force_reconstruct_read(c) &&
!p.idx && p.ec_nr)
p.idx++;
if (p.idx >= p.ec_nr + 1)
continue;
if (ret && !ptr_better(c, p, *pick))
@ -616,8 +678,8 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
return "invalid extent entry type";
if (extent_entry_is_crc(entry))
return "has crc field";
if (!extent_entry_is_ptr(entry))
return "has non ptr field";
}
extent_for_each_ptr(e, ptr) {
@ -754,6 +816,8 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
case BCH_EXTENT_ENTRY_crc128:
entry->crc128.offset += e.k->size - len;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
break;
}
if (extent_entry_is_crc(entry))
@ -1512,7 +1576,18 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
return "invalid extent entry type";
if (extent_entry_is_crc(entry)) {
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
reason = extent_ptr_invalid(c, e, &entry->ptr,
size_ondisk, false);
if (reason)
return reason;
break;
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
if (crc.offset + e.k->size >
@ -1533,13 +1608,9 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
else if (nonce != crc.offset + crc.nonce)
return "incorrect nonce";
}
} else {
ptr = entry_to_ptr(entry);
reason = extent_ptr_invalid(c, e, &entry->ptr,
size_ondisk, false);
if (reason)
return reason;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
break;
}
}
@ -1744,6 +1815,7 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
{
struct bch_extent_crc_unpacked crc;
union bch_extent_entry *pos;
unsigned i;
extent_for_each_crc(extent_i_to_s(e), crc, pos)
if (!bch2_crc_unpacked_cmp(crc, p->crc))
@ -1754,6 +1826,11 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
found:
p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
__extent_entry_insert(e, pos, to_entry(&p->ptr));
for (i = 0; i < p->ec_nr; i++) {
p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
__extent_entry_insert(e, pos, to_entry(&p->ec[i]));
}
}
/*
@ -1808,26 +1885,27 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
unsigned target,
unsigned nr_desired_replicas)
{
struct bch_extent_ptr *ptr;
union bch_extent_entry *entry;
struct extent_ptr_decoded p;
int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
if (target && extra > 0)
extent_for_each_ptr(e, ptr) {
int n = bch2_extent_ptr_durability(c, ptr);
extent_for_each_ptr_decode(e, p, entry) {
int n = bch2_extent_ptr_durability(c, p);
if (n && n <= extra &&
!bch2_dev_in_target(c, ptr->dev, target)) {
ptr->cached = true;
!bch2_dev_in_target(c, p.ptr.dev, target)) {
entry->ptr.cached = true;
extra -= n;
}
}
if (extra > 0)
extent_for_each_ptr(e, ptr) {
int n = bch2_extent_ptr_durability(c, ptr);
extent_for_each_ptr_decode(e, p, entry) {
int n = bch2_extent_ptr_durability(c, p);
if (n && n <= extra) {
ptr->cached = true;
entry->ptr.cached = true;
extra -= n;
}
}
@ -1903,7 +1981,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
if ((extent_entry_type(en_l) !=
extent_entry_type(en_r)) ||
extent_entry_is_crc(en_l))
!extent_entry_is_ptr(en_l))
return BCH_MERGE_NOMERGE;
lp = &en_l->ptr;

View File

@ -95,8 +95,6 @@ unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
unsigned bch2_extent_is_compressed(struct bkey_s_c);
unsigned bch2_extent_ptr_durability(struct bch_fs *,
const struct bch_extent_ptr *);
unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
@ -361,20 +359,13 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
/* Iterate over pointers, with crcs: */
static inline struct extent_ptr_decoded
__extent_ptr_decoded_init(const struct bkey *k)
{
return (struct extent_ptr_decoded) {
.crc = bch2_extent_crc_unpack(k, NULL),
};
}
#define EXTENT_ITERATE_EC (1 << 0)
#define __extent_ptr_next_decode(_e, _ptr, _entry) \
({ \
__label__ out; \
\
(_ptr).idx = 0; \
(_ptr).ec_nr = 0; \
\
extent_for_each_entry_from(_e, _entry, _entry) \
switch (extent_entry_type(_entry)) { \
case BCH_EXTENT_ENTRY_ptr: \
@ -386,14 +377,16 @@ __extent_ptr_decoded_init(const struct bkey *k)
(_ptr).crc = bch2_extent_crc_unpack((_e).k, \
entry_to_crc(_entry)); \
break; \
case BCH_EXTENT_ENTRY_stripe_ptr: \
(_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr; \
break; \
} \
\
out: \
_entry < extent_entry_last(_e); \
})
#define extent_for_each_ptr_decode(_e, _ptr, _entry) \
for ((_ptr) = __extent_ptr_decoded_init((_e).k), \
for ((_ptr).crc = bch2_extent_crc_unpack((_e).k, NULL), \
(_entry) = (_e).v->start; \
__extent_ptr_next_decode(_e, _ptr, _entry); \
(_entry) = extent_entry_next(_entry))

View File

@ -19,14 +19,18 @@ struct bch_extent_crc_unpacked {
};
struct extent_ptr_decoded {
unsigned idx;
unsigned ec_nr;
struct bch_extent_crc_unpacked crc;
struct bch_extent_ptr ptr;
struct bch_extent_stripe_ptr ec[4];
};
struct bch_io_failures {
u8 nr;
struct bch_dev_io_failures {
u8 dev;
u8 idx;
u8 nr_failed;
u8 nr_retries;
} devs[BCH_REPLICAS_MAX];

View File

@ -454,12 +454,12 @@ struct bch_page_state {
union { struct {
/* existing data: */
unsigned sectors:PAGE_SECTOR_SHIFT + 1;
unsigned nr_replicas:4;
unsigned compressed:1;
/* Owns PAGE_SECTORS sized reservation: */
unsigned reserved:1;
unsigned reservation_replicas:4;
/* Uncompressed, fully allocated replicas: */
unsigned nr_replicas:4;
/* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
unsigned replicas_reserved:4;
/* Owns PAGE_SECTORS sized quota reservation: */
unsigned quota_reserved:1;
@ -506,7 +506,7 @@ static inline struct bch_page_state *page_state(struct page *page)
static inline unsigned page_res_sectors(struct bch_page_state s)
{
return s.reserved ? s.reservation_replicas * PAGE_SECTORS : 0;
return s.replicas_reserved * PAGE_SECTORS;
}
static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
@ -524,8 +524,10 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i
{
struct bch_page_state s;
EBUG_ON(!PageLocked(page));
s = page_state_cmpxchg(page_state(page), s, {
s.reserved = 0;
s.replicas_reserved = 0;
s.quota_reserved = 0;
});
@ -535,62 +537,46 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i
static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
struct page *page, bool check_enospc)
{
struct bch_page_state *s = page_state(page), new, old;
struct bch_page_state *s = page_state(page), new;
/* XXX: this should not be open coded */
unsigned nr_replicas = inode->ei_inode.bi_data_replicas
? inode->ei_inode.bi_data_replicas - 1
: c->opts.data_replicas;
struct disk_reservation disk_res = bch2_disk_reservation_init(c,
nr_replicas);
struct disk_reservation disk_res;
struct quota_res quota_res = { 0 };
int ret = 0;
int ret;
/*
* XXX: this could likely be quite a bit simpler, page reservations
* _should_ only be manipulated with page locked:
*/
EBUG_ON(!PageLocked(page));
old = page_state_cmpxchg(s, new, {
if (new.reserved
? (new.reservation_replicas < disk_res.nr_replicas)
: (new.sectors < PAGE_SECTORS ||
new.nr_replicas < disk_res.nr_replicas ||
new.compressed)) {
int sectors = (disk_res.nr_replicas * PAGE_SECTORS -
page_res_sectors(new) -
disk_res.sectors);
if (s->replicas_reserved < nr_replicas) {
ret = bch2_disk_reservation_get(c, &disk_res, PAGE_SECTORS,
nr_replicas - s->replicas_reserved,
!check_enospc ? BCH_DISK_RESERVATION_NOFAIL : 0);
if (unlikely(ret))
return ret;
if (sectors > 0) {
ret = bch2_disk_reservation_add(c, &disk_res, sectors,
!check_enospc
? BCH_DISK_RESERVATION_NOFAIL : 0);
if (unlikely(ret))
goto err;
}
page_state_cmpxchg(s, new, ({
BUG_ON(new.replicas_reserved +
disk_res.nr_replicas != nr_replicas);
new.replicas_reserved += disk_res.nr_replicas;
}));
}
new.reserved = 1;
new.reservation_replicas = disk_res.nr_replicas;
}
if (!new.quota_reserved &&
new.sectors + new.dirty_sectors < PAGE_SECTORS) {
ret = bch2_quota_reservation_add(c, inode, &quota_res,
PAGE_SECTORS - quota_res.sectors,
check_enospc);
if (unlikely(ret))
goto err;
if (!s->quota_reserved &&
s->sectors + s->dirty_sectors < PAGE_SECTORS) {
ret = bch2_quota_reservation_add(c, inode, &quota_res,
PAGE_SECTORS,
check_enospc);
if (unlikely(ret))
return ret;
page_state_cmpxchg(s, new, ({
BUG_ON(new.quota_reserved);
new.quota_reserved = 1;
}
});
}));
}
quota_res.sectors -= (new.quota_reserved - old.quota_reserved) * PAGE_SECTORS;
disk_res.sectors -= page_res_sectors(new) - page_res_sectors(old);
err:
bch2_quota_reservation_put(c, inode, &quota_res);
bch2_disk_reservation_put(c, &disk_res);
return ret;
}
@ -600,6 +586,8 @@ static void bch2_clear_page_bits(struct page *page)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_page_state s;
EBUG_ON(!PageLocked(page));
if (!PagePrivate(page))
return;
@ -710,6 +698,9 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
{
int ret;
EBUG_ON(!PageLocked(page));
EBUG_ON(!PageLocked(newpage));
ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
@ -856,10 +847,13 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
{
struct bvec_iter iter;
struct bio_vec bv;
bool compressed = bch2_extent_is_compressed(k);
unsigned nr_ptrs = bch2_extent_nr_dirty_ptrs(k);
unsigned nr_ptrs = !bch2_extent_is_compressed(k)
? bch2_extent_nr_dirty_ptrs(k)
: 0;
bio_for_each_segment(bv, bio, iter) {
/* brand new pages, don't need to be locked: */
struct bch_page_state *s = page_state(bv.bv_page);
/* sectors in @k from the start of this page: */
@ -867,14 +861,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
s->nr_replicas = !s->sectors
? nr_ptrs
: min_t(unsigned, s->nr_replicas, nr_ptrs);
s->nr_replicas = page_sectors == PAGE_SECTORS
? nr_ptrs : 0;
BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
s->sectors += page_sectors;
s->compressed |= compressed;
}
}
@ -1214,7 +1205,7 @@ static int __bch2_writepage(struct page *page,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_writepage_state *w = data;
struct bch_page_state new, old;
unsigned offset;
unsigned offset, nr_replicas_this_write;
loff_t i_size = i_size_read(&inode->v);
pgoff_t end_index = i_size >> PAGE_SHIFT;
@ -1240,19 +1231,31 @@ static int __bch2_writepage(struct page *page,
*/
zero_user_segment(page, offset, PAGE_SIZE);
do_io:
EBUG_ON(!PageLocked(page));
/* Before unlocking the page, transfer reservation to w->io: */
old = page_state_cmpxchg(page_state(page), new, {
EBUG_ON(!new.reserved &&
(new.sectors != PAGE_SECTORS ||
new.compressed));
/*
* If we didn't get a reservation, we can only write out the
* number of (fully allocated) replicas that currently exist,
* and only if the entire page has been written:
*/
nr_replicas_this_write =
max_t(unsigned,
new.replicas_reserved,
(new.sectors == PAGE_SECTORS
? new.nr_replicas : 0));
if (new.reserved)
new.nr_replicas = new.reservation_replicas;
new.reserved = 0;
BUG_ON(!nr_replicas_this_write);
new.compressed |= w->opts.compression != 0;
new.nr_replicas = w->opts.compression
? 0
: nr_replicas_this_write;
new.replicas_reserved = 0;
new.sectors += new.dirty_sectors;
BUG_ON(new.sectors != PAGE_SECTORS);
new.dirty_sectors = 0;
});
@ -1261,21 +1264,20 @@ do_io:
unlock_page(page);
if (w->io &&
(w->io->op.op.res.nr_replicas != new.nr_replicas ||
(w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
!bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
bch2_writepage_do_io(w);
if (!w->io)
bch2_writepage_io_alloc(c, w, inode, page, new.nr_replicas);
bch2_writepage_io_alloc(c, w, inode, page,
nr_replicas_this_write);
w->io->new_sectors += new.sectors - old.sectors;
BUG_ON(inode != w->io->op.inode);
BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
if (old.reserved)
w->io->op.op.res.sectors += old.reservation_replicas * PAGE_SECTORS;
w->io->op.op.res.sectors += old.replicas_reserved * PAGE_SECTORS;
w->io->op.new_i_size = i_size;
if (wbc->sync_mode == WB_SYNC_ALL)
@ -2547,10 +2549,9 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
&disk_res, &quota_res,
iter, &reservation.k_i,
0, true, true, NULL);
btree_iter_err:
bch2_quota_reservation_put(c, inode, &quota_res);
bch2_disk_reservation_put(c, &disk_res);
btree_iter_err:
if (ret == -EINTR)
ret = 0;
if (ret)
@ -2612,6 +2613,8 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
static bool page_is_data(struct page *page)
{
EBUG_ON(!PageLocked(page));
/* XXX: should only have to check PageDirty */
return PagePrivate(page) &&
(page_state(page)->sectors ||

View File

@ -15,6 +15,7 @@
#include "clock.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
#include "extents.h"
#include "io.h"
@ -302,6 +303,7 @@ static void __bch2_write_index(struct bch_write_op *op)
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
struct bkey_i *src, *dst = keys->keys, *n, *k;
unsigned dev;
int ret;
for (src = keys->keys; src != keys->top; src = n) {
@ -345,6 +347,10 @@ static void __bch2_write_index(struct bch_write_op *op)
}
}
out:
/* If some a bucket wasn't written, we can't erasure code it: */
for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
bch2_open_bucket_write_error(c, &op->open_buckets, dev);
bch2_open_buckets_put(c, &op->open_buckets);
return;
err:
@ -421,7 +427,8 @@ static void init_append_extent(struct bch_write_op *op,
static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
struct write_point *wp,
struct bio *src,
bool *page_alloc_failed)
bool *page_alloc_failed,
void *buf)
{
struct bch_write_bio *wbio;
struct bio *bio;
@ -431,11 +438,18 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
wbio = wbio_init(bio);
wbio->bounce = true;
wbio->put_bio = true;
/* copy WRITE_SYNC flag */
wbio->bio.bi_opf = src->bi_opf;
if (buf) {
bio->bi_iter.bi_size = output_available;
bch2_bio_map(bio, buf);
return bio;
}
wbio->bounce = true;
/*
* We can't use mempool for more than c->sb.encoded_extent_max
* worth of pages, but we'd like to allocate more if we can:
@ -600,14 +614,18 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
struct bio *src = &op->wbio.bio, *dst = src;
struct bvec_iter saved_iter;
struct bkey_i *key_to_write;
void *ec_buf;
unsigned key_to_write_offset = op->insert_keys.top_p -
op->insert_keys.keys_p;
unsigned total_output = 0;
bool bounce = false, page_alloc_failed = false;
unsigned total_output = 0, total_input = 0;
bool bounce = false;
bool page_alloc_failed = false;
int ret, more = 0;
BUG_ON(!bio_sectors(src));
ec_buf = bch2_writepoint_ec_buf(c, wp);
switch (bch2_write_prep_encoded_data(op, wp)) {
case PREP_ENCODED_OK:
break;
@ -617,16 +635,26 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
case PREP_ENCODED_CHECKSUM_ERR:
goto csum_err;
case PREP_ENCODED_DO_WRITE:
if (ec_buf) {
dst = bch2_write_bio_alloc(c, wp, src,
&page_alloc_failed,
ec_buf);
bio_copy_data(dst, src);
bounce = true;
}
init_append_extent(op, wp, op->version, op->crc);
goto do_write;
}
if (op->compression_type ||
if (ec_buf ||
op->compression_type ||
(op->csum_type &&
!(op->flags & BCH_WRITE_PAGES_STABLE)) ||
(bch2_csum_type_is_encryption(op->csum_type) &&
!(op->flags & BCH_WRITE_PAGES_OWNED))) {
dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
dst = bch2_write_bio_alloc(c, wp, src,
&page_alloc_failed,
ec_buf);
bounce = true;
}
@ -729,7 +757,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
if (dst != src)
bio_advance(dst, dst_len);
bio_advance(src, src_len);
total_output += dst_len;
total_output += dst_len;
total_input += src_len;
} while (dst->bi_iter.bi_size &&
src->bi_iter.bi_size &&
wp->sectors_free &&
@ -742,16 +771,20 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
dst->bi_iter = saved_iter;
if (!bounce && more) {
dst = bio_split(src, total_output >> 9,
if (dst == src && more) {
BUG_ON(total_output != total_input);
dst = bio_split(src, total_input >> 9,
GFP_NOIO, &c->bio_write);
wbio_init(dst)->put_bio = true;
wbio_init(dst)->put_bio = true;
/* copy WRITE_SYNC flag */
dst->bi_opf = src->bi_opf;
}
dst->bi_iter.bi_size = total_output;
/* Free unneeded pages after compressing: */
if (bounce)
if (to_wbio(dst)->bounce)
while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
&c->bio_bounce_pages);
@ -760,6 +793,10 @@ do_write:
key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
bch2_ec_add_backpointer(c, wp,
bkey_start_pos(&key_to_write->k),
total_input >> 9);
dst->bi_end_io = bch2_write_endio;
dst->bi_private = &op->cl;
bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
@ -774,10 +811,10 @@ csum_err:
"rewriting existing data (memory corruption?)");
ret = -EIO;
err:
if (bounce) {
if (to_wbio(dst)->bounce)
bch2_bio_free_pages_pool(c, dst);
if (to_wbio(dst)->put_bio)
bio_put(dst);
}
return ret;
}
@ -789,6 +826,8 @@ static void __bch2_write(struct closure *cl)
struct write_point *wp;
int ret;
again:
memset(&op->failed, 0, sizeof(op->failed));
do {
/* +1 for possible cache device: */
if (op->open_buckets.nr + op->nr_replicas + 1 >
@ -803,6 +842,7 @@ again:
wp = bch2_alloc_sectors_start(c,
op->target,
op->opts.erasure_code,
op->write_point,
&op->devs_have,
op->nr_replicas,
@ -882,8 +922,6 @@ void bch2_write(struct closure *cl)
op->start_time = local_clock();
memset(&op->failed, 0, sizeof(op->failed));
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(&op->wbio.bio)->put_bio = false;
@ -1557,8 +1595,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
if (!pick_ret)
goto hole;
if (pick_ret < 0)
goto no_device;
if (pick_ret < 0) {
__bcache_io_error(c, "no device to read from");
goto err;
}
if (pick_ret > 0)
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
@ -1683,31 +1723,46 @@ noclone:
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
if (!rbio->have_ioref)
goto no_device_postclone;
percpu_down_read_preempt_disable(&c->usage_lock);
bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
percpu_up_read_preempt_enable(&c->usage_lock);
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
bio_sectors(&rbio->bio));
if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
bio_inc_remaining(&orig->bio);
trace_read_split(&orig->bio);
}
bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
if (likely(!(flags & BCH_READ_IN_RETRY))) {
if (!(flags & BCH_READ_LAST_FRAGMENT)) {
bio_inc_remaining(&orig->bio);
trace_read_split(&orig->bio);
if (!rbio->pick.idx) {
if (!rbio->have_ioref) {
__bcache_io_error(c, "no device to read from");
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
submit_bio(&rbio->bio);
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
bio_sectors(&rbio->bio));
bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
if (likely(!(flags & BCH_READ_IN_RETRY)))
submit_bio(&rbio->bio);
else
submit_bio_wait(&rbio->bio);
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(c, rbio)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
if (likely(!(flags & BCH_READ_IN_RETRY)))
bio_endio(&rbio->bio);
}
out:
if (likely(!(flags & BCH_READ_IN_RETRY))) {
return 0;
} else {
int ret;
submit_bio_wait(&rbio->bio);
rbio->context = RBIO_CONTEXT_UNBOUND;
bch2_read_endio(&rbio->bio);
@ -1722,22 +1777,12 @@ noclone:
return ret;
}
no_device_postclone:
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
bch2_rbio_free(rbio);
no_device:
__bcache_io_error(c, "no device to read from");
if (likely(!(flags & BCH_READ_IN_RETRY))) {
orig->bio.bi_status = BLK_STS_IOERR;
if (flags & BCH_READ_LAST_FRAGMENT)
bch2_rbio_done(orig);
return 0;
} else {
err:
if (flags & BCH_READ_IN_RETRY)
return READ_ERR;
}
orig->bio.bi_status = BLK_STS_IOERR;
goto out_read_done;
hole:
/*
@ -1749,7 +1794,7 @@ hole:
orig->hole = true;
zero_fill_bio_iter(&orig->bio, iter);
out_read_done:
if (flags & BCH_READ_LAST_FRAGMENT)
bch2_rbio_done(orig);
return 0;

View File

@ -134,6 +134,8 @@ static enum {
c->opts.block_size;
BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
bkey_extent_init(&buf->key);
/*
* We have to set last_seq here, _before_ opening a new journal entry:
*
@ -334,15 +336,14 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
}
static int __journal_res_get(struct journal *j, struct journal_res *res,
unsigned u64s_min, unsigned u64s_max)
unsigned flags)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf;
int ret;
retry:
ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
if (ret)
return ret;
if (journal_res_get_fast(j, res))
return 0;
spin_lock(&j->lock);
/*
@ -350,10 +351,9 @@ retry:
* that just did journal_entry_open() and call journal_entry_close()
* unnecessarily
*/
ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
if (ret) {
if (journal_res_get_fast(j, res)) {
spin_unlock(&j->lock);
return 1;
return 0;
}
/*
@ -376,7 +376,12 @@ retry:
spin_unlock(&j->lock);
return -EROFS;
case JOURNAL_ENTRY_INUSE:
/* haven't finished writing out the previous one: */
/*
* haven't finished writing out the previous entry, can't start
* another yet:
* signal to caller which sequence number we're trying to open:
*/
res->seq = journal_cur_seq(j) + 1;
spin_unlock(&j->lock);
trace_journal_entry_full(c);
goto blocked;
@ -388,6 +393,8 @@ retry:
/* We now have a new, closed journal buf - see if we can open it: */
ret = journal_entry_open(j);
if (!ret)
res->seq = journal_cur_seq(j);
spin_unlock(&j->lock);
if (ret < 0)
@ -407,7 +414,7 @@ retry:
blocked:
if (!j->res_get_blocked_start)
j->res_get_blocked_start = local_clock() ?: 1;
return 0;
return -EAGAIN;
}
/*
@ -421,14 +428,14 @@ blocked:
* btree node write locks.
*/
int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
unsigned u64s_min, unsigned u64s_max)
unsigned flags)
{
int ret;
wait_event(j->wait,
(ret = __journal_res_get(j, res, u64s_min,
u64s_max)));
return ret < 0 ? ret : 0;
(ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
(flags & JOURNAL_RES_GET_NONBLOCK));
return ret;
}
u64 bch2_journal_last_unwritten_seq(struct journal *j)
@ -452,28 +459,55 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j)
* btree root - every journal entry contains the roots of all the btrees, so it
* doesn't need to bother with getting a journal reservation
*/
int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
{
int ret;
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool need_reclaim = false;
retry:
spin_lock(&j->lock);
BUG_ON(seq > journal_cur_seq(j));
if (seq < journal_cur_seq(j) ||
journal_entry_is_open(j)) {
spin_unlock(&j->lock);
return 1;
return 0;
}
if (journal_cur_seq(j) < seq) {
switch (journal_buf_switch(j, false)) {
case JOURNAL_ENTRY_ERROR:
spin_unlock(&j->lock);
return -EROFS;
case JOURNAL_ENTRY_INUSE:
/* haven't finished writing out the previous one: */
trace_journal_entry_full(c);
goto blocked;
case JOURNAL_ENTRY_CLOSED:
break;
case JOURNAL_UNLOCKED:
goto retry;
}
}
BUG_ON(journal_cur_seq(j) < seq);
if (!journal_entry_open(j)) {
need_reclaim = true;
goto blocked;
}
ret = journal_entry_open(j);
if (!ret)
closure_wait(&j->async_wait, parent);
spin_unlock(&j->lock);
if (!ret)
bch2_journal_reclaim_work(&j->reclaim_work.work);
return 0;
blocked:
if (!j->res_get_blocked_start)
j->res_get_blocked_start = local_clock() ?: 1;
return ret;
closure_wait(&j->async_wait, cl);
spin_unlock(&j->lock);
if (need_reclaim)
bch2_journal_reclaim_work(&j->reclaim_work.work);
return -EAGAIN;
}
static int journal_seq_error(struct journal *j, u64 seq)
@ -593,11 +627,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
void bch2_journal_meta_async(struct journal *j, struct closure *parent)
{
struct journal_res res;
unsigned u64s = jset_u64s(0);
memset(&res, 0, sizeof(res));
bch2_journal_res_get(j, &res, u64s, u64s);
bch2_journal_res_get(j, &res, jset_u64s(0), 0);
bch2_journal_res_put(j, &res);
bch2_journal_flush_seq_async(j, res.seq, parent);
@ -606,12 +639,11 @@ void bch2_journal_meta_async(struct journal *j, struct closure *parent)
int bch2_journal_meta(struct journal *j)
{
struct journal_res res;
unsigned u64s = jset_u64s(0);
int ret;
memset(&res, 0, sizeof(res));
ret = bch2_journal_res_get(j, &res, u64s, u64s);
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
if (ret)
return ret;
@ -751,9 +783,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB),
new_fs
? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
: 0);
0);
if (c) {
spin_unlock(&c->journal.lock);
@ -861,10 +891,6 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
{
spin_lock(&j->lock);
bch2_extent_drop_device(bkey_i_to_s_extent(&j->key), ca->dev_idx);
spin_unlock(&j->lock);
wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
}
@ -1000,8 +1026,6 @@ int bch2_fs_journal_init(struct journal *j)
j->write_delay_ms = 1000;
j->reclaim_delay_ms = 100;
bkey_extent_init(&j->key);
atomic64_set(&j->reservations.counter,
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);

View File

@ -269,12 +269,10 @@ static inline void bch2_journal_res_put(struct journal *j,
}
int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
unsigned, unsigned);
unsigned);
static inline int journal_res_get_fast(struct journal *j,
struct journal_res *res,
unsigned u64s_min,
unsigned u64s_max)
struct journal_res *res)
{
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
@ -286,37 +284,37 @@ static inline int journal_res_get_fast(struct journal *j,
* Check if there is still room in the current journal
* entry:
*/
if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
return 0;
res->offset = old.cur_entry_offset;
res->u64s = min(u64s_max, j->cur_entry_u64s -
old.cur_entry_offset);
journal_state_inc(&new);
new.cur_entry_offset += res->u64s;
journal_state_inc(&new);
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
res->ref = true;
res->idx = new.idx;
res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
res->ref = true;
res->idx = old.idx;
res->offset = old.cur_entry_offset;
res->seq = le64_to_cpu(j->buf[old.idx].data->seq);
return 1;
}
#define JOURNAL_RES_GET_NONBLOCK (1 << 0)
static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
unsigned u64s_min, unsigned u64s_max)
unsigned u64s, unsigned flags)
{
int ret;
EBUG_ON(res->ref);
EBUG_ON(u64s_max < u64s_min);
EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
if (journal_res_get_fast(j, res, u64s_min, u64s_max))
res->u64s = u64s;
if (journal_res_get_fast(j, res))
goto out;
ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
ret = bch2_journal_res_get_slowpath(j, res, flags);
if (ret)
return ret;
out:

View File

@ -426,7 +426,7 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
static int journal_read_bucket(struct bch_dev *ca,
struct journal_read_buf *buf,
struct journal_list *jlist,
unsigned bucket, u64 *seq, bool *entries_found)
unsigned bucket)
{
struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
@ -511,7 +511,6 @@ reread:
switch (ret) {
case JOURNAL_ENTRY_ADD_OK:
*entries_found = true;
break;
case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
break;
@ -519,9 +518,6 @@ reread:
return ret;
}
if (le64_to_cpu(j->seq) > *seq)
*seq = le64_to_cpu(j->seq);
sectors = vstruct_sectors(j, c->block_bits);
next_block:
pr_debug("next");
@ -535,120 +531,51 @@ next_block:
static void bch2_journal_read_device(struct closure *cl)
{
#define read_bucket(b) \
({ \
bool entries_found = false; \
ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \
&entries_found); \
if (ret) \
goto err; \
__set_bit(b, bitmap); \
entries_found; \
})
struct journal_device *ja =
container_of(cl, struct journal_device, read);
struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
struct journal_read_buf buf = { NULL, 0 };
DECLARE_BITMAP(bitmap, ja->nr);
unsigned i, l, r;
u64 seq = 0;
u64 min_seq = U64_MAX;
unsigned i;
int ret;
if (!ja->nr)
goto out;
bitmap_zero(bitmap, ja->nr);
ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
if (ret)
goto err;
pr_debug("%u journal buckets", ja->nr);
/*
* If the device supports discard but not secure discard, we can't do
* the fancy fibonacci hash/binary search because the live journal
* entries might not form a contiguous range:
*/
for (i = 0; i < ja->nr; i++)
read_bucket(i);
goto search_done;
if (!blk_queue_nonrot(q))
goto linear_scan;
/*
* Read journal buckets ordered by golden ratio hash to quickly
* find a sequence of buckets with valid journal entries
*/
for (i = 0; i < ja->nr; i++) {
l = (i * 2654435769U) % ja->nr;
ret = journal_read_bucket(ca, &buf, jlist, i);
if (ret)
goto err;
}
if (test_bit(l, bitmap))
break;
/* Find the journal bucket with the highest sequence number: */
for (i = 0; i < ja->nr; i++) {
if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
ja->cur_idx = i;
if (read_bucket(l))
goto bsearch;
min_seq = min(ja->bucket_seq[i], min_seq);
}
/*
* If that fails, check all the buckets we haven't checked
* already
*/
pr_debug("falling back to linear search");
linear_scan:
for (l = find_first_zero_bit(bitmap, ja->nr);
l < ja->nr;
l = find_next_zero_bit(bitmap, ja->nr, l + 1))
if (read_bucket(l))
goto bsearch;
/* no journal entries on this device? */
if (l == ja->nr)
goto out;
bsearch:
/* Binary search */
r = find_next_bit(bitmap, ja->nr, l + 1);
pr_debug("starting binary search, l %u r %u", l, r);
while (l + 1 < r) {
unsigned m = (l + r) >> 1;
u64 cur_seq = seq;
read_bucket(m);
if (cur_seq != seq)
l = m;
else
r = m;
}
search_done:
/*
* Find the journal bucket with the highest sequence number:
*
* If there's duplicate journal entries in multiple buckets (which
* definitely isn't supposed to happen, but...) - make sure to start
* cur_idx at the last of those buckets, so we don't deadlock trying to
* allocate
*/
seq = 0;
while (ja->bucket_seq[ja->cur_idx] > min_seq &&
ja->bucket_seq[ja->cur_idx] >
ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
ja->cur_idx++;
for (i = 0; i < ja->nr; i++)
if (ja->bucket_seq[i] >= seq &&
ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
/*
* When journal_next_bucket() goes to allocate for
* the first time, it'll use the bucket after
* ja->cur_idx
*/
ja->cur_idx = i;
seq = ja->bucket_seq[i];
}
ja->sectors_free = 0;
/*
* Set last_idx to indicate the entire journal is full and needs to be
@ -656,17 +583,6 @@ search_done:
* pinned when it first runs:
*/
ja->last_idx = (ja->cur_idx + 1) % ja->nr;
/*
* Read buckets in reverse order until we stop finding more journal
* entries:
*/
for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
i != ja->cur_idx;
i = (i + ja->nr - 1) % ja->nr)
if (!test_bit(i, bitmap) &&
!read_bucket(i))
break;
out:
kvpfree(buf.data, buf.size);
percpu_ref_put(&ca->io_ref);
@ -677,7 +593,6 @@ err:
jlist->ret = ret;
mutex_unlock(&jlist->lock);
goto out;
#undef read_bucket
}
void bch2_journal_entries_free(struct list_head *list)
@ -865,7 +780,6 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
int ret = 0;
list_for_each_entry_safe(i, n, list, list) {
j->replay_journal_seq = le64_to_cpu(i->j.seq);
for_each_jset_key(k, _n, entry, &i->j) {
@ -875,7 +789,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
* allocation code handles replay for
* BTREE_ID_ALLOC keys:
*/
ret = bch2_alloc_replay_key(c, k->k.p);
ret = bch2_alloc_replay_key(c, k);
} else {
/*
* We might cause compressed extents to be
@ -886,9 +800,9 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
bch2_disk_reservation_init(c, 0);
ret = bch2_btree_insert(c, entry->btree_id, k,
&disk_res, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_REPLAY);
&disk_res, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_REPLAY);
}
if (ret) {
@ -932,32 +846,18 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf,
}
static unsigned journal_dev_buckets_available(struct journal *j,
struct bch_dev *ca)
struct journal_device *ja)
{
struct journal_device *ja = &ca->journal;
unsigned next = (ja->cur_idx + 1) % ja->nr;
unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
/*
* Hack to avoid a deadlock during journal replay:
* journal replay might require setting a new btree
* root, which requires writing another journal entry -
* thus, if the journal is full (and this happens when
* replaying the first journal bucket's entries) we're
* screwed.
*
* So don't let the journal fill up unless we're in
* replay:
*/
if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
available = max((int) available - 2, 0);
/*
* Don't use the last bucket unless writing the new last_seq
* will make another bucket available:
*/
if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
available = max((int) available - 1, 0);
if (available &&
journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
--available;
return available;
}
@ -967,7 +867,6 @@ int bch2_journal_entry_sectors(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
unsigned sectors_available = UINT_MAX;
unsigned i, nr_online = 0, nr_devs = 0;
@ -977,38 +876,39 @@ int bch2_journal_entry_sectors(struct journal *j)
for_each_member_device_rcu(ca, c, i,
&c->rw_devs[BCH_DATA_JOURNAL]) {
struct journal_device *ja = &ca->journal;
unsigned buckets_required = 0;
unsigned buckets_this_device, sectors_this_device;
if (!ja->nr)
continue;
sectors_available = min_t(unsigned, sectors_available,
ca->mi.bucket_size);
buckets_this_device = journal_dev_buckets_available(j, ja);
sectors_this_device = ja->sectors_free;
nr_online++;
/*
* Note that we don't allocate the space for a journal entry
* until we write it out - thus, if we haven't started the write
* for the previous entry we have to make sure we have space for
* it too:
* We that we don't allocate the space for a journal entry
* until we write it out - thus, account for it here:
*/
if (bch2_extent_has_device(e.c, ca->dev_idx)) {
if (j->prev_buf_sectors > ja->sectors_free)
buckets_required++;
if (j->prev_buf_sectors >= sectors_this_device) {
if (!buckets_this_device)
continue;
if (j->prev_buf_sectors + sectors_available >
ja->sectors_free)
buckets_required++;
} else {
if (j->prev_buf_sectors + sectors_available >
ca->mi.bucket_size)
buckets_required++;
buckets_required++;
buckets_this_device--;
sectors_this_device = ca->mi.bucket_size;
}
if (journal_dev_buckets_available(j, ca) >= buckets_required)
nr_devs++;
nr_online++;
sectors_this_device -= j->prev_buf_sectors;
if (buckets_this_device)
sectors_this_device = ca->mi.bucket_size;
if (!sectors_this_device)
continue;
sectors_available = min(sectors_available,
sectors_this_device);
nr_devs++;
}
rcu_read_unlock();
@ -1021,6 +921,61 @@ int bch2_journal_entry_sectors(struct journal *j)
return sectors_available;
}
static void __journal_write_alloc(struct journal *j,
struct journal_buf *w,
struct dev_alloc_list *devs_sorted,
unsigned sectors,
unsigned *replicas,
unsigned replicas_want)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bkey_i_extent *e = bkey_i_to_extent(&w->key);
struct journal_device *ja;
struct bch_dev *ca;
unsigned i;
if (*replicas >= replicas_want)
return;
for (i = 0; i < devs_sorted->nr; i++) {
ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
if (!ca)
continue;
ja = &ca->journal;
/*
* Check that we can use this device, and aren't already using
* it:
*/
if (!ca->mi.durability ||
ca->mi.state != BCH_MEMBER_STATE_RW ||
!ja->nr ||
bch2_extent_has_device(extent_i_to_s_c(e), ca->dev_idx) ||
sectors > ja->sectors_free)
continue;
bch2_dev_stripe_increment(c, ca, &j->wp.stripe);
extent_ptr_append(e,
(struct bch_extent_ptr) {
.offset = bucket_to_sector(ca,
ja->buckets[ja->cur_idx]) +
ca->mi.bucket_size -
ja->sectors_free,
.dev = ca->dev_idx,
});
ja->sectors_free -= sectors;
ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
*replicas += ca->mi.durability;
if (*replicas >= replicas_want)
break;
}
}
/**
* journal_next_bucket - move on to the next journal bucket if possible
*/
@ -1028,100 +983,49 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
unsigned sectors)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
struct journal_device *ja;
struct bch_dev *ca;
struct dev_alloc_list devs_sorted;
unsigned i, replicas, replicas_want =
unsigned i, replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
spin_lock(&j->lock);
e = bkey_i_to_s_extent(&j->key);
/*
* Drop any pointers to devices that have been removed, are no longer
* empty, or filled up their current journal bucket:
*
* Note that a device may have had a small amount of free space (perhaps
* one sector) that wasn't enough for the smallest possible journal
* entry - that's why we drop pointers to devices <= current free space,
* i.e. whichever device was limiting the current journal entry size.
*/
bch2_extent_drop_ptrs(e, ptr, ({
ca = bch_dev_bkey_exists(c, ptr->dev);
ca->mi.state != BCH_MEMBER_STATE_RW ||
ca->journal.sectors_free <= sectors;
}));
extent_for_each_ptr(e, ptr) {
ca = bch_dev_bkey_exists(c, ptr->dev);
BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW ||
ca->journal.sectors_free <= sectors);
ca->journal.sectors_free -= sectors;
}
replicas = bch2_extent_nr_ptrs(e.c);
rcu_read_lock();
devs_sorted = bch2_wp_alloc_list(c, &j->wp,
&c->rw_devs[BCH_DATA_JOURNAL]);
devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
&c->rw_devs[BCH_DATA_JOURNAL]);
spin_lock(&j->lock);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
if (replicas >= replicas_want)
goto done;
for (i = 0; i < devs_sorted.nr; i++) {
ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
if (!ca)
continue;
if (!ca->mi.durability)
continue;
ja = &ca->journal;
if (!ja->nr)
continue;
if (replicas >= replicas_want)
break;
/*
* Check that we can use this device, and aren't already using
* it:
*/
if (bch2_extent_has_device(e.c, ca->dev_idx) ||
!journal_dev_buckets_available(j, ca) ||
sectors > ca->mi.bucket_size)
continue;
j->wp.next_alloc[ca->dev_idx] += U32_MAX;
bch2_wp_rescale(c, ca, &j->wp);
ja->sectors_free = ca->mi.bucket_size - sectors;
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
extent_ptr_append(bkey_i_to_extent(&j->key),
(struct bch_extent_ptr) {
.offset = bucket_to_sector(ca,
ja->buckets[ja->cur_idx]),
.dev = ca->dev_idx,
});
replicas += ca->mi.durability;
if (sectors > ja->sectors_free &&
sectors <= ca->mi.bucket_size &&
journal_dev_buckets_available(j, ja)) {
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
ja->sectors_free = ca->mi.bucket_size;
}
}
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
done:
if (replicas >= replicas_want)
j->prev_buf_sectors = 0;
spin_unlock(&j->lock);
rcu_read_unlock();
j->prev_buf_sectors = 0;
bkey_copy(&w->key, &j->key);
spin_unlock(&j->lock);
if (replicas < c->opts.metadata_replicas_required)
return -EROFS;
BUG_ON(!replicas);
return 0;
return replicas >= replicas_want ? 0 : -EROFS;
}
static void journal_write_compact(struct jset *jset)
@ -1376,9 +1280,6 @@ void bch2_journal_write(struct closure *cl)
}
no_io:
extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
ptr->offset += sectors;
bch2_bucket_seq_cleanup(c);
continue_at(cl, journal_write_done, system_highpri_wq);

View File

@ -125,7 +125,8 @@ void bch2_journal_reclaim_fast(struct journal *j)
* Unpin journal entries whose reference counts reached zero, meaning
* all btree nodes got written out
*/
while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
while (!fifo_empty(&j->pin) &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
BUG_ON(!fifo_pop(&j->pin, temp));
popped = true;

View File

@ -184,7 +184,6 @@ struct journal {
struct list_head seq_blacklist;
struct journal_seq_blacklist *new_blacklist;
BKEY_PADDED(key);
struct write_point wp;
spinlock_t err_lock;

View File

@ -278,11 +278,37 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
case Opt_background_compression:
ret = bch2_check_set_has_compressed_data(c, v);
break;
case Opt_erasure_code:
if (v &&
!(c->sb.features & (1ULL << BCH_FEATURE_EC))) {
mutex_lock(&c->sb_lock);
c->disk_sb.sb->features[0] |=
cpu_to_le64(1ULL << BCH_FEATURE_EC);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
break;
}
return ret;
}
int bch2_opts_check_may_set(struct bch_fs *c)
{
unsigned i;
int ret;
for (i = 0; i < bch2_opts_nr; i++) {
ret = bch2_opt_check_may_set(c, i,
bch2_opt_get_by_id(&c->opts, i));
if (ret)
return ret;
}
return 0;
}
int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
{
char *opt, *name, *val;

View File

@ -110,6 +110,9 @@ enum opt_type {
BCH_OPT(promote_target, u16, OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_PROMOTE_TARGET, 0) \
BCH_OPT(erasure_code, u16, OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_ERASURE_CODE, false) \
BCH_OPT(inodes_32bit, u8, OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_INODE_32BIT, false) \
@ -266,6 +269,7 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
const struct bch_option *, u64, unsigned);
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
int bch2_opts_check_may_set(struct bch_fs *);
int bch2_parse_mount_opts(struct bch_opts *, char *);
/* inode opts: */
@ -277,7 +281,8 @@ int bch2_parse_mount_opts(struct bch_opts *, char *);
BCH_INODE_OPT(data_replicas, 8) \
BCH_INODE_OPT(promote_target, 16) \
BCH_INODE_OPT(foreground_target, 16) \
BCH_INODE_OPT(background_target, 16)
BCH_INODE_OPT(background_target, 16) \
BCH_INODE_OPT(erasure_code, 16)
struct bch_io_opts {
#define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1;

View File

@ -6,6 +6,7 @@
#include "btree_update_interior.h"
#include "btree_io.h"
#include "dirent.h"
#include "ec.h"
#include "error.h"
#include "fsck.h"
#include "journal_io.h"
@ -212,6 +213,11 @@ int bch2_fs_recovery(struct bch_fs *c)
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
err = "cannot allocate memory";
ret = bch2_fs_ec_start(c);
if (ret)
goto err;
bch_verbose(c, "starting mark and sweep:");
err = "error in recovery";
ret = bch2_initial_gc(c, &journal);

View File

@ -79,9 +79,33 @@ static void extent_to_replicas(struct bkey_s_c k,
r->nr_required = 1;
extent_for_each_ptr_decode(e, p, entry)
if (!p.ptr.cached)
r->devs[r->nr_devs++] = p.ptr.dev;
extent_for_each_ptr_decode(e, p, entry) {
if (p.ptr.cached)
continue;
if (p.ec_nr) {
r->nr_devs = 0;
break;
}
r->devs[r->nr_devs++] = p.ptr.dev;
}
}
}
static void stripe_to_replicas(struct bkey_s_c k,
struct bch_replicas_entry *r)
{
if (k.k->type == BCH_STRIPE) {
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
const struct bch_extent_ptr *ptr;
r->nr_required = s.v->nr_blocks - s.v->nr_redundant;
for (ptr = s.v->ptrs;
ptr < s.v->ptrs + s.v->nr_blocks;
ptr++)
r->devs[r->nr_devs++] = ptr->dev;
}
}
@ -100,6 +124,10 @@ static void bkey_to_replicas(enum bkey_type type,
e->data_type = BCH_DATA_USER;
extent_to_replicas(k, e);
break;
case BKEY_TYPE_EC:
e->data_type = BCH_DATA_USER;
stripe_to_replicas(k, e);
break;
default:
break;
}

View File

@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "checksum.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
#include "io.h"
#include "journal.h"

View File

@ -19,6 +19,7 @@
#include "compress.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
#include "fs.h"
#include "fs-io.h"
@ -395,6 +396,7 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_io_exit(c);
bch2_fs_btree_cache_exit(c);
@ -403,7 +405,7 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
percpu_free_rwsem(&c->usage_lock);
free_percpu(c->usage_percpu);
free_percpu(c->usage[0]);
mempool_exit(&c->btree_iters_pool);
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
@ -576,6 +578,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
INIT_LIST_HEAD(&c->ec_new_stripe_list);
mutex_init(&c->ec_new_stripe_lock);
mutex_init(&c->ec_stripes_lock);
spin_lock_init(&c->ec_stripes_heap_lock);
seqcount_init(&c->gc_pos_lock);
c->copy_gc_enabled = 1;
@ -631,7 +638,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
max(offsetof(struct btree_read_bio, bio),
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
!(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
percpu_init_rwsem(&c->usage_lock) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
@ -644,6 +651,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_io_init(c) ||
bch2_fs_encryption_init(c) ||
bch2_fs_compress_init(c) ||
bch2_fs_ec_init(c) ||
bch2_fs_fsio_init(c))
goto err;
@ -715,6 +723,10 @@ const char *bch2_fs_start(struct bch_fs *c)
if (ret)
goto err;
ret = bch2_opts_check_may_set(c);
if (ret)
goto err;
err = "dynamic fault";
if (bch2_fs_init_fault("fs_start"))
goto err;
@ -1054,8 +1066,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
return ret;
mutex_lock(&c->sb_lock);
bch2_mark_dev_superblock(ca->fs, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
bch2_mark_dev_superblock(ca->fs, ca, 0);
mutex_unlock(&c->sb_lock);
bch2_dev_sysfs_online(c, ca);
@ -1340,7 +1351,7 @@ static void dev_usage_clear(struct bch_dev *ca)
for_each_possible_cpu(cpu) {
struct bch_dev_usage *p =
per_cpu_ptr(ca->usage_percpu, cpu);
per_cpu_ptr(ca->usage[0], cpu);
memset(p, 0, sizeof(*p));
}
@ -1401,8 +1412,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
* allocate the journal, reset all the marks, then remark after we
* attach...
*/
bch2_mark_dev_superblock(ca->fs, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
bch2_mark_dev_superblock(ca->fs, ca, 0);
err = "journal alloc failed";
ret = bch2_dev_journal_alloc(ca);
@ -1461,8 +1471,7 @@ have_slot:
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
bch2_mark_dev_superblock(c, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
bch2_mark_dev_superblock(c, ca, 0);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);

View File

@ -18,6 +18,7 @@
#include "btree_gc.h"
#include "buckets.h"
#include "disk_groups.h"
#include "ec.h"
#include "inode.h"
#include "journal.h"
#include "keylist.h"
@ -187,6 +188,8 @@ sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_work);
rw_attribute(promote_whole_extents);
read_attribute(new_stripes);
rw_attribute(pd_controllers_update_seconds);
read_attribute(meta_replicas_have);
@ -241,6 +244,8 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
pr_buf(&out, "\t%s:\t\t%llu\n",
bch2_data_types[type],
stats.replicas[replicas].data[type]);
pr_buf(&out, "\terasure coded:\t%llu\n",
stats.replicas[replicas].ec_data);
pr_buf(&out, "\treserved:\t%llu\n",
stats.replicas[replicas].persistent_reserved);
}
@ -309,6 +314,41 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
compressed_sectors_uncompressed << 9);
}
static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf)
{
char *out = buf, *end = buf + PAGE_SIZE;
struct ec_stripe_head *h;
struct ec_stripe_new *s;
mutex_lock(&c->ec_new_stripe_lock);
list_for_each_entry(h, &c->ec_new_stripe_list, list) {
out += scnprintf(out, end - out,
"target %u algo %u redundancy %u:\n",
h->target, h->algo, h->redundancy);
if (h->s)
out += scnprintf(out, end - out,
"\tpending: blocks %u allocated %u\n",
h->s->blocks.nr,
bitmap_weight(h->s->blocks_allocated,
h->s->blocks.nr));
mutex_lock(&h->lock);
list_for_each_entry(s, &h->stripes, list)
out += scnprintf(out, end - out,
"\tin flight: blocks %u allocated %u pin %u\n",
s->blocks.nr,
bitmap_weight(s->blocks_allocated,
s->blocks.nr),
atomic_read(&s->pin));
mutex_unlock(&h->lock);
}
mutex_unlock(&c->ec_new_stripe_lock);
return out - buf;
}
SHOW(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@ -368,6 +408,9 @@ SHOW(bch2_fs)
if (attr == &sysfs_compression_stats)
return bch2_compression_stats(c, buf);
if (attr == &sysfs_new_stripes)
return bch2_new_stripes(c, buf);
#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
@ -434,7 +477,7 @@ STORE(__bch2_fs)
bch2_coalesce(c);
if (attr == &sysfs_trigger_gc)
bch2_gc(c);
bch2_gc(c, NULL, false);
if (attr == &sysfs_prune_cache) {
struct shrink_control sc;
@ -536,6 +579,8 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_rebalance_work,
sysfs_pd_controller_files(rebalance),
&sysfs_new_stripes,
&sysfs_internal_uuid,
#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@ -764,6 +809,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
" meta: %llu\n"
" user: %llu\n"
" cached: %llu\n"
" erasure coded: %llu\n"
" available: %lli\n"
"sectors:\n"
" sb: %llu\n"
@ -787,6 +833,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
stats.buckets[BCH_DATA_BTREE],
stats.buckets[BCH_DATA_USER],
stats.buckets[BCH_DATA_CACHED],
stats.buckets_ec,
ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
stats.sectors[BCH_DATA_SB],
stats.sectors[BCH_DATA_JOURNAL],