Update bcachefs sources to f7ccf51390 bcachefs: durability

This commit is contained in:
Kent Overstreet 2018-03-13 03:22:35 -04:00
parent 2bb8cdfc75
commit 35d3f92ad5
13 changed files with 236 additions and 120 deletions

View File

@ -1 +1 @@
da224776eb43b7a47d8c7cd9314d5d1a8e97aabe
f7ccf513908be42581e41b48b8b078a441a6a804

View File

@ -1227,24 +1227,35 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
{
enum bucket_alloc_ret ret = NO_DEVICES;
struct dev_alloc_list devs_sorted;
unsigned i;
struct bch_dev *ca;
unsigned i, nr_ptrs_effective = 0;
bool have_cache_dev = false;
BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
if (wp->nr_ptrs >= nr_replicas)
for (i = wp->first_ptr; i < wp->nr_ptrs; i++) {
ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
nr_ptrs_effective += ca->mi.durability;
have_cache_dev |= !ca->mi.durability;
}
if (nr_ptrs_effective >= nr_replicas)
return ALLOC_SUCCESS;
rcu_read_lock();
devs_sorted = bch2_wp_alloc_list(c, wp, devs);
for (i = 0; i < devs_sorted.nr; i++) {
struct bch_dev *ca =
rcu_dereference(c->devs[devs_sorted.devs[i]]);
int ob;
ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
if (!ca)
continue;
if (have_cache_dev && !ca->mi.durability)
continue;
ob = bch2_bucket_alloc(c, ca, reserve,
wp->type == BCH_DATA_USER, cl);
if (ob < 0) {
@ -1256,13 +1267,17 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
BUG_ON(ob <= 0 || ob > U8_MAX);
BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
bch2_wp_rescale(c, ca, wp);
nr_ptrs_effective += ca->mi.durability;
have_cache_dev |= !ca->mi.durability;
__clear_bit(ca->dev_idx, devs->d);
if (wp->nr_ptrs == nr_replicas) {
if (nr_ptrs_effective >= nr_replicas) {
ret = ALLOC_SUCCESS;
break;
}
@ -1318,37 +1333,46 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
/* Sector allocator */
static void writepoint_drop_ptr(struct bch_fs *c,
struct write_point *wp,
unsigned i)
{
struct open_bucket *ob = wp->ptrs[i];
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
BUG_ON(ca->open_buckets_partial_nr >=
ARRAY_SIZE(ca->open_buckets_partial));
if (wp->type == BCH_DATA_USER) {
spin_lock(&c->freelist_lock);
ob->on_partial_list = true;
ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
ob - c->open_buckets;
spin_unlock(&c->freelist_lock);
closure_wake_up(&c->open_buckets_wait);
closure_wake_up(&c->freelist_wait);
} else {
bch2_open_bucket_put(c, ob);
}
array_remove_item(wp->ptrs, wp->nr_ptrs, i);
if (i < wp->first_ptr)
wp->first_ptr--;
}
static void writepoint_drop_ptrs(struct bch_fs *c,
struct write_point *wp,
u16 target, bool in_target,
unsigned nr_ptrs_dislike)
u16 target, bool in_target)
{
int i;
if (!nr_ptrs_dislike)
return;
for (i = wp->first_ptr - 1; i >= 0; --i) {
struct bch_dev *ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
for (i = wp->nr_ptrs - 1; i >= 0; --i) {
struct open_bucket *ob = wp->ptrs[i];
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
if (nr_ptrs_dislike &&
dev_in_target(ca, target) == in_target) {
BUG_ON(ca->open_buckets_partial_nr >=
ARRAY_SIZE(ca->open_buckets_partial));
spin_lock(&c->freelist_lock);
ob->on_partial_list = true;
ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
ob - c->open_buckets;
spin_unlock(&c->freelist_lock);
closure_wake_up(&c->open_buckets_wait);
closure_wake_up(&c->freelist_wait);
array_remove_item(wp->ptrs, wp->nr_ptrs, i);
--nr_ptrs_dislike;
}
if (dev_in_target(ca, target) == in_target)
writepoint_drop_ptr(c, wp, i);
}
}
@ -1358,7 +1382,7 @@ static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
struct open_bucket *ob;
unsigned i;
writepoint_for_each_ptr(wp, ob, i) {
writepoint_for_each_ptr_all(wp, ob, i) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
BUG_ON(ptr_stale(ca, &ob->ptr));
@ -1378,14 +1402,11 @@ static int open_bucket_add_buckets(struct bch_fs *c,
struct open_bucket *ob;
unsigned i;
if (wp->nr_ptrs >= nr_replicas)
return 0;
/* Don't allocate from devices we already have pointers to: */
for (i = 0; i < devs_have->nr; i++)
__clear_bit(devs_have->devs[i], devs.d);
writepoint_for_each_ptr(wp, ob, i)
writepoint_for_each_ptr_all(wp, ob, i)
__clear_bit(ob->ptr.dev, devs.d);
if (target) {
@ -1487,76 +1508,123 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
{
struct write_point *wp;
struct open_bucket *ob;
unsigned i, nr_ptrs_dislike = 0, nr_ptrs_have = 0;
int ret;
struct bch_dev *ca;
unsigned nr_ptrs_have, nr_ptrs_effective;
int ret, i, cache_idx = -1;
BUG_ON(!nr_replicas || !nr_replicas_required);
wp = writepoint_find(c, write_point.v);
/* does ob have ptrs we don't need? */
wp->first_ptr = 0;
/* does writepoint have ptrs we can't use? */
writepoint_for_each_ptr(wp, ob, i)
if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev))
nr_ptrs_have++;
else if (!dev_in_target(c->devs[ob->ptr.dev], target))
nr_ptrs_dislike++;
if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev)) {
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
wp->first_ptr++;
}
nr_ptrs_have = wp->first_ptr;
/* does writepoint have ptrs we don't want to use? */
writepoint_for_each_ptr(wp, ob, i)
if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
wp->first_ptr++;
}
ret = open_bucket_add_buckets(c, target, wp, devs_have,
nr_replicas + nr_ptrs_have + nr_ptrs_dislike,
reserve, cl);
nr_replicas, reserve, cl);
if (ret && ret != -EROFS)
goto err;
if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
goto alloc_done;
if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
ret = open_bucket_add_buckets(c, target, wp, devs_have,
nr_replicas, reserve, cl);
} else {
ret = open_bucket_add_buckets(c, target, wp, devs_have,
nr_replicas, reserve, NULL);
if (!ret)
goto alloc_done;
ret = open_bucket_add_buckets(c, target, wp, devs_have,
nr_replicas + nr_ptrs_have,
reserve, cl);
if (ret && ret != -EROFS)
wp->first_ptr = nr_ptrs_have;
ret = open_bucket_add_buckets(c, 0, wp, devs_have,
nr_replicas, reserve, cl);
}
if (ret)
goto err;
alloc_done:
if (wp->nr_ptrs - nr_ptrs_have -
((flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) ? nr_ptrs_dislike : 0)
< nr_replicas_required) {
ret = -EROFS;
goto err;
/* check for more than one cache: */
for (i = wp->nr_ptrs - 1; i >= wp->first_ptr; --i) {
ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
if (ca->mi.durability)
continue;
/*
* if we ended up with more than one cache device, prefer the
* one in the target we want:
*/
if (cache_idx >= 0) {
if (!dev_in_target(ca, target)) {
writepoint_drop_ptr(c, wp, i);
} else {
writepoint_drop_ptr(c, wp, cache_idx);
cache_idx = i;
}
} else {
cache_idx = i;
}
}
if ((int) wp->nr_ptrs - nr_ptrs_dislike < nr_replicas)
nr_ptrs_dislike = clamp_t(int, wp->nr_ptrs - nr_replicas,
0, nr_ptrs_dislike);
/* we might have more effective replicas than required: */
nr_ptrs_effective = 0;
writepoint_for_each_ptr(wp, ob, i) {
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
nr_ptrs_effective += ca->mi.durability;
}
if (nr_ptrs_effective > nr_replicas) {
writepoint_for_each_ptr(wp, ob, i) {
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
if (ca->mi.durability &&
ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
!dev_idx_in_target(c, ob->ptr.dev, target)) {
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
wp->first_ptr++;
nr_ptrs_effective -= ca->mi.durability;
}
}
}
if (nr_ptrs_effective > nr_replicas) {
writepoint_for_each_ptr(wp, ob, i) {
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
if (ca->mi.durability &&
ca->mi.durability <= nr_ptrs_effective - nr_replicas) {
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
wp->first_ptr++;
nr_ptrs_effective -= ca->mi.durability;
}
}
}
/* Remove pointers we don't want to use: */
writepoint_drop_ptrs(c, wp, target, false, nr_ptrs_dislike);
if (target)
writepoint_drop_ptrs(c, wp, target, false);
/*
* Move pointers to devices we already have to end of open bucket
* pointer list - note that removing pointers we don't want to use might
* have changed nr_ptrs_have:
*/
if (nr_ptrs_have) {
i = nr_ptrs_have = 0;
while (i < wp->nr_ptrs - nr_ptrs_have)
if (bch2_dev_list_has_dev(*devs_have, wp->ptrs[i]->ptr.dev)) {
nr_ptrs_have++;
swap(wp->ptrs[i], wp->ptrs[wp->nr_ptrs - nr_ptrs_have]);
} else {
i++;
}
}
wp->nr_ptrs_can_use =
min_t(unsigned, nr_replicas, wp->nr_ptrs - nr_ptrs_have);
BUG_ON(wp->nr_ptrs_can_use < nr_replicas_required ||
wp->nr_ptrs_can_use > wp->nr_ptrs);
BUG_ON(wp->first_ptr >= wp->nr_ptrs);
BUG_ON(nr_ptrs_effective < nr_replicas_required);
wp->sectors_free = UINT_MAX;
for (i = 0; i < wp->nr_ptrs_can_use; i++)
wp->sectors_free = min(wp->sectors_free,
wp->ptrs[i]->sectors_free);
writepoint_for_each_ptr(wp, ob, i)
wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
@ -1575,19 +1643,21 @@ err:
void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
struct bkey_i_extent *e, unsigned sectors)
{
struct open_bucket *ob;
unsigned i;
BUG_ON(sectors > wp->sectors_free);
wp->sectors_free -= sectors;
for (i = 0; i < wp->nr_ptrs_can_use; i++) {
struct open_bucket *ob = wp->ptrs[i];
writepoint_for_each_ptr(wp, ob, i) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
struct bch_extent_ptr tmp = ob->ptr;
EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
tmp.cached = bkey_extent_is_cached(&e->k);
tmp.cached = bkey_extent_is_cached(&e->k) ||
(!ca->mi.durability && wp->type == BCH_DATA_USER);
tmp.offset += ca->mi.bucket_size - ob->sectors_free;
extent_ptr_append(e, tmp);
@ -1704,8 +1774,8 @@ static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX);
mutex_lock(&wp->lock);
writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx),
true, wp->nr_ptrs);
wp->first_ptr = wp->nr_ptrs;
writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx), true);
mutex_unlock(&wp->lock);
}

View File

@ -33,6 +33,17 @@ enum bucket_alloc_ret {
int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
struct closure *);
#define __writepoint_for_each_ptr(_wp, _ob, _i, _start) \
for ((_i) = (_start); \
(_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \
(_i)++)
#define writepoint_for_each_ptr_all(_wp, _ob, _i) \
__writepoint_for_each_ptr(_wp, _ob, _i, 0)
#define writepoint_for_each_ptr(_wp, _ob, _i) \
__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
@ -55,11 +66,10 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
struct write_point *wp,
u8 *nr, u8 *refs)
{
struct open_bucket *ob;
unsigned i;
for (i = 0; i < wp->nr_ptrs_can_use; i++) {
struct open_bucket *ob = wp->ptrs[i];
writepoint_for_each_ptr(wp, ob, i) {
atomic_inc(&ob->pin);
refs[(*nr)++] = ob - c->open_buckets;
}
@ -88,11 +98,6 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
rcu_read_unlock();
}
#define writepoint_for_each_ptr(_wp, _ob, _i) \
for ((_i) = 0; \
(_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \
(_i)++)
static inline struct write_point_specifier writepoint_hashed(unsigned long v)
{
return (struct write_point_specifier) { .v = v | 1 };

View File

@ -65,11 +65,8 @@ struct write_point {
enum bch_data_type type;
u8 nr_ptrs;
/*
* number of pointers in @ob we can't use, because we already had
* pointers to those devices:
*/
u8 nr_ptrs_can_use;
u8 first_ptr;
/* calculated based on how many pointers we're actually going to use: */
unsigned sectors_free;

View File

@ -829,6 +829,7 @@ LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14)
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15)
LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30)
#define BCH_TIER_MAX 4U

View File

@ -201,17 +201,31 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
return nr_ptrs;
}
unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e)
unsigned bch2_extent_ptr_durability(struct bch_fs *c,
const struct bch_extent_ptr *ptr)
{
struct bch_dev *ca;
if (ptr->cached)
return 0;
ca = bch_dev_bkey_exists(c, ptr->dev);
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
return 0;
return ca->mi.durability;
}
unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
{
const struct bch_extent_ptr *ptr;
unsigned nr_ptrs = 0;
unsigned durability = 0;
extent_for_each_ptr(e, ptr)
nr_ptrs += (!ptr->cached &&
bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
BCH_MEMBER_STATE_FAILED);
durability += bch2_extent_ptr_durability(c, ptr);
return nr_ptrs;
return durability;
}
unsigned bch2_extent_is_compressed(struct bkey_s_c k)
@ -2008,21 +2022,29 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
unsigned target)
{
struct bch_extent_ptr *ptr;
unsigned nr_cached = 0, nr_good = bch2_extent_nr_good_ptrs(c, e.c);
int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
if (nr_good <= nr_desired_replicas)
if (extra <= 0)
return;
nr_cached = nr_good - nr_desired_replicas;
extent_for_each_ptr(e, ptr) {
int n = bch2_extent_ptr_durability(c, ptr);
extent_for_each_ptr(e, ptr)
if (!ptr->cached &&
if (n && n <= extra &&
!dev_in_target(c->devs[ptr->dev], target)) {
ptr->cached = true;
nr_cached--;
if (!nr_cached)
return;
extra -= n;
}
}
extent_for_each_ptr(e, ptr) {
int n = bch2_extent_ptr_durability(c, ptr);
if (n && n <= extra) {
ptr->cached = true;
extra -= n;
}
}
}
/*

View File

@ -52,9 +52,12 @@ bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
unsigned bch2_extent_nr_good_ptrs(struct bch_fs *, struct bkey_s_c_extent);
unsigned bch2_extent_is_compressed(struct bkey_s_c);
unsigned bch2_extent_ptr_durability(struct bch_fs *,
const struct bch_extent_ptr *);
unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
struct bch_extent_ptr, u64);

View File

@ -791,7 +791,7 @@ static void __bch2_write(struct closure *cl)
ret = bch2_write_extent(op, wp);
BUG_ON(op->open_buckets_nr + wp->nr_ptrs_can_use >
BUG_ON(op->open_buckets_nr + wp->nr_ptrs - wp->first_ptr >
ARRAY_SIZE(op->open_buckets));
bch2_open_bucket_get(c, wp,
&op->open_buckets_nr,

View File

@ -23,7 +23,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
bch2_extent_drop_device(e, dev_idx);
nr_good = bch2_extent_nr_good_ptrs(c, e.c);
nr_good = bch2_extent_durability(c, e.c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
return -EINVAL;

View File

@ -688,7 +688,7 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
unsigned nr_good = bch2_extent_nr_good_ptrs(c, e);
unsigned nr_good = bch2_extent_durability(c, e);
unsigned replicas = type == BKEY_TYPE_BTREE
? c->opts.metadata_replicas
: io_opts->data_replicas;

View File

@ -132,6 +132,9 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
.replacement = BCH_MEMBER_REPLACEMENT(mi),
.discard = BCH_MEMBER_DISCARD(mi),
.data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
.durability = BCH_MEMBER_DURABILITY(mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
.valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
};
}
@ -249,6 +252,17 @@ static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
}
}
static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
{
bool ret;
rcu_read_lock();
ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
rcu_read_unlock();
return ret;
}
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
int __bch2_disk_group_find(struct bch_sb_field_disk_groups *, const char *);

View File

@ -27,6 +27,7 @@ struct bch_member_cpu {
u8 replacement;
u8 discard;
u8 data_allowed;
u8 durability;
u8 valid;
};

View File

@ -138,6 +138,7 @@ read_attribute(block_size);
read_attribute(btree_node_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
read_attribute(durability);
read_attribute(iostats);
read_attribute(read_priority_stats);
read_attribute(write_priority_stats);
@ -800,6 +801,7 @@ SHOW(bch2_dev)
sysfs_print(block_size, block_bytes(c));
sysfs_print(first_bucket, ca->mi.first_bucket);
sysfs_print(nbuckets, ca->mi.nbuckets);
sysfs_print(durability, ca->mi.durability);
sysfs_print(discard, ca->mi.discard);
if (attr == &sysfs_group) {
@ -930,6 +932,7 @@ struct attribute *bch2_dev_files[] = {
&sysfs_block_size,
&sysfs_first_bucket,
&sysfs_nbuckets,
&sysfs_durability,
/* settings: */
&sysfs_discard,