mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-02-23 00:00:02 +03:00
Update bcachefs sources to 9e7ae5219c bcachefs: Make write points more dynamic
This commit is contained in:
parent
74cb922032
commit
22291ae84a
@ -1 +1 @@
|
|||||||
661faf58dbcab87e512e64e7cb164905689e64c8
|
192d759a491f50d92c89c2e842639d2307c815a5
|
||||||
|
@ -265,7 +265,7 @@ static void write_data(struct bch_fs *c,
|
|||||||
if (ret)
|
if (ret)
|
||||||
die("error reserving space in new filesystem: %s", strerror(-ret));
|
die("error reserving space in new filesystem: %s", strerror(-ret));
|
||||||
|
|
||||||
bch2_write_op_init(&op, c, res, c->write_points,
|
bch2_write_op_init(&op, c, res, NULL, 0,
|
||||||
POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
|
POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
|
||||||
closure_call(&op.cl, bch2_write, NULL, &cl);
|
closure_call(&op.cl, bch2_write, NULL, &cl);
|
||||||
closure_sync(&cl);
|
closure_sync(&cl);
|
||||||
|
@ -70,7 +70,7 @@ extern int register_refined_jiffies(long clock_tick_rate);
|
|||||||
/* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
|
/* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
|
||||||
#define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
|
#define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
|
||||||
|
|
||||||
static inline u64 local_clock(void)
|
static inline u64 sched_clock(void)
|
||||||
{
|
{
|
||||||
struct timespec ts;
|
struct timespec ts;
|
||||||
|
|
||||||
@ -79,6 +79,11 @@ static inline u64 local_clock(void)
|
|||||||
return ((s64) ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
|
return ((s64) ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline u64 local_clock(void)
|
||||||
|
{
|
||||||
|
return sched_clock();
|
||||||
|
}
|
||||||
|
|
||||||
extern unsigned long clock_t_to_jiffies(unsigned long x);
|
extern unsigned long clock_t_to_jiffies(unsigned long x);
|
||||||
extern u64 jiffies_64_to_clock_t(u64 x);
|
extern u64 jiffies_64_to_clock_t(u64 x);
|
||||||
extern u64 nsec_to_clock_t(u64 x);
|
extern u64 nsec_to_clock_t(u64 x);
|
||||||
@ -87,7 +92,7 @@ extern unsigned long nsecs_to_jiffies(u64 n);
|
|||||||
|
|
||||||
static inline u64 get_jiffies_64(void)
|
static inline u64 get_jiffies_64(void)
|
||||||
{
|
{
|
||||||
return nsecs_to_jiffies64(local_clock());
|
return nsecs_to_jiffies64(sched_clock());
|
||||||
}
|
}
|
||||||
|
|
||||||
#define jiffies_64 get_jiffies_64()
|
#define jiffies_64 get_jiffies_64()
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
#ifndef _LINUX_RCULIST_H
|
#ifndef _LINUX_RCULIST_H
|
||||||
#define _LINUX_RCULIST_H
|
#define _LINUX_RCULIST_H
|
||||||
|
|
||||||
#ifdef __KERNEL__
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* RCU-protected list version
|
* RCU-protected list version
|
||||||
*/
|
*/
|
||||||
@ -671,5 +669,4 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
|
|||||||
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
|
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
|
||||||
&(pos)->member)), typeof(*(pos)), member))
|
&(pos)->member)), typeof(*(pos)), member))
|
||||||
|
|
||||||
#endif /* __KERNEL__ */
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -70,6 +70,7 @@
|
|||||||
#include <linux/kthread.h>
|
#include <linux/kthread.h>
|
||||||
#include <linux/math64.h>
|
#include <linux/math64.h>
|
||||||
#include <linux/random.h>
|
#include <linux/random.h>
|
||||||
|
#include <linux/rculist.h>
|
||||||
#include <linux/rcupdate.h>
|
#include <linux/rcupdate.h>
|
||||||
#include <linux/sched/task.h>
|
#include <linux/sched/task.h>
|
||||||
#include <linux/sort.h>
|
#include <linux/sort.h>
|
||||||
@ -1118,6 +1119,7 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
|
|||||||
{
|
{
|
||||||
enum bucket_alloc_ret ret = NO_DEVICES;
|
enum bucket_alloc_ret ret = NO_DEVICES;
|
||||||
struct dev_alloc_list devs_sorted;
|
struct dev_alloc_list devs_sorted;
|
||||||
|
u64 buckets_free;
|
||||||
unsigned i;
|
unsigned i;
|
||||||
|
|
||||||
BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));
|
BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));
|
||||||
@ -1127,46 +1129,55 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
|
|||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
devs_sorted = bch2_wp_alloc_list(c, wp, devs);
|
devs_sorted = bch2_wp_alloc_list(c, wp, devs);
|
||||||
|
spin_lock(&ob->lock);
|
||||||
|
|
||||||
for (i = 0; i < devs_sorted.nr; i++) {
|
for (i = 0; i < devs_sorted.nr; i++) {
|
||||||
struct bch_dev *ca =
|
struct bch_dev *ca =
|
||||||
rcu_dereference(c->devs[devs_sorted.devs[i]]);
|
rcu_dereference(c->devs[devs_sorted.devs[i]]);
|
||||||
long bucket;
|
struct open_bucket_ptr ptr;
|
||||||
|
|
||||||
if (!ca)
|
if (!ca)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
bucket = bch2_bucket_alloc(c, ca, reserve);
|
if (wp->type == BCH_DATA_USER &&
|
||||||
if (bucket < 0) {
|
ca->open_buckets_partial_nr) {
|
||||||
ret = FREELIST_EMPTY;
|
ptr = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
|
||||||
continue;
|
} else {
|
||||||
|
long bucket = bch2_bucket_alloc(c, ca, reserve);
|
||||||
|
if (bucket < 0) {
|
||||||
|
ret = FREELIST_EMPTY;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ptr = (struct open_bucket_ptr) {
|
||||||
|
.ptr.gen = ca->buckets[bucket].mark.gen,
|
||||||
|
.ptr.offset = bucket_to_sector(ca, bucket),
|
||||||
|
.ptr.dev = ca->dev_idx,
|
||||||
|
.sectors_free = ca->mi.bucket_size,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
wp->next_alloc[ca->dev_idx] +=
|
|
||||||
div64_u64(U64_MAX, dev_buckets_free(ca) *
|
|
||||||
ca->mi.bucket_size);
|
|
||||||
bch2_wp_rescale(c, ca, wp);
|
|
||||||
|
|
||||||
__clear_bit(ca->dev_idx, devs->d);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* open_bucket_add_buckets expects new pointers at the head of
|
* open_bucket_add_buckets expects new pointers at the head of
|
||||||
* the list:
|
* the list:
|
||||||
*/
|
*/
|
||||||
BUG_ON(ob->nr_ptrs >= BCH_REPLICAS_MAX);
|
BUG_ON(ob->nr_ptrs >= ARRAY_SIZE(ob->ptrs));
|
||||||
memmove(&ob->ptrs[1],
|
memmove(&ob->ptrs[1],
|
||||||
&ob->ptrs[0],
|
&ob->ptrs[0],
|
||||||
ob->nr_ptrs * sizeof(ob->ptrs[0]));
|
ob->nr_ptrs * sizeof(ob->ptrs[0]));
|
||||||
memmove(&ob->ptr_offset[1],
|
|
||||||
&ob->ptr_offset[0],
|
|
||||||
ob->nr_ptrs * sizeof(ob->ptr_offset[0]));
|
|
||||||
ob->nr_ptrs++;
|
ob->nr_ptrs++;
|
||||||
ob->ptrs[0] = (struct bch_extent_ptr) {
|
ob->ptrs[0] = ptr;
|
||||||
.gen = ca->buckets[bucket].mark.gen,
|
|
||||||
.offset = bucket_to_sector(ca, bucket),
|
buckets_free = U64_MAX, dev_buckets_free(ca);
|
||||||
.dev = ca->dev_idx,
|
if (buckets_free)
|
||||||
};
|
wp->next_alloc[ca->dev_idx] +=
|
||||||
ob->ptr_offset[0] = 0;
|
div64_u64(U64_MAX, buckets_free *
|
||||||
|
ca->mi.bucket_size);
|
||||||
|
else
|
||||||
|
wp->next_alloc[ca->dev_idx] = U64_MAX;
|
||||||
|
bch2_wp_rescale(c, ca, wp);
|
||||||
|
|
||||||
|
__clear_bit(ca->dev_idx, devs->d);
|
||||||
|
|
||||||
if (ob->nr_ptrs == nr_replicas) {
|
if (ob->nr_ptrs == nr_replicas) {
|
||||||
ret = ALLOC_SUCCESS;
|
ret = ALLOC_SUCCESS;
|
||||||
@ -1175,6 +1186,7 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
|
|||||||
}
|
}
|
||||||
|
|
||||||
EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
|
EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
|
||||||
|
spin_unlock(&ob->lock);
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -1242,24 +1254,45 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
|
|||||||
|
|
||||||
void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
|
void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
|
||||||
{
|
{
|
||||||
const struct bch_extent_ptr *ptr;
|
const struct open_bucket_ptr *ptr;
|
||||||
u8 new_ob;
|
u8 new_ob;
|
||||||
|
|
||||||
if (!atomic_dec_and_test(&ob->pin))
|
if (!atomic_dec_and_test(&ob->pin))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
spin_lock(&c->open_buckets_lock);
|
down_read(&c->alloc_gc_lock);
|
||||||
|
spin_lock(&ob->lock);
|
||||||
|
|
||||||
open_bucket_for_each_ptr(ob, ptr) {
|
open_bucket_for_each_ptr(ob, ptr) {
|
||||||
struct bch_dev *ca = c->devs[ptr->dev];
|
struct bch_dev *ca = c->devs[ptr->ptr.dev];
|
||||||
|
|
||||||
bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false);
|
if (ptr->sectors_free) {
|
||||||
|
/*
|
||||||
|
* This is a ptr to a bucket that still has free space,
|
||||||
|
* but we don't want to use it
|
||||||
|
*/
|
||||||
|
BUG_ON(ca->open_buckets_partial_nr >=
|
||||||
|
ARRAY_SIZE(ca->open_buckets_partial));
|
||||||
|
|
||||||
|
spin_lock(&ca->freelist_lock);
|
||||||
|
ca->open_buckets_partial[ca->open_buckets_partial_nr++]
|
||||||
|
= *ptr;
|
||||||
|
spin_unlock(&ca->freelist_lock);
|
||||||
|
} else {
|
||||||
|
bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ob->nr_ptrs = 0;
|
ob->nr_ptrs = 0;
|
||||||
|
|
||||||
|
spin_unlock(&ob->lock);
|
||||||
|
up_read(&c->alloc_gc_lock);
|
||||||
|
|
||||||
new_ob = ob->new_ob;
|
new_ob = ob->new_ob;
|
||||||
ob->new_ob = 0;
|
ob->new_ob = 0;
|
||||||
|
|
||||||
list_move(&ob->list, &c->open_buckets_free);
|
spin_lock(&c->open_buckets_lock);
|
||||||
|
ob->freelist = c->open_buckets_freelist;
|
||||||
|
c->open_buckets_freelist = ob - c->open_buckets;
|
||||||
c->open_buckets_nr_free++;
|
c->open_buckets_nr_free++;
|
||||||
spin_unlock(&c->open_buckets_lock);
|
spin_unlock(&c->open_buckets_lock);
|
||||||
|
|
||||||
@ -1270,22 +1303,19 @@ void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c,
|
static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c,
|
||||||
unsigned nr_reserved,
|
unsigned nr_reserved,
|
||||||
struct closure *cl)
|
struct closure *cl)
|
||||||
{
|
{
|
||||||
struct open_bucket *ret;
|
struct open_bucket *ret;
|
||||||
|
|
||||||
spin_lock(&c->open_buckets_lock);
|
spin_lock(&c->open_buckets_lock);
|
||||||
|
|
||||||
if (c->open_buckets_nr_free > nr_reserved) {
|
if (c->open_buckets_nr_free > nr_reserved) {
|
||||||
BUG_ON(list_empty(&c->open_buckets_free));
|
BUG_ON(!c->open_buckets_freelist);
|
||||||
ret = list_first_entry(&c->open_buckets_free,
|
|
||||||
struct open_bucket, list);
|
|
||||||
list_move(&ret->list, &c->open_buckets_open);
|
|
||||||
BUG_ON(ret->nr_ptrs);
|
|
||||||
|
|
||||||
|
ret = c->open_buckets + c->open_buckets_freelist;
|
||||||
|
c->open_buckets_freelist = ret->freelist;
|
||||||
atomic_set(&ret->pin, 1); /* XXX */
|
atomic_set(&ret->pin, 1); /* XXX */
|
||||||
ret->has_full_ptrs = false;
|
|
||||||
|
|
||||||
BUG_ON(ret->new_ob);
|
BUG_ON(ret->new_ob);
|
||||||
BUG_ON(ret->nr_ptrs);
|
BUG_ON(ret->nr_ptrs);
|
||||||
@ -1307,148 +1337,259 @@ static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned ob_ptr_sectors_free(struct bch_fs *c,
|
|
||||||
struct open_bucket *ob,
|
|
||||||
struct bch_extent_ptr *ptr)
|
|
||||||
{
|
|
||||||
struct bch_dev *ca = c->devs[ptr->dev];
|
|
||||||
unsigned i = ptr - ob->ptrs;
|
|
||||||
unsigned used = bucket_remainder(ca, ptr->offset) +
|
|
||||||
ob->ptr_offset[i];
|
|
||||||
|
|
||||||
BUG_ON(used > ca->mi.bucket_size);
|
|
||||||
|
|
||||||
return ca->mi.bucket_size - used;
|
|
||||||
}
|
|
||||||
|
|
||||||
static unsigned open_bucket_sectors_free(struct bch_fs *c,
|
static unsigned open_bucket_sectors_free(struct bch_fs *c,
|
||||||
struct open_bucket *ob,
|
struct open_bucket *ob,
|
||||||
unsigned nr_replicas)
|
unsigned nr_replicas)
|
||||||
{
|
{
|
||||||
unsigned i, sectors_free = UINT_MAX;
|
unsigned sectors_free = UINT_MAX;
|
||||||
|
struct open_bucket_ptr *ptr;
|
||||||
|
|
||||||
for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++)
|
open_bucket_for_each_ptr(ob, ptr)
|
||||||
sectors_free = min(sectors_free,
|
sectors_free = min(sectors_free, ptr->sectors_free);
|
||||||
ob_ptr_sectors_free(c, ob, &ob->ptrs[i]));
|
|
||||||
|
|
||||||
return sectors_free != UINT_MAX ? sectors_free : 0;
|
return sectors_free != UINT_MAX ? sectors_free : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void open_bucket_copy_unused_ptrs(struct bch_fs *c,
|
static void open_bucket_move_ptrs(struct bch_fs *c,
|
||||||
struct open_bucket *new,
|
struct open_bucket *dst,
|
||||||
struct open_bucket *old)
|
struct open_bucket *src,
|
||||||
|
struct bch_devs_mask *devs,
|
||||||
|
unsigned nr_ptrs_dislike)
|
||||||
{
|
{
|
||||||
bool moved_ptr = false;
|
bool moved_ptr = false;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for (i = old->nr_ptrs - 1; i >= 0; --i)
|
down_read(&c->alloc_gc_lock);
|
||||||
if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) {
|
|
||||||
BUG_ON(new->nr_ptrs >= BCH_REPLICAS_MAX);
|
|
||||||
|
|
||||||
new->ptrs[new->nr_ptrs] = old->ptrs[i];
|
if (dst < src) {
|
||||||
new->ptr_offset[new->nr_ptrs] = old->ptr_offset[i];
|
spin_lock(&dst->lock);
|
||||||
new->nr_ptrs++;
|
spin_lock_nested(&src->lock, 1);
|
||||||
|
} else {
|
||||||
|
spin_lock(&src->lock);
|
||||||
|
spin_lock_nested(&dst->lock, 1);
|
||||||
|
}
|
||||||
|
|
||||||
old->nr_ptrs--;
|
for (i = src->nr_ptrs - 1; i >= 0; --i) {
|
||||||
memmove(&old->ptrs[i],
|
if (!src->ptrs[i].sectors_free) {
|
||||||
&old->ptrs[i + 1],
|
/*
|
||||||
(old->nr_ptrs - i) * sizeof(old->ptrs[0]));
|
* Don't do anything: leave the ptr on the old
|
||||||
memmove(&old->ptr_offset[i],
|
* open_bucket for gc to find
|
||||||
&old->ptr_offset[i + 1],
|
*/
|
||||||
(old->nr_ptrs - i) * sizeof(old->ptr_offset[0]));
|
} else if (nr_ptrs_dislike &&
|
||||||
|
!test_bit(src->ptrs[i].ptr.dev, devs->d)) {
|
||||||
|
/*
|
||||||
|
* We don't want this pointer; bch2_open_bucket_put()
|
||||||
|
* will stick it on ca->open_buckets_partial to be
|
||||||
|
* reused
|
||||||
|
*/
|
||||||
|
--nr_ptrs_dislike;
|
||||||
|
} else {
|
||||||
|
BUG_ON(dst->nr_ptrs >= ARRAY_SIZE(dst->ptrs));
|
||||||
|
|
||||||
|
dst->ptrs[dst->nr_ptrs++] = src->ptrs[i];
|
||||||
|
|
||||||
|
src->nr_ptrs--;
|
||||||
|
memmove(&src->ptrs[i],
|
||||||
|
&src->ptrs[i + 1],
|
||||||
|
(src->nr_ptrs - i) * sizeof(src->ptrs[0]));
|
||||||
|
|
||||||
moved_ptr = true;
|
moved_ptr = true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (moved_ptr) {
|
if (moved_ptr) {
|
||||||
BUG_ON(old->new_ob);
|
BUG_ON(src->new_ob);
|
||||||
|
|
||||||
atomic_inc(&new->pin);
|
atomic_inc(&dst->pin);
|
||||||
old->new_ob = new - c->open_buckets;
|
src->new_ob = dst - c->open_buckets;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
spin_unlock(&dst->lock);
|
||||||
|
spin_unlock(&src->lock);
|
||||||
|
up_read(&c->alloc_gc_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob)
|
static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob)
|
||||||
{
|
{
|
||||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||||
const struct bch_extent_ptr *ptr;
|
const struct open_bucket_ptr *ptr;
|
||||||
|
|
||||||
open_bucket_for_each_ptr(ob, ptr) {
|
open_bucket_for_each_ptr(ob, ptr) {
|
||||||
struct bch_dev *ca = c->devs[ptr->dev];
|
struct bch_dev *ca = c->devs[ptr->ptr.dev];
|
||||||
|
|
||||||
BUG_ON(ptr_stale(ca, ptr));
|
BUG_ON(ptr_stale(ca, &ptr->ptr));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Sector allocator */
|
/* Sector allocator */
|
||||||
|
|
||||||
static struct open_bucket *lock_writepoint(struct bch_fs *c,
|
|
||||||
struct write_point *wp)
|
|
||||||
{
|
|
||||||
struct open_bucket *ob;
|
|
||||||
|
|
||||||
while ((ob = ACCESS_ONCE(wp->b))) {
|
|
||||||
mutex_lock(&ob->lock);
|
|
||||||
if (wp->b == ob)
|
|
||||||
break;
|
|
||||||
|
|
||||||
mutex_unlock(&ob->lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ob;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int open_bucket_add_buckets(struct bch_fs *c,
|
static int open_bucket_add_buckets(struct bch_fs *c,
|
||||||
struct write_point *wp,
|
struct write_point *wp,
|
||||||
|
struct bch_devs_mask *_devs,
|
||||||
struct open_bucket *ob,
|
struct open_bucket *ob,
|
||||||
unsigned nr_replicas,
|
unsigned nr_replicas,
|
||||||
unsigned nr_replicas_required,
|
|
||||||
enum alloc_reserve reserve,
|
enum alloc_reserve reserve,
|
||||||
struct closure *cl)
|
struct closure *cl)
|
||||||
{
|
{
|
||||||
struct bch_devs_mask devs = c->rw_devs[wp->type];
|
struct bch_devs_mask devs = c->rw_devs[wp->type];
|
||||||
unsigned i;
|
struct open_bucket_ptr *ptr;
|
||||||
int ret;
|
|
||||||
|
|
||||||
if (ob->nr_ptrs >= nr_replicas)
|
if (ob->nr_ptrs >= nr_replicas)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
if (_devs)
|
||||||
|
bitmap_and(devs.d, devs.d, _devs->d, BCH_SB_MEMBERS_MAX);
|
||||||
|
|
||||||
/* Don't allocate from devices we already have pointers to: */
|
/* Don't allocate from devices we already have pointers to: */
|
||||||
for (i = 0; i < ob->nr_ptrs; i++)
|
open_bucket_for_each_ptr(ob, ptr)
|
||||||
__clear_bit(ob->ptrs[i].dev, devs.d);
|
if (ptr->sectors_free)
|
||||||
|
__clear_bit(ptr->ptr.dev, devs.d);
|
||||||
|
|
||||||
if (wp->group)
|
return bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
|
||||||
bitmap_and(devs.d, devs.d, wp->group->d, BCH_SB_MEMBERS_MAX);
|
reserve, &devs, cl);
|
||||||
|
}
|
||||||
|
|
||||||
ret = bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
|
static struct write_point *__writepoint_find(struct hlist_head *head,
|
||||||
reserve, &devs, cl);
|
unsigned long write_point)
|
||||||
|
{
|
||||||
|
struct write_point *wp;
|
||||||
|
|
||||||
if (ret == -EROFS &&
|
hlist_for_each_entry_rcu(wp, head, node) {
|
||||||
ob->nr_ptrs >= nr_replicas_required)
|
if (wp->write_point == write_point)
|
||||||
ret = 0;
|
continue;
|
||||||
|
|
||||||
return ret;
|
mutex_lock(&wp->lock);
|
||||||
|
if (wp->write_point == write_point)
|
||||||
|
return wp;
|
||||||
|
mutex_unlock(&wp->lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct hlist_head *writepoint_hash(struct bch_fs *c,
|
||||||
|
unsigned long write_point)
|
||||||
|
{
|
||||||
|
unsigned hash =
|
||||||
|
hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
|
||||||
|
|
||||||
|
return &c->write_points_hash[hash];
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct write_point *writepoint_find(struct bch_fs *c,
|
||||||
|
enum bch_data_type data_type,
|
||||||
|
unsigned long write_point)
|
||||||
|
{
|
||||||
|
struct write_point *wp, *oldest = NULL;
|
||||||
|
struct hlist_head *head;
|
||||||
|
|
||||||
|
switch (data_type) {
|
||||||
|
case BCH_DATA_BTREE:
|
||||||
|
wp = &c->btree_write_point;
|
||||||
|
mutex_lock(&wp->lock);
|
||||||
|
return wp;
|
||||||
|
case BCH_DATA_USER:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
|
||||||
|
head = writepoint_hash(c, write_point);
|
||||||
|
wp = __writepoint_find(head, write_point);
|
||||||
|
if (wp)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
mutex_lock(&c->write_points_hash_lock);
|
||||||
|
wp = __writepoint_find(head, write_point);
|
||||||
|
if (wp)
|
||||||
|
goto out_unlock;
|
||||||
|
|
||||||
|
for (wp = c->write_points;
|
||||||
|
wp < c->write_points + ARRAY_SIZE(c->write_points);
|
||||||
|
wp++)
|
||||||
|
if (!oldest || time_before64(wp->last_used, oldest->last_used))
|
||||||
|
oldest = wp;
|
||||||
|
|
||||||
|
wp = oldest;
|
||||||
|
BUG_ON(!wp);
|
||||||
|
|
||||||
|
mutex_lock(&wp->lock);
|
||||||
|
hlist_del_rcu(&wp->node);
|
||||||
|
wp->write_point = write_point;
|
||||||
|
hlist_add_head_rcu(&wp->node, head);
|
||||||
|
out_unlock:
|
||||||
|
mutex_unlock(&c->write_points_hash_lock);
|
||||||
|
out:
|
||||||
|
wp->last_used = sched_clock();
|
||||||
|
return wp;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get us an open_bucket we can allocate from, return with it locked:
|
* Get us an open_bucket we can allocate from, return with it locked:
|
||||||
*/
|
*/
|
||||||
struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *c,
|
struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
|
||||||
struct write_point *wp,
|
enum bch_data_type data_type,
|
||||||
|
struct bch_devs_mask *devs,
|
||||||
|
unsigned long write_point,
|
||||||
unsigned nr_replicas,
|
unsigned nr_replicas,
|
||||||
unsigned nr_replicas_required,
|
unsigned nr_replicas_required,
|
||||||
enum alloc_reserve reserve,
|
enum alloc_reserve reserve,
|
||||||
|
unsigned flags,
|
||||||
struct closure *cl)
|
struct closure *cl)
|
||||||
{
|
{
|
||||||
struct open_bucket *ob;
|
struct open_bucket *ob;
|
||||||
unsigned open_buckets_reserved = wp == &c->btree_write_point
|
struct write_point *wp;
|
||||||
|
struct open_bucket_ptr *ptr;
|
||||||
|
unsigned open_buckets_reserved = data_type == BCH_DATA_BTREE
|
||||||
? 0 : BTREE_NODE_RESERVE;
|
? 0 : BTREE_NODE_RESERVE;
|
||||||
|
unsigned nr_ptrs_empty = 0, nr_ptrs_dislike = 0;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
BUG_ON(!nr_replicas);
|
BUG_ON(!nr_replicas);
|
||||||
retry:
|
|
||||||
ob = lock_writepoint(c, wp);
|
wp = writepoint_find(c, data_type, write_point);
|
||||||
|
BUG_ON(wp->type != data_type);
|
||||||
|
|
||||||
|
wp->last_used = sched_clock();
|
||||||
|
|
||||||
|
ob = wp->ob;
|
||||||
|
|
||||||
|
/* does ob have ptrs we don't need? */
|
||||||
|
open_bucket_for_each_ptr(ob, ptr) {
|
||||||
|
if (!ptr->sectors_free)
|
||||||
|
nr_ptrs_empty++;
|
||||||
|
else if (devs && !test_bit(ptr->ptr.dev, devs->d))
|
||||||
|
nr_ptrs_dislike++;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = open_bucket_add_buckets(c, wp, devs, ob,
|
||||||
|
nr_replicas + nr_ptrs_empty + nr_ptrs_dislike,
|
||||||
|
reserve, cl);
|
||||||
|
if (ret && ret != -EROFS)
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
|
||||||
|
goto alloc_done;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* XXX:
|
||||||
|
* Should this allocation be _forced_ to used the specified device (e.g.
|
||||||
|
* internal migration), or should we fall back to allocating from all
|
||||||
|
* devices?
|
||||||
|
*/
|
||||||
|
ret = open_bucket_add_buckets(c, wp, NULL, ob,
|
||||||
|
nr_replicas + nr_ptrs_empty,
|
||||||
|
reserve, cl);
|
||||||
|
if (ret && ret != -EROFS)
|
||||||
|
goto err;
|
||||||
|
alloc_done:
|
||||||
|
if (ob->nr_ptrs - nr_ptrs_empty -
|
||||||
|
((flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) ? nr_ptrs_dislike : 0)
|
||||||
|
< nr_replicas_required) {
|
||||||
|
ret = -EROFS;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If ob->sectors_free == 0, one or more of the buckets ob points to is
|
* If ob->sectors_free == 0, one or more of the buckets ob points to is
|
||||||
@ -1456,53 +1597,34 @@ retry:
|
|||||||
* still needs to find them; instead, we must allocate a new open bucket
|
* still needs to find them; instead, we must allocate a new open bucket
|
||||||
* and copy any pointers to non-full buckets into the new open bucket.
|
* and copy any pointers to non-full buckets into the new open bucket.
|
||||||
*/
|
*/
|
||||||
if (!ob || ob->has_full_ptrs) {
|
BUG_ON(ob->nr_ptrs - nr_ptrs_empty - nr_replicas > nr_ptrs_dislike);
|
||||||
struct open_bucket *new_ob;
|
nr_ptrs_dislike = ob->nr_ptrs - nr_ptrs_empty - nr_replicas;
|
||||||
|
|
||||||
new_ob = bch2_open_bucket_get(c, open_buckets_reserved, cl);
|
if (nr_ptrs_empty || nr_ptrs_dislike) {
|
||||||
if (IS_ERR(new_ob))
|
ob = bch2_open_bucket_get(c, open_buckets_reserved, cl);
|
||||||
return new_ob;
|
if (IS_ERR(ob)) {
|
||||||
|
ret = PTR_ERR(ob);
|
||||||
mutex_lock(&new_ob->lock);
|
goto err;
|
||||||
|
|
||||||
/*
|
|
||||||
* We point the write point at the open_bucket before doing the
|
|
||||||
* allocation to avoid a race with shutdown:
|
|
||||||
*/
|
|
||||||
if (race_fault() ||
|
|
||||||
cmpxchg(&wp->b, ob, new_ob) != ob) {
|
|
||||||
/* We raced: */
|
|
||||||
mutex_unlock(&new_ob->lock);
|
|
||||||
bch2_open_bucket_put(c, new_ob);
|
|
||||||
|
|
||||||
if (ob)
|
|
||||||
mutex_unlock(&ob->lock);
|
|
||||||
goto retry;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ob) {
|
/* Remove pointers we don't want to use: */
|
||||||
open_bucket_copy_unused_ptrs(c, new_ob, ob);
|
|
||||||
mutex_unlock(&ob->lock);
|
|
||||||
bch2_open_bucket_put(c, ob);
|
|
||||||
}
|
|
||||||
|
|
||||||
ob = new_ob;
|
open_bucket_move_ptrs(c, ob, wp->ob, devs, nr_ptrs_dislike);
|
||||||
|
bch2_open_bucket_put(c, wp->ob);
|
||||||
|
wp->ob = ob;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = open_bucket_add_buckets(c, wp, ob, nr_replicas,
|
BUG_ON(ob->nr_ptrs < nr_replicas_required);
|
||||||
nr_replicas_required,
|
|
||||||
reserve, cl);
|
|
||||||
if (ret) {
|
|
||||||
mutex_unlock(&ob->lock);
|
|
||||||
return ERR_PTR(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
|
wp->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
|
||||||
|
|
||||||
BUG_ON(!ob->sectors_free);
|
BUG_ON(!wp->sectors_free);
|
||||||
verify_not_stale(c, ob);
|
verify_not_stale(c, ob);
|
||||||
|
|
||||||
return ob;
|
return wp;
|
||||||
|
err:
|
||||||
|
mutex_unlock(&wp->lock);
|
||||||
|
return ERR_PTR(ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1514,29 +1636,26 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
|
|||||||
unsigned sectors)
|
unsigned sectors)
|
||||||
{
|
{
|
||||||
struct bch_extent_ptr tmp;
|
struct bch_extent_ptr tmp;
|
||||||
bool has_data = false;
|
struct open_bucket_ptr *ptr;
|
||||||
unsigned i;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We're keeping any existing pointer k has, and appending new pointers:
|
* We're keeping any existing pointer k has, and appending new pointers:
|
||||||
* __bch2_write() will only write to the pointers we add here:
|
* __bch2_write() will only write to the pointers we add here:
|
||||||
*/
|
*/
|
||||||
|
|
||||||
BUG_ON(sectors > ob->sectors_free);
|
for (ptr = ob->ptrs;
|
||||||
|
ptr < ob->ptrs + min_t(u8, ob->nr_ptrs, nr_replicas); ptr++) {
|
||||||
|
struct bch_dev *ca = c->devs[ptr->ptr.dev];
|
||||||
|
|
||||||
/* didn't use all the ptrs: */
|
EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ptr->ptr.dev));
|
||||||
if (nr_replicas < ob->nr_ptrs)
|
|
||||||
has_data = true;
|
|
||||||
|
|
||||||
for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) {
|
tmp = ptr->ptr;
|
||||||
EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
|
|
||||||
|
|
||||||
tmp = ob->ptrs[i];
|
|
||||||
tmp.cached = bkey_extent_is_cached(&e->k);
|
tmp.cached = bkey_extent_is_cached(&e->k);
|
||||||
tmp.offset += ob->ptr_offset[i];
|
tmp.offset += ca->mi.bucket_size - ptr->sectors_free;
|
||||||
extent_ptr_append(e, tmp);
|
extent_ptr_append(e, tmp);
|
||||||
|
|
||||||
ob->ptr_offset[i] += sectors;
|
BUG_ON(sectors > ptr->sectors_free);
|
||||||
|
ptr->sectors_free -= sectors;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1544,25 +1663,27 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
|
|||||||
* Append pointers to the space we just allocated to @k, and mark @sectors space
|
* Append pointers to the space we just allocated to @k, and mark @sectors space
|
||||||
* as allocated out of @ob
|
* as allocated out of @ob
|
||||||
*/
|
*/
|
||||||
void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp,
|
void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
|
||||||
struct open_bucket *ob)
|
|
||||||
{
|
{
|
||||||
bool has_data = false;
|
struct open_bucket *ob = wp->ob, *new_ob = NULL;
|
||||||
unsigned i;
|
struct open_bucket_ptr *ptr;
|
||||||
|
bool empty = false;
|
||||||
|
|
||||||
for (i = 0; i < ob->nr_ptrs; i++) {
|
open_bucket_for_each_ptr(ob, ptr)
|
||||||
if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i]))
|
empty |= !ptr->sectors_free;
|
||||||
ob->has_full_ptrs = true;
|
|
||||||
else
|
if (empty)
|
||||||
has_data = true;
|
new_ob = bch2_open_bucket_get(c, 0, NULL);
|
||||||
|
|
||||||
|
if (!IS_ERR_OR_NULL(new_ob)) {
|
||||||
|
/* writepoint's ref becomes our ref: */
|
||||||
|
wp->ob = new_ob;
|
||||||
|
open_bucket_move_ptrs(c, new_ob, ob, 0, 0);
|
||||||
|
} else {
|
||||||
|
atomic_inc(&ob->pin);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (likely(has_data))
|
mutex_unlock(&wp->lock);
|
||||||
atomic_inc(&ob->pin);
|
|
||||||
else
|
|
||||||
BUG_ON(xchg(&wp->b, NULL) != ob);
|
|
||||||
|
|
||||||
mutex_unlock(&ob->lock);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1583,27 +1704,33 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp,
|
|||||||
* @cl - closure to wait for a bucket
|
* @cl - closure to wait for a bucket
|
||||||
*/
|
*/
|
||||||
struct open_bucket *bch2_alloc_sectors(struct bch_fs *c,
|
struct open_bucket *bch2_alloc_sectors(struct bch_fs *c,
|
||||||
struct write_point *wp,
|
enum bch_data_type data_type,
|
||||||
|
struct bch_devs_mask *devs,
|
||||||
|
unsigned long write_point,
|
||||||
struct bkey_i_extent *e,
|
struct bkey_i_extent *e,
|
||||||
unsigned nr_replicas,
|
unsigned nr_replicas,
|
||||||
unsigned nr_replicas_required,
|
unsigned nr_replicas_required,
|
||||||
enum alloc_reserve reserve,
|
enum alloc_reserve reserve,
|
||||||
|
unsigned flags,
|
||||||
struct closure *cl)
|
struct closure *cl)
|
||||||
{
|
{
|
||||||
|
struct write_point *wp;
|
||||||
struct open_bucket *ob;
|
struct open_bucket *ob;
|
||||||
|
|
||||||
ob = bch2_alloc_sectors_start(c, wp, nr_replicas,
|
wp = bch2_alloc_sectors_start(c, data_type, devs, write_point,
|
||||||
nr_replicas_required,
|
nr_replicas, nr_replicas_required,
|
||||||
reserve, cl);
|
reserve, flags, cl);
|
||||||
if (IS_ERR_OR_NULL(ob))
|
if (IS_ERR_OR_NULL(wp))
|
||||||
return ob;
|
return ERR_CAST(wp);
|
||||||
|
|
||||||
if (e->k.size > ob->sectors_free)
|
ob = wp->ob;
|
||||||
bch2_key_resize(&e->k, ob->sectors_free);
|
|
||||||
|
if (e->k.size > wp->sectors_free)
|
||||||
|
bch2_key_resize(&e->k, wp->sectors_free);
|
||||||
|
|
||||||
bch2_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size);
|
bch2_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size);
|
||||||
|
|
||||||
bch2_alloc_sectors_done(c, wp, ob);
|
bch2_alloc_sectors_done(c, wp);
|
||||||
|
|
||||||
return ob;
|
return ob;
|
||||||
}
|
}
|
||||||
@ -1640,8 +1767,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
|
|||||||
}
|
}
|
||||||
|
|
||||||
c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
|
c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
|
||||||
|
c->fastest_devs = fastest_tier != slowest_tier ? &fastest_tier->devs : NULL;
|
||||||
c->promote_write_point.group = &fastest_tier->devs;
|
|
||||||
|
|
||||||
if (!fastest_tier)
|
if (!fastest_tier)
|
||||||
goto set_capacity;
|
goto set_capacity;
|
||||||
@ -1713,49 +1839,61 @@ set_capacity:
|
|||||||
closure_wake_up(&c->freelist_wait);
|
closure_wake_up(&c->freelist_wait);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool open_bucket_has_device(struct open_bucket *ob,
|
||||||
|
struct bch_dev *ca)
|
||||||
|
{
|
||||||
|
struct open_bucket_ptr *ptr;
|
||||||
|
bool ret = false;
|
||||||
|
|
||||||
|
spin_lock(&ob->lock);
|
||||||
|
open_bucket_for_each_ptr(ob, ptr)
|
||||||
|
ret |= ptr->ptr.dev == ca->dev_idx;
|
||||||
|
spin_unlock(&ob->lock);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
|
static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
|
||||||
struct write_point *wp)
|
struct write_point *wp)
|
||||||
{
|
{
|
||||||
struct open_bucket *ob;
|
struct open_bucket *ob;
|
||||||
struct bch_extent_ptr *ptr;
|
struct closure cl;
|
||||||
|
|
||||||
ob = lock_writepoint(c, wp);
|
closure_init_stack(&cl);
|
||||||
if (!ob)
|
retry:
|
||||||
|
mutex_lock(&wp->lock);
|
||||||
|
if (!open_bucket_has_device(wp->ob, ca)) {
|
||||||
|
mutex_unlock(&wp->lock);
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
|
ob = bch2_open_bucket_get(c, 0, &cl);
|
||||||
if (ptr->dev == ca->dev_idx)
|
if (IS_ERR(ob)) {
|
||||||
goto found;
|
mutex_unlock(&wp->lock);
|
||||||
|
closure_sync(&cl);
|
||||||
|
goto retry;
|
||||||
|
|
||||||
mutex_unlock(&ob->lock);
|
}
|
||||||
return;
|
|
||||||
found:
|
|
||||||
BUG_ON(xchg(&wp->b, NULL) != ob);
|
|
||||||
mutex_unlock(&ob->lock);
|
|
||||||
|
|
||||||
/* Drop writepoint's ref: */
|
open_bucket_move_ptrs(c, ob, wp->ob, &ca->self, ob->nr_ptrs);
|
||||||
bch2_open_bucket_put(c, ob);
|
bch2_open_bucket_put(c, wp->ob);
|
||||||
|
wp->ob = ob;
|
||||||
|
|
||||||
|
mutex_unlock(&wp->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
|
static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
|
||||||
{
|
{
|
||||||
struct bch_extent_ptr *ptr;
|
|
||||||
struct open_bucket *ob;
|
struct open_bucket *ob;
|
||||||
|
bool ret = false;
|
||||||
|
|
||||||
for (ob = c->open_buckets;
|
for (ob = c->open_buckets;
|
||||||
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
|
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
|
||||||
ob++)
|
ob++)
|
||||||
if (atomic_read(&ob->pin)) {
|
if (atomic_read(&ob->pin))
|
||||||
mutex_lock(&ob->lock);
|
ret |= open_bucket_has_device(ob, ca);
|
||||||
for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
|
|
||||||
if (ptr->dev == ca->dev_idx) {
|
|
||||||
mutex_unlock(&ob->lock);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
mutex_unlock(&ob->lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* device goes ro: */
|
/* device goes ro: */
|
||||||
@ -1782,11 +1920,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
|
|||||||
/* Next, close write points that point to this device... */
|
/* Next, close write points that point to this device... */
|
||||||
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
|
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
|
||||||
bch2_stop_write_point(c, ca, &c->write_points[i]);
|
bch2_stop_write_point(c, ca, &c->write_points[i]);
|
||||||
|
|
||||||
bch2_stop_write_point(c, ca, &ca->copygc_write_point);
|
|
||||||
bch2_stop_write_point(c, ca, &c->promote_write_point);
|
|
||||||
bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp);
|
|
||||||
bch2_stop_write_point(c, ca, &c->migration_write_point);
|
|
||||||
bch2_stop_write_point(c, ca, &c->btree_write_point);
|
bch2_stop_write_point(c, ca, &c->btree_write_point);
|
||||||
|
|
||||||
mutex_lock(&c->btree_reserve_cache_lock);
|
mutex_lock(&c->btree_reserve_cache_lock);
|
||||||
@ -1880,35 +2013,44 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
|
|||||||
|
|
||||||
void bch2_fs_allocator_init(struct bch_fs *c)
|
void bch2_fs_allocator_init(struct bch_fs *c)
|
||||||
{
|
{
|
||||||
unsigned i;
|
struct open_bucket *ob;
|
||||||
|
struct write_point *wp;
|
||||||
|
|
||||||
INIT_LIST_HEAD(&c->open_buckets_open);
|
mutex_init(&c->write_points_hash_lock);
|
||||||
INIT_LIST_HEAD(&c->open_buckets_free);
|
init_rwsem(&c->alloc_gc_lock);
|
||||||
spin_lock_init(&c->open_buckets_lock);
|
spin_lock_init(&c->open_buckets_lock);
|
||||||
bch2_prio_timer_init(c, READ);
|
bch2_prio_timer_init(c, READ);
|
||||||
bch2_prio_timer_init(c, WRITE);
|
bch2_prio_timer_init(c, WRITE);
|
||||||
|
|
||||||
/* open bucket 0 is a sentinal NULL: */
|
/* open bucket 0 is a sentinal NULL: */
|
||||||
mutex_init(&c->open_buckets[0].lock);
|
spin_lock_init(&c->open_buckets[0].lock);
|
||||||
INIT_LIST_HEAD(&c->open_buckets[0].list);
|
|
||||||
|
|
||||||
for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) {
|
for (ob = c->open_buckets + 1;
|
||||||
mutex_init(&c->open_buckets[i].lock);
|
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
|
||||||
|
spin_lock_init(&ob->lock);
|
||||||
c->open_buckets_nr_free++;
|
c->open_buckets_nr_free++;
|
||||||
list_add(&c->open_buckets[i].list, &c->open_buckets_free);
|
|
||||||
|
ob->freelist = c->open_buckets_freelist;
|
||||||
|
c->open_buckets_freelist = ob - c->open_buckets;
|
||||||
}
|
}
|
||||||
|
|
||||||
c->journal.wp.type = BCH_DATA_JOURNAL;
|
mutex_init(&c->btree_write_point.lock);
|
||||||
c->btree_write_point.type = BCH_DATA_BTREE;
|
c->btree_write_point.type = BCH_DATA_BTREE;
|
||||||
|
c->btree_write_point.ob = bch2_open_bucket_get(c, 0, NULL);
|
||||||
|
BUG_ON(IS_ERR(c->btree_write_point.ob));
|
||||||
|
|
||||||
for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
|
for (wp = c->write_points;
|
||||||
c->tiers[i].wp.type = BCH_DATA_USER;
|
wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
|
||||||
|
mutex_init(&wp->lock);
|
||||||
|
wp->type = BCH_DATA_USER;
|
||||||
|
wp->ob = bch2_open_bucket_get(c, 0, NULL);
|
||||||
|
wp->last_used = sched_clock();
|
||||||
|
|
||||||
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
|
wp->write_point = (unsigned long) wp;
|
||||||
c->write_points[i].type = BCH_DATA_USER;
|
hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
|
||||||
|
|
||||||
c->promote_write_point.type = BCH_DATA_USER;
|
BUG_ON(IS_ERR(wp->ob));
|
||||||
c->migration_write_point.type = BCH_DATA_USER;
|
}
|
||||||
|
|
||||||
c->pd_controllers_update_seconds = 5;
|
c->pd_controllers_update_seconds = 5;
|
||||||
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
|
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
|
||||||
|
@ -28,20 +28,28 @@ long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
|
|||||||
|
|
||||||
void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
|
void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
|
||||||
|
|
||||||
struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *,
|
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
|
||||||
struct write_point *,
|
enum bch_data_type,
|
||||||
unsigned, unsigned,
|
struct bch_devs_mask *,
|
||||||
enum alloc_reserve,
|
unsigned long,
|
||||||
struct closure *);
|
unsigned, unsigned,
|
||||||
|
enum alloc_reserve,
|
||||||
|
unsigned,
|
||||||
|
struct closure *);
|
||||||
|
|
||||||
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *,
|
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *,
|
||||||
unsigned, struct open_bucket *, unsigned);
|
unsigned, struct open_bucket *, unsigned);
|
||||||
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *,
|
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
|
||||||
struct open_bucket *);
|
|
||||||
|
|
||||||
struct open_bucket *bch2_alloc_sectors(struct bch_fs *, struct write_point *,
|
struct open_bucket *bch2_alloc_sectors(struct bch_fs *,
|
||||||
struct bkey_i_extent *, unsigned, unsigned,
|
enum bch_data_type,
|
||||||
enum alloc_reserve, struct closure *);
|
struct bch_devs_mask *,
|
||||||
|
unsigned long,
|
||||||
|
struct bkey_i_extent *,
|
||||||
|
unsigned, unsigned,
|
||||||
|
enum alloc_reserve,
|
||||||
|
unsigned,
|
||||||
|
struct closure *);
|
||||||
|
|
||||||
static inline void bch2_wake_allocator(struct bch_dev *ca)
|
static inline void bch2_wake_allocator(struct bch_dev *ca)
|
||||||
{
|
{
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#define _BCACHEFS_ALLOC_TYPES_H
|
#define _BCACHEFS_ALLOC_TYPES_H
|
||||||
|
|
||||||
#include <linux/mutex.h>
|
#include <linux/mutex.h>
|
||||||
|
#include <linux/spinlock.h>
|
||||||
|
|
||||||
#include "clock_types.h"
|
#include "clock_types.h"
|
||||||
|
|
||||||
@ -44,39 +45,34 @@ enum alloc_reserve {
|
|||||||
|
|
||||||
/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
|
/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
|
||||||
#define OPEN_BUCKETS_COUNT 256
|
#define OPEN_BUCKETS_COUNT 256
|
||||||
|
#define WRITE_POINT_COUNT 32
|
||||||
|
|
||||||
#define WRITE_POINT_COUNT 16
|
struct open_bucket_ptr {
|
||||||
|
struct bch_extent_ptr ptr;
|
||||||
|
unsigned sectors_free;
|
||||||
|
};
|
||||||
|
|
||||||
struct open_bucket {
|
struct open_bucket {
|
||||||
struct list_head list;
|
spinlock_t lock;
|
||||||
struct mutex lock;
|
|
||||||
atomic_t pin;
|
atomic_t pin;
|
||||||
bool has_full_ptrs;
|
u8 freelist;
|
||||||
u8 new_ob;
|
u8 new_ob;
|
||||||
|
u8 nr_ptrs;
|
||||||
|
|
||||||
/*
|
struct open_bucket_ptr ptrs[BCH_REPLICAS_MAX * 2];
|
||||||
* recalculated every time we allocate from this open_bucket based on
|
|
||||||
* how many pointers we're actually going to use:
|
|
||||||
*/
|
|
||||||
unsigned sectors_free;
|
|
||||||
unsigned nr_ptrs;
|
|
||||||
struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
|
|
||||||
unsigned ptr_offset[BCH_REPLICAS_MAX];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct write_point {
|
struct write_point {
|
||||||
struct open_bucket *b;
|
struct hlist_node node;
|
||||||
|
struct mutex lock;
|
||||||
|
u64 last_used;
|
||||||
|
unsigned long write_point;
|
||||||
enum bch_data_type type;
|
enum bch_data_type type;
|
||||||
|
|
||||||
/*
|
/* calculated based on how many pointers we're actually going to use: */
|
||||||
* If not NULL, cache group for tiering, promotion and moving GC -
|
unsigned sectors_free;
|
||||||
* always allocates a single replica
|
|
||||||
*
|
|
||||||
* Otherwise do a normal replicated bucket allocation that could come
|
|
||||||
* from any device in tier 0 (foreground write)
|
|
||||||
*/
|
|
||||||
struct bch_devs_mask *group;
|
|
||||||
|
|
||||||
|
struct open_bucket *ob;
|
||||||
u64 next_alloc[BCH_SB_MEMBERS_MAX];
|
u64 next_alloc[BCH_SB_MEMBERS_MAX];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -392,6 +392,9 @@ struct bch_dev {
|
|||||||
unsigned nr_invalidated;
|
unsigned nr_invalidated;
|
||||||
bool alloc_thread_started;
|
bool alloc_thread_started;
|
||||||
|
|
||||||
|
struct open_bucket_ptr open_buckets_partial[BCH_REPLICAS_MAX * WRITE_POINT_COUNT];
|
||||||
|
unsigned open_buckets_partial_nr;
|
||||||
|
|
||||||
size_t fifo_last_bucket;
|
size_t fifo_last_bucket;
|
||||||
|
|
||||||
/* Allocation stuff: */
|
/* Allocation stuff: */
|
||||||
@ -426,8 +429,6 @@ struct bch_dev {
|
|||||||
|
|
||||||
struct bch_pd_controller moving_gc_pd;
|
struct bch_pd_controller moving_gc_pd;
|
||||||
|
|
||||||
struct write_point copygc_write_point;
|
|
||||||
|
|
||||||
struct journal_device journal;
|
struct journal_device journal;
|
||||||
|
|
||||||
struct work_struct io_error_work;
|
struct work_struct io_error_work;
|
||||||
@ -472,7 +473,6 @@ struct bch_tier {
|
|||||||
struct bch_pd_controller pd;
|
struct bch_pd_controller pd;
|
||||||
|
|
||||||
struct bch_devs_mask devs;
|
struct bch_devs_mask devs;
|
||||||
struct write_point wp;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
enum bch_fs_state {
|
enum bch_fs_state {
|
||||||
@ -546,40 +546,7 @@ struct bch_fs {
|
|||||||
struct btree_root btree_roots[BTREE_ID_NR];
|
struct btree_root btree_roots[BTREE_ID_NR];
|
||||||
struct mutex btree_root_lock;
|
struct mutex btree_root_lock;
|
||||||
|
|
||||||
bool btree_cache_table_init_done;
|
struct btree_cache btree_cache;
|
||||||
struct rhashtable btree_cache_table;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We never free a struct btree, except on shutdown - we just put it on
|
|
||||||
* the btree_cache_freed list and reuse it later. This simplifies the
|
|
||||||
* code, and it doesn't cost us much memory as the memory usage is
|
|
||||||
* dominated by buffers that hold the actual btree node data and those
|
|
||||||
* can be freed - and the number of struct btrees allocated is
|
|
||||||
* effectively bounded.
|
|
||||||
*
|
|
||||||
* btree_cache_freeable effectively is a small cache - we use it because
|
|
||||||
* high order page allocations can be rather expensive, and it's quite
|
|
||||||
* common to delete and allocate btree nodes in quick succession. It
|
|
||||||
* should never grow past ~2-3 nodes in practice.
|
|
||||||
*/
|
|
||||||
struct mutex btree_cache_lock;
|
|
||||||
struct list_head btree_cache;
|
|
||||||
struct list_head btree_cache_freeable;
|
|
||||||
struct list_head btree_cache_freed;
|
|
||||||
|
|
||||||
/* Number of elements in btree_cache + btree_cache_freeable lists */
|
|
||||||
unsigned btree_cache_used;
|
|
||||||
unsigned btree_cache_reserve;
|
|
||||||
struct shrinker btree_cache_shrink;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If we need to allocate memory for a new btree node and that
|
|
||||||
* allocation fails, we can cannibalize another node in the btree cache
|
|
||||||
* to satisfy the allocation - lock to guarantee only one thread does
|
|
||||||
* this at a time:
|
|
||||||
*/
|
|
||||||
struct closure_waitlist mca_wait;
|
|
||||||
struct task_struct *btree_cache_alloc_lock;
|
|
||||||
|
|
||||||
mempool_t btree_reserve_pool;
|
mempool_t btree_reserve_pool;
|
||||||
|
|
||||||
@ -606,6 +573,7 @@ struct bch_fs {
|
|||||||
struct workqueue_struct *copygc_wq;
|
struct workqueue_struct *copygc_wq;
|
||||||
|
|
||||||
/* ALLOCATION */
|
/* ALLOCATION */
|
||||||
|
struct rw_semaphore alloc_gc_lock;
|
||||||
struct bch_pd_controller foreground_write_pd;
|
struct bch_pd_controller foreground_write_pd;
|
||||||
struct delayed_work pd_controllers_update;
|
struct delayed_work pd_controllers_update;
|
||||||
unsigned pd_controllers_update_seconds;
|
unsigned pd_controllers_update_seconds;
|
||||||
@ -622,6 +590,7 @@ struct bch_fs {
|
|||||||
struct bch_devs_mask rw_devs[BCH_DATA_NR];
|
struct bch_devs_mask rw_devs[BCH_DATA_NR];
|
||||||
struct bch_tier tiers[BCH_TIER_MAX];
|
struct bch_tier tiers[BCH_TIER_MAX];
|
||||||
/* NULL if we only have devices in one tier: */
|
/* NULL if we only have devices in one tier: */
|
||||||
|
struct bch_devs_mask *fastest_devs;
|
||||||
struct bch_tier *fastest_tier;
|
struct bch_tier *fastest_tier;
|
||||||
|
|
||||||
u64 capacity; /* sectors */
|
u64 capacity; /* sectors */
|
||||||
@ -654,17 +623,17 @@ struct bch_fs {
|
|||||||
struct io_clock io_clock[2];
|
struct io_clock io_clock[2];
|
||||||
|
|
||||||
/* SECTOR ALLOCATOR */
|
/* SECTOR ALLOCATOR */
|
||||||
struct list_head open_buckets_open;
|
|
||||||
struct list_head open_buckets_free;
|
|
||||||
unsigned open_buckets_nr_free;
|
|
||||||
struct closure_waitlist open_buckets_wait;
|
|
||||||
spinlock_t open_buckets_lock;
|
spinlock_t open_buckets_lock;
|
||||||
|
u8 open_buckets_freelist;
|
||||||
|
u8 open_buckets_nr_free;
|
||||||
|
struct closure_waitlist open_buckets_wait;
|
||||||
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
|
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
|
||||||
|
|
||||||
struct write_point btree_write_point;
|
struct write_point btree_write_point;
|
||||||
|
|
||||||
struct write_point write_points[WRITE_POINT_COUNT];
|
struct write_point write_points[WRITE_POINT_COUNT];
|
||||||
struct write_point promote_write_point;
|
struct hlist_head write_points_hash[WRITE_POINT_COUNT];
|
||||||
|
struct mutex write_points_hash_lock;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This write point is used for migrating data off a device
|
* This write point is used for migrating data off a device
|
||||||
|
@ -31,13 +31,15 @@ void bch2_recalc_btree_reserve(struct bch_fs *c)
|
|||||||
reserve += min_t(unsigned, 1,
|
reserve += min_t(unsigned, 1,
|
||||||
c->btree_roots[i].b->level) * 8;
|
c->btree_roots[i].b->level) * 8;
|
||||||
|
|
||||||
c->btree_cache_reserve = reserve;
|
c->btree_cache.reserve = reserve;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define mca_can_free(c) \
|
static inline unsigned btree_cache_can_free(struct btree_cache *bc)
|
||||||
max_t(int, 0, c->btree_cache_used - c->btree_cache_reserve)
|
{
|
||||||
|
return max_t(int, 0, bc->used - bc->reserve);
|
||||||
|
}
|
||||||
|
|
||||||
static void __mca_data_free(struct bch_fs *c, struct btree *b)
|
static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
|
||||||
{
|
{
|
||||||
EBUG_ON(btree_node_write_in_flight(b));
|
EBUG_ON(btree_node_write_in_flight(b));
|
||||||
|
|
||||||
@ -46,11 +48,13 @@ static void __mca_data_free(struct bch_fs *c, struct btree *b)
|
|||||||
bch2_btree_keys_free(b);
|
bch2_btree_keys_free(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mca_data_free(struct bch_fs *c, struct btree *b)
|
static void btree_node_data_free(struct bch_fs *c, struct btree *b)
|
||||||
{
|
{
|
||||||
__mca_data_free(c, b);
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
c->btree_cache_used--;
|
|
||||||
list_move(&b->list, &c->btree_cache_freed);
|
__btree_node_data_free(c, b);
|
||||||
|
bc->used--;
|
||||||
|
list_move(&b->list, &bc->freed);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const struct rhashtable_params bch_btree_cache_params = {
|
static const struct rhashtable_params bch_btree_cache_params = {
|
||||||
@ -59,8 +63,10 @@ static const struct rhashtable_params bch_btree_cache_params = {
|
|||||||
.key_len = sizeof(struct bch_extent_ptr),
|
.key_len = sizeof(struct bch_extent_ptr),
|
||||||
};
|
};
|
||||||
|
|
||||||
static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
|
static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
|
||||||
{
|
{
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
|
|
||||||
b->data = kvpmalloc(btree_bytes(c), gfp);
|
b->data = kvpmalloc(btree_bytes(c), gfp);
|
||||||
if (!b->data)
|
if (!b->data)
|
||||||
goto err;
|
goto err;
|
||||||
@ -68,16 +74,16 @@ static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
|
|||||||
if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
|
if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
|
||||||
goto err;
|
goto err;
|
||||||
|
|
||||||
c->btree_cache_used++;
|
bc->used++;
|
||||||
list_move(&b->list, &c->btree_cache_freeable);
|
list_move(&b->list, &bc->freeable);
|
||||||
return;
|
return;
|
||||||
err:
|
err:
|
||||||
kvpfree(b->data, btree_bytes(c));
|
kvpfree(b->data, btree_bytes(c));
|
||||||
b->data = NULL;
|
b->data = NULL;
|
||||||
list_move(&b->list, &c->btree_cache_freed);
|
list_move(&b->list, &bc->freed);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
|
static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
|
||||||
{
|
{
|
||||||
struct btree *b = kzalloc(sizeof(struct btree), gfp);
|
struct btree *b = kzalloc(sizeof(struct btree), gfp);
|
||||||
if (!b)
|
if (!b)
|
||||||
@ -88,49 +94,48 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
|
|||||||
INIT_LIST_HEAD(&b->list);
|
INIT_LIST_HEAD(&b->list);
|
||||||
INIT_LIST_HEAD(&b->write_blocked);
|
INIT_LIST_HEAD(&b->write_blocked);
|
||||||
|
|
||||||
mca_data_alloc(c, b, gfp);
|
btree_node_data_alloc(c, b, gfp);
|
||||||
return b->data ? b : NULL;
|
return b->data ? b : NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Btree in memory cache - hash table */
|
/* Btree in memory cache - hash table */
|
||||||
|
|
||||||
void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b)
|
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
|
||||||
{
|
{
|
||||||
rhashtable_remove_fast(&c->btree_cache_table, &b->hash,
|
rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
|
||||||
bch_btree_cache_params);
|
|
||||||
|
|
||||||
/* Cause future lookups for this node to fail: */
|
/* Cause future lookups for this node to fail: */
|
||||||
bkey_i_to_extent(&b->key)->v._data[0] = 0;
|
bkey_i_to_extent(&b->key)->v._data[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int __bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b)
|
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
|
||||||
{
|
{
|
||||||
return rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash,
|
return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
|
||||||
bch_btree_cache_params);
|
bch_btree_cache_params);
|
||||||
}
|
}
|
||||||
|
|
||||||
int bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b,
|
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
|
||||||
unsigned level, enum btree_id id)
|
unsigned level, enum btree_id id)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
b->level = level;
|
b->level = level;
|
||||||
b->btree_id = id;
|
b->btree_id = id;
|
||||||
|
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&bc->lock);
|
||||||
ret = __bch2_btree_node_hash_insert(c, b);
|
ret = __bch2_btree_node_hash_insert(bc, b);
|
||||||
if (!ret)
|
if (!ret)
|
||||||
list_add(&b->list, &c->btree_cache);
|
list_add(&b->list, &bc->live);
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&bc->lock);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
__flatten
|
__flatten
|
||||||
static inline struct btree *mca_find(struct bch_fs *c,
|
static inline struct btree *btree_cache_find(struct btree_cache *bc,
|
||||||
const struct bkey_i *k)
|
const struct bkey_i *k)
|
||||||
{
|
{
|
||||||
return rhashtable_lookup_fast(&c->btree_cache_table, &PTR_HASH(k),
|
return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
|
||||||
bch_btree_cache_params);
|
bch_btree_cache_params);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -140,9 +145,10 @@ static inline struct btree *mca_find(struct bch_fs *c,
|
|||||||
*/
|
*/
|
||||||
static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
|
static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
|
||||||
{
|
{
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
lockdep_assert_held(&c->btree_cache_lock);
|
lockdep_assert_held(&bc->lock);
|
||||||
|
|
||||||
if (!six_trylock_intent(&b->lock))
|
if (!six_trylock_intent(&b->lock))
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
@ -201,11 +207,12 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
|
|||||||
return __btree_node_reclaim(c, b, true);
|
return __btree_node_reclaim(c, b, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long bch2_mca_scan(struct shrinker *shrink,
|
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
||||||
struct shrink_control *sc)
|
struct shrink_control *sc)
|
||||||
{
|
{
|
||||||
struct bch_fs *c = container_of(shrink, struct bch_fs,
|
struct bch_fs *c = container_of(shrink, struct bch_fs,
|
||||||
btree_cache_shrink);
|
btree_cache.shrink);
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
struct btree *b, *t;
|
struct btree *b, *t;
|
||||||
unsigned long nr = sc->nr_to_scan;
|
unsigned long nr = sc->nr_to_scan;
|
||||||
unsigned long can_free;
|
unsigned long can_free;
|
||||||
@ -218,8 +225,8 @@ static unsigned long bch2_mca_scan(struct shrinker *shrink,
|
|||||||
|
|
||||||
/* Return -1 if we can't do anything right now */
|
/* Return -1 if we can't do anything right now */
|
||||||
if (sc->gfp_mask & __GFP_IO)
|
if (sc->gfp_mask & __GFP_IO)
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&bc->lock);
|
||||||
else if (!mutex_trylock(&c->btree_cache_lock))
|
else if (!mutex_trylock(&bc->lock))
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -230,11 +237,11 @@ static unsigned long bch2_mca_scan(struct shrinker *shrink,
|
|||||||
* IO can always make forward progress:
|
* IO can always make forward progress:
|
||||||
*/
|
*/
|
||||||
nr /= btree_pages(c);
|
nr /= btree_pages(c);
|
||||||
can_free = mca_can_free(c);
|
can_free = btree_cache_can_free(bc);
|
||||||
nr = min_t(unsigned long, nr, can_free);
|
nr = min_t(unsigned long, nr, can_free);
|
||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
|
list_for_each_entry_safe(b, t, &bc->freeable, list) {
|
||||||
touched++;
|
touched++;
|
||||||
|
|
||||||
if (freed >= nr)
|
if (freed >= nr)
|
||||||
@ -242,34 +249,34 @@ static unsigned long bch2_mca_scan(struct shrinker *shrink,
|
|||||||
|
|
||||||
if (++i > 3 &&
|
if (++i > 3 &&
|
||||||
!btree_node_reclaim(c, b)) {
|
!btree_node_reclaim(c, b)) {
|
||||||
mca_data_free(c, b);
|
btree_node_data_free(c, b);
|
||||||
six_unlock_write(&b->lock);
|
six_unlock_write(&b->lock);
|
||||||
six_unlock_intent(&b->lock);
|
six_unlock_intent(&b->lock);
|
||||||
freed++;
|
freed++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
restart:
|
restart:
|
||||||
list_for_each_entry_safe(b, t, &c->btree_cache, list) {
|
list_for_each_entry_safe(b, t, &bc->live, list) {
|
||||||
touched++;
|
touched++;
|
||||||
|
|
||||||
if (freed >= nr) {
|
if (freed >= nr) {
|
||||||
/* Save position */
|
/* Save position */
|
||||||
if (&t->list != &c->btree_cache)
|
if (&t->list != &bc->live)
|
||||||
list_move_tail(&c->btree_cache, &t->list);
|
list_move_tail(&bc->live, &t->list);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!btree_node_accessed(b) &&
|
if (!btree_node_accessed(b) &&
|
||||||
!btree_node_reclaim(c, b)) {
|
!btree_node_reclaim(c, b)) {
|
||||||
/* can't call bch2_btree_node_hash_remove under btree_cache_lock */
|
/* can't call bch2_btree_node_hash_remove under lock */
|
||||||
freed++;
|
freed++;
|
||||||
if (&t->list != &c->btree_cache)
|
if (&t->list != &bc->live)
|
||||||
list_move_tail(&c->btree_cache, &t->list);
|
list_move_tail(&bc->live, &t->list);
|
||||||
|
|
||||||
mca_data_free(c, b);
|
btree_node_data_free(c, b);
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&bc->lock);
|
||||||
|
|
||||||
bch2_btree_node_hash_remove(c, b);
|
bch2_btree_node_hash_remove(bc, b);
|
||||||
six_unlock_write(&b->lock);
|
six_unlock_write(&b->lock);
|
||||||
six_unlock_intent(&b->lock);
|
six_unlock_intent(&b->lock);
|
||||||
|
|
||||||
@ -277,97 +284,97 @@ restart:
|
|||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
if (sc->gfp_mask & __GFP_IO)
|
if (sc->gfp_mask & __GFP_IO)
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&bc->lock);
|
||||||
else if (!mutex_trylock(&c->btree_cache_lock))
|
else if (!mutex_trylock(&bc->lock))
|
||||||
goto out;
|
goto out;
|
||||||
goto restart;
|
goto restart;
|
||||||
} else
|
} else
|
||||||
clear_btree_node_accessed(b);
|
clear_btree_node_accessed(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&bc->lock);
|
||||||
out:
|
out:
|
||||||
return (unsigned long) freed * btree_pages(c);
|
return (unsigned long) freed * btree_pages(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long bch2_mca_count(struct shrinker *shrink,
|
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
|
||||||
struct shrink_control *sc)
|
struct shrink_control *sc)
|
||||||
{
|
{
|
||||||
struct bch_fs *c = container_of(shrink, struct bch_fs,
|
struct bch_fs *c = container_of(shrink, struct bch_fs,
|
||||||
btree_cache_shrink);
|
btree_cache.shrink);
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
|
|
||||||
if (btree_shrinker_disabled(c))
|
if (btree_shrinker_disabled(c))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
return mca_can_free(c) * btree_pages(c);
|
return btree_cache_can_free(bc) * btree_pages(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
void bch2_fs_btree_exit(struct bch_fs *c)
|
void bch2_fs_btree_cache_exit(struct bch_fs *c)
|
||||||
{
|
{
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
struct btree *b;
|
struct btree *b;
|
||||||
unsigned i;
|
unsigned i;
|
||||||
|
|
||||||
if (c->btree_cache_shrink.list.next)
|
if (bc->shrink.list.next)
|
||||||
unregister_shrinker(&c->btree_cache_shrink);
|
unregister_shrinker(&bc->shrink);
|
||||||
|
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&bc->lock);
|
||||||
|
|
||||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||||
if (c->verify_data)
|
if (c->verify_data)
|
||||||
list_move(&c->verify_data->list, &c->btree_cache);
|
list_move(&c->verify_data->list, &bc->live);
|
||||||
|
|
||||||
kvpfree(c->verify_ondisk, btree_bytes(c));
|
kvpfree(c->verify_ondisk, btree_bytes(c));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (i = 0; i < BTREE_ID_NR; i++)
|
for (i = 0; i < BTREE_ID_NR; i++)
|
||||||
if (c->btree_roots[i].b)
|
if (c->btree_roots[i].b)
|
||||||
list_add(&c->btree_roots[i].b->list, &c->btree_cache);
|
list_add(&c->btree_roots[i].b->list, &bc->live);
|
||||||
|
|
||||||
list_splice(&c->btree_cache_freeable,
|
list_splice(&bc->freeable, &bc->live);
|
||||||
&c->btree_cache);
|
|
||||||
|
|
||||||
while (!list_empty(&c->btree_cache)) {
|
while (!list_empty(&bc->live)) {
|
||||||
b = list_first_entry(&c->btree_cache, struct btree, list);
|
b = list_first_entry(&bc->live, struct btree, list);
|
||||||
|
|
||||||
if (btree_node_dirty(b))
|
if (btree_node_dirty(b))
|
||||||
bch2_btree_complete_write(c, b, btree_current_write(b));
|
bch2_btree_complete_write(c, b, btree_current_write(b));
|
||||||
clear_btree_node_dirty(b);
|
clear_btree_node_dirty(b);
|
||||||
|
|
||||||
mca_data_free(c, b);
|
btree_node_data_free(c, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
while (!list_empty(&c->btree_cache_freed)) {
|
while (!list_empty(&bc->freed)) {
|
||||||
b = list_first_entry(&c->btree_cache_freed,
|
b = list_first_entry(&bc->freed, struct btree, list);
|
||||||
struct btree, list);
|
|
||||||
list_del(&b->list);
|
list_del(&b->list);
|
||||||
kfree(b);
|
kfree(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&bc->lock);
|
||||||
|
|
||||||
if (c->btree_cache_table_init_done)
|
if (bc->table_init_done)
|
||||||
rhashtable_destroy(&c->btree_cache_table);
|
rhashtable_destroy(&bc->table);
|
||||||
}
|
}
|
||||||
|
|
||||||
int bch2_fs_btree_init(struct bch_fs *c)
|
int bch2_fs_btree_cache_init(struct bch_fs *c)
|
||||||
{
|
{
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
unsigned i;
|
unsigned i;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
ret = rhashtable_init(&c->btree_cache_table, &bch_btree_cache_params);
|
ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
c->btree_cache_table_init_done = true;
|
bc->table_init_done = true;
|
||||||
|
|
||||||
bch2_recalc_btree_reserve(c);
|
bch2_recalc_btree_reserve(c);
|
||||||
|
|
||||||
for (i = 0; i < c->btree_cache_reserve; i++)
|
for (i = 0; i < bc->reserve; i++)
|
||||||
if (!mca_bucket_alloc(c, GFP_KERNEL))
|
if (!btree_node_mem_alloc(c, GFP_KERNEL))
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
list_splice_init(&c->btree_cache,
|
list_splice_init(&bc->live, &bc->freeable);
|
||||||
&c->btree_cache_freeable);
|
|
||||||
|
|
||||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||||
mutex_init(&c->verify_lock);
|
mutex_init(&c->verify_lock);
|
||||||
@ -376,42 +383,53 @@ int bch2_fs_btree_init(struct bch_fs *c)
|
|||||||
if (!c->verify_ondisk)
|
if (!c->verify_ondisk)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
c->verify_data = mca_bucket_alloc(c, GFP_KERNEL);
|
c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
|
||||||
if (!c->verify_data)
|
if (!c->verify_data)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
list_del_init(&c->verify_data->list);
|
list_del_init(&c->verify_data->list);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
c->btree_cache_shrink.count_objects = bch2_mca_count;
|
bc->shrink.count_objects = bch2_btree_cache_count;
|
||||||
c->btree_cache_shrink.scan_objects = bch2_mca_scan;
|
bc->shrink.scan_objects = bch2_btree_cache_scan;
|
||||||
c->btree_cache_shrink.seeks = 4;
|
bc->shrink.seeks = 4;
|
||||||
c->btree_cache_shrink.batch = btree_pages(c) * 2;
|
bc->shrink.batch = btree_pages(c) * 2;
|
||||||
register_shrinker(&c->btree_cache_shrink);
|
register_shrinker(&bc->shrink);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
|
||||||
|
{
|
||||||
|
mutex_init(&bc->lock);
|
||||||
|
INIT_LIST_HEAD(&bc->live);
|
||||||
|
INIT_LIST_HEAD(&bc->freeable);
|
||||||
|
INIT_LIST_HEAD(&bc->freed);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We can only have one thread cannibalizing other cached btree nodes at a time,
|
* We can only have one thread cannibalizing other cached btree nodes at a time,
|
||||||
* or we'll deadlock. We use an open coded mutex to ensure that, which a
|
* or we'll deadlock. We use an open coded mutex to ensure that, which a
|
||||||
* cannibalize_bucket() will take. This means every time we unlock the root of
|
* cannibalize_bucket() will take. This means every time we unlock the root of
|
||||||
* the btree, we need to release this lock if we have it held.
|
* the btree, we need to release this lock if we have it held.
|
||||||
*/
|
*/
|
||||||
void bch2_btree_node_cannibalize_unlock(struct bch_fs *c)
|
void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
|
||||||
{
|
{
|
||||||
if (c->btree_cache_alloc_lock == current) {
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
|
|
||||||
|
if (bc->alloc_lock == current) {
|
||||||
trace_btree_node_cannibalize_unlock(c);
|
trace_btree_node_cannibalize_unlock(c);
|
||||||
c->btree_cache_alloc_lock = NULL;
|
bc->alloc_lock = NULL;
|
||||||
closure_wake_up(&c->mca_wait);
|
closure_wake_up(&bc->alloc_wait);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int bch2_btree_node_cannibalize_lock(struct bch_fs *c, struct closure *cl)
|
int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
|
||||||
{
|
{
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
struct task_struct *old;
|
struct task_struct *old;
|
||||||
|
|
||||||
old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
|
old = cmpxchg(&bc->alloc_lock, NULL, current);
|
||||||
if (old == NULL || old == current)
|
if (old == NULL || old == current)
|
||||||
goto success;
|
goto success;
|
||||||
|
|
||||||
@ -420,13 +438,13 @@ int bch2_btree_node_cannibalize_lock(struct bch_fs *c, struct closure *cl)
|
|||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
closure_wait(&c->mca_wait, cl);
|
closure_wait(&bc->alloc_wait, cl);
|
||||||
|
|
||||||
/* Try again, after adding ourselves to waitlist */
|
/* Try again, after adding ourselves to waitlist */
|
||||||
old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
|
old = cmpxchg(&bc->alloc_lock, NULL, current);
|
||||||
if (old == NULL || old == current) {
|
if (old == NULL || old == current) {
|
||||||
/* We raced */
|
/* We raced */
|
||||||
closure_wake_up(&c->mca_wait);
|
closure_wake_up(&bc->alloc_wait);
|
||||||
goto success;
|
goto success;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -438,16 +456,17 @@ success:
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct btree *mca_cannibalize(struct bch_fs *c)
|
static struct btree *btree_node_cannibalize(struct bch_fs *c)
|
||||||
{
|
{
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
struct btree *b;
|
struct btree *b;
|
||||||
|
|
||||||
list_for_each_entry_reverse(b, &c->btree_cache, list)
|
list_for_each_entry_reverse(b, &bc->live, list)
|
||||||
if (!btree_node_reclaim(c, b))
|
if (!btree_node_reclaim(c, b))
|
||||||
return b;
|
return b;
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
list_for_each_entry_reverse(b, &c->btree_cache, list)
|
list_for_each_entry_reverse(b, &bc->live, list)
|
||||||
if (!btree_node_write_and_reclaim(c, b))
|
if (!btree_node_write_and_reclaim(c, b))
|
||||||
return b;
|
return b;
|
||||||
|
|
||||||
@ -462,16 +481,17 @@ static struct btree *mca_cannibalize(struct bch_fs *c)
|
|||||||
|
|
||||||
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
|
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
|
||||||
{
|
{
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
struct btree *b;
|
struct btree *b;
|
||||||
u64 start_time = local_clock();
|
u64 start_time = local_clock();
|
||||||
|
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&bc->lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* btree_free() doesn't free memory; it sticks the node on the end of
|
* btree_free() doesn't free memory; it sticks the node on the end of
|
||||||
* the list. Check if there's any freed nodes there:
|
* the list. Check if there's any freed nodes there:
|
||||||
*/
|
*/
|
||||||
list_for_each_entry(b, &c->btree_cache_freeable, list)
|
list_for_each_entry(b, &bc->freeable, list)
|
||||||
if (!btree_node_reclaim(c, b))
|
if (!btree_node_reclaim(c, b))
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
|
|
||||||
@ -479,9 +499,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
|
|||||||
* We never free struct btree itself, just the memory that holds the on
|
* We never free struct btree itself, just the memory that holds the on
|
||||||
* disk node. Check the freed list before allocating a new one:
|
* disk node. Check the freed list before allocating a new one:
|
||||||
*/
|
*/
|
||||||
list_for_each_entry(b, &c->btree_cache_freed, list)
|
list_for_each_entry(b, &bc->freed, list)
|
||||||
if (!btree_node_reclaim(c, b)) {
|
if (!btree_node_reclaim(c, b)) {
|
||||||
mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
|
btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
|
||||||
if (b->data)
|
if (b->data)
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
|
|
||||||
@ -490,7 +510,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
|
|||||||
goto err;
|
goto err;
|
||||||
}
|
}
|
||||||
|
|
||||||
b = mca_bucket_alloc(c, __GFP_NOWARN|GFP_NOIO);
|
b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
|
||||||
if (!b)
|
if (!b)
|
||||||
goto err;
|
goto err;
|
||||||
|
|
||||||
@ -501,7 +521,7 @@ out_unlock:
|
|||||||
BUG_ON(btree_node_write_in_flight(b));
|
BUG_ON(btree_node_write_in_flight(b));
|
||||||
|
|
||||||
list_del_init(&b->list);
|
list_del_init(&b->list);
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&bc->lock);
|
||||||
out:
|
out:
|
||||||
b->flags = 0;
|
b->flags = 0;
|
||||||
b->written = 0;
|
b->written = 0;
|
||||||
@ -517,18 +537,18 @@ out:
|
|||||||
return b;
|
return b;
|
||||||
err:
|
err:
|
||||||
/* Try to cannibalize another cached btree node: */
|
/* Try to cannibalize another cached btree node: */
|
||||||
if (c->btree_cache_alloc_lock == current) {
|
if (bc->alloc_lock == current) {
|
||||||
b = mca_cannibalize(c);
|
b = btree_node_cannibalize(c);
|
||||||
list_del_init(&b->list);
|
list_del_init(&b->list);
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&bc->lock);
|
||||||
|
|
||||||
bch2_btree_node_hash_remove(c, b);
|
bch2_btree_node_hash_remove(bc, b);
|
||||||
|
|
||||||
trace_btree_node_cannibalize(c);
|
trace_btree_node_cannibalize(c);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&bc->lock);
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(-ENOMEM);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -539,6 +559,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
|
|||||||
unsigned level,
|
unsigned level,
|
||||||
enum six_lock_type lock_type)
|
enum six_lock_type lock_type)
|
||||||
{
|
{
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
struct btree *b;
|
struct btree *b;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -552,15 +573,15 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
|
|||||||
return b;
|
return b;
|
||||||
|
|
||||||
bkey_copy(&b->key, k);
|
bkey_copy(&b->key, k);
|
||||||
if (bch2_btree_node_hash_insert(c, b, level, iter->btree_id)) {
|
if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
|
||||||
/* raced with another fill: */
|
/* raced with another fill: */
|
||||||
|
|
||||||
/* mark as unhashed... */
|
/* mark as unhashed... */
|
||||||
bkey_i_to_extent(&b->key)->v._data[0] = 0;
|
bkey_i_to_extent(&b->key)->v._data[0] = 0;
|
||||||
|
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&bc->lock);
|
||||||
list_add(&b->list, &c->btree_cache_freeable);
|
list_add(&b->list, &bc->freeable);
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&bc->lock);
|
||||||
|
|
||||||
six_unlock_write(&b->lock);
|
six_unlock_write(&b->lock);
|
||||||
six_unlock_intent(&b->lock);
|
six_unlock_intent(&b->lock);
|
||||||
@ -601,13 +622,14 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
|
|||||||
const struct bkey_i *k, unsigned level,
|
const struct bkey_i *k, unsigned level,
|
||||||
enum six_lock_type lock_type)
|
enum six_lock_type lock_type)
|
||||||
{
|
{
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
struct btree *b;
|
struct btree *b;
|
||||||
struct bset_tree *t;
|
struct bset_tree *t;
|
||||||
|
|
||||||
BUG_ON(level >= BTREE_MAX_DEPTH);
|
BUG_ON(level >= BTREE_MAX_DEPTH);
|
||||||
retry:
|
retry:
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
b = mca_find(c, k);
|
b = btree_cache_find(bc, k);
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
|
|
||||||
if (unlikely(!b)) {
|
if (unlikely(!b)) {
|
||||||
@ -755,12 +777,13 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
|
|||||||
void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k,
|
void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k,
|
||||||
unsigned level, enum btree_id btree_id)
|
unsigned level, enum btree_id btree_id)
|
||||||
{
|
{
|
||||||
|
struct btree_cache *bc = &c->btree_cache;
|
||||||
struct btree *b;
|
struct btree *b;
|
||||||
|
|
||||||
BUG_ON(level >= BTREE_MAX_DEPTH);
|
BUG_ON(level >= BTREE_MAX_DEPTH);
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
b = mca_find(c, k);
|
b = btree_cache_find(bc, k);
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
|
|
||||||
if (b)
|
if (b)
|
||||||
@ -771,15 +794,15 @@ void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k,
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
bkey_copy(&b->key, k);
|
bkey_copy(&b->key, k);
|
||||||
if (bch2_btree_node_hash_insert(c, b, level, btree_id)) {
|
if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
|
||||||
/* raced with another fill: */
|
/* raced with another fill: */
|
||||||
|
|
||||||
/* mark as unhashed... */
|
/* mark as unhashed... */
|
||||||
bkey_i_to_extent(&b->key)->v._data[0] = 0;
|
bkey_i_to_extent(&b->key)->v._data[0] = 0;
|
||||||
|
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&bc->lock);
|
||||||
list_add(&b->list, &c->btree_cache_freeable);
|
list_add(&b->list, &bc->freeable);
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&bc->lock);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,13 +11,13 @@ extern const char * const bch2_btree_ids[];
|
|||||||
|
|
||||||
void bch2_recalc_btree_reserve(struct bch_fs *);
|
void bch2_recalc_btree_reserve(struct bch_fs *);
|
||||||
|
|
||||||
void bch2_btree_node_hash_remove(struct bch_fs *, struct btree *);
|
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
|
||||||
int __bch2_btree_node_hash_insert(struct bch_fs *, struct btree *);
|
int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
|
||||||
int bch2_btree_node_hash_insert(struct bch_fs *, struct btree *,
|
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
|
||||||
unsigned, enum btree_id);
|
unsigned, enum btree_id);
|
||||||
|
|
||||||
void bch2_btree_node_cannibalize_unlock(struct bch_fs *);
|
void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
|
||||||
int bch2_btree_node_cannibalize_lock(struct bch_fs *, struct closure *);
|
int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
|
||||||
|
|
||||||
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
|
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
|
||||||
|
|
||||||
@ -32,8 +32,9 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
|
|||||||
void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *,
|
void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *,
|
||||||
unsigned, enum btree_id);
|
unsigned, enum btree_id);
|
||||||
|
|
||||||
void bch2_fs_btree_exit(struct bch_fs *);
|
void bch2_fs_btree_cache_exit(struct bch_fs *);
|
||||||
int bch2_fs_btree_init(struct bch_fs *);
|
int bch2_fs_btree_cache_init(struct bch_fs *);
|
||||||
|
void bch2_fs_btree_cache_init_early(struct btree_cache *);
|
||||||
|
|
||||||
#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0])
|
#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0])
|
||||||
|
|
||||||
|
@ -278,9 +278,12 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
|
|||||||
{
|
{
|
||||||
struct bch_dev *ca;
|
struct bch_dev *ca;
|
||||||
struct open_bucket *ob;
|
struct open_bucket *ob;
|
||||||
|
const struct open_bucket_ptr *ptr;
|
||||||
size_t i, j, iter;
|
size_t i, j, iter;
|
||||||
unsigned ci;
|
unsigned ci;
|
||||||
|
|
||||||
|
down_write(&c->alloc_gc_lock);
|
||||||
|
|
||||||
for_each_member_device(ca, c, ci) {
|
for_each_member_device(ca, c, ci) {
|
||||||
spin_lock(&ca->freelist_lock);
|
spin_lock(&ca->freelist_lock);
|
||||||
|
|
||||||
@ -291,21 +294,26 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
|
|||||||
fifo_for_each_entry(i, &ca->free[j], iter)
|
fifo_for_each_entry(i, &ca->free[j], iter)
|
||||||
bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
|
bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
|
||||||
|
|
||||||
|
for (ptr = ca->open_buckets_partial;
|
||||||
|
ptr < ca->open_buckets_partial + ca->open_buckets_partial_nr;
|
||||||
|
ptr++)
|
||||||
|
bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
|
||||||
|
|
||||||
spin_unlock(&ca->freelist_lock);
|
spin_unlock(&ca->freelist_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (ob = c->open_buckets;
|
for (ob = c->open_buckets;
|
||||||
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
|
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
|
||||||
ob++) {
|
ob++) {
|
||||||
const struct bch_extent_ptr *ptr;
|
spin_lock(&ob->lock);
|
||||||
|
|
||||||
mutex_lock(&ob->lock);
|
|
||||||
open_bucket_for_each_ptr(ob, ptr) {
|
open_bucket_for_each_ptr(ob, ptr) {
|
||||||
ca = c->devs[ptr->dev];
|
ca = c->devs[ptr->ptr.dev];
|
||||||
bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), true);
|
bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
|
||||||
}
|
}
|
||||||
mutex_unlock(&ob->lock);
|
spin_unlock(&ob->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
up_write(&c->alloc_gc_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end,
|
static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end,
|
||||||
|
@ -1364,17 +1364,17 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
|
|||||||
closure_init_stack(&cl);
|
closure_init_stack(&cl);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
ret = bch2_btree_node_cannibalize_lock(c, &cl);
|
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
|
||||||
closure_sync(&cl);
|
closure_sync(&cl);
|
||||||
} while (ret);
|
} while (ret);
|
||||||
|
|
||||||
b = bch2_btree_node_mem_alloc(c);
|
b = bch2_btree_node_mem_alloc(c);
|
||||||
bch2_btree_node_cannibalize_unlock(c);
|
bch2_btree_cache_cannibalize_unlock(c);
|
||||||
|
|
||||||
BUG_ON(IS_ERR(b));
|
BUG_ON(IS_ERR(b));
|
||||||
|
|
||||||
bkey_copy(&b->key, k);
|
bkey_copy(&b->key, k);
|
||||||
BUG_ON(bch2_btree_node_hash_insert(c, b, level, id));
|
BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
|
||||||
|
|
||||||
bch2_btree_node_read(c, b, true);
|
bch2_btree_node_read(c, b, true);
|
||||||
six_unlock_write(&b->lock);
|
six_unlock_write(&b->lock);
|
||||||
@ -1844,8 +1844,8 @@ void bch2_btree_verify_flushed(struct bch_fs *c)
|
|||||||
unsigned i;
|
unsigned i;
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
|
tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
|
||||||
&c->btree_cache_table);
|
&c->btree_cache.table);
|
||||||
|
|
||||||
for (i = 0; i < tbl->size; i++)
|
for (i = 0; i < tbl->size; i++)
|
||||||
rht_for_each_entry_rcu(b, pos, tbl, i, hash)
|
rht_for_each_entry_rcu(b, pos, tbl, i, hash)
|
||||||
|
@ -769,7 +769,7 @@ retry_all:
|
|||||||
closure_init_stack(&cl);
|
closure_init_stack(&cl);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
ret = bch2_btree_node_cannibalize_lock(c, &cl);
|
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
|
||||||
closure_sync(&cl);
|
closure_sync(&cl);
|
||||||
} while (ret);
|
} while (ret);
|
||||||
}
|
}
|
||||||
@ -817,7 +817,7 @@ retry:
|
|||||||
|
|
||||||
ret = btree_iter_linked(iter) ? -EINTR : 0;
|
ret = btree_iter_linked(iter) ? -EINTR : 0;
|
||||||
out:
|
out:
|
||||||
bch2_btree_node_cannibalize_unlock(c);
|
bch2_btree_cache_cannibalize_unlock(c);
|
||||||
return ret;
|
return ret;
|
||||||
io_error:
|
io_error:
|
||||||
BUG_ON(ret != -EIO);
|
BUG_ON(ret != -EIO);
|
||||||
|
@ -130,6 +130,42 @@ struct btree {
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct btree_cache {
|
||||||
|
struct rhashtable table;
|
||||||
|
bool table_init_done;
|
||||||
|
/*
|
||||||
|
* We never free a struct btree, except on shutdown - we just put it on
|
||||||
|
* the btree_cache_freed list and reuse it later. This simplifies the
|
||||||
|
* code, and it doesn't cost us much memory as the memory usage is
|
||||||
|
* dominated by buffers that hold the actual btree node data and those
|
||||||
|
* can be freed - and the number of struct btrees allocated is
|
||||||
|
* effectively bounded.
|
||||||
|
*
|
||||||
|
* btree_cache_freeable effectively is a small cache - we use it because
|
||||||
|
* high order page allocations can be rather expensive, and it's quite
|
||||||
|
* common to delete and allocate btree nodes in quick succession. It
|
||||||
|
* should never grow past ~2-3 nodes in practice.
|
||||||
|
*/
|
||||||
|
struct mutex lock;
|
||||||
|
struct list_head live;
|
||||||
|
struct list_head freeable;
|
||||||
|
struct list_head freed;
|
||||||
|
|
||||||
|
/* Number of elements in live + freeable lists */
|
||||||
|
unsigned used;
|
||||||
|
unsigned reserve;
|
||||||
|
struct shrinker shrink;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we need to allocate memory for a new btree node and that
|
||||||
|
* allocation fails, we can cannibalize another node in the btree cache
|
||||||
|
* to satisfy the allocation - lock to guarantee only one thread does
|
||||||
|
* this at a time:
|
||||||
|
*/
|
||||||
|
struct task_struct *alloc_lock;
|
||||||
|
struct closure_waitlist alloc_wait;
|
||||||
|
};
|
||||||
|
|
||||||
#define BTREE_FLAG(flag) \
|
#define BTREE_FLAG(flag) \
|
||||||
static inline bool btree_node_ ## flag(struct btree *b) \
|
static inline bool btree_node_ ## flag(struct btree *b) \
|
||||||
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
|
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
|
||||||
|
@ -237,11 +237,11 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
|
|||||||
|
|
||||||
six_lock_write(&b->lock);
|
six_lock_write(&b->lock);
|
||||||
|
|
||||||
bch2_btree_node_hash_remove(c, b);
|
bch2_btree_node_hash_remove(&c->btree_cache, b);
|
||||||
|
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&c->btree_cache.lock);
|
||||||
list_move(&b->list, &c->btree_cache_freeable);
|
list_move(&b->list, &c->btree_cache.freeable);
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&c->btree_cache.lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* By using six_unlock_write() directly instead of
|
* By using six_unlock_write() directly instead of
|
||||||
@ -339,11 +339,11 @@ retry:
|
|||||||
bkey_extent_init(&tmp.k);
|
bkey_extent_init(&tmp.k);
|
||||||
tmp.k.k.size = c->opts.btree_node_size,
|
tmp.k.k.size = c->opts.btree_node_size,
|
||||||
|
|
||||||
ob = bch2_alloc_sectors(c, &c->btree_write_point,
|
ob = bch2_alloc_sectors(c, BCH_DATA_BTREE, 0, 0,
|
||||||
bkey_i_to_extent(&tmp.k),
|
bkey_i_to_extent(&tmp.k),
|
||||||
res->nr_replicas,
|
res->nr_replicas,
|
||||||
c->opts.metadata_replicas_required,
|
c->opts.metadata_replicas_required,
|
||||||
alloc_reserve, cl);
|
alloc_reserve, 0, cl);
|
||||||
if (IS_ERR(ob))
|
if (IS_ERR(ob))
|
||||||
return ERR_CAST(ob);
|
return ERR_CAST(ob);
|
||||||
|
|
||||||
@ -374,7 +374,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
|
|||||||
|
|
||||||
b = as->reserve->b[--as->reserve->nr];
|
b = as->reserve->b[--as->reserve->nr];
|
||||||
|
|
||||||
BUG_ON(bch2_btree_node_hash_insert(c, b, level, as->btree_id));
|
BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
|
||||||
|
|
||||||
set_btree_node_accessed(b);
|
set_btree_node_accessed(b);
|
||||||
set_btree_node_dirty(b);
|
set_btree_node_dirty(b);
|
||||||
@ -515,7 +515,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
|
|||||||
* Protects reaping from the btree node cache and using the btree node
|
* Protects reaping from the btree node cache and using the btree node
|
||||||
* open bucket reserve:
|
* open bucket reserve:
|
||||||
*/
|
*/
|
||||||
ret = bch2_btree_node_cannibalize_lock(c, cl);
|
ret = bch2_btree_cache_cannibalize_lock(c, cl);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
bch2_disk_reservation_put(c, &disk_res);
|
bch2_disk_reservation_put(c, &disk_res);
|
||||||
return ERR_PTR(ret);
|
return ERR_PTR(ret);
|
||||||
@ -543,11 +543,11 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
|
|||||||
reserve->b[reserve->nr++] = b;
|
reserve->b[reserve->nr++] = b;
|
||||||
}
|
}
|
||||||
|
|
||||||
bch2_btree_node_cannibalize_unlock(c);
|
bch2_btree_cache_cannibalize_unlock(c);
|
||||||
return reserve;
|
return reserve;
|
||||||
err_free:
|
err_free:
|
||||||
bch2_btree_reserve_put(c, reserve);
|
bch2_btree_reserve_put(c, reserve);
|
||||||
bch2_btree_node_cannibalize_unlock(c);
|
bch2_btree_cache_cannibalize_unlock(c);
|
||||||
trace_btree_reserve_get_fail(c, nr_nodes, cl);
|
trace_btree_reserve_get_fail(c, nr_nodes, cl);
|
||||||
return ERR_PTR(ret);
|
return ERR_PTR(ret);
|
||||||
}
|
}
|
||||||
@ -1015,9 +1015,9 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
|
|||||||
static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
|
static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
|
||||||
{
|
{
|
||||||
/* Root nodes cannot be reaped */
|
/* Root nodes cannot be reaped */
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&c->btree_cache.lock);
|
||||||
list_del_init(&b->list);
|
list_del_init(&b->list);
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&c->btree_cache.lock);
|
||||||
|
|
||||||
mutex_lock(&c->btree_root_lock);
|
mutex_lock(&c->btree_root_lock);
|
||||||
btree_node_root(c, b) = b;
|
btree_node_root(c, b) = b;
|
||||||
@ -1802,7 +1802,7 @@ retry:
|
|||||||
PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
|
PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
|
||||||
/* bch2_btree_reserve_get will unlock */
|
/* bch2_btree_reserve_get will unlock */
|
||||||
do {
|
do {
|
||||||
ret = bch2_btree_node_cannibalize_lock(c, &cl);
|
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
|
||||||
closure_sync(&cl);
|
closure_sync(&cl);
|
||||||
} while (ret == -EAGAIN);
|
} while (ret == -EAGAIN);
|
||||||
|
|
||||||
@ -1873,23 +1873,24 @@ retry:
|
|||||||
if (parent) {
|
if (parent) {
|
||||||
if (new_hash) {
|
if (new_hash) {
|
||||||
bkey_copy(&new_hash->key, &new_key->k_i);
|
bkey_copy(&new_hash->key, &new_key->k_i);
|
||||||
BUG_ON(bch2_btree_node_hash_insert(c, new_hash,
|
ret = bch2_btree_node_hash_insert(&c->btree_cache,
|
||||||
b->level, b->btree_id));
|
new_hash, b->level, b->btree_id);
|
||||||
|
BUG_ON(ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
bch2_btree_insert_node(as, parent, &iter,
|
bch2_btree_insert_node(as, parent, &iter,
|
||||||
&keylist_single(&new_key->k_i));
|
&keylist_single(&new_key->k_i));
|
||||||
|
|
||||||
if (new_hash) {
|
if (new_hash) {
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&c->btree_cache.lock);
|
||||||
bch2_btree_node_hash_remove(c, new_hash);
|
bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
|
||||||
|
|
||||||
bch2_btree_node_hash_remove(c, b);
|
bch2_btree_node_hash_remove(&c->btree_cache, b);
|
||||||
|
|
||||||
bkey_copy(&b->key, &new_key->k_i);
|
bkey_copy(&b->key, &new_key->k_i);
|
||||||
ret = __bch2_btree_node_hash_insert(c, b);
|
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
|
||||||
BUG_ON(ret);
|
BUG_ON(ret);
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&c->btree_cache.lock);
|
||||||
} else {
|
} else {
|
||||||
bkey_copy(&b->key, &new_key->k_i);
|
bkey_copy(&b->key, &new_key->k_i);
|
||||||
}
|
}
|
||||||
@ -1918,9 +1919,9 @@ retry:
|
|||||||
bch2_btree_update_done(as);
|
bch2_btree_update_done(as);
|
||||||
out:
|
out:
|
||||||
if (new_hash) {
|
if (new_hash) {
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&c->btree_cache.lock);
|
||||||
list_move(&new_hash->list, &c->btree_cache_freeable);
|
list_move(&new_hash->list, &c->btree_cache.freeable);
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&c->btree_cache.lock);
|
||||||
|
|
||||||
six_unlock_write(&new_hash->lock);
|
six_unlock_write(&new_hash->lock);
|
||||||
six_unlock_intent(&new_hash->lock);
|
six_unlock_intent(&new_hash->lock);
|
||||||
|
@ -407,8 +407,11 @@ void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
|
|||||||
|
|
||||||
static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
|
static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
|
||||||
{
|
{
|
||||||
return sectors * crc_compressed_size(NULL, crc) /
|
if (!sectors)
|
||||||
crc_uncompressed_size(NULL, crc);
|
return 0;
|
||||||
|
|
||||||
|
return max(1U, DIV_ROUND_UP(sectors * crc_compressed_size(NULL, crc),
|
||||||
|
crc_uncompressed_size(NULL, crc)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -25,7 +25,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
|
|||||||
{
|
{
|
||||||
void *b;
|
void *b;
|
||||||
|
|
||||||
BUG_ON(size > c->sb.encoded_extent_max);
|
BUG_ON(size > c->sb.encoded_extent_max << 9);
|
||||||
|
|
||||||
b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
|
b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
|
||||||
if (b)
|
if (b)
|
||||||
@ -164,8 +164,8 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case BCH_COMPRESSION_LZ4:
|
case BCH_COMPRESSION_LZ4:
|
||||||
ret = LZ4_decompress_safe(src_data.b, dst_data,
|
ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
|
||||||
src_len, dst_len);
|
src_len, dst_len, dst_len);
|
||||||
if (ret != dst_len) {
|
if (ret != dst_len) {
|
||||||
ret = -EIO;
|
ret = -EIO;
|
||||||
goto err;
|
goto err;
|
||||||
@ -269,7 +269,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
|
|||||||
size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
|
size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
|
||||||
int ret = -ENOMEM;
|
int ret = -ENOMEM;
|
||||||
|
|
||||||
if (crc_uncompressed_size(NULL, &crc) < c->sb.encoded_extent_max)
|
if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
|
||||||
|
crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max)
|
||||||
return -EIO;
|
return -EIO;
|
||||||
|
|
||||||
dst_data = dst_len == dst_iter.bi_size
|
dst_data = dst_len == dst_iter.bi_size
|
||||||
@ -294,7 +295,7 @@ static int __bio_compress(struct bch_fs *c,
|
|||||||
{
|
{
|
||||||
struct bbuf src_data = { NULL }, dst_data = { NULL };
|
struct bbuf src_data = { NULL }, dst_data = { NULL };
|
||||||
unsigned pad;
|
unsigned pad;
|
||||||
int ret;
|
int ret = 0;
|
||||||
|
|
||||||
dst_data = bio_map_or_bounce(c, dst, WRITE);
|
dst_data = bio_map_or_bounce(c, dst, WRITE);
|
||||||
src_data = bio_map_or_bounce(c, src, READ);
|
src_data = bio_map_or_bounce(c, src, READ);
|
||||||
@ -307,23 +308,28 @@ static int __bio_compress(struct bch_fs *c,
|
|||||||
void *workspace;
|
void *workspace;
|
||||||
int len = src->bi_iter.bi_size;
|
int len = src->bi_iter.bi_size;
|
||||||
|
|
||||||
ret = 0;
|
|
||||||
|
|
||||||
workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
|
workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
|
||||||
|
|
||||||
while (len > block_bytes(c) &&
|
while (1) {
|
||||||
(!(ret = LZ4_compress_destSize(
|
if (len <= block_bytes(c)) {
|
||||||
|
ret = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = LZ4_compress_destSize(
|
||||||
src_data.b, dst_data.b,
|
src_data.b, dst_data.b,
|
||||||
&len, dst->bi_iter.bi_size,
|
&len, dst->bi_iter.bi_size,
|
||||||
workspace)) ||
|
workspace);
|
||||||
(len & (block_bytes(c) - 1)))) {
|
if (ret >= len) {
|
||||||
/*
|
/* uncompressible: */
|
||||||
* On error, the compressed data was bigger than
|
ret = 0;
|
||||||
* dst_len - round down to nearest block and try again:
|
break;
|
||||||
*/
|
}
|
||||||
|
|
||||||
|
if (!(len & (block_bytes(c) - 1)))
|
||||||
|
break;
|
||||||
len = round_down(len, block_bytes(c));
|
len = round_down(len, block_bytes(c));
|
||||||
}
|
}
|
||||||
|
|
||||||
mempool_free(workspace, &c->lz4_workspace_pool);
|
mempool_free(workspace, &c->lz4_workspace_pool);
|
||||||
|
|
||||||
if (!ret)
|
if (!ret)
|
||||||
@ -331,6 +337,7 @@ static int __bio_compress(struct bch_fs *c,
|
|||||||
|
|
||||||
*src_len = len;
|
*src_len = len;
|
||||||
*dst_len = ret;
|
*dst_len = ret;
|
||||||
|
ret = 0;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case BCH_COMPRESSION_GZIP: {
|
case BCH_COMPRESSION_GZIP: {
|
||||||
@ -446,20 +453,22 @@ int bch2_check_set_has_compressed_data(struct bch_fs *c,
|
|||||||
unsigned compression_type)
|
unsigned compression_type)
|
||||||
{
|
{
|
||||||
switch (compression_type) {
|
switch (compression_type) {
|
||||||
case BCH_COMPRESSION_NONE:
|
case BCH_COMPRESSION_OPT_NONE:
|
||||||
return 0;
|
return 0;
|
||||||
case BCH_COMPRESSION_LZ4:
|
case BCH_COMPRESSION_OPT_LZ4:
|
||||||
if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4))
|
if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4);
|
bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4);
|
||||||
break;
|
break;
|
||||||
case BCH_COMPRESSION_GZIP:
|
case BCH_COMPRESSION_OPT_GZIP:
|
||||||
if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
|
if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP);
|
bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP);
|
||||||
break;
|
break;
|
||||||
|
default:
|
||||||
|
BUG();
|
||||||
}
|
}
|
||||||
|
|
||||||
return bch2_fs_compress_init(c);
|
return bch2_fs_compress_init(c);
|
||||||
|
@ -511,19 +511,19 @@ static void extent_pick_read_device(struct bch_fs *c,
|
|||||||
struct bch_dev *ca = c->devs[ptr->dev];
|
struct bch_dev *ca = c->devs[ptr->dev];
|
||||||
|
|
||||||
if (ptr->cached && ptr_stale(ca, ptr))
|
if (ptr->cached && ptr_stale(ca, ptr))
|
||||||
return;
|
continue;
|
||||||
|
|
||||||
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
|
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
|
||||||
return;
|
continue;
|
||||||
|
|
||||||
if (avoid && test_bit(ca->dev_idx, avoid->d))
|
if (avoid && test_bit(ca->dev_idx, avoid->d))
|
||||||
return;
|
continue;
|
||||||
|
|
||||||
if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
|
if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
|
||||||
return;
|
continue;
|
||||||
|
|
||||||
if (!percpu_ref_tryget(&ca->io_ref))
|
if (!percpu_ref_tryget(&ca->io_ref))
|
||||||
return;
|
continue;
|
||||||
|
|
||||||
if (pick->ca)
|
if (pick->ca)
|
||||||
percpu_ref_put(&pick->ca->io_ref);
|
percpu_ref_put(&pick->ca->io_ref);
|
||||||
|
@ -974,7 +974,8 @@ alloc_io:
|
|||||||
(struct disk_reservation) {
|
(struct disk_reservation) {
|
||||||
.nr_replicas = c->opts.data_replicas,
|
.nr_replicas = c->opts.data_replicas,
|
||||||
},
|
},
|
||||||
foreground_write_point(c, inode->ei_last_dirtied),
|
c->fastest_devs,
|
||||||
|
inode->ei_last_dirtied,
|
||||||
POS(inum, 0),
|
POS(inum, 0),
|
||||||
&inode->ei_journal_seq,
|
&inode->ei_journal_seq,
|
||||||
BCH_WRITE_THROTTLE);
|
BCH_WRITE_THROTTLE);
|
||||||
@ -1545,10 +1546,11 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
|
|||||||
dio->iop.is_dio = true;
|
dio->iop.is_dio = true;
|
||||||
dio->iop.new_i_size = U64_MAX;
|
dio->iop.new_i_size = U64_MAX;
|
||||||
bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
|
bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
|
||||||
foreground_write_point(dio->c, (unsigned long) current),
|
dio->c->fastest_devs,
|
||||||
POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
|
(unsigned long) dio->task,
|
||||||
&inode->ei_journal_seq,
|
POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
|
||||||
flags|BCH_WRITE_THROTTLE);
|
&inode->ei_journal_seq,
|
||||||
|
flags|BCH_WRITE_THROTTLE);
|
||||||
dio->iop.op.index_update_fn = bchfs_write_index_update;
|
dio->iop.op.index_update_fn = bchfs_write_index_update;
|
||||||
|
|
||||||
dio->res.sectors -= bio_sectors(bio);
|
dio->res.sectors -= bio_sectors(bio);
|
||||||
@ -1568,13 +1570,13 @@ static void bch2_dio_write_loop_async(struct closure *cl)
|
|||||||
bch2_dio_write_done(dio);
|
bch2_dio_write_done(dio);
|
||||||
|
|
||||||
if (dio->iter.count && !dio->error) {
|
if (dio->iter.count && !dio->error) {
|
||||||
use_mm(dio->mm);
|
use_mm(dio->task->mm);
|
||||||
pagecache_block_get(&mapping->add_lock);
|
pagecache_block_get(&mapping->add_lock);
|
||||||
|
|
||||||
bch2_do_direct_IO_write(dio);
|
bch2_do_direct_IO_write(dio);
|
||||||
|
|
||||||
pagecache_block_put(&mapping->add_lock);
|
pagecache_block_put(&mapping->add_lock);
|
||||||
unuse_mm(dio->mm);
|
unuse_mm(dio->task->mm);
|
||||||
|
|
||||||
continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
|
continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
|
||||||
} else {
|
} else {
|
||||||
@ -1617,7 +1619,7 @@ static int bch2_direct_IO_write(struct bch_fs *c,
|
|||||||
dio->offset = offset;
|
dio->offset = offset;
|
||||||
dio->iovec = NULL;
|
dio->iovec = NULL;
|
||||||
dio->iter = *iter;
|
dio->iter = *iter;
|
||||||
dio->mm = current->mm;
|
dio->task = current;
|
||||||
closure_init(&dio->cl, NULL);
|
closure_init(&dio->cl, NULL);
|
||||||
|
|
||||||
if (offset + iter->count > inode->v.i_size)
|
if (offset + iter->count > inode->v.i_size)
|
||||||
|
@ -74,7 +74,7 @@ struct dio_write {
|
|||||||
struct iovec inline_vecs[UIO_FASTIOV];
|
struct iovec inline_vecs[UIO_FASTIOV];
|
||||||
struct iov_iter iter;
|
struct iov_iter iter;
|
||||||
|
|
||||||
struct mm_struct *mm;
|
struct task_struct *task;
|
||||||
|
|
||||||
/* must be last: */
|
/* must be last: */
|
||||||
struct bchfs_write_op iop;
|
struct bchfs_write_op iop;
|
||||||
|
@ -350,7 +350,7 @@ static void init_append_extent(struct bch_write_op *op,
|
|||||||
bch2_keylist_push(&op->insert_keys);
|
bch2_keylist_push(&op->insert_keys);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
|
static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
|
||||||
{
|
{
|
||||||
struct bch_fs *c = op->c;
|
struct bch_fs *c = op->c;
|
||||||
struct bio *orig = &op->wbio.bio;
|
struct bio *orig = &op->wbio.bio;
|
||||||
@ -371,7 +371,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
|
|||||||
/* Need to decompress data? */
|
/* Need to decompress data? */
|
||||||
if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
|
if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
|
||||||
(crc_uncompressed_size(NULL, &op->crc) != op->size ||
|
(crc_uncompressed_size(NULL, &op->crc) != op->size ||
|
||||||
crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) {
|
crc_compressed_size(NULL, &op->crc) > wp->sectors_free)) {
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc);
|
ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc);
|
||||||
@ -389,7 +389,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
|
|||||||
op->crc.nonce,
|
op->crc.nonce,
|
||||||
op->crc.csum,
|
op->crc.csum,
|
||||||
op->crc.csum_type,
|
op->crc.csum_type,
|
||||||
ob);
|
wp->ob);
|
||||||
|
|
||||||
bio = orig;
|
bio = orig;
|
||||||
wbio = wbio_init(bio);
|
wbio = wbio_init(bio);
|
||||||
@ -398,7 +398,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
|
|||||||
compression_type != BCH_COMPRESSION_NONE) {
|
compression_type != BCH_COMPRESSION_NONE) {
|
||||||
/* all units here in bytes */
|
/* all units here in bytes */
|
||||||
unsigned total_output = 0, output_available =
|
unsigned total_output = 0, output_available =
|
||||||
min(ob->sectors_free << 9, orig->bi_iter.bi_size);
|
min(wp->sectors_free << 9, orig->bi_iter.bi_size);
|
||||||
unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type)
|
unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type)
|
||||||
? op->nonce : 0;
|
? op->nonce : 0;
|
||||||
struct bch_csum csum;
|
struct bch_csum csum;
|
||||||
@ -441,7 +441,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
|
|||||||
init_append_extent(op,
|
init_append_extent(op,
|
||||||
dst_len >> 9, src_len >> 9,
|
dst_len >> 9, src_len >> 9,
|
||||||
fragment_compression_type,
|
fragment_compression_type,
|
||||||
crc_nonce, csum, csum_type, ob);
|
crc_nonce, csum, csum_type, wp->ob);
|
||||||
|
|
||||||
total_output += dst_len;
|
total_output += dst_len;
|
||||||
bio_advance(bio, dst_len);
|
bio_advance(bio, dst_len);
|
||||||
@ -468,14 +468,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
|
|||||||
|
|
||||||
more = orig->bi_iter.bi_size != 0;
|
more = orig->bi_iter.bi_size != 0;
|
||||||
} else {
|
} else {
|
||||||
bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO,
|
bio = bio_next_split(orig, wp->sectors_free, GFP_NOIO,
|
||||||
&c->bio_write);
|
&c->bio_write);
|
||||||
wbio = wbio_init(bio);
|
wbio = wbio_init(bio);
|
||||||
wbio->put_bio = bio != orig;
|
wbio->put_bio = bio != orig;
|
||||||
|
|
||||||
init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
|
init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
|
||||||
compression_type, 0,
|
compression_type, 0,
|
||||||
(struct bch_csum) { 0 }, csum_type, ob);
|
(struct bch_csum) { 0 }, csum_type, wp->ob);
|
||||||
|
|
||||||
more = bio != orig;
|
more = bio != orig;
|
||||||
}
|
}
|
||||||
@ -505,7 +505,8 @@ static void __bch2_write(struct closure *cl)
|
|||||||
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
|
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
|
||||||
struct bch_fs *c = op->c;
|
struct bch_fs *c = op->c;
|
||||||
unsigned open_bucket_nr = 0;
|
unsigned open_bucket_nr = 0;
|
||||||
struct open_bucket *b;
|
struct write_point *wp;
|
||||||
|
struct open_bucket *ob;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
@ -519,16 +520,19 @@ static void __bch2_write(struct closure *cl)
|
|||||||
BKEY_EXTENT_U64s_MAX))
|
BKEY_EXTENT_U64s_MAX))
|
||||||
continue_at(cl, bch2_write_index, index_update_wq(op));
|
continue_at(cl, bch2_write_index, index_update_wq(op));
|
||||||
|
|
||||||
b = bch2_alloc_sectors_start(c, op->wp,
|
wp = bch2_alloc_sectors_start(c, BCH_DATA_USER,
|
||||||
|
op->devs,
|
||||||
|
op->write_point,
|
||||||
op->nr_replicas,
|
op->nr_replicas,
|
||||||
c->opts.data_replicas_required,
|
c->opts.data_replicas_required,
|
||||||
op->alloc_reserve,
|
op->alloc_reserve,
|
||||||
|
op->flags,
|
||||||
(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
|
(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
|
||||||
EBUG_ON(!b);
|
EBUG_ON(!wp);
|
||||||
|
|
||||||
if (unlikely(IS_ERR(b))) {
|
if (unlikely(IS_ERR(wp))) {
|
||||||
if (unlikely(PTR_ERR(b) != -EAGAIN)) {
|
if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
|
||||||
ret = PTR_ERR(b);
|
ret = PTR_ERR(wp);
|
||||||
goto err;
|
goto err;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -561,13 +565,15 @@ static void __bch2_write(struct closure *cl)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
BUG_ON(b - c->open_buckets == 0 ||
|
ob = wp->ob;
|
||||||
b - c->open_buckets > U8_MAX);
|
|
||||||
op->open_buckets[open_bucket_nr++] = b - c->open_buckets;
|
|
||||||
|
|
||||||
ret = bch2_write_extent(op, b);
|
BUG_ON(ob - c->open_buckets == 0 ||
|
||||||
|
ob - c->open_buckets > U8_MAX);
|
||||||
|
op->open_buckets[open_bucket_nr++] = ob - c->open_buckets;
|
||||||
|
|
||||||
bch2_alloc_sectors_done(c, op->wp, b);
|
ret = bch2_write_extent(op, wp);
|
||||||
|
|
||||||
|
bch2_alloc_sectors_done(c, wp);
|
||||||
|
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
goto err;
|
goto err;
|
||||||
@ -704,7 +710,9 @@ void bch2_write(struct closure *cl)
|
|||||||
|
|
||||||
void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
|
void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
|
||||||
struct disk_reservation res,
|
struct disk_reservation res,
|
||||||
struct write_point *wp, struct bpos pos,
|
struct bch_devs_mask *devs,
|
||||||
|
unsigned long write_point,
|
||||||
|
struct bpos pos,
|
||||||
u64 *journal_seq, unsigned flags)
|
u64 *journal_seq, unsigned flags)
|
||||||
{
|
{
|
||||||
EBUG_ON(res.sectors && !res.nr_replicas);
|
EBUG_ON(res.sectors && !res.nr_replicas);
|
||||||
@ -723,7 +731,8 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
|
|||||||
op->pos = pos;
|
op->pos = pos;
|
||||||
op->version = ZERO_VERSION;
|
op->version = ZERO_VERSION;
|
||||||
op->res = res;
|
op->res = res;
|
||||||
op->wp = wp;
|
op->devs = devs;
|
||||||
|
op->write_point = write_point;
|
||||||
|
|
||||||
if (journal_seq) {
|
if (journal_seq) {
|
||||||
op->journal_seq_p = journal_seq;
|
op->journal_seq_p = journal_seq;
|
||||||
@ -826,6 +835,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
|
|||||||
* Adjust bio to correspond to _live_ portion of @k -
|
* Adjust bio to correspond to _live_ portion of @k -
|
||||||
* which might be less than what we're actually reading:
|
* which might be less than what we're actually reading:
|
||||||
*/
|
*/
|
||||||
|
bio->bi_iter.bi_size = sectors << 9;
|
||||||
bio_advance(bio, pick->crc.offset << 9);
|
bio_advance(bio, pick->crc.offset << 9);
|
||||||
BUG_ON(bio_sectors(bio) < k.k->size);
|
BUG_ON(bio_sectors(bio) < k.k->size);
|
||||||
bio->bi_iter.bi_size = k.k->size << 9;
|
bio->bi_iter.bi_size = k.k->size << 9;
|
||||||
@ -836,7 +846,8 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
|
|||||||
*/
|
*/
|
||||||
op->write.op.pos.offset = iter.bi_sector;
|
op->write.op.pos.offset = iter.bi_sector;
|
||||||
}
|
}
|
||||||
bch2_migrate_write_init(c, &op->write, &c->promote_write_point,
|
bch2_migrate_write_init(c, &op->write,
|
||||||
|
c->fastest_devs,
|
||||||
k, NULL,
|
k, NULL,
|
||||||
BCH_WRITE_ALLOC_NOWAIT|
|
BCH_WRITE_ALLOC_NOWAIT|
|
||||||
BCH_WRITE_CACHED);
|
BCH_WRITE_CACHED);
|
||||||
|
@ -22,11 +22,12 @@ enum bch_write_flags {
|
|||||||
BCH_WRITE_FLUSH = (1 << 2),
|
BCH_WRITE_FLUSH = (1 << 2),
|
||||||
BCH_WRITE_DATA_COMPRESSED = (1 << 3),
|
BCH_WRITE_DATA_COMPRESSED = (1 << 3),
|
||||||
BCH_WRITE_THROTTLE = (1 << 4),
|
BCH_WRITE_THROTTLE = (1 << 4),
|
||||||
|
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5),
|
||||||
|
|
||||||
/* Internal: */
|
/* Internal: */
|
||||||
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 5),
|
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6),
|
||||||
BCH_WRITE_DONE = (1 << 6),
|
BCH_WRITE_DONE = (1 << 7),
|
||||||
BCH_WRITE_LOOPED = (1 << 7),
|
BCH_WRITE_LOOPED = (1 << 8),
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline u64 *op_journal_seq(struct bch_write_op *op)
|
static inline u64 *op_journal_seq(struct bch_write_op *op)
|
||||||
@ -35,15 +36,10 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
|
|||||||
? op->journal_seq_p : &op->journal_seq;
|
? op->journal_seq_p : &op->journal_seq;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline struct write_point *foreground_write_point(struct bch_fs *c,
|
|
||||||
unsigned long v)
|
|
||||||
{
|
|
||||||
return c->write_points +
|
|
||||||
hash_long(v, ilog2(ARRAY_SIZE(c->write_points)));
|
|
||||||
}
|
|
||||||
|
|
||||||
void bch2_write_op_init(struct bch_write_op *, struct bch_fs *,
|
void bch2_write_op_init(struct bch_write_op *, struct bch_fs *,
|
||||||
struct disk_reservation, struct write_point *,
|
struct disk_reservation,
|
||||||
|
struct bch_devs_mask *,
|
||||||
|
unsigned long,
|
||||||
struct bpos, u64 *, unsigned);
|
struct bpos, u64 *, unsigned);
|
||||||
void bch2_write(struct closure *);
|
void bch2_write(struct closure *);
|
||||||
|
|
||||||
|
@ -116,9 +116,10 @@ struct bch_write_op {
|
|||||||
struct bch_extent_crc128 crc;
|
struct bch_extent_crc128 crc;
|
||||||
unsigned size;
|
unsigned size;
|
||||||
|
|
||||||
struct disk_reservation res;
|
struct bch_devs_mask *devs;
|
||||||
|
unsigned long write_point;
|
||||||
|
|
||||||
struct write_point *wp;
|
struct disk_reservation res;
|
||||||
|
|
||||||
union {
|
union {
|
||||||
u8 open_buckets[16];
|
u8 open_buckets[16];
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
static int issue_migration_move(struct bch_dev *ca,
|
static int issue_migration_move(struct bch_dev *ca,
|
||||||
struct moving_context *ctxt,
|
struct moving_context *ctxt,
|
||||||
|
struct bch_devs_mask *devs,
|
||||||
struct bkey_s_c k)
|
struct bkey_s_c k)
|
||||||
{
|
{
|
||||||
struct bch_fs *c = ca->fs;
|
struct bch_fs *c = ca->fs;
|
||||||
@ -33,7 +34,7 @@ static int issue_migration_move(struct bch_dev *ca,
|
|||||||
found:
|
found:
|
||||||
/* XXX: we need to be doing something with the disk reservation */
|
/* XXX: we need to be doing something with the disk reservation */
|
||||||
|
|
||||||
ret = bch2_data_move(c, ctxt, &c->migration_write_point, k, ptr);
|
ret = bch2_data_move(c, ctxt, devs, k, ptr);
|
||||||
if (ret)
|
if (ret)
|
||||||
bch2_disk_reservation_put(c, &res);
|
bch2_disk_reservation_put(c, &res);
|
||||||
return ret;
|
return ret;
|
||||||
@ -110,7 +111,7 @@ int bch2_move_data_off_device(struct bch_dev *ca)
|
|||||||
ca->dev_idx))
|
ca->dev_idx))
|
||||||
goto next;
|
goto next;
|
||||||
|
|
||||||
ret = issue_migration_move(ca, &ctxt, k);
|
ret = issue_migration_move(ca, &ctxt, NULL, k);
|
||||||
if (ret == -ENOMEM) {
|
if (ret == -ENOMEM) {
|
||||||
bch2_btree_iter_unlock(&iter);
|
bch2_btree_iter_unlock(&iter);
|
||||||
|
|
||||||
|
@ -139,7 +139,7 @@ out:
|
|||||||
|
|
||||||
void bch2_migrate_write_init(struct bch_fs *c,
|
void bch2_migrate_write_init(struct bch_fs *c,
|
||||||
struct migrate_write *m,
|
struct migrate_write *m,
|
||||||
struct write_point *wp,
|
struct bch_devs_mask *devs,
|
||||||
struct bkey_s_c k,
|
struct bkey_s_c k,
|
||||||
const struct bch_extent_ptr *move_ptr,
|
const struct bch_extent_ptr *move_ptr,
|
||||||
unsigned flags)
|
unsigned flags)
|
||||||
@ -155,8 +155,10 @@ void bch2_migrate_write_init(struct bch_fs *c,
|
|||||||
(move_ptr && move_ptr->cached))
|
(move_ptr && move_ptr->cached))
|
||||||
flags |= BCH_WRITE_CACHED;
|
flags |= BCH_WRITE_CACHED;
|
||||||
|
|
||||||
bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 }, wp,
|
bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 },
|
||||||
bkey_start_pos(k.k), NULL, flags);
|
devs, (unsigned long) current,
|
||||||
|
bkey_start_pos(k.k), NULL,
|
||||||
|
flags|BCH_WRITE_ONLY_SPECIFIED_DEVS);
|
||||||
|
|
||||||
if (m->move)
|
if (m->move)
|
||||||
m->op.alloc_reserve = RESERVE_MOVINGGC;
|
m->op.alloc_reserve = RESERVE_MOVINGGC;
|
||||||
@ -249,7 +251,7 @@ static void read_moving_endio(struct bio *bio)
|
|||||||
|
|
||||||
int bch2_data_move(struct bch_fs *c,
|
int bch2_data_move(struct bch_fs *c,
|
||||||
struct moving_context *ctxt,
|
struct moving_context *ctxt,
|
||||||
struct write_point *wp,
|
struct bch_devs_mask *devs,
|
||||||
struct bkey_s_c k,
|
struct bkey_s_c k,
|
||||||
const struct bch_extent_ptr *move_ptr)
|
const struct bch_extent_ptr *move_ptr)
|
||||||
{
|
{
|
||||||
@ -280,7 +282,7 @@ int bch2_data_move(struct bch_fs *c,
|
|||||||
|
|
||||||
migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size);
|
migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size);
|
||||||
|
|
||||||
bch2_migrate_write_init(c, &io->write, wp, k, move_ptr, 0);
|
bch2_migrate_write_init(c, &io->write, devs, k, move_ptr, 0);
|
||||||
|
|
||||||
trace_move_read(&io->write.key.k);
|
trace_move_read(&io->write.key.k);
|
||||||
|
|
||||||
|
@ -20,12 +20,9 @@ struct migrate_write {
|
|||||||
struct bch_write_op op;
|
struct bch_write_op op;
|
||||||
};
|
};
|
||||||
|
|
||||||
void bch2_migrate_write_init(struct bch_fs *,
|
void bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
|
||||||
struct migrate_write *,
|
struct bch_devs_mask *, struct bkey_s_c,
|
||||||
struct write_point *,
|
const struct bch_extent_ptr *, unsigned);
|
||||||
struct bkey_s_c,
|
|
||||||
const struct bch_extent_ptr *,
|
|
||||||
unsigned);
|
|
||||||
|
|
||||||
#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
|
#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
|
||||||
|
|
||||||
@ -69,11 +66,9 @@ struct moving_io {
|
|||||||
struct bio_vec bi_inline_vecs[0];
|
struct bio_vec bi_inline_vecs[0];
|
||||||
};
|
};
|
||||||
|
|
||||||
int bch2_data_move(struct bch_fs *,
|
int bch2_data_move(struct bch_fs *, struct moving_context *,
|
||||||
struct moving_context *,
|
struct bch_devs_mask *, struct bkey_s_c,
|
||||||
struct write_point *,
|
const struct bch_extent_ptr *);
|
||||||
struct bkey_s_c,
|
|
||||||
const struct bch_extent_ptr *);
|
|
||||||
|
|
||||||
int bch2_move_ctxt_wait(struct moving_context *);
|
int bch2_move_ctxt_wait(struct moving_context *);
|
||||||
void bch2_move_ctxt_wait_for_io(struct moving_context *);
|
void bch2_move_ctxt_wait_for_io(struct moving_context *);
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
#include "keylist.h"
|
#include "keylist.h"
|
||||||
#include "move.h"
|
#include "move.h"
|
||||||
#include "movinggc.h"
|
#include "movinggc.h"
|
||||||
|
#include "super-io.h"
|
||||||
|
|
||||||
#include <trace/events/bcachefs.h>
|
#include <trace/events/bcachefs.h>
|
||||||
#include <linux/freezer.h>
|
#include <linux/freezer.h>
|
||||||
@ -72,7 +73,7 @@ static int issue_moving_gc_move(struct bch_dev *ca,
|
|||||||
if (!ptr) /* We raced - bucket's been reused */
|
if (!ptr) /* We raced - bucket's been reused */
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
ret = bch2_data_move(c, ctxt, &ca->copygc_write_point, k, ptr);
|
ret = bch2_data_move(c, ctxt, &ca->self, k, ptr);
|
||||||
if (!ret)
|
if (!ret)
|
||||||
trace_gc_copy(k.k);
|
trace_gc_copy(k.k);
|
||||||
else
|
else
|
||||||
|
@ -376,7 +376,7 @@ err:
|
|||||||
static void bch2_fs_free(struct bch_fs *c)
|
static void bch2_fs_free(struct bch_fs *c)
|
||||||
{
|
{
|
||||||
bch2_fs_encryption_exit(c);
|
bch2_fs_encryption_exit(c);
|
||||||
bch2_fs_btree_exit(c);
|
bch2_fs_btree_cache_exit(c);
|
||||||
bch2_fs_journal_exit(&c->journal);
|
bch2_fs_journal_exit(&c->journal);
|
||||||
bch2_io_clock_exit(&c->io_clock[WRITE]);
|
bch2_io_clock_exit(&c->io_clock[WRITE]);
|
||||||
bch2_io_clock_exit(&c->io_clock[READ]);
|
bch2_io_clock_exit(&c->io_clock[READ]);
|
||||||
@ -491,7 +491,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|||||||
mutex_init(&c->state_lock);
|
mutex_init(&c->state_lock);
|
||||||
mutex_init(&c->sb_lock);
|
mutex_init(&c->sb_lock);
|
||||||
mutex_init(&c->replicas_gc_lock);
|
mutex_init(&c->replicas_gc_lock);
|
||||||
mutex_init(&c->btree_cache_lock);
|
|
||||||
mutex_init(&c->bucket_lock);
|
mutex_init(&c->bucket_lock);
|
||||||
mutex_init(&c->btree_root_lock);
|
mutex_init(&c->btree_root_lock);
|
||||||
INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
|
INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
|
||||||
@ -507,9 +506,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|||||||
bch2_fs_tiering_init(c);
|
bch2_fs_tiering_init(c);
|
||||||
|
|
||||||
INIT_LIST_HEAD(&c->list);
|
INIT_LIST_HEAD(&c->list);
|
||||||
INIT_LIST_HEAD(&c->btree_cache);
|
|
||||||
INIT_LIST_HEAD(&c->btree_cache_freeable);
|
|
||||||
INIT_LIST_HEAD(&c->btree_cache_freed);
|
|
||||||
|
|
||||||
INIT_LIST_HEAD(&c->btree_interior_update_list);
|
INIT_LIST_HEAD(&c->btree_interior_update_list);
|
||||||
mutex_init(&c->btree_reserve_cache_lock);
|
mutex_init(&c->btree_reserve_cache_lock);
|
||||||
@ -546,6 +542,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|||||||
c->journal.blocked_time = &c->journal_blocked_time;
|
c->journal.blocked_time = &c->journal_blocked_time;
|
||||||
c->journal.flush_seq_time = &c->journal_flush_seq_time;
|
c->journal.flush_seq_time = &c->journal_flush_seq_time;
|
||||||
|
|
||||||
|
bch2_fs_btree_cache_init_early(&c->btree_cache);
|
||||||
|
|
||||||
mutex_lock(&c->sb_lock);
|
mutex_lock(&c->sb_lock);
|
||||||
|
|
||||||
if (bch2_sb_to_fs(c, sb)) {
|
if (bch2_sb_to_fs(c, sb)) {
|
||||||
@ -599,7 +597,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|||||||
bch2_io_clock_init(&c->io_clock[READ]) ||
|
bch2_io_clock_init(&c->io_clock[READ]) ||
|
||||||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
|
bch2_io_clock_init(&c->io_clock[WRITE]) ||
|
||||||
bch2_fs_journal_init(&c->journal) ||
|
bch2_fs_journal_init(&c->journal) ||
|
||||||
bch2_fs_btree_init(c) ||
|
bch2_fs_btree_cache_init(c) ||
|
||||||
bch2_fs_encryption_init(c) ||
|
bch2_fs_encryption_init(c) ||
|
||||||
bch2_fs_compress_init(c) ||
|
bch2_fs_compress_init(c) ||
|
||||||
bch2_check_set_has_compressed_data(c, c->opts.compression))
|
bch2_check_set_has_compressed_data(c, c->opts.compression))
|
||||||
@ -1107,8 +1105,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
|
|||||||
ca->dev_idx = dev_idx;
|
ca->dev_idx = dev_idx;
|
||||||
__set_bit(ca->dev_idx, ca->self.d);
|
__set_bit(ca->dev_idx, ca->self.d);
|
||||||
|
|
||||||
ca->copygc_write_point.type = BCH_DATA_USER;
|
|
||||||
|
|
||||||
spin_lock_init(&ca->freelist_lock);
|
spin_lock_init(&ca->freelist_lock);
|
||||||
bch2_dev_moving_gc_init(ca);
|
bch2_dev_moving_gc_init(ca);
|
||||||
|
|
||||||
@ -1169,8 +1165,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
|
|||||||
for (i = 0; i < RESERVE_NR; i++)
|
for (i = 0; i < RESERVE_NR; i++)
|
||||||
total_reserve += ca->free[i].size;
|
total_reserve += ca->free[i].size;
|
||||||
|
|
||||||
ca->copygc_write_point.group = &ca->self;
|
|
||||||
|
|
||||||
ca->fs = c;
|
ca->fs = c;
|
||||||
rcu_assign_pointer(c->devs[ca->dev_idx], ca);
|
rcu_assign_pointer(c->devs[ca->dev_idx], ca);
|
||||||
|
|
||||||
|
@ -209,11 +209,11 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
|
|||||||
size_t ret = 0;
|
size_t ret = 0;
|
||||||
struct btree *b;
|
struct btree *b;
|
||||||
|
|
||||||
mutex_lock(&c->btree_cache_lock);
|
mutex_lock(&c->btree_cache.lock);
|
||||||
list_for_each_entry(b, &c->btree_cache, list)
|
list_for_each_entry(b, &c->btree_cache.live, list)
|
||||||
ret += btree_bytes(c);
|
ret += btree_bytes(c);
|
||||||
|
|
||||||
mutex_unlock(&c->btree_cache_lock);
|
mutex_unlock(&c->btree_cache.lock);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -436,7 +436,7 @@ STORE(__bch2_fs)
|
|||||||
|
|
||||||
sc.gfp_mask = GFP_KERNEL;
|
sc.gfp_mask = GFP_KERNEL;
|
||||||
sc.nr_to_scan = strtoul_or_return(buf);
|
sc.nr_to_scan = strtoul_or_return(buf);
|
||||||
c->btree_cache_shrink.scan_objects(&c->btree_cache_shrink, &sc);
|
c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
|
||||||
}
|
}
|
||||||
|
|
||||||
return size;
|
return size;
|
||||||
|
@ -54,7 +54,7 @@ static int issue_tiering_move(struct bch_fs *c,
|
|||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
ret = bch2_data_move(c, ctxt, &tier->wp, k, NULL);
|
ret = bch2_data_move(c, ctxt, &tier->devs, k, NULL);
|
||||||
if (!ret)
|
if (!ret)
|
||||||
trace_tiering_copy(k.k);
|
trace_tiering_copy(k.k);
|
||||||
else
|
else
|
||||||
@ -241,6 +241,5 @@ void bch2_fs_tiering_init(struct bch_fs *c)
|
|||||||
for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
|
for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
|
||||||
c->tiers[i].idx = i;
|
c->tiers[i].idx = i;
|
||||||
bch2_pd_controller_init(&c->tiers[i].pd);
|
bch2_pd_controller_init(&c->tiers[i].pd);
|
||||||
c->tiers[i].wp.group = &c->tiers[i].devs;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user