Update bcachefs sources to 99750eab4d bcachefs: Persist stripe blocks_used

This commit is contained in:
Kent Overstreet 2019-01-23 15:49:44 -05:00
parent 1c50d258e3
commit 35fca2f044
37 changed files with 920 additions and 405 deletions

View File

@ -1 +1 @@
bcca1c557b1897ecc3aeb1f89ab91865487d91ab
99750eab4d583132cf61f071082c7cf21f5295c0

0
include/asm/page.h Normal file
View File

View File

@ -37,6 +37,7 @@ typedef struct {
#define xchg_acquire(p, v) uatomic_xchg(p, v)
#define cmpxchg(p, old, new) uatomic_cmpxchg(p, old, new)
#define cmpxchg_acquire(p, old, new) uatomic_cmpxchg(p, old, new)
#define cmpxchg_release(p, old, new) uatomic_cmpxchg(p, old, new)
#define smp_mb__before_atomic() cmm_smp_mb__before_uatomic_add()
#define smp_mb__after_atomic() cmm_smp_mb__after_uatomic_add()
@ -77,6 +78,16 @@ typedef struct {
__old; \
})
#define cmpxchg_release(p, old, new) \
({ \
typeof(*(p)) __old = (old); \
\
__atomic_compare_exchange_n((p), &__old, new, false, \
__ATOMIC_RELEASE, \
__ATOMIC_RELEASE); \
__old; \
})
#define smp_mb__before_atomic() __atomic_thread_fence(__ATOMIC_SEQ_CST)
#define smp_mb__after_atomic() __atomic_thread_fence(__ATOMIC_SEQ_CST)
#define smp_wmb() __atomic_thread_fence(__ATOMIC_SEQ_CST)

View File

@ -1,34 +1,60 @@
#ifndef _LINUX_GENERIC_RADIX_TREE_H
#define _LINUX_GENERIC_RADIX_TREE_H
/*
* Generic radix trees/sparse arrays:
/**
* DOC: Generic radix trees/sparse arrays:
*
* A generic radix tree has all nodes of size PAGE_SIZE - both leaves and
* interior nodes.
* Very simple and minimalistic, supporting arbitrary size entries up to
* PAGE_SIZE.
*
* A genradix is defined with the type it will store, like so:
*
* static GENRADIX(struct foo) foo_genradix;
*
* The main operations are:
*
* - genradix_init(radix) - initialize an empty genradix
*
* - genradix_free(radix) - free all memory owned by the genradix and
* reinitialize it
*
* - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
* NULL if that entry does not exist
*
* - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
* allocating it if necessary
*
* - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
*
* The radix tree allocates one page of entries at a time, so entries may exist
* that were never explicitly allocated - they will be initialized to all
* zeroes.
*
* Internally, a genradix is just a radix tree of pages, and indexing works in
* terms of byte offsets. The wrappers in this header file use sizeof on the
* type the radix contains to calculate a byte offset from the index - see
* __idx_to_offset.
*/
#include <asm/page.h>
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/log2.h>
struct genradix_node;
struct genradix_root;
struct __genradix {
struct genradix_node *root;
size_t depth;
struct genradix_root __rcu *root;
};
/*
* NOTE: currently, sizeof(_type) must be a power of two and not larger than
* PAGE_SIZE:
* NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE:
*/
#define __GENRADIX_INITIALIZER \
{ \
.tree = { \
.root = NULL, \
.depth = 0, \
} \
}
@ -49,6 +75,12 @@ struct { \
#define DEFINE_GENRADIX(_name, _type) \
GENRADIX(_type) _name = __GENRADIX_INITIALIZER
/**
* genradix_init - initialize a genradix
* @_radix: genradix to initialize
*
* Does not fail
*/
#define genradix_init(_radix) \
do { \
*(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER; \
@ -56,11 +88,20 @@ do { \
void __genradix_free(struct __genradix *);
/**
* genradix_free: free all memory owned by a genradix
* @_radix: the genradix to free
*
* After freeing, @_radix will be reinitialized and empty
*/
#define genradix_free(_radix) __genradix_free(&(_radix)->tree)
static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
{
BUILD_BUG_ON(obj_size > PAGE_SIZE);
if (__builtin_constant_p(obj_size))
BUILD_BUG_ON(obj_size > PAGE_SIZE);
else
BUG_ON(obj_size > PAGE_SIZE);
if (!is_power_of_2(obj_size)) {
size_t objs_per_page = PAGE_SIZE / obj_size;
@ -79,7 +120,13 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
void *__genradix_ptr(struct __genradix *, size_t);
/* Returns a pointer to element at @_idx */
/**
* genradix_ptr - get a pointer to a genradix entry
* @_radix: genradix to access
* @_idx: index to fetch
*
* Returns a pointer to entry at @_idx, or NULL if that entry does not exist.
*/
#define genradix_ptr(_radix, _idx) \
(__genradix_cast(_radix) \
__genradix_ptr(&(_radix)->tree, \
@ -87,7 +134,15 @@ void *__genradix_ptr(struct __genradix *, size_t);
void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
/* Returns a pointer to element at @_idx, allocating it if necessary */
/**
* genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
* if necessary
* @_radix: genradix to access
* @_idx: index to fetch
* @_gfp: gfp mask
*
* Returns a pointer to entry at @_idx, or NULL on allocation failure
*/
#define genradix_ptr_alloc(_radix, _idx, _gfp) \
(__genradix_cast(_radix) \
__genradix_ptr_alloc(&(_radix)->tree, \
@ -99,6 +154,11 @@ struct genradix_iter {
size_t pos;
};
/**
* genradix_iter_init - initialize a genradix_iter
* @_radix: genradix that will be iterated over
* @_idx: index to start iterating from
*/
#define genradix_iter_init(_radix, _idx) \
((struct genradix_iter) { \
.pos = (_idx), \
@ -107,6 +167,14 @@ struct genradix_iter {
void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
/**
* genradix_iter_peek - get first entry at or above iterator's current
* position
* @_iter: a genradix_iter
* @_radix: genradix being iterated over
*
* If no more entries exist at or above @_iter's current position, returns NULL
*/
#define genradix_iter_peek(_iter, _radix) \
(__genradix_cast(_radix) \
__genradix_iter_peek(_iter, &(_radix)->tree, \
@ -127,4 +195,37 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
#define genradix_iter_advance(_iter, _radix) \
__genradix_iter_advance(_iter, __genradix_obj_size(_radix))
#define genradix_for_each_from(_radix, _iter, _p, _start) \
for (_iter = genradix_iter_init(_radix, _start); \
(_p = genradix_iter_peek(&_iter, _radix)) != NULL; \
genradix_iter_advance(&_iter, _radix))
/**
* genradix_for_each - iterate over entry in a genradix
* @_radix: genradix to iterate over
* @_iter: a genradix_iter to track current position
* @_p: pointer to genradix entry type
*
* On every iteration, @_p will point to the current entry, and @_iter.pos
* will be the current entry's index.
*/
#define genradix_for_each(_radix, _iter, _p) \
genradix_for_each_from(_radix, _iter, _p, 0)
int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
/**
* genradix_prealloc - preallocate entries in a generic radix tree
* @_radix: genradix to preallocate
* @_nr: number of entries to preallocate
* @_gfp: gfp mask
*
* Returns 0 on success, -ENOMEM on failure
*/
#define genradix_prealloc(_radix, _nr, _gfp) \
__genradix_prealloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _nr + 1),\
_gfp)
#endif /* _LINUX_GENERIC_RADIX_TREE_H */

View File

@ -249,6 +249,9 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
bch2_alloc_read_key(c, bkey_i_to_s_c(k));
}
for_each_member_device(ca, c, i)
bch2_dev_usage_from_buckets(c, ca);
mutex_lock(&c->bucket_clock[READ].lock);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
@ -280,35 +283,51 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
#endif
struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
struct bucket *g;
struct bucket_mark m;
struct bucket_mark m, new;
int ret;
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
a->k.p = POS(ca->dev_idx, b);
bch2_btree_iter_set_pos(iter, a->k.p);
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
percpu_down_read_preempt_disable(&c->mark_lock);
g = bucket(ca, b);
m = bucket_cmpxchg(g, m, m.dirty = false);
m = READ_ONCE(g->mark);
if (!m.dirty) {
percpu_up_read_preempt_enable(&c->mark_lock);
return 0;
}
__alloc_write_key(a, g, m);
percpu_up_read_preempt_enable(&c->mark_lock);
bch2_btree_iter_cond_resched(iter);
bch2_btree_iter_set_pos(iter, a->k.p);
ret = bch2_btree_insert_at(c, NULL, journal_seq,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
flags,
BTREE_INSERT_ENTRY(iter, &a->k_i));
if (ret)
return ret;
if (!ret && ca->buckets_written)
new = m;
new.dirty = false;
atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
if (ca->buckets_written)
set_bit(b, ca->buckets_written);
return ret;
return 0;
}
int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
@ -898,10 +917,19 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
for (i = 0; i < RESERVE_NR; i++)
if (fifo_push(&ca->free[i], bucket)) {
fifo_pop(&ca->free_inc, bucket);
closure_wake_up(&c->freelist_wait);
ca->allocator_blocked_full = false;
spin_unlock(&c->freelist_lock);
goto out;
}
if (!ca->allocator_blocked_full) {
ca->allocator_blocked_full = true;
closure_wake_up(&c->freelist_wait);
}
spin_unlock(&c->freelist_lock);
if ((current->flags & PF_KTHREAD) &&
@ -1226,6 +1254,11 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
{
closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
}
/* stop allocator thread: */
void bch2_dev_allocator_stop(struct bch_dev *ca)
{
@ -1333,6 +1366,24 @@ static void allocator_start_issue_discards(struct bch_fs *c)
ca->mi.bucket_size, GFP_NOIO, 0);
}
static int resize_free_inc(struct bch_dev *ca)
{
alloc_fifo free_inc;
if (!fifo_full(&ca->free_inc))
return 0;
if (!init_fifo(&free_inc,
ca->free_inc.size * 2,
GFP_KERNEL))
return -ENOMEM;
fifo_move(&free_inc, &ca->free_inc);
swap(free_inc, ca->free_inc);
free_fifo(&free_inc);
return 0;
}
static int __bch2_fs_allocator_start(struct bch_fs *c)
{
struct bch_dev *ca;
@ -1408,6 +1459,12 @@ not_enough:
while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
(bu = next_alloc_bucket(ca)) >= 0) {
ret = resize_free_inc(ca);
if (ret) {
percpu_ref_put(&ca->io_ref);
return ret;
}
bch2_invalidate_one_bucket(c, ca, bu,
&journal_seq);

View File

@ -51,6 +51,7 @@ void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);

View File

@ -106,6 +106,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
false, gc_pos_alloc(c, ob), 0);
ob->valid = false;
ob->type = 0;
spin_unlock(&ob->lock);
percpu_up_read_preempt_enable(&c->mark_lock);
@ -141,6 +142,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
ob = c->open_buckets + c->open_buckets_freelist;
c->open_buckets_freelist = ob->freelist;
atomic_set(&ob->pin, 1);
ob->type = 0;
c->open_buckets_nr_free--;
return ob;
@ -209,9 +211,9 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
case RESERVE_ALLOC:
return 0;
case RESERVE_BTREE:
return BTREE_NODE_RESERVE / 2;
return BTREE_NODE_OPEN_BUCKET_RESERVE;
default:
return BTREE_NODE_RESERVE;
return BTREE_NODE_OPEN_BUCKET_RESERVE * 2;
}
}
@ -837,15 +839,17 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
{
struct write_point *wp;
struct open_bucket *ob;
unsigned nr_effective = 0;
struct open_buckets ptrs = { .nr = 0 };
bool have_cache = false;
unsigned write_points_nr;
int ret = 0, i;
struct open_buckets ptrs;
unsigned nr_effective, write_points_nr;
bool have_cache;
int ret, i;
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
ptrs.nr = 0;
nr_effective = 0;
write_points_nr = c->write_points_nr;
have_cache = false;
wp = writepoint_find(c, write_point.v);

View File

@ -85,6 +85,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
ob->type = wp->type;
atomic_inc(&ob->pin);
ob_push(c, ptrs, ob);
}

View File

@ -55,9 +55,10 @@ struct open_bucket {
spinlock_t lock;
atomic_t pin;
u8 freelist;
bool valid;
bool on_partial_list;
u8 ec_idx;
u8 type;
unsigned valid:1;
unsigned on_partial_list:1;
unsigned sectors_free;
struct bch_extent_ptr ptr;
struct ec_stripe_new *ec;

View File

@ -330,6 +330,8 @@ enum bch_time_stats {
/* Size of the freelist we allocate btree nodes from: */
#define BTREE_NODE_RESERVE BTREE_RESERVE_MAX
#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
struct btree;
enum gc_phase {
@ -426,7 +428,13 @@ struct bch_dev {
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
/*
* XXX: this should be an enum for allocator state, so as to include
* error state
*/
bool allocator_blocked;
bool allocator_blocked_full;
alloc_heap alloc_heap;
@ -597,6 +605,7 @@ struct bch_fs {
struct workqueue_struct *wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
struct workqueue_struct *journal_reclaim_wq;
/* ALLOCATION */
struct delayed_work pd_controllers_update;

View File

@ -1010,11 +1010,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
nr_key_bits -= 64;
}
if (l_v != r_v)
return l_v < r_v ? -1 : 1;
if (!nr_key_bits)
return 0;
if (!nr_key_bits || l_v != r_v)
break;
l = next_word(l);
r = next_word(r);
@ -1022,6 +1019,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
l_v = *l;
r_v = *r;
}
return (l_v > r_v) - (l_v < r_v);
}
#endif

View File

@ -483,31 +483,6 @@ static void bch2_gc_free(struct bch_fs *c)
percpu_up_write(&c->mark_lock);
}
/*
* Accumulate percpu counters onto one cpu's copy - only valid when access
* against any percpu counter is guarded against
*/
static u64 *acc_percpu_u64s(u64 __percpu *p, unsigned nr)
{
u64 *ret;
int cpu;
preempt_disable();
ret = this_cpu_ptr(p);
preempt_enable();
for_each_possible_cpu(cpu) {
u64 *i = per_cpu_ptr(p, cpu);
if (i != ret) {
acc_u64s(ret, i, nr);
memset(i, 0, nr * sizeof(u64));
}
}
return ret;
}
static void bch2_gc_done_nocheck(struct bch_fs *c)
{
struct bch_dev *ca;
@ -543,9 +518,9 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
for_each_member_device(ca, c, i) {
unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
struct bch_dev_usage *dst = (void *)
acc_percpu_u64s((void *) ca->usage[0], nr);
bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
struct bch_dev_usage *src = (void *)
acc_percpu_u64s((void *) ca->usage[1], nr);
bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
*dst = *src;
}
@ -554,9 +529,9 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
c->replicas.nr;
struct bch_fs_usage *dst = (void *)
acc_percpu_u64s((void *) c->usage[0], nr);
bch2_acc_percpu_u64s((void *) c->usage[0], nr);
struct bch_fs_usage *src = (void *)
acc_percpu_u64s((void *) c->usage[1], nr);
bch2_acc_percpu_u64s((void *) c->usage[1], nr);
memcpy(&dst->s.gc_start[0],
&src->s.gc_start[0],
@ -582,6 +557,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
dst_iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \
dst->_f = src->_f; \
dst->dirty = true; \
}
#define copy_bucket_field(_f) \
if (dst->b[b].mark._f != src->b[b].mark._f) { \
@ -612,16 +588,18 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
(src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
BUG_ON(src_iter.pos != dst_iter.pos);
copy_stripe_field(alive, "alive");
copy_stripe_field(sectors, "sectors");
copy_stripe_field(algorithm, "algorithm");
copy_stripe_field(nr_blocks, "nr_blocks");
copy_stripe_field(nr_redundant, "nr_redundant");
copy_stripe_field(blocks_nonempty.counter,
copy_stripe_field(blocks_nonempty,
"blocks_nonempty");
for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
copy_stripe_field(block_sectors[i].counter,
copy_stripe_field(block_sectors[i],
"block_sectors[%u]", i);
if (dst->alive)
@ -656,9 +634,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
for_each_member_device(ca, c, i) {
unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
struct bch_dev_usage *dst = (void *)
acc_percpu_u64s((void *) ca->usage[0], nr);
bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
struct bch_dev_usage *src = (void *)
acc_percpu_u64s((void *) ca->usage[1], nr);
bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
unsigned b;
for (b = 0; b < BCH_DATA_NR; b++)
@ -678,9 +656,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
c->replicas.nr;
struct bch_fs_usage *dst = (void *)
acc_percpu_u64s((void *) c->usage[0], nr);
bch2_acc_percpu_u64s((void *) c->usage[0], nr);
struct bch_fs_usage *src = (void *)
acc_percpu_u64s((void *) c->usage[1], nr);
bch2_acc_percpu_u64s((void *) c->usage[1], nr);
copy_fs_field(s.hidden, "hidden");
copy_fs_field(s.data, "data");

View File

@ -109,7 +109,7 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
do {
seq = read_seqcount_begin(&c->gc_pos_lock);
ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
ret = gc_pos_cmp(pos, c->gc_pos) < 0;
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
return ret;

View File

@ -77,6 +77,7 @@ enum {
__BTREE_INSERT_ATOMIC,
__BTREE_INSERT_NOUNLOCK,
__BTREE_INSERT_NOFAIL,
__BTREE_INSERT_NOCHECK_RW,
__BTREE_INSERT_USE_RESERVE,
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
@ -100,6 +101,8 @@ enum {
/* Don't check for -ENOSPC: */
#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL)
#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW)
/* for copygc, or when merging btree nodes */
#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)

View File

@ -628,7 +628,8 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
trans_for_each_entry(trans, i)
btree_insert_entry_checks(c, i);
if (unlikely(!percpu_ref_tryget(&c->writes)))
if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
!percpu_ref_tryget(&c->writes)))
return -EROFS;
retry:
trans_for_each_iter(trans, i) {
@ -658,7 +659,8 @@ retry:
trans_for_each_iter(trans, i)
bch2_btree_iter_downgrade(i->iter);
out:
percpu_ref_put(&c->writes);
if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
percpu_ref_put(&c->writes);
/* make sure we didn't drop or screw up locks: */
trans_for_each_iter(trans, i) {

View File

@ -151,7 +151,6 @@ retry:
acc_u64s_percpu((u64 *) ret,
(u64 __percpu *) c->usage[0],
sizeof(*ret) / sizeof(u64) + nr);
percpu_up_read_preempt_enable(&c->mark_lock);
return ret;
}
@ -223,13 +222,14 @@ static bool bucket_became_unavailable(struct bucket_mark old,
!is_available_bucket(new);
}
void bch2_fs_usage_apply(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct disk_reservation *disk_res,
struct gc_pos gc_pos)
int bch2_fs_usage_apply(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct disk_reservation *disk_res,
struct gc_pos gc_pos)
{
s64 added = fs_usage->s.data + fs_usage->s.reserved;
s64 should_not_have_added;
int ret = 0;
percpu_rwsem_assert_held(&c->mark_lock);
@ -242,6 +242,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
"disk usage increased without a reservation")) {
atomic64_sub(should_not_have_added, &c->sectors_available);
added -= should_not_have_added;
ret = -1;
}
if (added > 0) {
@ -259,6 +260,8 @@ void bch2_fs_usage_apply(struct bch_fs *c,
(u64 *) fs_usage,
sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
}
return ret;
}
static inline void account_bucket(struct bch_fs_usage *fs_usage,
@ -363,10 +366,7 @@ static inline void update_cached_sectors(struct bch_fs *c,
{
struct bch_replicas_padded r;
r.e.data_type = BCH_DATA_CACHED;
r.e.nr_devs = 1;
r.e.nr_required = 1;
r.e.devs[0] = dev;
bch2_replicas_entry_cached(&r.e, dev);
update_replicas(c, fs_usage, &r.e, sectors);
}
@ -382,7 +382,8 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
*old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
BUG_ON(!is_available_bucket(new));
new.owned_by_allocator = 1;
new.owned_by_allocator = true;
new.dirty = true;
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
@ -455,6 +456,7 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
type != BCH_DATA_JOURNAL);
bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
new.dirty = true;
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
@ -480,13 +482,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
true);
} else {
struct bucket *g;
struct bucket_mark old, new;
struct bucket_mark new;
rcu_read_lock();
g = bucket(ca, b);
old = bucket_cmpxchg(g, new, ({
new.data_type = type;
bucket_cmpxchg(g, new, ({
new.dirty = true;
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
@ -537,6 +540,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
do {
new.v.counter = old.v.counter = v;
new.dirty = true;
/*
* Check this after reading bucket mark to guard against
* the allocator invalidating a bucket after we've already
@ -591,9 +596,14 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
int blocks_nonempty_delta;
s64 parity_sectors;
BUG_ON(!sectors);
m = genradix_ptr(&c->stripes[gc], p.idx);
spin_lock(&c->ec_stripes_heap_lock);
if (!m || !m->alive) {
spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
(u64) p.idx);
return -1;
@ -609,19 +619,21 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
parity_sectors = -parity_sectors;
sectors += parity_sectors;
new = atomic_add_return(sectors, &m->block_sectors[p.block]);
old = new - sectors;
old = m->block_sectors[p.block];
m->block_sectors[p.block] += sectors;
new = m->block_sectors[p.block];
blocks_nonempty_delta = (int) !!new - (int) !!old;
if (!blocks_nonempty_delta)
return 0;
if (blocks_nonempty_delta) {
m->blocks_nonempty += blocks_nonempty_delta;
atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
if (!gc)
bch2_stripes_heap_update(c, m, p.idx);
}
BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
m->dirty = true;
if (!gc)
bch2_stripes_heap_update(c, m, p.idx);
spin_unlock(&c->ec_stripes_heap_lock);
update_replicas(c, fs_usage, &m->r.e, sectors);
@ -629,8 +641,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
}
static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
s64 sectors,
enum bch_data_type data_type,
s64 sectors, enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
unsigned journal_seq, unsigned flags,
bool gc)
@ -701,14 +712,13 @@ static void bucket_set_stripe(struct bch_fs *c,
BUG_ON(ptr_stale(ca, ptr));
old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
new.dirty = true;
new.stripe = enabled;
if (journal_seq) {
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}
}));
BUG_ON(old.stripe == enabled);
}
}
@ -723,22 +733,19 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
unsigned i;
spin_lock(&c->ec_stripes_heap_lock);
if (!m || (!inserting && !m->alive)) {
spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
idx);
return -1;
}
if (inserting && m->alive) {
bch_err_ratelimited(c, "error marking stripe %zu: already exists",
idx);
return -1;
}
if (m->alive)
bch2_stripes_heap_del(c, m, idx);
BUG_ON(atomic_read(&m->blocks_nonempty));
for (i = 0; i < EC_STRIPE_MAX; i++)
BUG_ON(atomic_read(&m->block_sectors[i]));
memset(m, 0, sizeof(*m));
if (inserting) {
m->sectors = le16_to_cpu(s.v->sectors);
@ -754,7 +761,6 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
for (i = 0; i < s.v->nr_blocks; i++)
m->r.e.devs[i] = s.v->ptrs[i].dev;
}
/*
* XXX: account for stripes somehow here
@ -763,15 +769,23 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
#endif
if (!gc) {
if (inserting)
/* gc recalculates these fields: */
if (!(flags & BCH_BUCKET_MARK_GC)) {
for (i = 0; i < s.v->nr_blocks; i++) {
m->block_sectors[i] =
stripe_blockcount_get(s.v, i);
m->blocks_nonempty += !!m->block_sectors[i];
}
}
if (!gc)
bch2_stripes_heap_insert(c, m, idx);
else
bch2_stripes_heap_del(c, m, idx);
} else {
m->alive = inserting;
m->alive = true;
}
spin_unlock(&c->ec_stripes_heap_lock);
bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
return 0;
}
@ -879,6 +893,8 @@ void bch2_mark_update(struct btree_insert *trans,
struct bch_fs_usage *fs_usage;
struct gc_pos pos = gc_pos_btree_node(b);
struct bkey_packed *_k;
u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
static int warned_disk_usage = 0;
if (!btree_node_type_needs_gc(iter->btree_id))
return;
@ -939,7 +955,37 @@ void bch2_mark_update(struct btree_insert *trans,
bch2_btree_node_iter_advance(&node_iter, b);
}
bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos);
if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
!warned_disk_usage &&
!xchg(&warned_disk_usage, 1)) {
char buf[200];
pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
pr_err("while inserting");
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k));
pr_err("%s", buf);
pr_err("overlapping with");
node_iter = iter->l[0].iter;
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
KEY_TYPE_discard))) {
struct bkey unpacked;
struct bkey_s_c k;
k = bkey_disassemble(b, _k, &unpacked);
if (btree_node_is_extents(b)
? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
: bkey_cmp(insert->k->k.p, k.k->p))
break;
bch2_bkey_val_to_text(&PBUF(buf), c, k);
pr_err("%s", buf);
bch2_btree_node_iter_advance(&node_iter, b);
}
}
percpu_up_read_preempt_enable(&c->mark_lock);
}

View File

@ -181,6 +181,8 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
void bch2_dev_usage_from_buckets(struct bch_fs *, struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats)
{
@ -264,8 +266,8 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
bool, s64, struct gc_pos,
struct bch_fs_usage *, u64, unsigned);
void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, struct gc_pos);
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, struct gc_pos);
/* disk reservations: */

View File

@ -402,6 +402,8 @@ static long bch2_ioctl_usage(struct bch_fs *c,
if (!src)
return -ENOMEM;
percpu_up_read_preempt_enable(&c->mark_lock);
dst.used = bch2_fs_sectors_used(c, *src);
dst.online_reserved = src->s.online_reserved;

View File

@ -11,6 +11,7 @@
#include "ec.h"
#include "error.h"
#include "io.h"
#include "journal_io.h"
#include "keylist.h"
#include "super-io.h"
#include "util.h"
@ -98,40 +99,6 @@ struct ec_bio {
/* Stripes btree keys: */
static unsigned stripe_csums_per_device(const struct bch_stripe *s)
{
return DIV_ROUND_UP(le16_to_cpu(s->sectors),
1 << s->csum_granularity_bits);
}
static unsigned stripe_csum_offset(const struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
{
unsigned csum_bytes = bch_crc_bytes[s->csum_type];
return sizeof(struct bch_stripe) +
sizeof(struct bch_extent_ptr) * s->nr_blocks +
(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
}
static unsigned stripe_blockcount_offset(const struct bch_stripe *s,
unsigned idx)
{
return stripe_csum_offset(s, s->nr_blocks, 0) +
sizeof(16) * idx;
}
static unsigned stripe_val_u64s(const struct bch_stripe *s)
{
return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
sizeof(u64));
}
static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
{
return (void *) s + stripe_csum_offset(s, dev, csum_idx);
}
const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
@ -164,8 +131,9 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
1U << s->csum_granularity_bits);
for (i = 0; i < s->nr_blocks; i++)
pr_buf(out, " %u:%llu", s->ptrs[i].dev,
(u64) s->ptrs[i].offset);
pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
(u64) s->ptrs[i].offset,
stripe_blockcount_get(s, i));
}
static int ptr_matches_stripe(struct bch_fs *c,
@ -609,29 +577,15 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
BUG_ON(h->data[m->heap_idx].idx != idx);
}
static inline unsigned stripe_entry_blocks(struct stripe *m)
{
return atomic_read(&m->blocks_nonempty);
}
void bch2_stripes_heap_update(struct bch_fs *c,
struct stripe *m, size_t idx)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
bool queue_delete;
size_t i;
spin_lock(&c->ec_stripes_heap_lock);
if (!m->alive) {
spin_unlock(&c->ec_stripes_heap_lock);
return;
}
heap_verify_backpointer(c, idx);
h->data[m->heap_idx].blocks_nonempty =
stripe_entry_blocks(m);
h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
i = m->heap_idx;
heap_sift_up(h, i, ec_stripes_heap_cmp,
@ -641,44 +595,35 @@ void bch2_stripes_heap_update(struct bch_fs *c,
heap_verify_backpointer(c, idx);
queue_delete = stripe_idx_to_delete(c) >= 0;
spin_unlock(&c->ec_stripes_heap_lock);
if (queue_delete)
if (stripe_idx_to_delete(c) >= 0)
schedule_work(&c->ec_stripe_delete_work);
}
void bch2_stripes_heap_del(struct bch_fs *c,
struct stripe *m, size_t idx)
{
spin_lock(&c->ec_stripes_heap_lock);
heap_verify_backpointer(c, idx);
m->alive = false;
heap_del(&c->ec_stripes_heap, m->heap_idx,
ec_stripes_heap_cmp,
ec_stripes_heap_set_backpointer);
spin_unlock(&c->ec_stripes_heap_lock);
}
void bch2_stripes_heap_insert(struct bch_fs *c,
struct stripe *m, size_t idx)
{
spin_lock(&c->ec_stripes_heap_lock);
BUG_ON(heap_full(&c->ec_stripes_heap));
heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
.idx = idx,
.blocks_nonempty = stripe_entry_blocks(m),
.blocks_nonempty = m->blocks_nonempty,
}),
ec_stripes_heap_cmp,
ec_stripes_heap_set_backpointer);
m->alive = true;
heap_verify_backpointer(c, idx);
spin_unlock(&c->ec_stripes_heap_lock);
}
/* stripe deletion */
@ -1217,6 +1162,116 @@ unlock:
mutex_unlock(&c->ec_new_stripe_lock);
}
static int __bch2_stripe_write_key(struct bch_fs *c,
struct btree_iter *iter,
struct stripe *m,
size_t idx,
struct bkey_i_stripe *new_key,
unsigned flags)
{
struct bkey_s_c k;
unsigned i;
int ret;
bch2_btree_iter_set_pos(iter, POS(0, idx));
k = bch2_btree_iter_peek_slot(iter);
ret = btree_iter_err(k);
if (ret)
return ret;
if (k.k->type != KEY_TYPE_stripe)
return -EIO;
bkey_reassemble(&new_key->k_i, k);
spin_lock(&c->ec_stripes_heap_lock);
for (i = 0; i < new_key->v.nr_blocks; i++)
stripe_blockcount_set(&new_key->v, i,
m->block_sectors[i]);
m->dirty = false;
spin_unlock(&c->ec_stripes_heap_lock);
return bch2_btree_insert_at(c, NULL, NULL,
BTREE_INSERT_NOFAIL|flags,
BTREE_INSERT_ENTRY(iter, &new_key->k_i));
}
int bch2_stripes_write(struct bch_fs *c, bool *wrote)
{
struct btree_iter iter;
struct genradix_iter giter;
struct bkey_i_stripe *new_key;
struct stripe *m;
int ret = 0;
new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
BUG_ON(!new_key);
bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
genradix_for_each(&c->stripes[0], giter, m) {
if (!m->dirty)
continue;
ret = __bch2_stripe_write_key(c, &iter, m, giter.pos,
new_key, BTREE_INSERT_NOCHECK_RW);
if (ret)
break;
*wrote = true;
}
bch2_btree_iter_unlock(&iter);
kfree(new_key);
return ret;
}
static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
{
struct gc_pos pos = { 0 };
bch2_mark_key(c, k, true, 0, pos, NULL, 0, 0);
}
int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
{
struct journal_replay *r;
struct btree_iter iter;
struct bkey_s_c k;
int ret;
ret = bch2_fs_ec_start(c);
if (ret)
return ret;
for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN, 0, k) {
bch2_stripe_read_key(c, k);
bch2_btree_iter_cond_resched(&iter);
}
ret = bch2_btree_iter_unlock(&iter);
if (ret)
return ret;
list_for_each_entry(r, journal_replay_list, list) {
struct bkey_i *k, *n;
struct jset_entry *entry;
for_each_jset_key(k, n, entry, &r->j)
if (entry->btree_id == BTREE_ID_EC)
bch2_stripe_read_key(c, bkey_i_to_s_c(k));
}
return 0;
}
int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
{
struct btree_iter iter;

View File

@ -13,6 +13,55 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
.val_to_text = bch2_stripe_to_text, \
}
static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
{
return DIV_ROUND_UP(le16_to_cpu(s->sectors),
1 << s->csum_granularity_bits);
}
static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
{
unsigned csum_bytes = bch_crc_bytes[s->csum_type];
return sizeof(struct bch_stripe) +
sizeof(struct bch_extent_ptr) * s->nr_blocks +
(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
}
static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
unsigned idx)
{
return stripe_csum_offset(s, s->nr_blocks, 0) +
sizeof(u16) * idx;
}
static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
unsigned idx)
{
return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
}
static inline void stripe_blockcount_set(struct bch_stripe *s,
unsigned idx, unsigned v)
{
__le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
*p = cpu_to_le16(v);
}
static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
{
return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
sizeof(u64));
}
static inline void *stripe_csum(struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
{
return (void *) s + stripe_csum_offset(s, dev, csum_idx);
}
struct bch_read_bio;
struct ec_stripe_buf {
@ -100,6 +149,9 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_ec_flush_new_stripes(struct bch_fs *);
int bch2_stripes_read(struct bch_fs *, struct list_head *);
int bch2_stripes_write(struct bch_fs *, bool *);
int bch2_ec_mem_alloc(struct bch_fs *, bool);
int bch2_fs_ec_start(struct bch_fs *);

View File

@ -19,9 +19,10 @@ struct stripe {
u8 nr_blocks;
u8 nr_redundant;
u8 alive;
atomic_t blocks_nonempty;
atomic_t block_sectors[EC_STRIPE_MAX];
unsigned alive:1;
unsigned dirty:1;
u8 blocks_nonempty;
u16 block_sectors[EC_STRIPE_MAX];
struct bch_replicas_padded r;
};

View File

@ -1664,12 +1664,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
return ret == BCH_MERGE_MERGE;
}
int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
unsigned nr_replicas)
{
struct btree_iter iter;
struct bpos end = pos;
struct bkey_s_c k;
int ret = 0;
bool ret = true;
end.offset += size;
@ -1678,8 +1679,8 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
if (!bch2_extent_is_fully_allocated(k)) {
ret = -ENOSPC;
if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) {
ret = false;
break;
}
}
@ -1688,6 +1689,29 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
return ret;
}
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
{
unsigned ret = 0;
switch (k.k->type) {
case KEY_TYPE_extent: {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
extent_for_each_ptr_decode(e, p, entry)
ret += !p.ptr.cached &&
p.crc.compression_type == BCH_COMPRESSION_NONE;
break;
}
case KEY_TYPE_reservation:
ret = bkey_s_c_to_reservation(k).v->nr_replicas;
break;
}
return ret;
}
/* KEY_TYPE_reservation: */
const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)

View File

@ -571,6 +571,7 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst,
BUG_ON(!bch2_bkey_pack_key(dst, src, f));
}
int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
#endif /* _BCACHEFS_EXTENTS_H */

View File

@ -262,18 +262,20 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
}
}
static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
eytzinger_cmp_fn cmp, const void *search)
{
size_t i = 0;
int res;
while (i < nr &&
(res = cmp(search, base + i * size, size)))
i = eytzinger0_child(i, res > 0);
return i;
}
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
void *_base = (base); \
void *_search = (search); \
size_t _nr = (nr); \
size_t _size = (size); \
size_t _i = 0; \
int _res; \
\
while (_i < _nr && \
(_res = _cmp(_search, _base + _i * _size, _size))) \
_i = eytzinger0_child(_i, _res > 0); \
_i; \
})
void eytzinger0_sort(void *, size_t, size_t,
int (*cmp_func)(const void *, const void *, size_t),

View File

@ -253,7 +253,9 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
BUG_ON(btree_iter_err(old));
if (allocating &&
!bch2_extent_is_fully_allocated(old))
!*allocating &&
bch2_bkey_nr_ptrs_allocated(old) <
bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new)))
*allocating = true;
delta += (min(new->k.p.offset,
@ -858,9 +860,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
{
struct bvec_iter iter;
struct bio_vec bv;
unsigned nr_ptrs = !bch2_extent_is_compressed(k)
? bch2_bkey_nr_dirty_ptrs(k)
: 0;
unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
bio_for_each_segment(bv, bio, iter) {
/* brand new pages, don't need to be locked: */
@ -1759,6 +1759,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct bch_inode_info *inode = dio->iop.inode;
struct bio *bio = &dio->iop.op.wbio.bio;
struct bio_vec *bv;
loff_t offset;
bool sync;
long ret;
int i;
@ -1770,12 +1771,16 @@ static long bch2_dio_write_loop(struct dio_write *dio)
__pagecache_block_get(&mapping->add_lock);
/* Write and invalidate pagecache range that we're writing to: */
ret = write_invalidate_inode_pages_range(mapping, req->ki_pos,
req->ki_pos + iov_iter_count(&dio->iter) - 1);
offset = req->ki_pos + (dio->iop.op.written << 9);
ret = write_invalidate_inode_pages_range(mapping,
offset,
offset + iov_iter_count(&dio->iter) - 1);
if (unlikely(ret))
goto err;
while (1) {
offset = req->ki_pos + (dio->iop.op.written << 9);
BUG_ON(current->pagecache_lock);
current->pagecache_lock = &mapping->add_lock;
if (kthread)
@ -1792,13 +1797,12 @@ static long bch2_dio_write_loop(struct dio_write *dio)
/* gup might have faulted pages back in: */
ret = write_invalidate_inode_pages_range(mapping,
req->ki_pos + (dio->iop.op.written << 9),
req->ki_pos + iov_iter_count(&dio->iter) - 1);
offset,
offset + bio->bi_iter.bi_size - 1);
if (unlikely(ret))
goto err;
dio->iop.op.pos = POS(inode->v.i_ino,
(req->ki_pos >> 9) + dio->iop.op.written);
dio->iop.op.pos = POS(inode->v.i_ino, offset >> 9);
task_io_account_write(bio->bi_iter.bi_size);
@ -1878,7 +1882,6 @@ static int bch2_direct_IO_write(struct kiocb *req,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct dio_write *dio;
struct bio *bio;
loff_t offset = req->ki_pos;
ssize_t ret;
lockdep_assert_held(&inode->v.i_rwsem);
@ -1886,7 +1889,7 @@ static int bch2_direct_IO_write(struct kiocb *req,
if (unlikely(!iter->count))
return 0;
if (unlikely((offset|iter->count) & (block_bytes(c) - 1)))
if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
return -EINVAL;
bio = bio_alloc_bioset(GFP_KERNEL,
@ -1898,7 +1901,7 @@ static int bch2_direct_IO_write(struct kiocb *req,
dio->mm = current->mm;
dio->loop = false;
dio->sync = is_sync_kiocb(req) ||
offset + iter->count > inode->v.i_size;
req->ki_pos + iter->count > inode->v.i_size;
dio->free_iov = false;
dio->quota_res.sectors = 0;
dio->iter = *iter;
@ -1915,19 +1918,20 @@ static int bch2_direct_IO_write(struct kiocb *req,
if (unlikely(ret))
goto err;
dio->iop.op.nr_replicas = dio->iop.op.opts.data_replicas;
ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
dio->iop.op.opts.data_replicas, 0);
if (unlikely(ret)) {
if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
offset >> 9),
iter->count >> 9))
if (!bch2_check_range_allocated(c, POS(inode->v.i_ino,
req->ki_pos >> 9),
iter->count >> 9,
dio->iop.op.opts.data_replicas))
goto err;
dio->iop.unalloc = true;
}
dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas;
return bch2_dio_write_loop(dio);
err:
bch2_disk_reservation_put(c, &dio->iop.op.res);

View File

@ -694,6 +694,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
}
list_for_each_entry(i, list, list) {
struct bch_replicas_padded replicas;
char buf[80];
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
goto fsck_err;
@ -705,11 +710,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
if (!degraded &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
i->devs, false), c,
"superblock not marked as containing replicas (type %u)",
BCH_DATA_JOURNAL))) {
ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
"superblock not marked as containing replicas %s",
(bch2_replicas_entry_to_text(&PBUF(buf),
&replicas.e), buf)))) {
ret = bch2_mark_replicas(c, &replicas.e);
if (ret)
return ret;
}
@ -1108,6 +1113,7 @@ static void journal_write_done(struct closure *cl)
struct journal_buf *w = journal_prev_buf(j);
struct bch_devs_list devs =
bch2_bkey_devs(bkey_i_to_s_c(&w->key));
struct bch_replicas_padded replicas;
u64 seq = le64_to_cpu(w->data->seq);
u64 last_seq = le64_to_cpu(w->data->last_seq);
@ -1118,7 +1124,9 @@ static void journal_write_done(struct closure *cl)
goto err;
}
if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs);
if (bch2_mark_replicas(c, &replicas.e))
goto err;
spin_lock(&j->lock);

View File

@ -335,7 +335,7 @@ void bch2_journal_reclaim_work(struct work_struct *work)
mutex_unlock(&j->reclaim_lock);
if (!test_bit(BCH_FS_RO, &c->flags))
queue_delayed_work(system_freezable_wq, &j->reclaim_work,
queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
msecs_to_jiffies(j->reclaim_delay_ms));
}
@ -387,7 +387,6 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
struct bch_devs_list devs;
u64 iter, seq = 0;
int ret = 0;
@ -412,12 +411,15 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
spin_lock(&j->lock);
while (!ret && seq < j->pin.back) {
struct bch_replicas_padded replicas;
seq = max(seq, journal_last_seq(j));
devs = journal_seq_pin(j, seq)->devs;
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
journal_seq_pin(j, seq)->devs);
seq++;
spin_unlock(&j->lock);
ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
ret = bch2_mark_replicas(c, &replicas.e);
spin_lock(&j->lock);
}
spin_unlock(&j->lock);

View File

@ -4,6 +4,7 @@
#include "bcachefs.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
#include "extents.h"
#include "io.h"
@ -152,6 +153,16 @@ retry:
bch2_btree_iter_unlock(&iter);
}
/* flush relevant btree updates */
while (1) {
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c) ||
c->btree_roots_dirty);
if (!bch2_btree_interior_updates_nr_pending(c))
break;
bch2_journal_meta(&c->journal);
}
ret = 0;
out:
ret = bch2_replicas_gc_end(c, ret);

View File

@ -3,6 +3,7 @@
#include "alloc_foreground.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
#include "disk_groups.h"
#include "inode.h"
@ -763,6 +764,16 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_journal_flush_device_pins(&c->journal, -1);
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
while (1) {
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c) ||
c->btree_roots_dirty);
if (!bch2_btree_interior_updates_nr_pending(c))
break;
bch2_journal_meta(&c->journal);
}
ret = bch2_gc_btree_replicas(c) ?: ret;
ret = bch2_move_data(c, NULL,

View File

@ -214,12 +214,12 @@ int bch2_fs_recovery(struct bch_fs *c)
if (ret)
goto err;
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
err = "cannot allocate memory";
ret = bch2_fs_ec_start(c);
ret = bch2_stripes_read(c, &journal);
if (ret)
goto err;
pr_info("stripes_read done");
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
bch_verbose(c, "starting mark and sweep:");
err = "error in recovery";

View File

@ -13,6 +13,16 @@ static inline int u8_cmp(u8 l, u8 r)
return (l > r) - (l < r);
}
static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
{
#ifdef CONFIG_BCACHES_DEBUG
unsigned i;
for (i = 0; i + 1 < e->nr_devs; i++)
BUG_ON(e->devs[i] >= e->devs[i + 1]);
#endif
}
static void replicas_entry_sort(struct bch_replicas_entry *e)
{
bubble_sort(e->devs, e->nr_devs, u8_cmp);
@ -23,19 +33,13 @@ static void replicas_entry_sort(struct bch_replicas_entry *e)
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
_i = (void *) (_i) + (_r)->entry_size)
static inline struct bch_replicas_entry *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
}
static void replicas_entry_to_text(struct printbuf *out,
struct bch_replicas_entry *e)
void bch2_replicas_entry_to_text(struct printbuf *out,
struct bch_replicas_entry *e)
{
unsigned i;
@ -60,7 +64,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
pr_buf(out, " ");
first = false;
replicas_entry_to_text(out, e);
bch2_replicas_entry_to_text(out, e);
}
}
@ -100,8 +104,8 @@ static void stripe_to_replicas(struct bkey_s_c k,
r->devs[r->nr_devs++] = ptr->dev;
}
static void bkey_to_replicas(struct bkey_s_c k,
struct bch_replicas_entry *e)
static void bkey_to_replicas(struct bch_replicas_entry *e,
struct bkey_s_c k)
{
e->nr_devs = 0;
@ -119,11 +123,13 @@ static void bkey_to_replicas(struct bkey_s_c k,
stripe_to_replicas(k, e);
break;
}
replicas_entry_sort(e);
}
static inline void devlist_to_replicas(struct bch_devs_list devs,
enum bch_data_type data_type,
struct bch_replicas_entry *e)
void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
unsigned i;
@ -137,6 +143,8 @@ static inline void devlist_to_replicas(struct bch_devs_list devs,
for (i = 0; i < devs.nr; i++)
e->devs[e->nr_devs++] = devs.devs[i];
replicas_entry_sort(e);
}
static struct bch_replicas_cpu
@ -150,6 +158,9 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
replicas_entry_bytes(new_entry)),
};
BUG_ON(!new_entry->data_type);
verify_replicas_entry_sorted(new_entry);
new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
if (!new.entries)
return new;
@ -175,13 +186,12 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
if (unlikely(entry_size > r->entry_size))
return -1;
replicas_entry_sort(search);
while (entry_size < r->entry_size)
((char *) search)[entry_size++] = 0;
verify_replicas_entry_sorted(search);
#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
memcmp, search);
entry_cmp, search);
#undef entry_cmp
return idx < r->nr ? idx : -1;
}
@ -189,6 +199,8 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
int bch2_replicas_entry_idx(struct bch_fs *c,
struct bch_replicas_entry *search)
{
replicas_entry_sort(search);
return __replicas_entry_idx(&c->replicas, search);
}
@ -198,12 +210,17 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
return __replicas_entry_idx(r, search) >= 0;
}
static bool replicas_has_entry(struct bch_fs *c,
struct bch_replicas_entry *search,
bool check_gc_replicas)
bool bch2_replicas_marked(struct bch_fs *c,
struct bch_replicas_entry *search,
bool check_gc_replicas)
{
bool marked;
if (!search->nr_devs)
return true;
verify_replicas_entry_sorted(search);
percpu_down_read_preempt_disable(&c->mark_lock);
marked = __replicas_has_entry(&c->replicas, search) &&
(!check_gc_replicas ||
@ -214,35 +231,31 @@ static bool replicas_has_entry(struct bch_fs *c,
return marked;
}
static void __replicas_table_update(struct bch_fs_usage __percpu *dst,
static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
struct bch_replicas_cpu *dst_r,
struct bch_fs_usage __percpu *src,
struct bch_fs_usage __percpu *src_p,
struct bch_replicas_cpu *src_r)
{
int src_idx, dst_idx, cpu;
unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
struct bch_fs_usage *dst, *src = (void *)
bch2_acc_percpu_u64s((void *) src_p, src_nr);
int src_idx, dst_idx;
preempt_disable();
dst = this_cpu_ptr(dst_p);
preempt_enable();
*dst = *src;
for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
u64 *dst_v, src_v = 0;
for_each_possible_cpu(cpu)
src_v += *per_cpu_ptr(&src->data[src_idx], cpu);
if (!src->data[src_idx])
continue;
dst_idx = __replicas_entry_idx(dst_r,
cpu_replicas_entry(src_r, src_idx));
BUG_ON(dst_idx < 0);
if (dst_idx < 0) {
BUG_ON(src_v);
continue;
}
preempt_disable();
dst_v = this_cpu_ptr(&dst->data[dst_idx]);
BUG_ON(*dst_v);
*dst_v = src_v;
preempt_enable();
dst->data[dst_idx] = src->data[src_idx];
}
}
@ -344,30 +357,32 @@ err:
return ret;
}
static int __bch2_mark_replicas(struct bch_fs *c,
struct bch_replicas_entry *devs)
int bch2_mark_replicas(struct bch_fs *c,
struct bch_replicas_entry *r)
{
return likely(replicas_has_entry(c, devs, true))
return likely(bch2_replicas_marked(c, r, true))
? 0
: bch2_mark_replicas_slowpath(c, devs);
: bch2_mark_replicas_slowpath(c, r);
}
int bch2_mark_replicas(struct bch_fs *c,
enum bch_data_type data_type,
struct bch_devs_list devs)
bool bch2_bkey_replicas_marked(struct bch_fs *c,
struct bkey_s_c k,
bool check_gc_replicas)
{
struct bch_replicas_padded search;
struct bch_devs_list cached = bch2_bkey_cached_devs(k);
unsigned i;
if (!devs.nr)
return 0;
for (i = 0; i < cached.nr; i++) {
bch2_replicas_entry_cached(&search.e, cached.devs[i]);
memset(&search, 0, sizeof(search));
if (!bch2_replicas_marked(c, &search.e, check_gc_replicas))
return false;
}
BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
bkey_to_replicas(&search.e, k);
devlist_to_replicas(devs, data_type, &search.e);
return __bch2_mark_replicas(c, &search.e);
return bch2_replicas_marked(c, &search.e, check_gc_replicas);
}
int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
@ -377,18 +392,17 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
unsigned i;
int ret;
memset(&search, 0, sizeof(search));
for (i = 0; i < cached.nr; i++) {
bch2_replicas_entry_cached(&search.e, cached.devs[i]);
for (i = 0; i < cached.nr; i++)
if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
bch2_dev_list_single(cached.devs[i]))))
ret = bch2_mark_replicas(c, &search.e);
if (ret)
return ret;
}
bkey_to_replicas(k, &search.e);
bkey_to_replicas(&search.e, k);
return search.e.nr_devs
? __bch2_mark_replicas(c, &search.e)
: 0;
return bch2_mark_replicas(c, &search.e);
}
int bch2_replicas_gc_end(struct bch_fs *c, int ret)
@ -749,7 +763,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
pr_buf(out, " ");
first = false;
replicas_entry_to_text(out, e);
bch2_replicas_entry_to_text(out, e);
}
}
@ -798,46 +812,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
/* Query replicas: */
bool bch2_replicas_marked(struct bch_fs *c,
enum bch_data_type data_type,
struct bch_devs_list devs,
bool check_gc_replicas)
{
struct bch_replicas_padded search;
if (!devs.nr)
return true;
memset(&search, 0, sizeof(search));
devlist_to_replicas(devs, data_type, &search.e);
return replicas_has_entry(c, &search.e, check_gc_replicas);
}
bool bch2_bkey_replicas_marked(struct bch_fs *c,
struct bkey_s_c k,
bool check_gc_replicas)
{
struct bch_replicas_padded search;
struct bch_devs_list cached = bch2_bkey_cached_devs(k);
unsigned i;
memset(&search, 0, sizeof(search));
for (i = 0; i < cached.nr; i++)
if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
bch2_dev_list_single(cached.devs[i]),
check_gc_replicas))
return false;
bkey_to_replicas(k, &search.e);
return search.e.nr_devs
? replicas_has_entry(c, &search.e, check_gc_replicas)
: true;
}
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_devs_mask online_devs)
{

View File

@ -4,17 +4,39 @@
#include "eytzinger.h"
#include "replicas_types.h"
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
static inline struct bch_replicas_entry *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
int bch2_replicas_entry_idx(struct bch_fs *,
struct bch_replicas_entry *);
bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
struct bch_devs_list, bool);
void bch2_devlist_to_replicas(struct bch_replicas_entry *,
enum bch_data_type,
struct bch_devs_list);
bool bch2_replicas_marked(struct bch_fs *,
struct bch_replicas_entry *, bool);
int bch2_mark_replicas(struct bch_fs *,
struct bch_replicas_entry *);
bool bch2_bkey_replicas_marked(struct bch_fs *,
struct bkey_s_c, bool);
int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
struct bch_devs_list);
int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
unsigned dev)
{
e->data_type = BCH_DATA_CACHED;
e->nr_devs = 1;
e->nr_required = 1;
e->devs[0] = dev;
}
struct replicas_status {
struct {

View File

@ -205,7 +205,9 @@ int bch2_congested(void *data, int bdi_bits)
static void __bch2_fs_read_only(struct bch_fs *c)
{
struct bch_dev *ca;
bool wrote;
unsigned i;
int ret;
bch2_rebalance_stop(c);
@ -220,23 +222,42 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/
bch2_journal_flush_all_pins(&c->journal);
do {
ret = bch2_alloc_write(c, false, &wrote);
if (ret) {
bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
break;
}
ret = bch2_stripes_write(c, &wrote);
if (ret) {
bch2_fs_inconsistent(c, "error writing out stripes");
break;
}
for_each_member_device(ca, c, i)
bch2_dev_allocator_quiesce(c, ca);
bch2_journal_flush_all_pins(&c->journal);
/*
* We need to explicitly wait on btree interior updates to complete
* before stopping the journal, flushing all journal pins isn't
* sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
* interior updates have to drop their journal pin before they're
* fully complete:
*/
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
} while (wrote);
for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca);
bch2_journal_flush_all_pins(&c->journal);
/*
* We need to explicitly wait on btree interior updates to complete
* before stopping the journal, flushing all journal pins isn't
* sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
* interior updates have to drop their journal pin before they're
* fully complete:
*/
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
bch2_fs_journal_stop(&c->journal);
/* XXX: mark super that alloc info is persistent */
/*
* the journal kicks off btree writes via reclaim - wait for in flight
* writes after stopping journal:
@ -420,6 +441,8 @@ static void bch2_fs_free(struct bch_fs *c)
kfree(c->replicas_gc.entries);
kfree(rcu_dereference_protected(c->disk_groups, 1));
if (c->journal_reclaim_wq)
destroy_workqueue(c->journal_reclaim_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->wq)
@ -638,6 +661,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcache_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
!(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) ||
mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
sizeof(struct btree_reserve)) ||
@ -1297,8 +1322,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (data) {
char data_has_str[100];
bch2_string_opt_to_text(&PBUF(data_has_str),
bch2_data_types, data);
bch2_flags_to_text(&PBUF(data_has_str),
bch2_data_types, data);
bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
ret = -EBUSY;
goto err;

View File

@ -234,17 +234,17 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
{
struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
unsigned replicas;
unsigned i;
if (!fs_usage)
return -ENOMEM;
pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
for (replicas = 0;
replicas < ARRAY_SIZE(fs_usage->persistent_reserved);
replicas++) {
pr_buf(&out, "%u replicas:\n", replicas + 1);
for (i = 0;
i < ARRAY_SIZE(fs_usage->persistent_reserved);
i++) {
pr_buf(&out, "%u replicas:\n", i + 1);
#if 0
for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
pr_buf(&out, "\t%s:\t\t%llu\n",
@ -254,12 +254,23 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
stats.replicas[replicas].ec_data);
#endif
pr_buf(&out, "\treserved:\t%llu\n",
fs_usage->persistent_reserved[replicas]);
fs_usage->persistent_reserved[i]);
}
pr_buf(&out, "online reserved:\t%llu\n",
fs_usage->s.online_reserved);
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
pr_buf(&out, "\t");
bch2_replicas_entry_to_text(&out, e);
pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
}
percpu_up_read_preempt_enable(&c->mark_lock);
kfree(fs_usage);
return out.pos - buf;
@ -797,6 +808,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
{
struct bch_fs *c = ca->fs;
struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
unsigned i, nr[BCH_DATA_NR];
memset(nr, 0, sizeof(nr));
for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].type]++;
return scnprintf(buf, PAGE_SIZE,
"free_inc: %zu/%zu\n"
@ -823,7 +840,10 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
" copygc threshold: %llu\n"
"freelist_wait: %s\n"
"open buckets: %u/%u (reserved %u)\n"
"open_buckets_wait: %s\n",
"open_buckets_wait: %s\n"
"open_buckets_btree: %u\n"
"open_buckets_user: %u\n"
"btree reserve cache: %u\n",
fifo_used(&ca->free_inc), ca->free_inc.size,
fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
@ -845,8 +865,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
stats.sectors_fragmented,
ca->copygc_threshold,
c->freelist_wait.list.first ? "waiting" : "empty",
c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
c->open_buckets_wait.list.first ? "waiting" : "empty");
c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
BTREE_NODE_OPEN_BUCKET_RESERVE,
c->open_buckets_wait.list.first ? "waiting" : "empty",
nr[BCH_DATA_BTREE],
nr[BCH_DATA_USER],
c->btree_reserve_cache_nr);
}
static const char * const bch2_rw[] = {

View File

@ -133,6 +133,7 @@ void bch2_flags_to_text(struct printbuf *out,
const char * const list[], u64 flags)
{
unsigned bit, nr = 0;
bool first = true;
if (out->pos != out->end)
*out->pos = '\0';
@ -141,7 +142,10 @@ void bch2_flags_to_text(struct printbuf *out,
nr++;
while (flags && (bit = __ffs(flags)) < nr) {
pr_buf(out, "%s,", list[bit]);
pr_buf(out, "%s", list[bit]);
if (!first)
pr_buf(out, ",");
first = false;
flags ^= 1 << bit;
}
}
@ -894,3 +898,28 @@ void eytzinger0_find_test(void)
kfree(test_array);
}
#endif
/*
* Accumulate percpu counters onto one cpu's copy - only valid when access
* against any percpu counter is guarded against
*/
u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
{
u64 *ret;
int cpu;
preempt_disable();
ret = this_cpu_ptr(p);
preempt_enable();
for_each_possible_cpu(cpu) {
u64 *i = per_cpu_ptr(p, cpu);
if (i != ret) {
acc_u64s(ret, i, nr);
memset(i, 0, nr * sizeof(u64));
}
}
return ret;
}

View File

@ -715,4 +715,6 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
}
u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
#endif /* _BCACHEFS_UTIL_H */

View File

@ -1,4 +1,5 @@
#include <linux/atomic.h>
#include <linux/export.h>
#include <linux/generic-radix-tree.h>
#include <linux/gfp.h>
@ -16,7 +17,7 @@ struct genradix_node {
};
};
static inline unsigned genradix_depth_shift(unsigned depth)
static inline int genradix_depth_shift(unsigned depth)
{
return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth;
}
@ -29,16 +30,34 @@ static inline size_t genradix_depth_size(unsigned depth)
return 1UL << genradix_depth_shift(depth);
}
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
#define GENRADIX_MAX_DEPTH \
DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT)
#define GENRADIX_DEPTH_MASK \
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
unsigned genradix_root_to_depth(struct genradix_root *r)
{
return (unsigned long) r & GENRADIX_DEPTH_MASK;
}
struct genradix_node *genradix_root_to_node(struct genradix_root *r)
{
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
}
/*
* Returns pointer to the specified byte @offset within @radix, or NULL if not
* allocated
*/
void *__genradix_ptr(struct __genradix *radix, size_t offset)
{
size_t level = radix->depth;
struct genradix_node *n = radix->root;
struct genradix_root *r = READ_ONCE(radix->root);
struct genradix_node *n = genradix_root_to_node(r);
unsigned level = genradix_root_to_depth(r);
if (offset >= genradix_depth_size(radix->depth))
if (ilog2(offset) >= genradix_depth_shift(level))
return NULL;
while (1) {
@ -64,43 +83,60 @@ EXPORT_SYMBOL(__genradix_ptr);
void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
gfp_t gfp_mask)
{
struct genradix_node **n;
size_t level;
struct genradix_root *v = READ_ONCE(radix->root);
struct genradix_node *n, *new_node = NULL;
unsigned level;
/* Increase tree depth if necessary: */
while (offset >= genradix_depth_size(radix->depth)) {
struct genradix_node *new_root =
(void *) __get_free_page(gfp_mask|__GFP_ZERO);
if (!new_root)
return NULL;
new_root->children[0] = radix->root;
radix->root = new_root;
radix->depth++;
}
n = &radix->root;
level = radix->depth;
while (1) {
if (!*n) {
*n = (void *) __get_free_page(gfp_mask|__GFP_ZERO);
if (!*n)
struct genradix_root *r = v, *new_root;
n = genradix_root_to_node(r);
level = genradix_root_to_depth(r);
if (n && ilog2(offset) < genradix_depth_shift(level))
break;
if (!new_node) {
new_node = (void *)
__get_free_page(gfp_mask|__GFP_ZERO);
if (!new_node)
return NULL;
}
if (!level)
break;
new_node->children[0] = n;
new_root = ((struct genradix_root *)
((unsigned long) new_node | (n ? level + 1 : 0)));
level--;
n = &(*n)->children[offset >> genradix_depth_shift(level)];
offset &= genradix_depth_size(level) - 1;
if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) {
v = new_root;
new_node = NULL;
}
}
return &(*n)->data[offset];
while (level--) {
struct genradix_node **p =
&n->children[offset >> genradix_depth_shift(level)];
offset &= genradix_depth_size(level) - 1;
n = READ_ONCE(*p);
if (!n) {
if (!new_node) {
new_node = (void *)
__get_free_page(gfp_mask|__GFP_ZERO);
if (!new_node)
return NULL;
}
if (!(n = cmpxchg_release(p, NULL, new_node)))
swap(n, new_node);
}
}
if (new_node)
free_page((unsigned long) new_node);
return &n->data[offset];
}
EXPORT_SYMBOL(__genradix_ptr_alloc);
@ -108,17 +144,19 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
struct __genradix *radix,
size_t objs_per_page)
{
struct genradix_root *r;
struct genradix_node *n;
size_t level, i;
if (!radix->root)
return NULL;
unsigned level, i;
restart:
if (iter->offset >= genradix_depth_size(radix->depth))
r = READ_ONCE(radix->root);
if (!r)
return NULL;
n = radix->root;
level = radix->depth;
n = genradix_root_to_node(r);
level = genradix_root_to_depth(r);
if (ilog2(iter->offset) >= genradix_depth_shift(level))
return NULL;
while (level) {
level--;
@ -157,11 +195,24 @@ static void genradix_free_recurse(struct genradix_node *n, unsigned level)
free_page((unsigned long) n);
}
int __genradix_prealloc(struct __genradix *radix, size_t size,
gfp_t gfp_mask)
{
size_t offset;
for (offset = 0; offset < size; offset += PAGE_SIZE)
if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
return -ENOMEM;
return 0;
}
EXPORT_SYMBOL(__genradix_prealloc);
void __genradix_free(struct __genradix *radix)
{
genradix_free_recurse(radix->root, radix->depth);
struct genradix_root *r = xchg(&radix->root, NULL);
radix->root = NULL;
radix->depth = 0;
genradix_free_recurse(genradix_root_to_node(r),
genradix_root_to_depth(r));
}
EXPORT_SYMBOL(__genradix_free);